[dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality
@ 2017-08-03  8:49 Vasily Philipov
  2017-08-03  8:49 ` [dpdk-dev] [PATCH 2/2] net/mlx4: get back RX offloads Vasily Philipov
  2017-09-25 13:42 ` [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Ferruh Yigit
  0 siblings, 2 replies; 3+ messages in thread
From: Vasily Philipov @ 2017-08-03  8:49 UTC (permalink / raw)
  To: dev; +Cc: Vasily Philipov, Adrien Mazarguil, Nelio Laranjeiro

Getting hw directly on RX fast path without verbs call.

Now the number of scatters is calculating on the fly, according to the
maximum expected packet size.

Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
The series depends on:

http://dpdk.org/dev/patchwork/patch/27313/
---
 drivers/net/mlx4/mlx4.h       |   3 +
 drivers/net/mlx4/mlx4_prm.h   | 405 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxq.c   | 205 ++++++++++-----------
 drivers/net/mlx4/mlx4_rxtx.c  | 266 ++++++++++++++++-----------
 drivers/net/mlx4/mlx4_rxtx.h  |  18 +-
 drivers/net/mlx4/mlx4_utils.h |  20 +++
 6 files changed, 688 insertions(+), 229 deletions(-)
 create mode 100644 drivers/net/mlx4/mlx4_prm.h

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 1cd4db3..4b7f98b 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -42,6 +42,7 @@
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
+#include "mlx4_prm.h"
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
@@ -57,6 +58,8 @@
 /* Maximum size for inline data. */
 #define MLX4_PMD_MAX_INLINE 0
 
+#include <rte_ethdev.h>
+
 /*
  * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
  * from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 0000000..03c1192
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,405 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX4_PRM_H_
+#define RTE_PMD_MLX4_PRM_H_
+
+#include <arpa/inet.h>
+
+#include <infiniband/arch.h>
+#include <infiniband/driver.h>
+#include <infiniband/verbs.h>
+
+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#if MLX4_GCC_VERSION >= 403
+#	define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
+#	define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
+#else
+#	define __MLX4_ALGN_FUNC__
+#	define __MLX4_ALGN_DATA__
+#endif
+
+/* Maximum number of physical ports. */
+#define MLX4_PMD_MAX_PHYS_PORTS 2
+
+/* Generic macro to convert MLX4 to IBV flags. */
+#define MLX4_TRANSPOSE(val, from, to) \
+	(((from) >= (to)) ? \
+	 (((val) & (from)) / ((from) / (to))) : \
+	 (((val) & (from)) * ((to) / (from))))
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+enum {
+	MLX4_INVALID_LKEY = 0x100,
+};
+
+enum {
+	MLX4_MAX_BFS_IN_PAGE = 8,
+	MLX4_BFS_STRIDE	     = 512,
+};
+
+enum {
+	MLX4_CQE_L2_TUNNEL_IPV4	   = 1U << 25,
+	MLX4_CQE_L2_TUNNEL_L4_CSUM = 1U << 26,
+	MLX4_CQE_L2_TUNNEL         = 1U << 27,
+	MLX4_CQE_VLAN_PRESENT_MASK = 1U << 29,
+	MLX4_CQE_L2_TUNNEL_IPOK	   = 1U << 31,
+	MLX4_CQE_QPN_MASK          = 0xffffff,
+};
+
+enum {
+	MLX4_QP_TABLE_BITS = 8,
+	MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS,
+	MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
+};
+
+enum {
+	MLX4_XSRQ_TABLE_BITS = 8,
+	MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+	MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_wqe_data_seg {
+	uint32_t byte_count;
+	uint32_t lkey;
+	uint64_t addr;
+};
+
+struct mlx4_xsrq_table {
+	struct {
+		struct mlx4_srq **table;
+		int             refcnt;
+	} xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+	pthread_mutex_t         mutex;
+	int                     num_xsrq;
+	int                     shift;
+	int                     mask;
+};
+
+enum qp_cap_cache {
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1,
+	MLX4_RX_VXLAN                          = 1 << 2
+};
+
+enum mlx4_db_type {
+	MLX4_DB_TYPE_CQ,
+	MLX4_DB_TYPE_RQ,
+	MLX4_NUM_DB_TYPE,
+};
+
+enum mlx4_lock_type {
+	MLX4_SPIN_LOCK = 0,
+	MLX4_MUTEX     = 1,
+};
+
+enum mlx4_lock_state {
+	MLX4_USE_LOCK,
+	MLX4_LOCKED,
+	MLX4_UNLOCKED
+};
+
+struct mlx4_spinlock {
+	pthread_spinlock_t   lock;
+	enum mlx4_lock_state state;
+};
+
+struct mlx4_lock {
+	pthread_mutex_t	     mutex;
+	pthread_spinlock_t   slock;
+	enum mlx4_lock_state state;
+	enum mlx4_lock_type  type;
+};
+
+/* struct for BF dedicated for one QP */
+struct mlx4_dedic_bf {
+	void *address;
+};
+
+/* struct for the common BF which may be shared by many QPs */
+struct mlx4_cmn_bf {
+	void	         *address;
+	/*
+	 * Protect usage of BF address field including data written
+	 * to the BF and the BF buffer toggling.
+	 */
+	struct mlx4_lock lock;
+};
+
+union mlx4_bf {
+	struct mlx4_dedic_bf dedic;
+	struct mlx4_cmn_bf cmn;
+};
+
+struct mlx4_bfs_data {
+	struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
+	struct mlx4_cmn_bf   cmn_bf;
+	uint8_t              dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
+	uint8_t              dedic_bf_free;
+	/*
+	 * protect dedicated BFs managing
+	 * including dedic_bf_used and
+	 * dedic_bf_free fields
+	 */
+	struct mlx4_spinlock dedic_bf_lock;
+	void                 *page;
+	uint16_t             buf_size;
+	uint8_t              num_dedic_bfs;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+	union {
+		struct ibv_context      ibv_ctx;
+	};
+	/* protects send_db_list and send_db_num_uars */
+	struct mlx4_spinlock            send_db_lock;
+	struct list_head                send_db_list;
+	unsigned int                    send_db_num_uars;
+	void                            *uar;
+	struct mlx4_spinlock            uar_lock;
+	struct mlx4_bfs_data            bfs;
+	int                             bf_regs_per_page;
+	int                             max_ctx_res_domain;
+	struct {
+		struct mlx4_qp          **table;
+		int                     refcnt;
+	} qp_table[MLX4_QP_TABLE_SIZE];
+	pthread_mutex_t	                qp_table_mutex;
+	int                             num_qps;
+	int                             qp_table_shift;
+	int                             qp_table_mask;
+	int                             max_qp_wr;
+	int                             max_sge;
+	int                             max_cqe;
+	uint64_t                        exp_device_cap_flags;
+	struct {
+		int                     offset;
+		int                     mult;
+		int                     shift;
+		uint64_t                mask;
+	} core_clk;
+	void                            *hca_core_clock;
+	struct mlx4_xsrq_table          xsrq_table;
+	struct mlx4_db_page             *db_list[MLX4_NUM_DB_TYPE];
+	pthread_mutex_t	                db_list_mutex;
+	int                             cqe_size;
+	int                             prefer_bf;
+	struct mlx4_spinlock            hugetlb_lock;
+	struct list_head                hugetlb_list;
+	int                             stall_enable;
+	pthread_mutex_t	                task_mutex;
+	struct {
+		uint8_t	                valid;
+		uint8_t                 link_layer;
+		enum ibv_port_cap_flags	caps;
+	} port_query_cache[MLX4_PMD_MAX_PHYS_PORTS];
+	pthread_mutex_t	                env_mtx;
+	int                             env_initialized;
+};
+
+struct mlx4_buf {
+	void   *buf;
+	void   *hmem;
+	size_t length;
+	int    base;
+};
+
+struct mlx4_pd {
+	struct ibv_pd ibv_pd;
+	uint32_t      pdn;
+};
+
+struct mlx4_cq {
+	struct ibv_cq    ibv_cq __MLX4_ALGN_DATA__;
+	uint32_t         pattern;
+	struct mlx4_buf	 buf;
+	struct mlx4_buf	 resize_buf;
+	struct mlx4_lock lock;
+	uint32_t         cqn;
+	uint32_t         cons_index;
+	uint32_t         wait_index;
+	uint32_t         wait_count;
+	uint32_t         *set_ci_db;
+	uint32_t         *arm_db;
+	int              arm_sn;
+	int              stall_next_poll;
+	int              stall_enable;
+	int              cqe_size;
+	int              creation_flags;
+	struct mlx4_qp   *last_qp;
+	uint32_t         model_flags; /* use mlx4_cq_model_flags */
+};
+
+struct mlx4_wq {
+	uint64_t         *wrid;
+	struct mlx4_lock lock;
+	int              wqe_cnt;
+	unsigned         max_post;
+	char             *buf;
+	unsigned         head;
+	unsigned         tail;
+	int              max_gs;
+	int              wqe_shift;
+	unsigned         head_en_index;
+	unsigned         head_en_count;
+};
+
+struct mlx4_inlr_rbuff {
+	void *rbuff;
+	int rlen;
+};
+
+struct mlx4_inlr_sg_list {
+	struct mlx4_inlr_rbuff *sg_list;
+	int list_len;
+};
+
+struct mlx4_inlr_buff {
+	struct mlx4_inlr_sg_list *buff;
+	int len;
+};
+
+struct mlx4_qp {
+	struct verbs_qp       verbs_qp;
+	uint32_t              pattern;
+	int                   buf_size;
+	uint32_t              model_flags; /* use mlx4_qp_model_flags */
+	/* hot post send data */
+	struct mlx4_wq        sq __MLX4_ALGN_DATA__;
+	int                   (*post_send_one)(struct ibv_send_wr *wr,
+					       struct mlx4_qp *qp,
+					       void *wqe, int *total_size,
+					       int *inl, unsigned int ind);
+	union mlx4_bf         *bf;
+	uint32_t              *sdb; /* send DB */
+	struct mlx4_buf       buf;
+	unsigned              last_db_head;
+	uint32_t              doorbell_qpn;
+	uint32_t              create_flags;
+	uint16_t              max_inline_data;
+	uint16_t              bf_buf_size;
+	uint16_t              sq_spare_wqes;
+	uint8_t               srcrb_flags_tbl[16];
+	uint8_t               db_method;
+	uint8_t	              qp_type;
+	/* RAW_PACKET hot data */
+	uint8_t               link_layer;
+	uint8_t               is_masked_atomic;
+	/* post receive hot data */
+	struct mlx4_wq        rq __MLX4_ALGN_DATA__;
+	uint32_t              *db;
+	uint32_t              max_inlr_sg;
+	int32_t               cached_rx_csum_flags;
+	int32_t               transposed_rx_csum_flags;
+	struct mlx4_inlr_buff inlr_buff;
+	uint8_t               qp_cap_cache;
+};
+
+struct mlx4_cqe {
+	uint32_t	vlan_my_qpn;
+	uint32_t	immed_rss_invalid;
+	uint32_t	g_mlpath_rqpn;
+	union {
+		struct {
+			union {
+				struct {
+					uint16_t  sl_vid;
+					uint16_t  rlid;
+				};
+				uint32_t  timestamp_16_47;
+			};
+			uint16_t  status;
+			uint8_t   reserved2;
+			uint8_t   badfcs_enc;
+		};
+		struct {
+			uint16_t reserved4;
+			uint8_t  smac[6];
+		};
+	};
+	uint32_t	byte_cnt;
+	uint16_t	wqe_index;
+	uint16_t	checksum;
+	uint8_t		reserved5[1];
+	uint16_t	timestamp_0_15;
+	uint8_t		owner_sr_opcode;
+} __attribute__((packed));
+
+enum {
+	MLX4_CQE_OWNER_MASK       = 0x80,
+	MLX4_CQE_IS_SEND_MASK     = 0x40,
+	MLX4_CQE_INL_SCATTER_MASK = 0x20,
+	MLX4_CQE_OPCODE_MASK      = 0x1f
+};
+
+enum {
+	MLX4_CQE_OPCODE_ERROR  = 0x1e,
+	MLX4_CQE_OPCODE_RESIZE = 0x16,
+};
+
+enum {
+	MLX4_CQE_STATUS_L4_CSUM	= 1 << 2,
+	MLX4_CQE_STATUS_IPV4    = 1 << 6,
+	MLX4_CQE_STATUS_IPV4F   = 1 << 7,
+	MLX4_CQE_STATUS_IPV6    = 1 << 8,
+	MLX4_CQE_STATUS_IPV4OPT	= 1 << 9,
+	MLX4_CQE_STATUS_TCP     = 1 << 10,
+	MLX4_CQE_STATUS_UDP     = 1 << 11,
+	MLX4_CQE_STATUS_IPOK    = 1 << 12
+};
+
+#define to_mxxx(xxx, type) \
+	((struct mlx4_##type *)	\
+	 ((uint8_t *)ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+	return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+	return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+	return container_of(container_of(ibqp, struct verbs_qp, qp),
+			    struct mlx4_qp, verbs_qp);
+}
+
+#endif /* RTE_PMD_MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 1456b5f..bbe9c89 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -78,103 +78,73 @@
  */
 static int
 mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n,
-		    struct rte_mbuf **pool)
+		    struct rte_mbuf *(*pool)[])
 {
-	unsigned int i;
-	struct rxq_elt (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq->socket);
+	unsigned int i = 0;
+	const unsigned int sge_n = 1 << rxq->sge_n;
+	struct rte_mbuf *(*elts)[elts_n] =
+		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket);
 
 	if (elts == NULL) {
 		rte_errno = ENOMEM;
 		ERROR("%p: can't allocate packets array", (void *)rxq);
 		goto error;
 	}
-	/* For each WR (packet). */
-	for (i = 0; (i != elts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct ibv_recv_wr *wr = &elt->wr;
-		struct ibv_sge *sge = &(*elts)[i].sge;
+	rxq->elts = elts;
+	for (; i != elts_n; ++i) {
 		struct rte_mbuf *buf;
+		volatile struct mlx4_wqe_data_seg *scat =
+			&(*rxq->hw.wqes)[i];
 
 		if (pool != NULL) {
-			buf = *(pool++);
+			buf = (*pool)[i];
 			assert(buf != NULL);
 			rte_pktmbuf_reset(buf);
-		} else {
+			rte_pktmbuf_refcnt_update(buf, 1);
+		} else
 			buf = rte_pktmbuf_alloc(rxq->mp);
-		}
 		if (buf == NULL) {
 			rte_errno = ENOMEM;
 			assert(pool == NULL);
 			ERROR("%p: empty mbuf pool", (void *)rxq);
 			goto error;
 		}
-		/*
-		 * Configure WR. Work request ID contains its own index in
-		 * the elts array and the offset between SGE buffer header and
-		 * its data.
-		 */
-		WR_ID(wr->wr_id).id = i;
-		WR_ID(wr->wr_id).offset =
-			(((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
-			 (uintptr_t)buf);
-		wr->next = &(*elts)[(i + 1)].wr;
-		wr->sg_list = sge;
-		wr->num_sge = 1;
 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
 		assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
 		/* Buffer is supposed to be empty. */
 		assert(rte_pktmbuf_data_len(buf) == 0);
 		assert(rte_pktmbuf_pkt_len(buf) == 0);
-		/* sge->addr must be able to store a pointer. */
-		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-		/* SGE keeps its headroom. */
-		sge->addr = (uintptr_t)
-			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-		sge->lkey = rxq->mr->lkey;
-		/* Redundant check for tailroom. */
-		assert(sge->length == rte_pktmbuf_tailroom(buf));
-		/*
-		 * Make sure elts index and SGE mbuf pointer can be deduced
-		 * from WR ID.
-		 */
-		if ((WR_ID(wr->wr_id).id != i) ||
-		    ((void *)((uintptr_t)sge->addr -
-			WR_ID(wr->wr_id).offset) != buf)) {
-			rte_errno = EOVERFLOW;
-			ERROR("%p: cannot store index and offset in WR ID",
-			      (void *)rxq);
-			sge->addr = 0;
-			rte_pktmbuf_free(buf);
-			goto error;
-		}
+		assert(!buf->next);
+		/* Only the first segment keeps headroom. */
+		if (i % sge_n)
+			buf->data_off = 0;
+		buf->port = rxq->port_id;
+		buf->data_len = rte_pktmbuf_tailroom(buf);
+		buf->pkt_len = rte_pktmbuf_tailroom(buf);
+		buf->nb_segs = 1;
+		/* scat->addr must be able to store a pointer. */
+		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx4_wqe_data_seg){
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(buf->data_len),
+			.lkey = htonl(rxq->mr->lkey),
+		};
+		(*rxq->elts)[i] = buf;
 	}
-	/* The last WR pointer must be NULL. */
-	(*elts)[(i - 1)].wr.next = NULL;
-	DEBUG("%p: allocated and configured %u single-segment WRs",
-	      (void *)rxq, elts_n);
-	rxq->elts_n = elts_n;
-	rxq->elts_head = 0;
-	rxq->elts = elts;
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq, elts_n, elts_n >> rxq->sge_n);
+	rxq->elts_n = log2above(elts_n);
 	return 0;
 error:
-	if (elts != NULL) {
-		assert(pool == NULL);
-		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-			struct rxq_elt *elt = &(*elts)[i];
-			struct rte_mbuf *buf;
-
-			if (elt->sge.addr == 0)
-				continue;
-			assert(WR_ID(elt->wr.wr_id).id == i);
-			buf = (void *)((uintptr_t)elt->sge.addr -
-				WR_ID(elt->wr.wr_id).offset);
-			rte_pktmbuf_free_seg(buf);
-		}
-		rte_free(elts);
+	assert(pool == NULL);
+	elts_n = i;
+	for (i = 0; i != elts_n; ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+		(*rxq->elts)[i] = NULL;
 	}
+	rte_free(rxq->elts);
+	rxq->elts = NULL;
 	DEBUG("%p: failed, freed everything", (void *)rxq);
 	assert(rte_errno > 0);
 	return -rte_errno;
@@ -190,26 +160,17 @@
 mlx4_rxq_free_elts(struct rxq *rxq)
 {
 	unsigned int i;
-	unsigned int elts_n = rxq->elts_n;
-	struct rxq_elt (*elts)[elts_n] = rxq->elts;
 
 	DEBUG("%p: freeing WRs", (void *)rxq);
-	rxq->elts_n = 0;
-	rxq->elts = NULL;
-	if (elts == NULL)
+	if (rxq->elts == NULL)
 		return;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct rte_mbuf *buf;
 
-		if (elt->sge.addr == 0)
-			continue;
-		assert(WR_ID(elt->wr.wr_id).id == i);
-		buf = (void *)((uintptr_t)elt->sge.addr -
-			WR_ID(elt->wr.wr_id).offset);
-		rte_pktmbuf_free_seg(buf);
+	for (i = 0; i != (1u << rxq->elts_n); ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+		(*rxq->elts)[i] = NULL;
 	}
-	rte_free(elts);
+	rte_free(rxq->elts);
 }
 
 /**
@@ -251,7 +212,8 @@
  *   QP pointer or NULL in case of error and rte_errno is set.
  */
 static struct ibv_qp *
-mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
+mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq,
+		  uint16_t desc, unsigned int sge_n)
 {
 	struct ibv_qp *qp;
 	struct ibv_qp_init_attr attr = {
@@ -265,7 +227,7 @@
 					priv->device_attr.max_qp_wr :
 					desc),
 			/* Max number of scatter/gather elements in a WR. */
-			.max_recv_sge = 1,
+			.max_recv_sge = sge_n,
 		},
 		.qp_type = IBV_QPT_RAW_PACKET,
 	};
@@ -307,26 +269,34 @@
 		.socket = socket
 	};
 	struct ibv_qp_attr mod;
-	struct ibv_recv_wr *bad_wr;
 	unsigned int mb_len;
 	int ret;
 
 	(void)conf; /* Thresholds configuration (ignored). */
 	mb_len = rte_pktmbuf_data_room_size(mp);
-	if (desc == 0) {
-		rte_errno = EINVAL;
-		ERROR("%p: invalid number of RX descriptors", (void *)dev);
-		goto error;
-	}
 	/* Enable scattered packets support for this queue if necessary. */
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
 	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
-		;
+		tmpl.sge_n = 0;
 	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
-		WARN("%p: scattered mode has been requested but is"
-		     " not supported, this may lead to packet loss",
-		     (void *)dev);
+		unsigned int sges_n;
+		unsigned int rx_pkt_len =
+				dev->data->dev_conf.rxmode.jumbo_frame ?
+				dev->data->dev_conf.rxmode.max_rx_pkt_len :
+				ETHER_MTU;
+
+		if (rx_pkt_len < ETHER_MTU)
+			rx_pkt_len = ETHER_MTU;
+		/* Only the first mbuf has a headroom */
+		rx_pkt_len = rx_pkt_len - mb_len + RTE_PKTMBUF_HEADROOM;
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = (rx_pkt_len / mb_len) + !!(rx_pkt_len / mb_len) + 1;
+		tmpl.sge_n = log2above(sges_n);
+		desc >>= tmpl.sge_n;
 	} else {
 		WARN("%p: the requested maximum Rx packet size (%u) is"
 		     " larger than a single mbuf (%u) and scattered"
@@ -335,6 +305,8 @@
 		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
 		     mb_len - RTE_PKTMBUF_HEADROOM);
 	}
+	DEBUG("%p: number of sges %u (%u WRs)",
+	      (void *)dev, 1 << tmpl.sge_n, desc);
 	/* Use the entire RX mempool as the memory region. */
 	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
@@ -370,7 +342,7 @@
 	      priv->device_attr.max_qp_wr);
 	DEBUG("priv->device_attr.max_sge is %d",
 	      priv->device_attr.max_sge);
-	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc);
+	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc, 1 << tmpl.sge_n);
 	if (tmpl.qp == NULL) {
 		ERROR("%p: QP creation failure: %s",
 		      (void *)dev, strerror(rte_errno));
@@ -389,21 +361,6 @@
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
-	ret = mlx4_rxq_alloc_elts(&tmpl, desc, NULL);
-	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(rte_errno));
-		goto error;
-	}
-	ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr);
-	if (ret) {
-		rte_errno = ret;
-		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
-		      (void *)dev,
-		      (void *)bad_wr,
-		      strerror(rte_errno));
-		goto error;
-	}
 	mod = (struct ibv_qp_attr){
 		.qp_state = IBV_QPS_RTR
 	};
@@ -414,14 +371,32 @@
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
+	/* Init HW depended fields */
+	tmpl.hw.wqes =
+		(volatile struct mlx4_wqe_data_seg (*)[])
+		(uintptr_t)to_mqp(tmpl.qp)->rq.buf;
+	tmpl.hw.rq_db =
+		(volatile uint32_t *)
+		(uintptr_t)to_mqp(tmpl.qp)->db;
+	tmpl.hw.rq_ci = 0;
 	/* Save port ID. */
 	tmpl.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+	ret = mlx4_rxq_alloc_elts(&tmpl, desc << tmpl.sge_n, NULL);
+	if (ret) {
+		ERROR("%p: RXQ allocation failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
 	mlx4_rxq_cleanup(rxq);
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+	/* Update doorbell counter. */
+	rxq->hw.rq_ci = desc;
+	rte_wmb();
+	*rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
 	return 0;
 error:
 	ret = rte_errno;
@@ -459,6 +434,12 @@
 	struct rxq *rxq = (*priv->rxqs)[idx];
 	int ret;
 
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in RX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
 	DEBUG("%p: configuring queue %u for %u descriptors",
 	      (void *)dev, idx, desc);
 	if (idx >= priv->rxqs_n) {
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 944cf48..f11c84c 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -348,9 +348,73 @@
 }
 
 /**
- * DPDK callback for Rx.
+ * Get next cqe from HW.
  *
- * The following function doesn't manage scattered packets.
+ * @param cq
+ *   Pointer to CQ structure.
+ *
+ * @return
+ *   Pointer to the CQ element or NULL in case there is no one.
+ */
+static inline struct mlx4_cqe *
+mlx4_cq_get_next_cqe(struct mlx4_cq *cq)
+{
+	int cqe_off;
+	struct mlx4_cqe *cqe;
+	const int cqe_size = cq->cqe_size;
+
+	/* CQE offset is 32 bytes in case if cqe_size is 64 */
+	cqe_off = (cqe_size & 64) >> 1;
+	cqe = (struct mlx4_cqe *)
+		((uint8_t *)cq->buf.buf +
+		(cq->cons_index & cq->ibv_cq.cqe) * cqe_size +
+		cqe_off);
+	/* Return NULL if HW hasn't produced cqe */
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
+		return NULL;
+	return cqe;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param cq
+ *   Pointer to ibv CQ structure.
+ * @param[out] out
+ *   Just polled cqe.
+ *
+ * @return
+ *   byte_cnt of the cqe, 0 in case there is no completion,
+ *   negative on failure.
+ */
+static int
+mlx4_cq_poll_one(struct rxq *rxq,
+		 struct mlx4_cqe **out)
+{
+	int ret = 0;
+	struct mlx4_cqe *cqe;
+	struct mlx4_cq *cq = to_mcq(rxq->cq);
+
+	cqe = mlx4_cq_get_next_cqe(cq);
+	if (cqe) {
+		/*
+		 * Make sure we read CQ entry contents after we've checked the
+		 * ownership bit.
+		 */
+		rte_rmb();
+		assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+		assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+		       MLX4_CQE_OPCODE_ERROR);
+		ret = ntohl(cqe->byte_cnt);
+		++cq->cons_index;
+	}
+	*out = cqe;
+	return ret;
+}
+
+/**
+ * DPDK callback for RX with scattered packets support.
  *
  * @param dpdk_rxq
  *   Generic pointer to Rx queue structure.
@@ -365,121 +429,109 @@
 uint16_t
 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
-	struct ibv_recv_wr *wr_head = NULL;
-	struct ibv_recv_wr **wr_next = &wr_head;
-	struct ibv_recv_wr *wr_bad = NULL;
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
+	struct rxq *rxq = dpdk_rxq;
+	const unsigned int wr_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int sge_n = rxq->sge_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	unsigned int i = 0;
+	unsigned int rq_ci = (rxq->hw.rq_ci << sge_n);
+	int len = 0;
 
-	ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(ret == 0))
-		return 0;
-	if (unlikely(ret < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, ret);
-		return 0;
-	}
-	assert(ret <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; i != (unsigned int)ret; ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint64_t wr_id = wr->wr_id;
-		uint32_t len = wc->byte_len;
-		struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
-			WR_ID(wr_id).offset);
-		struct rte_mbuf *rep;
+	while (pkts_n) {
+		struct mlx4_cqe *cqe;
+		unsigned int idx = rq_ci & wr_cnt;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx4_wqe_data_seg *scat =
+					&(*rxq->hw.wqes)[idx];
 
-		/* Sanity checks. */
-		assert(WR_ID(wr_id).id < rxq->elts_n);
-		assert(wr_id == wc->wr_id);
-		assert(wr->sg_list == &elt->sge);
-		assert(wr->num_sge == 1);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		/* Link completed WRs together for repost. */
-		*wr_next = wr;
-		wr_next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work completion"
-			      " status (%d): %s",
-			      (void *)rxq, wr_id, wc->status,
-			      ibv_wc_status_str(wc->status));
-			/* Increment dropped packets counter. */
-			++rxq->stats.idropped;
-			goto repost;
-		}
+		/* Update the 'next' pointer of the previous segment */
+		if (pkt)
+			seg->next = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(scat);
 		rep = rte_mbuf_raw_alloc(rxq->mp);
 		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
-			      " can't allocate a new mbuf",
-			      (void *)rxq, WR_ID(wr_id).id);
-			/* Increase out of memory counters. */
 			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			goto repost;
+			if (!pkt) {
+				/*
+				 * no buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				assert(pkt != (*rxq->elts)[idx]);
+				rep = pkt->next;
+				pkt->next = NULL;
+				pkt->nb_segs = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
 		}
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		assert(elt->sge.lkey == rxq->mr->lkey);
-		WR_ID(wr->wr_id).offset =
-			(((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
-			 (uintptr_t)rep);
-		assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
-		/* Update seg information. */
-		seg->data_off = RTE_PKTMBUF_HEADROOM;
-		seg->nb_segs = 1;
-		seg->port = rxq->port_id;
-		seg->next = NULL;
-		seg->pkt_len = len;
+		if (!pkt) {
+			/* Looking for the new packet */
+			len = mlx4_cq_poll_one(rxq, &cqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len < 0)) {
+				/* RX error, packet is likely too large. */
+				rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			pkt->packet_type = 0;
+			pkt->ol_flags = 0;
+			pkt->pkt_len = len;
+		}
+		rep->nb_segs = 1;
+		rep->port = rxq->port_id;
+		rep->data_len = seg->data_len;
+		rep->data_off = seg->data_off;
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer. The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		scat->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+		if (len > seg->data_len) {
+			len -= seg->data_len;
+			++pkt->nb_segs;
+			++rq_ci;
+			continue;
+		}
+		/* The last segment. */
 		seg->data_len = len;
-		seg->packet_type = 0;
-		seg->ol_flags = 0;
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += pkt->pkt_len;
 		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += len;
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sge_n;
+		++rq_ci;
+		rq_ci <<= sge_n;
 	}
-	if (unlikely(i == 0))
+	if (unlikely((i == 0) && ((rq_ci >> sge_n) == rxq->hw.rq_ci)))
 		return 0;
-	/* Repost WRs. */
-	*wr_next = NULL;
-	assert(wr_head);
-	ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-	return pkts_ret;
+	/* Update the consumer index. */
+	rxq->hw.rq_ci = rq_ci >> sge_n;
+	rte_wmb();
+	*rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
+	*to_mcq(rxq->cq)->set_ci_db =
+		htonl(to_mcq(rxq->cq)->cons_index & 0xffffff);
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+	return i;
 }
 
 /**
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index a3d972b..077fdd8 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -70,13 +70,6 @@ struct mlx4_rxq_stats {
 	uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
 };
 
-/** Rx element. */
-struct rxq_elt {
-	struct ibv_recv_wr wr; /**< Work request. */
-	struct ibv_sge sge; /**< Scatter/gather element. */
-	/* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
-};
-
 /** Rx queue descriptor. */
 struct rxq {
 	struct priv *priv; /**< Back pointer to private data. */
@@ -86,9 +79,14 @@ struct rxq {
 	struct ibv_qp *qp; /**< Queue pair. */
 	struct ibv_comp_channel *channel; /**< Rx completion channel. */
 	unsigned int port_id; /**< Port ID for incoming packets. */
-	unsigned int elts_n; /**< (*elts)[] length. */
-	unsigned int elts_head; /**< Current index in (*elts)[]. */
-	struct rxq_elt (*elts)[]; /**< Rx elements. */
+	unsigned int elts_n; /**< Log 2 of Mbufs. */
+	struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+	struct {
+		volatile struct mlx4_wqe_data_seg(*wqes)[];
+		volatile uint32_t *rq_db;
+		uint16_t rq_ci;
+	} hw;
+	unsigned int sge_n; /**< Log 2 of SGEs number. */
 	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
 };
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index e74b61b..a37a3e5 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -102,4 +102,24 @@
 
 int mlx4_fd_set_non_blocking(int fd);
 
+/**
+ * Return nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+	unsigned int l;
+	unsigned int r;
+
+	for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+		r |= (v & 1);
+	return l + r;
+}
+
 #endif /* MLX4_UTILS_H_ */
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [dpdk-dev] [PATCH 2/2] net/mlx4: get back RX offloads
  2017-08-03  8:49 [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Vasily Philipov
@ 2017-08-03  8:49 ` Vasily Philipov
  2017-09-25 13:42 ` [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Ferruh Yigit
  1 sibling, 0 replies; 3+ messages in thread
From: Vasily Philipov @ 2017-08-03  8:49 UTC (permalink / raw)
  To: dev; +Cc: Vasily Philipov, Adrien Mazarguil, Nelio Laranjeiro

Adding support for RX checksum and packet type HW offloads.

Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx4/mlx4.c        |  11 ++++
 drivers/net/mlx4/mlx4.h        |   2 +
 drivers/net/mlx4/mlx4_ethdev.c |   7 ++-
 drivers/net/mlx4/mlx4_prm.h    |   9 ++++
 drivers/net/mlx4/mlx4_rxq.c    |   5 ++
 drivers/net/mlx4/mlx4_rxtx.c   | 120 ++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h   |   2 +
 7 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 8573e14..6842df1 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -571,6 +571,17 @@ struct mlx4_conf {
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		priv->vf = vf;
+		priv->hw_csum =
+			((to_mctx(ctx)->exp_device_cap_flags &
+			  MLX4_DEVICE_RX_CSUM_TCP_UDP_PKT) &&
+			 (to_mctx(ctx)->exp_device_cap_flags &
+			  MLX4_DEVICE_RX_CSUM_IP_PKT));
+		DEBUG("checksum offloading is %ssupported",
+		      (priv->hw_csum ? "" : "not "));
+		priv->hw_csum_l2tun = !!(to_mctx(ctx)->exp_device_cap_flags &
+					 MLX4_DEVICE_VXLAN_SUPPORT);
+		DEBUG("L2 tunnel checksum offloads are %ssupported",
+		      (priv->hw_csum_l2tun ? "" : "not "));
 		/* Configure the first MAC address by default. */
 		if (mlx4_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx4_en loaded?"
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 4b7f98b..b81dcb0 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -107,6 +107,8 @@ struct priv {
 	unsigned int vf:1; /* This is a VF device. */
 	unsigned int intr_alarm:1; /* An interrupt alarm is scheduled. */
 	unsigned int isolated:1; /* Toggle isolated mode. */
+	unsigned int hw_csum:1; /* Checksum offload is supported. */
+	unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 8c6b1fd..9b5ba31 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -551,7 +551,12 @@
 	info->max_tx_queues = max;
 	/* Last array entry is reserved for broadcast. */
 	info->max_mac_addrs = 1;
-	info->rx_offload_capa = 0;
+	info->rx_offload_capa =
+		(priv->hw_csum ?
+		 (DEV_RX_OFFLOAD_IPV4_CKSUM |
+		  DEV_RX_OFFLOAD_UDP_CKSUM |
+		  DEV_RX_OFFLOAD_TCP_CKSUM) :
+		 0);
 	info->tx_offload_capa = 0;
 	if (mlx4_get_ifname(priv, &ifname) == 0)
 		info->if_index = if_nametoindex(ifname);
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index 03c1192..a06781e 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -402,4 +402,13 @@ static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
 			    struct mlx4_qp, verbs_qp);
 }
 
+enum mlx4_device_cap_flags {
+	MLX4_START_FLAG_LOC = 0x20,
+	MLX4_START_FLAG	    = (1ULL << MLX4_START_FLAG_LOC),
+
+	MLX4_DEVICE_VXLAN_SUPPORT	= (MLX4_START_FLAG << 10),
+	MLX4_DEVICE_RX_CSUM_TCP_UDP_PKT = (MLX4_START_FLAG << 11),
+	MLX4_DEVICE_RX_CSUM_IP_PKT	= (MLX4_START_FLAG << 12),
+};
+
 #endif /* RTE_PMD_MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index bbe9c89..c431033 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -274,6 +274,11 @@
 
 	(void)conf; /* Thresholds configuration (ignored). */
 	mb_len = rte_pktmbuf_data_room_size(mp);
+	/* Toggle RX checksum offload if hardware supports it. */
+	if (priv->hw_csum)
+		tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+	if (priv->hw_csum_l2tun)
+		tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
 	/* Enable scattered packets support for this queue if necessary. */
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index f11c84c..f090354 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -348,6 +348,115 @@
 }
 
 /**
+ * Translate RX completion flags to packet type.
+ *
+ * @param flags
+ *   RX completion flags returned by poll_length_flags().
+ *
+ * @return
+ *   Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(uint32_t flags)
+{
+	uint32_t pkt_type;
+
+	if (flags & MLX4_CQE_L2_TUNNEL)
+		pkt_type =
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_L2_TUNNEL_IPV4,
+				       RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
+			MLX4_TRANSPOSE(~flags,
+				       MLX4_CQE_L2_TUNNEL_IPV4,
+				       RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_IPV4,
+				       RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_IPV6,
+				       RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
+	else
+		pkt_type =
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_IPV4,
+				       RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_IPV6,
+				       RTE_PTYPE_L3_IPV6_EXT_UNKNOWN);
+	return pkt_type;
+}
+
+/**
+ * Translate RX completion flags to offload flags.
+ *
+ * @param[in] rxq
+ *   Pointer to RX queue structure.
+ * @param flags
+ *   RX completion flags returned by poll_length_flags().
+ *
+ * @return
+ *   Offload flags (ol_flags) for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+{
+	uint32_t ol_flags = 0;
+
+	if (rxq->csum)
+		ol_flags |=
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_IPOK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_STATUS_L4_CSUM,
+				       PKT_RX_L4_CKSUM_GOOD);
+	if ((flags & MLX4_CQE_L2_TUNNEL) && (rxq->csum_l2tun))
+		ol_flags |=
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_L2_TUNNEL_IPOK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			MLX4_TRANSPOSE(flags,
+				       MLX4_CQE_L2_TUNNEL_L4_CSUM,
+				       PKT_RX_L4_CKSUM_GOOD);
+	return ol_flags;
+}
+
+/**
+ * Get cqe flags.
+ *
+ * @param cqe
+ *   Pointer to cqe structure.
+ * @param mlx4_qp
+ *   Pointer to QP structure.
+ *
+ * @return
+ *   CQE's flags.
+ */
+static inline uint32_t
+mlx4_cqe_flags(struct mlx4_cqe *cqe, struct ibv_qp *qp)
+{
+	uint32_t flags = 0;
+	struct mlx4_qp *mlx4_qp = to_mqp(qp);
+
+	/*
+	 * The relevant bits are in different locations on their
+	 * CQE fields therefore we can join them in one 32bit
+	 * variable.
+	 */
+	if (mlx4_qp->qp_cap_cache & MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)
+		flags = (cqe->badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) |
+			(ntohs(cqe->status) & (MLX4_CQE_STATUS_IPOK |
+					       MLX4_CQE_STATUS_IPV4 |
+					       MLX4_CQE_STATUS_IPV6));
+	if (mlx4_qp->qp_cap_cache & MLX4_RX_VXLAN)
+		flags |= ntohl(cqe->vlan_my_qpn) & (MLX4_CQE_L2_TUNNEL     |
+						MLX4_CQE_L2_TUNNEL_IPOK    |
+						MLX4_CQE_L2_TUNNEL_L4_CSUM |
+						MLX4_CQE_L2_TUNNEL_IPV4);
+	return flags;
+}
+
+/**
  * Get next cqe from HW.
  *
  * @param cq
@@ -485,8 +594,15 @@
 				goto skip;
 			}
 			pkt = seg;
-			pkt->packet_type = 0;
-			pkt->ol_flags = 0;
+			if (rxq->csum | rxq->csum_l2tun) {
+				uint32_t flags = mlx4_cqe_flags(cqe, rxq->qp);
+
+				pkt->packet_type = rxq_cq_to_pkt_type(flags);
+				pkt->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+			} else {
+				pkt->packet_type = 0;
+				pkt->ol_flags = 0;
+			}
 			pkt->pkt_len = len;
 		}
 		rep->nb_segs = 1;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 077fdd8..2f03a94 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -87,6 +87,8 @@ struct rxq {
 		uint16_t rq_ci;
 	} hw;
 	unsigned int sge_n; /**< Log 2 of SGEs number. */
+	unsigned int csum:1; /**< Enable checksum offloading. */
+	unsigned int csum_l2tun:1; /**< Same for L2 tunnels. */
 	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
 };
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality
  2017-08-03  8:49 [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Vasily Philipov
  2017-08-03  8:49 ` [dpdk-dev] [PATCH 2/2] net/mlx4: get back RX offloads Vasily Philipov
@ 2017-09-25 13:42 ` Ferruh Yigit
  1 sibling, 0 replies; 3+ messages in thread
From: Ferruh Yigit @ 2017-09-25 13:42 UTC (permalink / raw)
  To: Vasily Philipov, dev; +Cc: Adrien Mazarguil, Nelio Laranjeiro

On 8/3/2017 9:49 AM, Vasily Philipov wrote:
> Getting hw directly on RX fast path without verbs call.
> 
> Now the number of scatters is calculating on the fly, according to the
> maximum expected packet size.
> 
> Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>

I will update series as superseded with [1], please shout if this is wrong.

Thanks,
ferruh

[1]
http://dpdk.org/dev/patchwork/patch/27881/
Moti's mlx4 patchset with 5 patches.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-09-25 13:42 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-03  8:49 [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Vasily Philipov
2017-08-03  8:49 ` [dpdk-dev] [PATCH 2/2] net/mlx4: get back RX offloads Vasily Philipov
2017-09-25 13:42 ` [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).