DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v2] net/mlx4: support hardware TSO
       [not found] <0~1530184583-30166-1-git-send-email-motih@mellanox.com>
@ 2018-06-28 11:57 ` Moti Haimovsky
  2018-06-28 12:48   ` [dpdk-dev] [PATCH v3] " Moti Haimovsky
  0 siblings, 1 reply; 14+ messages in thread
From: Moti Haimovsky @ 2018-06-28 11:57 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---

 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/mlx4.c           |  16 ++
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  12 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 8 files changed, 415 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..61b7844 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -519,6 +519,8 @@ struct mlx4_conf {
 		.ports.present = 0,
 	};
 	unsigned int vf;
+	struct rte_mbuf mbuf;
+	uint64_t size_test = UINT_MAX;
 	int i;
 
 	(void)pci_drv;
@@ -677,6 +679,20 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		/*
+		 * No TSO SIZE is defined in DPDK, need to figure it out
+		 * in order to see if we can support it.
+		 */
+		mbuf.tso_segsz = size_test;
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso >= mbuf.tso_segsz) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..742d741 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum Packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192  // TODO: find the real value
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported */
+	uint32_t tso_max_payload_sz; /* Max TSO payload size being supported */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index e15a3c1..915796b 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -40,6 +40,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -97,6 +98,17 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg_ {
+	__be32 mss_hdr_size;
+	__be32 header[0];
+};
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index a92da66..992d193 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,25 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper struct for TSO packet handling. */
+struct tso_info {
+	/* Total size of the WQE including padding */
+	uint32_t wqe_size;
+	/* size of TSO header to prepend to each packet to send */
+	uint16_t tso_header_sz;
+	/* Total size of the TSO entry in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/* Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -377,6 +392,349 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 || tinfo->tso_header_sz == 0)) {
+		DEBUG("%p: Invalid TSO parameters", (void *)txq);
+		return -EINVAL;
+	}
+	/* First segment must contain all TSO headers. */
+	if (unlikely(tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER) ||
+		     tinfo->tso_header_sz > buf->data_len) {
+		DEBUG("%p: Invalid TSO header length", (void *)txq);
+		return -EINVAL;
+	}
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg_) +
+					    tinfo->tso_header_sz,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_segs(struct rte_mbuf *buf,
+			    struct txq *txq,
+			    const struct tso_info *tinfo,
+			    volatile struct mlx4_wqe_data_seg *dseg,
+			    struct pv *pv, int *pv_counter)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	uint16_t sb_of = tinfo->tso_header_sz;
+	uint16_t data_len;
+
+	while (nb_segs > 0) {
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					(volatile uint8_t *)dseg - sq->size;
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb =
+			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+			sizeof(struct mlx4_wqe_data_seg);
+		switch (nb_segs_txbb) {
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			--nb_segs;
+			break;
+		default:
+			/* Should never happen */
+			ERROR("%p: invalid number of txbb data segments %d",
+			      (void *)txq, nb_segs_txbb);
+			return -EINVAL;
+		}
+	}
+	return 0;
+lkey_err:
+	DEBUG("%p: unable to get MP <-> MR association",
+	      (void *)txq);
+	return -EFAULT;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *  This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param tseg
+ *   Pointer to the TSO header field in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   const struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_lso_seg_ *tseg,
+			    struct pv *pv, int *pv_counter)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	int remain_sz = tinfo->tso_header_sz;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	int copy_sz;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg_));
+	copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+	rte_memcpy(thdr.to, from, copy_sz);
+	remain_sz -= copy_sz;
+	while (remain_sz > 0) {
+		from += copy_sz;
+		thdr.to += copy_sz;
+		/* Start of TXBB need to check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+		pv[(*pv_counter)++].val = *((uint32_t *)from);
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_sz -= sizeof(uint32_t);
+		if (remain_sz <= 0)
+			break;
+		/* Now copy the rest */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+		rte_memcpy(thdr.to, from, copy_sz);
+		remain_sz -= copy_sz;
+	}
+	/* TODO: handle PID and IPID ? */
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_sz);
+	return 0;
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_lso_seg_ *tseg =
+		(volatile struct mlx4_wqe_lso_seg_ *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int pv_counter = 0;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (ret)
+		goto error;
+	ret = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo,
+					 tseg, pv, &pv_counter);
+	if (ret)
+		goto error;
+	/* Calculate data segment location */
+	dseg = (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo.wqe_tso_seg_size);
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ret = mlx4_tx_burst_fill_tso_segs(buf, txq, &tinfo,
+					  dseg, pv, &pv_counter);
+	if (ret)
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			*pv[pv_counter].dst = pv[pv_counter].val;
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	/* Align next WQE address to the next TXBB. */
+	return (volatile struct mlx4_wqe_ctrl_seg *)
+		((volatile uint8_t *)ctrl + tinfo.wqe_size);
+error:
+	txq->stats.odropped++;
+	rte_errno = ret;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -569,6 +927,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -587,7 +946,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v3] net/mlx4: support hardware TSO
  2018-06-28 11:57 ` [dpdk-dev] [PATCH v2] net/mlx4: support hardware TSO Moti Haimovsky
@ 2018-06-28 12:48   ` Moti Haimovsky
  2018-06-28 14:15     ` Adrien Mazarguil
                       ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Moti Haimovsky @ 2018-06-28 12:48 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/mlx4.c           |  16 ++
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  12 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 8 files changed, 415 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..61b7844 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -519,6 +519,8 @@ struct mlx4_conf {
 		.ports.present = 0,
 	};
 	unsigned int vf;
+	struct rte_mbuf mbuf;
+	uint64_t size_test = UINT_MAX;
 	int i;
 
 	(void)pci_drv;
@@ -677,6 +679,20 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		/*
+		 * No TSO SIZE is defined in DPDK, need to figure it out
+		 * in order to see if we can support it.
+		 */
+		mbuf.tso_segsz = size_test;
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso >= mbuf.tso_segsz) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..742d741 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum Packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192  // TODO: find the real value
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported */
+	uint32_t tso_max_payload_sz; /* Max TSO payload size being supported */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index e15a3c1..0484878 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -40,6 +40,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -97,6 +98,17 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg_ {
+	__be32 mss_hdr_size;
+	__be32 header[];
+};
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index a92da66..992d193 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,25 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper struct for TSO packet handling. */
+struct tso_info {
+	/* Total size of the WQE including padding */
+	uint32_t wqe_size;
+	/* size of TSO header to prepend to each packet to send */
+	uint16_t tso_header_sz;
+	/* Total size of the TSO entry in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/* Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -377,6 +392,349 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 || tinfo->tso_header_sz == 0)) {
+		DEBUG("%p: Invalid TSO parameters", (void *)txq);
+		return -EINVAL;
+	}
+	/* First segment must contain all TSO headers. */
+	if (unlikely(tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER) ||
+		     tinfo->tso_header_sz > buf->data_len) {
+		DEBUG("%p: Invalid TSO header length", (void *)txq);
+		return -EINVAL;
+	}
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg_) +
+					    tinfo->tso_header_sz,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_segs(struct rte_mbuf *buf,
+			    struct txq *txq,
+			    const struct tso_info *tinfo,
+			    volatile struct mlx4_wqe_data_seg *dseg,
+			    struct pv *pv, int *pv_counter)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	uint16_t sb_of = tinfo->tso_header_sz;
+	uint16_t data_len;
+
+	while (nb_segs > 0) {
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					(volatile uint8_t *)dseg - sq->size;
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb =
+			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+			sizeof(struct mlx4_wqe_data_seg);
+		switch (nb_segs_txbb) {
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			--nb_segs;
+			break;
+		default:
+			/* Should never happen */
+			ERROR("%p: invalid number of txbb data segments %d",
+			      (void *)txq, nb_segs_txbb);
+			return -EINVAL;
+		}
+	}
+	return 0;
+lkey_err:
+	DEBUG("%p: unable to get MP <-> MR association",
+	      (void *)txq);
+	return -EFAULT;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *  This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param tseg
+ *   Pointer to the TSO header field in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   const struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_lso_seg_ *tseg,
+			    struct pv *pv, int *pv_counter)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	int remain_sz = tinfo->tso_header_sz;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	int copy_sz;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg_));
+	copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+	rte_memcpy(thdr.to, from, copy_sz);
+	remain_sz -= copy_sz;
+	while (remain_sz > 0) {
+		from += copy_sz;
+		thdr.to += copy_sz;
+		/* Start of TXBB need to check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+		pv[(*pv_counter)++].val = *((uint32_t *)from);
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_sz -= sizeof(uint32_t);
+		if (remain_sz <= 0)
+			break;
+		/* Now copy the rest */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+		rte_memcpy(thdr.to, from, copy_sz);
+		remain_sz -= copy_sz;
+	}
+	/* TODO: handle PID and IPID ? */
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_sz);
+	return 0;
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_lso_seg_ *tseg =
+		(volatile struct mlx4_wqe_lso_seg_ *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	int pv_counter = 0;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (ret)
+		goto error;
+	ret = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo,
+					 tseg, pv, &pv_counter);
+	if (ret)
+		goto error;
+	/* Calculate data segment location */
+	dseg = (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo.wqe_tso_seg_size);
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ret = mlx4_tx_burst_fill_tso_segs(buf, txq, &tinfo,
+					  dseg, pv, &pv_counter);
+	if (ret)
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			*pv[pv_counter].dst = pv[pv_counter].val;
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	/* Align next WQE address to the next TXBB. */
+	return (volatile struct mlx4_wqe_ctrl_seg *)
+		((volatile uint8_t *)ctrl + tinfo.wqe_size);
+error:
+	txq->stats.odropped++;
+	rte_errno = ret;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -569,6 +927,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -587,7 +946,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v3] net/mlx4: support hardware TSO
  2018-06-28 12:48   ` [dpdk-dev] [PATCH v3] " Moti Haimovsky
@ 2018-06-28 14:15     ` Adrien Mazarguil
  2018-06-28 15:19     ` Matan Azrad
  2018-07-04 14:53     ` [dpdk-dev] [PATCH v4] " Moti Haimovsky
  2 siblings, 0 replies; 14+ messages in thread
From: Adrien Mazarguil @ 2018-06-28 14:15 UTC (permalink / raw)
  To: Moti Haimovsky; +Cc: matan, dev

Hi Moti,

On Thu, Jun 28, 2018 at 03:48:57PM +0300, Moti Haimovsky wrote:
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>

I only glanced at the code but overall TSO logic appears to be sound;
assuming it went through non-regression I think it's OK. Please see below
for a bunch of cosmetic comments.

> ---
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
>   caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
> 
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
> 
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
>  doc/guides/nics/features/mlx4.ini |   1 +
>  doc/guides/nics/mlx4.rst          |   3 +
>  drivers/net/mlx4/mlx4.c           |  16 ++
>  drivers/net/mlx4/mlx4.h           |   5 +
>  drivers/net/mlx4/mlx4_prm.h       |  12 ++
>  drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
>  drivers/net/mlx4/mlx4_txq.c       |   8 +-
>  8 files changed, 415 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop     = Y
>  MTU update           = Y
>  Jumbo frame          = Y
>  Scattered Rx         = Y
> +TSO                  = Y
>  Promiscuous mode     = Y
>  Allmulticast mode    = Y
>  Unicast MAC filter   = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
> index 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
>    The ability to enable/disable CRC stripping requires OFED version
>    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
>  
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> +  4.4 and above or in rdma-core version v18 and above.
> +
>  Prerequisites
>  -------------
>  
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
> index d151a90..61b7844 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -519,6 +519,8 @@ struct mlx4_conf {
>  		.ports.present = 0,
>  	};
>  	unsigned int vf;
> +	struct rte_mbuf mbuf;
> +	uint64_t size_test = UINT_MAX;

This requires #include <limits.h>

>  	int i;
>  
>  	(void)pci_drv;
> @@ -677,6 +679,20 @@ struct mlx4_conf {
>  					IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DEBUG("FCS stripping toggling is %ssupported",
>  		      priv->hw_fcs_strip ? "" : "not ");
> +		/*
> +		 * No TSO SIZE is defined in DPDK, need to figure it out
> +		 * in order to see if we can support it.
> +		 */
> +		mbuf.tso_segsz = size_test;

I understand that you expect UINT_MAX to be truncated to the size of the
underlying type, but this looks convoluted.

Keep it simple, if both PMD and HW support TSO, just enable the
capability. Ideally the maximum size should be provided by the application
during dev_configure. If the API lacks such information, then this will be
checked during TX possibly causing tx_burst() to bail out early.

> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso >= mbuf.tso_segsz) &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;
> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
> index 300cb4d..742d741 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
>  /** Interrupt alarm timeout value in microseconds. */
>  #define MLX4_INTR_ALARM_TIMEOUT 100000
>  
> +/* Maximum Packet headers size (L2+L3+L4) for TSO. */

Packet => packet

> +#define MLX4_MAX_TSO_HEADER 192  // TODO: find the real value
> +

No "//" comments. Can this TODO be fixed before applying this patch?

>  /** Port parameter. */
>  #define MLX4_PMD_PORT_KVARG "port"
>  
> @@ -90,6 +93,8 @@ struct priv {
>  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
>  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
>  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> +	uint32_t tso:1; /**< Transmit segmentation offload is supported */
> +	uint32_t tso_max_payload_sz; /* Max TSO payload size being supported */

Please use Doxygen format ("/**<").

>  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
>  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
>  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
> diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index e15a3c1..0484878 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -40,6 +40,7 @@
>  /* Work queue element (WQE) flags. */
>  #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
>  #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
>  
>  /* CQE checksum flags. */
>  enum {
> @@ -97,6 +98,17 @@ struct mlx4_cq {
>  	int arm_sn; /**< Rx event counter. */
>  };
>  
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg_ {

Is the trailing underscore purpose to avoid a conflict with v18+?

In that case, you should define this structure under #ifdef HAVE_SOMETHING
generated by auto-config-h.sh. See drivers/net/mlx5/Makefile.

> +	__be32 mss_hdr_size;
> +	__be32 header[];
> +};

Replace __be32 with DPDK types, that is, rte_be32_t.

> +
>  /**
>   * Retrieve a CQE entry from a CQ.
>   *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index a92da66..992d193 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,25 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	volatile struct mlx4_wqe_data_seg *dseg;
> +	union {
> +		volatile struct mlx4_wqe_data_seg *dseg;
> +		volatile uint32_t *dst;
> +	};
>  	uint32_t val;
>  };
>  
> +/** A helper struct for TSO packet handling. */

Since you chose Doxygen format, the description of each field below must
also start with "/**" when occurring before or "/**<" otherwise. Also make
sure sentences are properly capitalized and have a trailing period to keep
generated documentation neat.

> +struct tso_info {
> +	/* Total size of the WQE including padding */
> +	uint32_t wqe_size;
> +	/* size of TSO header to prepend to each packet to send */
> +	uint16_t tso_header_sz;
> +	/* Total size of the TSO entry in the WQE. */
> +	uint16_t wqe_tso_seg_size;
> +	/* Raw WQE size in units of 16 Bytes and without padding. */
> +	uint8_t fence_size;
> +};
> +
>  /** A table to translate Rx completion flags to packet type. */
>  uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
>  	/*
> @@ -377,6 +392,349 @@ struct pv {
>  }
>  
>  /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to a structure to fill the info with.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
> +	if (unlikely(buf->tso_segsz == 0 || tinfo->tso_header_sz == 0)) {
> +		DEBUG("%p: Invalid TSO parameters", (void *)txq);

Please keep the data plane free of any error messages and other logs. Think
about millions of such messages occurring each second, not all that useful.

> +		return -EINVAL;
> +	}
> +	/* First segment must contain all TSO headers. */
> +	if (unlikely(tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER) ||
> +		     tinfo->tso_header_sz > buf->data_len) {
> +		DEBUG("%p: Invalid TSO header length", (void *)txq);

Ditto.

> +		return -EINVAL;
> +	}
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
> +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg_) +
> +					    tinfo->tso_header_sz,
> +					    sizeof(struct mlx4_wqe_data_seg));
> +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> +			     buf->nb_segs;
> +	tinfo->wqe_size =
> +		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
> +			  MLX4_TXBB_SIZE);
> +	/* Validate WQE size and WQE space in the send queue. */
> +	if (sq->remain_size < tinfo->wqe_size ||
> +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param dseg
> + *   Pointer to the first data segment in the TSO WQE.
> + * @param pv
> + *   Pointer to a stash area for saving the first 32bit word of each TXBB
> + *   used for the TSO WQE.
> + * @param pv_counter
> + *   Current location in the stash.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_fill_tso_segs(struct rte_mbuf *buf,
> +			    struct txq *txq,
> +			    const struct tso_info *tinfo,
> +			    volatile struct mlx4_wqe_data_seg *dseg,
> +			    struct pv *pv, int *pv_counter)
> +{
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	uint16_t sb_of = tinfo->tso_header_sz;
> +	uint16_t data_len;
> +
> +	while (nb_segs > 0) {
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					(volatile uint8_t *)dseg - sq->size;
> +		/* how many dseg entries do we have in the current TXBB ? */
> +		nb_segs_txbb =
> +			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
> +			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
> +			sizeof(struct mlx4_wqe_data_seg);
> +		switch (nb_segs_txbb) {
> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			dseg->addr =
> +			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;
> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			--nb_segs;
> +			break;
> +		default:
> +			/* Should never happen */

Then assert() is in order if it can only happen due to a programming
mistake. Crashing the application early is the best approach.

> +			ERROR("%p: invalid number of txbb data segments %d",
> +			      (void *)txq, nb_segs_txbb);

Please remove this error message.

> +			return -EINVAL;

You could replace this by rte_panic(), present when compiled in DEBUG mode.

> +		}
> +	}
> +	return 0;
> +lkey_err:
> +	DEBUG("%p: unable to get MP <-> MR association",
> +	      (void *)txq);

Ditto re log messages in the data plane (please check all remaining
occurrences).

> +	return -EFAULT;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *  This will be used as the header for each TSO segment that is transmitted.

Extra space, I also suggest to add an empty line between them.

> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param tseg
> + *   Pointer to the TSO header field in the TSO WQE.
> + * @param pv
> + *   Pointer to a stash area for saving the first 32bit word of each TXBB
> + *   used for the TSO WQE.
> + * @param pv_counter
> + *   Current location in the stash.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> +			   struct txq *txq,
> +			   const struct tso_info *tinfo,
> +			   volatile struct mlx4_wqe_lso_seg_ *tseg,
> +			    struct pv *pv, int *pv_counter)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	int remain_sz = tinfo->tso_header_sz;
> +	char *from = rte_pktmbuf_mtod(buf, char *);
> +	uint16_t txbb_avail_space;
> +	int copy_sz;
> +	/* Union to overcome volatile constraints when copying TSO header. */
> +	union {
> +		volatile uint8_t *vto;
> +		uint8_t *to;
> +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> +	/*
> +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> +	 * we can write the first 44 TSO header bytes without worry for TxQ
> +	 * wrapping or overwriting the first TXBB 32bit word.
> +	 */
> +	txbb_avail_space = MLX4_TXBB_SIZE -
> +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> +			    sizeof(struct mlx4_wqe_lso_seg_));
> +	copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
> +	rte_memcpy(thdr.to, from, copy_sz);
> +	remain_sz -= copy_sz;
> +	while (remain_sz > 0) {
> +		from += copy_sz;
> +		thdr.to += copy_sz;
> +		/* Start of TXBB need to check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		/* New TXBB, stash the first 32bits for later use. */
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
> +		pv[(*pv_counter)++].val = *((uint32_t *)from);
> +		from += sizeof(uint32_t);
> +		thdr.to += sizeof(uint32_t);
> +		remain_sz -= sizeof(uint32_t);
> +		if (remain_sz <= 0)
> +			break;
> +		/* Now copy the rest */
> +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> +		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
> +		rte_memcpy(thdr.to, from, copy_sz);
> +		remain_sz -= copy_sz;
> +	}
> +	/* TODO: handle PID and IPID ? */

Who's supposed to answer this and when? Please clear TODOs or at least write
down a full description of the work that needs to be done, with any
suggestions you might have.

> +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> +					      tinfo->tso_header_sz);
> +	return 0;
> +}
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param ctrl
> + *   Pointer to the WQE control segment.
> + *
> + * @return
> + *   Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
> +		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
> +{
> +	volatile struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_lso_seg_ *tseg =
> +		(volatile struct mlx4_wqe_lso_seg_ *)(ctrl + 1);
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct tso_info tinfo;
> +	struct pv *pv = (struct pv *)txq->bounce_buf;
> +	int pv_counter = 0;
> +	int ret;
> +
> +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> +	if (ret)
> +		goto error;
> +	ret = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo,
> +					 tseg, pv, &pv_counter);
> +	if (ret)
> +		goto error;
> +	/* Calculate data segment location */
> +	dseg = (volatile struct mlx4_wqe_data_seg *)
> +				((uintptr_t)tseg + tinfo.wqe_tso_seg_size);
> +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> +		dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((uintptr_t)dseg - sq->size);
> +	ret = mlx4_tx_burst_fill_tso_segs(buf, txq, &tinfo,
> +					  dseg, pv, &pv_counter);
> +	if (ret)
> +		goto error;
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (pv_counter) {
> +		/* Need a barrier here before writing the first TXBB word. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			*pv[pv_counter].dst = pv[pv_counter].val;
> +	}
> +	ctrl->fence_size = tinfo.fence_size;
> +	sq->remain_size -= tinfo.wqe_size;
> +	/* Align next WQE address to the next TXBB. */
> +	return (volatile struct mlx4_wqe_ctrl_seg *)
> +		((volatile uint8_t *)ctrl + tinfo.wqe_size);
> +error:
> +	txq->stats.odropped++;
> +	rte_errno = ret;
> +	return NULL;
> +}
> +
> +/**
>   * Write data segments of multi-segment packet.
>   *
>   * @param buf
> @@ -569,6 +927,7 @@ struct pv {
>  			uint16_t flags16[2];
>  		} srcrb;
>  		uint32_t lkey;
> +		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
>  
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -587,7 +946,16 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		if (buf->nb_segs == 1) {
> +		if (tso) {
> +			/* Change opcode to TSO */
> +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> +			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
> +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> +			if (!ctrl_next) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +		} else if (buf->nb_segs == 1) {
>  			/* Validate WQE space in the send queue. */
>  			if (sq->remain_size < MLX4_TXBB_SIZE) {
>  				elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
>  	unsigned int idx; /**< Mapping index. */
>  	uint64_t opackets; /**< Total of successfully sent packets. */
>  	uint64_t obytes; /**< Total of successfully sent bytes. */
> -	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
> +	uint64_t odropped; /**< Total number of packets failed to transmit. */
>  };
>  
>  /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
>  			     DEV_TX_OFFLOAD_UDP_CKSUM |
>  			     DEV_TX_OFFLOAD_TCP_CKSUM);
>  	}
> -	if (priv->hw_csum_l2tun)
> +	if (priv->tso)
> +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> +	if (priv->hw_csum_l2tun) {
>  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +		if (priv->tso)
> +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> +	}
>  	return offloads;
>  }
>  
> -- 
> 1.8.3.1
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v3] net/mlx4: support hardware TSO
  2018-06-28 12:48   ` [dpdk-dev] [PATCH v3] " Moti Haimovsky
  2018-06-28 14:15     ` Adrien Mazarguil
@ 2018-06-28 15:19     ` Matan Azrad
  2018-07-04 14:53     ` [dpdk-dev] [PATCH v4] " Moti Haimovsky
  2 siblings, 0 replies; 14+ messages in thread
From: Matan Azrad @ 2018-06-28 15:19 UTC (permalink / raw)
  To: Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev

Hi Moti

I started to review it but not finished all :)
Please see some comments\questions,
I will continue the review again in the next version, after addressing the next comments and Adrien comments.

From: Mordechay Haimovsky
> +		 * No TSO SIZE is defined in DPDK, need to figure it out
> +		 * in order to see if we can support it.
> +		 */
> +		mbuf.tso_segsz = size_test;
> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso >= mbuf.tso_segsz)
> &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;

Are all the tso_caps fields exist in old rdma-core versions?

> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
...
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_sz += buf->outer_l2_len + buf-
> >outer_l3_len;

Are those tunnel sizes include outer and inner vlan sizes?

> +	if (unlikely(buf->tso_segsz == 0 || tinfo->tso_header_sz == 0)) {
> +		DEBUG("%p: Invalid TSO parameters", (void *)txq);
> +		return -EINVAL;
> +	}
> +	/* First segment must contain all TSO headers. */
> +	if (unlikely(tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER) ||
> +		     tinfo->tso_header_sz > buf->data_len) {
> +		DEBUG("%p: Invalid TSO header length", (void *)txq);
> +		return -EINVAL;
> +	}
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
...
> +static inline int
> +mlx4_tx_burst_fill_tso_segs(struct rte_mbuf *buf,
> +			    struct txq *txq,
> +			    const struct tso_info *tinfo,
> +			    volatile struct mlx4_wqe_data_seg *dseg,
> +			    struct pv *pv, int *pv_counter)
> +{
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	uint16_t sb_of = tinfo->tso_header_sz;
> +	uint16_t data_len;
> +
> +	while (nb_segs > 0) {
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					(volatile uint8_t *)dseg - sq->size;

I think we don't need this check in the first time, so maybe move it to the end of the loop is better.

> +		/* how many dseg entries do we have in the current TXBB ? */
> +		nb_segs_txbb =
> +			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg))
> -
> +			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
> +			sizeof(struct mlx4_wqe_data_seg);
> +		switch (nb_segs_txbb) {

I think this switch case is corrupted, what's happen If we have only 1\2\3 segments but we are in the start of txbbb?
You are going to write the first byte of the txbb firstly, no?

> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			dseg->addr =
> +			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;
> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			--nb_segs;
> +			break;
> +		default:
> +			/* Should never happen */
> +			ERROR("%p: invalid number of txbb data segments
> %d",
> +			      (void *)txq, nb_segs_txbb);
> +			return -EINVAL;
> +		}
> +	}
> +	return 0;
> +lkey_err:
> +	DEBUG("%p: unable to get MP <-> MR association",
> +	      (void *)txq);
> +	return -EFAULT;
> +}

Matan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v4] net/mlx4: support hardware TSO
  2018-06-28 12:48   ` [dpdk-dev] [PATCH v3] " Moti Haimovsky
  2018-06-28 14:15     ` Adrien Mazarguil
  2018-06-28 15:19     ` Matan Azrad
@ 2018-07-04 14:53     ` Moti Haimovsky
  2018-07-05 12:30       ` Matan Azrad
  2018-07-09 10:43       ` [dpdk-dev] [PATCH v5] " Moti Haimovsky
  2 siblings, 2 replies; 14+ messages in thread
From: Moti Haimovsky @ 2018-07-04 14:53 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 362 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 406 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..750ad6d 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_sz;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,335 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_sz == 0 ||
+		     tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_sz > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_sz,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	uint16_t sb_of = tinfo->tso_header_sz;
+	uint16_t data_len;
+
+	while (nb_segs > 0) {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb =
+			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+			sizeof(struct mlx4_wqe_data_seg);
+		switch (nb_segs_txbb) {
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			--nb_segs;
+			break;
+		default:
+			/* Should never happen */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+				  (void *)txq, nb_segs_txbb);
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	}
+	/* Align next WQE address to the next TXBB. */
+	return (volatile struct mlx4_wqe_ctrl_seg *)
+		((volatile uint8_t *)ctrl + tinfo->wqe_size);
+lkey_err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_sz = tinfo->tso_header_sz;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	int copy_sz;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	do {
+		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+		rte_memcpy(thdr.to, from, copy_sz);
+		remain_sz -= copy_sz;
+		if (remain_sz <= 0)
+			break;
+		from += copy_sz;
+		thdr.to += copy_sz;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+		rte_memcpy(&pv[*pv_counter].val, from,
+			   RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
+		(*pv_counter)++;
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_sz -= sizeof(uint32_t);
+		/* Space in current TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	} while (remain_sz > 0);
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_sz);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (tinfo.pv_counter) {
+		pv = tinfo.pv;
+		pv_counter = tinfo.pv_counter;
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			*pv[pv_counter].dst = pv[pv_counter].val;
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +908,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +927,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v4] net/mlx4: support hardware TSO
  2018-07-04 14:53     ` [dpdk-dev] [PATCH v4] " Moti Haimovsky
@ 2018-07-05 12:30       ` Matan Azrad
  2018-07-09 10:43       ` [dpdk-dev] [PATCH v5] " Moti Haimovsky
  1 sibling, 0 replies; 14+ messages in thread
From: Matan Azrad @ 2018-07-05 12:30 UTC (permalink / raw)
  To: Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev

HI Moti

Please see inline.

From: Mordechay Haimovsky
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> ---
> v4:
> * Bug fixes in filling TSO data segments.
> * Modifications according to review inputs from Adrien Mazarguil
>   and Matan Azrad.
> in reply to
> 1530190137-17848-1-git-send-email-motih@mellanox.com
> 
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
>   caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
> 
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
> 
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
>  doc/guides/nics/features/mlx4.ini |   1 +
>  doc/guides/nics/mlx4.rst          |   3 +
>  drivers/net/mlx4/Makefile         |   5 +
>  drivers/net/mlx4/mlx4.c           |   9 +
>  drivers/net/mlx4/mlx4.h           |   5 +
>  drivers/net/mlx4/mlx4_prm.h       |  15 ++
>  drivers/net/mlx4/mlx4_rxtx.c      | 362
> +++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
>  drivers/net/mlx4/mlx4_txq.c       |   8 +-
>  9 files changed, 406 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx4.ini
> b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop     = Y
>  MTU update           = Y
>  Jumbo frame          = Y
>  Scattered Rx         = Y
> +TSO                  = Y
>  Promiscuous mode     = Y
>  Allmulticast mode    = Y
>  Unicast MAC filter   = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
>    The ability to enable/disable CRC stripping requires OFED version
>    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> 
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> +  4.4 and above or in rdma-core version v18 and above.
> +
>  Prerequisites
>  -------------
> 
> diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index
> 73f9d40..63bc003 100644
> --- a/drivers/net/mlx4/Makefile
> +++ b/drivers/net/mlx4/Makefile
> @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
>  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
>  	$Q $(RM) -f -- '$@'
>  	$Q : > '$@'
> +	$Q sh -- '$<' '$@' \
> +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> +		infiniband/mlx4dv.h \
> +		type 'struct mlx4_wqe_lso_seg' \
> +		$(AUTOCONF_OUTPUT)
> 
>  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> d151a90..5d8c76d 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -677,6 +677,15 @@ struct mlx4_conf {
> 
> 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DEBUG("FCS stripping toggling is %ssupported",
>  		      priv->hw_fcs_strip ? "" : "not ");
> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso > 0) &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;
> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> 300cb4d..89d8c38 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
>  /** Interrupt alarm timeout value in microseconds. */  #define
> MLX4_INTR_ALARM_TIMEOUT 100000
> 
> +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> +MLX4_MAX_TSO_HEADER 192
> +
>  /** Port parameter. */
>  #define MLX4_PMD_PORT_KVARG "port"
> 
> @@ -90,6 +93,8 @@ struct priv {
>  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
>  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> */
>  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> size. */
>  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format). */
>  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
>  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b771d8c..aef77ba 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -19,6 +19,7 @@
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> +#include "mlx4_autoconf.h"
> 
>  /* ConnectX-3 Tx queue basic block. */
>  #define MLX4_TXBB_SHIFT 6
> @@ -40,6 +41,7 @@
>  /* Work queue element (WQE) flags. */
>  #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
> 
>  /* CQE checksum flags. */
>  enum {
> @@ -98,6 +100,19 @@ struct mlx4_cq {
>  	int arm_sn; /**< Rx event counter. */
>  };
> 
> +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg {
> +	rte_be32_t mss_hdr_size;
> +	rte_be32_t header[];
> +};
> +#endif
> +
>  /**
>   * Retrieve a CQE entry from a CQ.
>   *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 78b6dd5..750ad6d 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,29 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	volatile struct mlx4_wqe_data_seg *dseg;
> +	union {
> +		volatile struct mlx4_wqe_data_seg *dseg;
> +		volatile uint32_t *dst;
> +	};
>  	uint32_t val;
>  };
> 
> +/** A helper structure for TSO packet handling. */ struct tso_info {
> +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> +	struct pv *pv;
> +	/** Current entry in the pv array. */
> +	int pv_counter;
> +	/** Total size of the WQE including padding. */
> +	uint32_t wqe_size;
> +	/** size of TSO header to prepend to each packet to send. */

size => Size

> +	uint16_t tso_header_sz;
 tso_header_sz = > t tso_header_size

"size" like the next fields name.

> +	/** Total size of the TSO segment in the WQE. */
> +	uint16_t wqe_tso_seg_size;
> +	/** Raw WQE size in units of 16 Bytes and without padding. */
> +	uint8_t fence_size;
> +};
> +
>  /** A table to translate Rx completion flags to packet type. */  uint32_t
> mlx4_ptype_table[0x100] __rte_cache_aligned = {
>  	/*
> @@ -368,6 +387,335 @@ struct pv {
>  }
> 
>  /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to a structure to fill the info with.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_sz += buf->outer_l2_len + buf-
> >outer_l3_len;
> +	if (unlikely(buf->tso_segsz == 0 ||
> +		     tinfo->tso_header_sz == 0 ||
> +		     tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
> +		     tinfo->tso_header_sz > buf->data_len))
> +		return -EINVAL;
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
> +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +    tinfo->tso_header_sz, sizeof(struct mlx4_wqe_data_seg));
> +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> +			     buf->nb_segs;
> +	tinfo->wqe_size =
> +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> MLX4_SEG_SHIFT),
> +			  MLX4_TXBB_SIZE);
> +	/* Validate WQE size and WQE space in the send queue. */
> +	if (sq->remain_size < tinfo->wqe_size ||
> +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> +		return -ENOMEM;
> +	/* Init pv. */
> +	tinfo->pv = (struct pv *)txq->bounce_buf;
> +	tinfo->pv_counter = 0;
> +	return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param dseg
> + *   Pointer to the first data segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo,
> +			     volatile struct mlx4_wqe_data_seg *dseg,
> +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	uint16_t sb_of = tinfo->tso_header_sz;
> +	uint16_t data_len;
> +
> +	while (nb_segs > 0) {

I think that here do while statement is better(no need the check in the first loop).

> +		/* how many dseg entries do we have in the current TXBB ?
> */
> +		nb_segs_txbb =
> +			(MLX4_TXBB_SIZE / sizeof(struct
> mlx4_wqe_data_seg)) -
> +			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
> +			sizeof(struct mlx4_wqe_data_seg);

Division may be expensive, you can avoid it by next:
nb_segs_txbb = (MLX4_TXBB_SIZE - ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> MLX4_SEG_SHIFT;

> +		switch (nb_segs_txbb) {
> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			dseg->addr =
> +
> rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count
> writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;
> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;

I think that here and in all the other cases it is better to do " return X" instead of break.
X is the same return value as now which can be calculated in the start.

> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			--nb_segs;
> +			break;
> +		default:
> +			/* Should never happen */
> +			rte_panic("%p: Invalid number of SGEs(%d) for a
> TXBB",
> +				  (void *)txq, nb_segs_txbb);

I think we don't need the default case here, Do you have any scenario it may really happen?

> +		}
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((volatile uint8_t *)dseg - sq->size);
> +	}
> +	/* Align next WQE address to the next TXBB. */
> +	return (volatile struct mlx4_wqe_ctrl_seg *)
> +		((volatile uint8_t *)ctrl + tinfo->wqe_size);
> +lkey_err:
> +	return NULL;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *
> + * This will be used as the header for each TSO segment that is transmitted.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_data_seg *
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> +			   struct txq *txq,
> +			   struct tso_info *tinfo,
> +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_lso_seg *tseg =
> +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	int remain_sz = tinfo->tso_header_sz;
> +	char *from = rte_pktmbuf_mtod(buf, char *);
> +	uint16_t txbb_avail_space;
> +	int copy_sz;
> +	/* Union to overcome volatile constraints when copying TSO header.
> */
> +	union {
> +		volatile uint8_t *vto;
> +		uint8_t *to;
> +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> +	/*
> +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> +	 * we can write the first 44 TSO header bytes without worry for TxQ
> +	 * wrapping or overwriting the first TXBB 32bit word.
> +	 */
> +	txbb_avail_space = MLX4_TXBB_SIZE -
> +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> +			    sizeof(struct mlx4_wqe_lso_seg));
> +	do {
> +		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
> +		rte_memcpy(thdr.to, from, copy_sz);
> +		remain_sz -= copy_sz;
> +		if (remain_sz <= 0)
> +			break;
> +		from += copy_sz;
> +		thdr.to += copy_sz;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		/* New TXBB, stash the first 32bits for later use. */
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
> +		rte_memcpy(&pv[*pv_counter].val, from,
> +			   RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
> +		(*pv_counter)++;
> +		from += sizeof(uint32_t);
> +		thdr.to += sizeof(uint32_t);
> +		remain_sz -= sizeof(uint32_t);
> +		/* Space in current TXBB is TXBB size - 4 */
> +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> +	} while (remain_sz > 0);

I think the loop can be better - you have now 5 checks per txbb, we can reduce it to 2 by next:

txbb_data_space = 44; (not include the first 4 bytes of the current txbb)

while (remain_size >= txbb_data_space + 4) // loop to write the tail of the current txbb + the head of the next txbb
	write txbb_data_space to the WQE.
	Check wrap around
	Write 4 bytes for the next txbb to pv
	remain_size -= 	txbb_data_space + 4
	txbb_data_space =60
	
if (remain_size > txbb_data_space) // write tail and partially head
	write txbb_data_space to the WQE.
	Check wrap around
	Write (remain_size - txbb_data_space) to pv
Else // write only tail
	write remain_size from the header

Am I missing something?
	
> +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> +					      tinfo->tso_header_sz);
> +	/* Calculate data segment location */
> +	return (volatile struct mlx4_wqe_data_seg *)
> +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> }
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param ctrl
> + *   Pointer to the WQE control segment.
> + *
> + * @return
> + *   Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> +rte_mbuf *buf, struct txq *txq,
> +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct tso_info tinfo;
> +	struct pv *pv;
> +	int pv_counter;
> +	int ret;
> +
> +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> +	if (unlikely(ret))
> +		goto error;
> +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> +	if (unlikely(dseg == NULL))
> +		goto error;
> +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> +		dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((uintptr_t)dseg - sq->size);
> +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> +	if (unlikely(ctrl_next == NULL))
> +		goto error;
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (tinfo.pv_counter) {

I think you can add here likely:

The minimum segments:
1. cntrl
2. header eth
3. header IP
4. header IP\tcp
5. at least 1 data segment.

Maybe even we don't need this check.

> +		pv = tinfo.pv;
> +		pv_counter = tinfo.pv_counter;
> +		/* Need a barrier here before writing the first TXBB word. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			*pv[pv_counter].dst = pv[pv_counter].val;
> +	}
> +	ctrl->fence_size = tinfo.fence_size;
> +	sq->remain_size -= tinfo.wqe_size;
> +	return ctrl_next;
> +error:
> +	txq->stats.odropped++;
> +	return NULL;
> +}
> +
> +/**
>   * Write data segments of multi-segment packet.
>   *
>   * @param buf
> @@ -560,6 +908,7 @@ struct pv {
>  			uint16_t flags16[2];
>  		} srcrb;
>  		uint32_t lkey;
> +		bool tso = txq->priv->tso && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> 
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -578,7 +927,16 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		if (buf->nb_segs == 1) {
> +		if (tso) {
> +			/* Change opcode to TSO */
> +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> +			owner_opcode |= MLX4_OPCODE_LSO |
> MLX4_WQE_CTRL_RR;
> +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> +			if (!ctrl_next) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +		} else if (buf->nb_segs == 1) {
>  			/* Validate WQE space in the send queue. */
>  			if (sq->remain_size < MLX4_TXBB_SIZE) {
>  				elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
>  	unsigned int idx; /**< Mapping index. */
>  	uint64_t opackets; /**< Total of successfully sent packets. */
>  	uint64_t obytes; /**< Total of successfully sent bytes. */
> -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> */
> +	uint64_t odropped; /**< Total number of packets failed to transmit.
> */
>  };
> 
>  /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
>  			     DEV_TX_OFFLOAD_UDP_CKSUM |
>  			     DEV_TX_OFFLOAD_TCP_CKSUM);
>  	}
> -	if (priv->hw_csum_l2tun)
> +	if (priv->tso)
> +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> +	if (priv->hw_csum_l2tun) {
>  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +		if (priv->tso)
> +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> +	}
>  	return offloads;
>  }
> 
> --
> 1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v5] net/mlx4: support hardware TSO
  2018-07-04 14:53     ` [dpdk-dev] [PATCH v4] " Moti Haimovsky
  2018-07-05 12:30       ` Matan Azrad
@ 2018-07-09 10:43       ` Moti Haimovsky
  2018-07-09 13:07         ` Matan Azrad
  2018-07-09 16:33         ` [dpdk-dev] [PATCH v6] " Moti Haimovsky
  1 sibling, 2 replies; 14+ messages in thread
From: Moti Haimovsky @ 2018-07-09 10:43 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v5:
* Modification to the code according to review inputs from Matan
  Azrad.
* Code optimization to the TSO header copy routine.
* Rearranged the TSO data-segments creation routine.
in reply to 
1530715998-15703-1-git-send-email-motih@mellanox.com

v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 416 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..b695539 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** Size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_size;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,345 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_size +=
+				buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_size == 0 ||
+		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_size > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_size,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+			(volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl + tinfo->wqe_size);
+	uint16_t sb_of = tinfo->tso_header_size;
+	uint16_t data_len;
+
+	do {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb = (MLX4_TXBB_SIZE -
+				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+			       MLX4_SEG_SHIFT;
+		switch (nb_segs_txbb) {
+		default:
+			/* Should never happen. */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+			(void *)txq, nb_segs_txbb);
+			/* rte_panic never returns. */
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	} while (true);
+err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_size = tinfo->tso_header_size;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+		/* Copy to end of txbb. */
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		pv[(*pv_counter)++].val = *(uint32_t *)from,
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_size -= (txbb_avail_space + sizeof(uint32_t));
+		/* Avail space in new TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	}
+	if (remain_size > txbb_avail_space) {
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		remain_size -= txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+		(*pv_counter)++;
+	} else {
+		rte_memcpy(thdr.to, from, remain_size);
+	}
+
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_size);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	pv = tinfo.pv;
+	pv_counter = tinfo.pv_counter;
+	/* Need a barrier here before writing the first TXBB word. */
+	rte_io_wmb();
+	for (--pv_counter; pv_counter  >= 0; pv_counter--)
+		*pv[pv_counter].dst = pv[pv_counter].val;
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +918,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +937,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/mlx4: support hardware TSO
  2018-07-09 10:43       ` [dpdk-dev] [PATCH v5] " Moti Haimovsky
@ 2018-07-09 13:07         ` Matan Azrad
  2018-07-09 16:22           ` Mordechay Haimovsky
  2018-07-09 16:33         ` [dpdk-dev] [PATCH v6] " Moti Haimovsky
  1 sibling, 1 reply; 14+ messages in thread
From: Matan Azrad @ 2018-07-09 13:07 UTC (permalink / raw)
  To: Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev



Hi Moti 

Please see some comments below.

From: Mordechay Haimovsky
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> ---
> v5:
> * Modification to the code according to review inputs from Matan
>   Azrad.
> * Code optimization to the TSO header copy routine.
> * Rearranged the TSO data-segments creation routine.
> in reply to
> 1530715998-15703-1-git-send-email-motih@mellanox.com
> 
> v4:
> * Bug fixes in filling TSO data segments.
> * Modifications according to review inputs from Adrien Mazarguil
>   and Matan Azrad.
> in reply to
> 1530190137-17848-1-git-send-email-motih@mellanox.com
> 
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
>   caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
> 
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
> 
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
>  doc/guides/nics/features/mlx4.ini |   1 +
>  doc/guides/nics/mlx4.rst          |   3 +
>  drivers/net/mlx4/Makefile         |   5 +
>  drivers/net/mlx4/mlx4.c           |   9 +
>  drivers/net/mlx4/mlx4.h           |   5 +
>  drivers/net/mlx4/mlx4_prm.h       |  15 ++
>  drivers/net/mlx4/mlx4_rxtx.c      | 372
> +++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
>  drivers/net/mlx4/mlx4_txq.c       |   8 +-
>  9 files changed, 416 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx4.ini
> b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop     = Y
>  MTU update           = Y
>  Jumbo frame          = Y
>  Scattered Rx         = Y
> +TSO                  = Y
>  Promiscuous mode     = Y
>  Allmulticast mode    = Y
>  Unicast MAC filter   = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
>    The ability to enable/disable CRC stripping requires OFED version
>    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> 
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> +  4.4 and above or in rdma-core version v18 and above.
> +
>  Prerequisites
>  -------------
> 
> diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index
> 73f9d40..63bc003 100644
> --- a/drivers/net/mlx4/Makefile
> +++ b/drivers/net/mlx4/Makefile
> @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
>  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
>  	$Q $(RM) -f -- '$@'
>  	$Q : > '$@'
> +	$Q sh -- '$<' '$@' \
> +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> +		infiniband/mlx4dv.h \
> +		type 'struct mlx4_wqe_lso_seg' \
> +		$(AUTOCONF_OUTPUT)
> 
>  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> d151a90..5d8c76d 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -677,6 +677,15 @@ struct mlx4_conf {
> 
> 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DEBUG("FCS stripping toggling is %ssupported",
>  		      priv->hw_fcs_strip ? "" : "not ");
> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso > 0) &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;
> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> 300cb4d..89d8c38 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
>  /** Interrupt alarm timeout value in microseconds. */  #define
> MLX4_INTR_ALARM_TIMEOUT 100000
> 
> +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> +MLX4_MAX_TSO_HEADER 192
> +
>  /** Port parameter. */
>  #define MLX4_PMD_PORT_KVARG "port"
> 
> @@ -90,6 +93,8 @@ struct priv {
>  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
>  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> */
>  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> size. */
>  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format). */
>  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
>  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b771d8c..aef77ba 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -19,6 +19,7 @@
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> +#include "mlx4_autoconf.h"
> 
>  /* ConnectX-3 Tx queue basic block. */
>  #define MLX4_TXBB_SHIFT 6
> @@ -40,6 +41,7 @@
>  /* Work queue element (WQE) flags. */
>  #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
> 
>  /* CQE checksum flags. */
>  enum {
> @@ -98,6 +100,19 @@ struct mlx4_cq {
>  	int arm_sn; /**< Rx event counter. */
>  };
> 
> +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg {
> +	rte_be32_t mss_hdr_size;
> +	rte_be32_t header[];
> +};
> +#endif
> +
>  /**
>   * Retrieve a CQE entry from a CQ.
>   *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 78b6dd5..b695539 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,29 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	volatile struct mlx4_wqe_data_seg *dseg;
> +	union {
> +		volatile struct mlx4_wqe_data_seg *dseg;
> +		volatile uint32_t *dst;
> +	};
>  	uint32_t val;
>  };
> 
> +/** A helper structure for TSO packet handling. */ struct tso_info {
> +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> +	struct pv *pv;
> +	/** Current entry in the pv array. */
> +	int pv_counter;
> +	/** Total size of the WQE including padding. */
> +	uint32_t wqe_size;
> +	/** Size of TSO header to prepend to each packet to send. */
> +	uint16_t tso_header_size;
> +	/** Total size of the TSO segment in the WQE. */
> +	uint16_t wqe_tso_seg_size;
> +	/** Raw WQE size in units of 16 Bytes and without padding. */
> +	uint8_t fence_size;
> +};
> +
>  /** A table to translate Rx completion flags to packet type. */  uint32_t
> mlx4_ptype_table[0x100] __rte_cache_aligned = {
>  	/*
> @@ -368,6 +387,345 @@ struct pv {
>  }
> 
>  /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to a structure to fill the info with.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_size +=
> +				buf->outer_l2_len + buf->outer_l3_len;
> +	if (unlikely(buf->tso_segsz == 0 ||
> +		     tinfo->tso_header_size == 0 ||
> +		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
> +		     tinfo->tso_header_size > buf->data_len))
> +		return -EINVAL;
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
> +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct
> mlx4_wqe_lso_seg) +
> +					    tinfo->tso_header_size,
> +					    sizeof(struct
> mlx4_wqe_data_seg));
> +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> +			     buf->nb_segs;
> +	tinfo->wqe_size =
> +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> MLX4_SEG_SHIFT),
> +			  MLX4_TXBB_SIZE);
> +	/* Validate WQE size and WQE space in the send queue. */
> +	if (sq->remain_size < tinfo->wqe_size ||
> +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> +		return -ENOMEM;
> +	/* Init pv. */
> +	tinfo->pv = (struct pv *)txq->bounce_buf;
> +	tinfo->pv_counter = 0;
> +	return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param dseg
> + *   Pointer to the first data segment in the TSO WQE.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo,
> +			     volatile struct mlx4_wqe_data_seg *dseg,
> +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
> +			(volatile struct mlx4_wqe_ctrl_seg *)
> +				((volatile uint8_t *)ctrl + tinfo->wqe_size);
> +	uint16_t sb_of = tinfo->tso_header_size;
> +	uint16_t data_len;
> +
> +	do {
> +		/* how many dseg entries do we have in the current TXBB ?
> */
> +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> +			       MLX4_SEG_SHIFT;
> +		switch (nb_segs_txbb) {
> +		default:
> +			/* Should never happen. */
> +			rte_panic("%p: Invalid number of SGEs(%d) for a
> TXBB",
> +			(void *)txq, nb_segs_txbb);
> +			/* rte_panic never returns. */

Since this default case should not happen because of the above calculation I think we don't need it.
Just "break" if the compiler complain of default case lack.

> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			dseg->addr =
> +
> rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count
> writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;

Is there a chance that the data_len will be negative? Rolled in this case?
Maybe it is better to change it for int16_t and to replace the next check to be:
data_len > 0 ? data_len : 0x80000000


And I think I found a way to remove the sb_of calculations for each segment:

Each segment will create the next segment parameters while only the pre loop calculation for the first segment parameters will calculate the header offset:

The parameters: data_len and sb_of.

So before the loop:
sb_of = tinfo->tso_header_size;
data_len = sbuf->data_len - sb_of;

And inside the loop (after the check of nb_segs):
sb_of = 0;
data_len = sbuf->data_len(the next sbuf);

so each segment calculates the next segment parameters and we don't need the "- sb_of" calculation per segment.

> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +		}
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((volatile uint8_t *)dseg - sq->size);
> +	} while (true);
> +err:
> +	return NULL;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *
> + * This will be used as the header for each TSO segment that is transmitted.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_data_seg *
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> +			   struct txq *txq,
> +			   struct tso_info *tinfo,
> +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_lso_seg *tseg =
> +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	int remain_size = tinfo->tso_header_size;
> +	char *from = rte_pktmbuf_mtod(buf, char *);
> +	uint16_t txbb_avail_space;
> +	/* Union to overcome volatile constraints when copying TSO header.
> */
> +	union {
> +		volatile uint8_t *vto;
> +		uint8_t *to;
> +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> +	/*
> +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> +	 * we can write the first 44 TSO header bytes without worry for TxQ
> +	 * wrapping or overwriting the first TXBB 32bit word.
> +	 */
> +	txbb_avail_space = MLX4_TXBB_SIZE -
> +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> +			    sizeof(struct mlx4_wqe_lso_seg));

I think that better name is txbb_tail_size.

> +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> +		/* Copy to end of txbb. */
> +		rte_memcpy(thdr.to, from, txbb_avail_space);
> +		from += txbb_avail_space;
> +		thdr.to += txbb_avail_space;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		/* New TXBB, stash the first 32bits for later use. */
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> +		from += sizeof(uint32_t);
> +		thdr.to += sizeof(uint32_t);
> +		remain_size -= (txbb_avail_space + sizeof(uint32_t));

You don't need the () here.

> +		/* Avail space in new TXBB is TXBB size - 4 */
> +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> +	}
> +	if (remain_size > txbb_avail_space) {
> +		rte_memcpy(thdr.to, from, txbb_avail_space);
> +		from += txbb_avail_space;
> +		thdr.to += txbb_avail_space;
> +		remain_size -= txbb_avail_space;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> +		(*pv_counter)++;
> +	} else {

Here it should be else if (remain_size > 0).

> +		rte_memcpy(thdr.to, from, remain_size);
> +	}
> +
> +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> +					      tinfo->tso_header_size);
> +	/* Calculate data segment location */
> +	return (volatile struct mlx4_wqe_data_seg *)
> +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> }
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param ctrl
> + *   Pointer to the WQE control segment.
> + *
> + * @return
> + *   Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> +rte_mbuf *buf, struct txq *txq,
> +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct tso_info tinfo;
> +	struct pv *pv;
> +	int pv_counter;
> +	int ret;
> +
> +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> +	if (unlikely(ret))
> +		goto error;
> +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> +	if (unlikely(dseg == NULL))
> +		goto error;
> +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> +		dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((uintptr_t)dseg - sq->size);
> +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> +	if (unlikely(ctrl_next == NULL))
> +		goto error;
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	pv = tinfo.pv;
> +	pv_counter = tinfo.pv_counter;
> +	/* Need a barrier here before writing the first TXBB word. */
> +	rte_io_wmb();

> +	for (--pv_counter; pv_counter  >= 0; pv_counter--)

Since we don't need the first check do while statement is better.
To be fully safe you can use likely check before the memory barrier. 

> +		*pv[pv_counter].dst = pv[pv_counter].val;
> +	ctrl->fence_size = tinfo.fence_size;
> +	sq->remain_size -= tinfo.wqe_size;
> +	return ctrl_next;
> +error:
> +	txq->stats.odropped++;
> +	return NULL;
> +}
> +
> +/**
>   * Write data segments of multi-segment packet.
>   *
>   * @param buf
> @@ -560,6 +918,7 @@ struct pv {
>  			uint16_t flags16[2];
>  		} srcrb;
>  		uint32_t lkey;
> +		bool tso = txq->priv->tso && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> 
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -578,7 +937,16 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		if (buf->nb_segs == 1) {
> +		if (tso) {
> +			/* Change opcode to TSO */
> +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> +			owner_opcode |= MLX4_OPCODE_LSO |
> MLX4_WQE_CTRL_RR;
> +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> +			if (!ctrl_next) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +		} else if (buf->nb_segs == 1) {
>  			/* Validate WQE space in the send queue. */
>  			if (sq->remain_size < MLX4_TXBB_SIZE) {
>  				elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
>  	unsigned int idx; /**< Mapping index. */
>  	uint64_t opackets; /**< Total of successfully sent packets. */
>  	uint64_t obytes; /**< Total of successfully sent bytes. */
> -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> */
> +	uint64_t odropped; /**< Total number of packets failed to transmit.
> */
>  };
> 
>  /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
>  			     DEV_TX_OFFLOAD_UDP_CKSUM |
>  			     DEV_TX_OFFLOAD_TCP_CKSUM);
>  	}
> -	if (priv->hw_csum_l2tun)
> +	if (priv->tso)
> +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> +	if (priv->hw_csum_l2tun) {
>  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +		if (priv->tso)
> +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> +	}
>  	return offloads;
>  }
> 
> --
> 1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/mlx4: support hardware TSO
  2018-07-09 13:07         ` Matan Azrad
@ 2018-07-09 16:22           ` Mordechay Haimovsky
  2018-07-09 18:18             ` Matan Azrad
  0 siblings, 1 reply; 14+ messages in thread
From: Mordechay Haimovsky @ 2018-07-09 16:22 UTC (permalink / raw)
  To: Matan Azrad, Adrien Mazarguil; +Cc: dev

inline

> -----Original Message-----
> From: Matan Azrad
> Sent: Monday, July 9, 2018 4:07 PM
> To: Mordechay Haimovsky <motih@mellanox.com>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>
> Cc: dev@dpdk.org
> Subject: RE: [PATCH v5] net/mlx4: support hardware TSO
> 
> 
> 
> Hi Moti
> 
> Please see some comments below.
> 
> From: Mordechay Haimovsky
> > Implement support for hardware TSO.
> >
> > Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> > ---
> > v5:
> > * Modification to the code according to review inputs from Matan
> >   Azrad.
> > * Code optimization to the TSO header copy routine.
> > * Rearranged the TSO data-segments creation routine.
> > in reply to
> > 1530715998-15703-1-git-send-email-motih@mellanox.com
> >
> > v4:
> > * Bug fixes in filling TSO data segments.
> > * Modifications according to review inputs from Adrien Mazarguil
> >   and Matan Azrad.
> > in reply to
> > 1530190137-17848-1-git-send-email-motih@mellanox.com
> >
> > v3:
> > * Fixed compilation errors in compilers without GNU C extensions
> >   caused by a declaration of zero-length array in the code.
> > in reply to
> > 1530187032-6489-1-git-send-email-motih@mellanox.com
> >
> > v2:
> > * Fixed coding style warning.
> > in reply to
> > 1530184583-30166-1-git-send-email-motih@mellanox.com
> >
> > v1:
> > * Fixed coding style warnings.
> > in reply to
> > 1530181779-19716-1-git-send-email-motih@mellanox.com
> > ---
> >  doc/guides/nics/features/mlx4.ini |   1 +
> >  doc/guides/nics/mlx4.rst          |   3 +
> >  drivers/net/mlx4/Makefile         |   5 +
> >  drivers/net/mlx4/mlx4.c           |   9 +
> >  drivers/net/mlx4/mlx4.h           |   5 +
> >  drivers/net/mlx4/mlx4_prm.h       |  15 ++
> >  drivers/net/mlx4/mlx4_rxtx.c      | 372
> > +++++++++++++++++++++++++++++++++++++-
> >  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
> >  drivers/net/mlx4/mlx4_txq.c       |   8 +-
> >  9 files changed, 416 insertions(+), 4 deletions(-)
> >
> > diff --git a/doc/guides/nics/features/mlx4.ini
> > b/doc/guides/nics/features/mlx4.ini
> > index f6efd21..98a3f61 100644
> > --- a/doc/guides/nics/features/mlx4.ini
> > +++ b/doc/guides/nics/features/mlx4.ini
> > @@ -13,6 +13,7 @@ Queue start/stop     = Y
> >  MTU update           = Y
> >  Jumbo frame          = Y
> >  Scattered Rx         = Y
> > +TSO                  = Y
> >  Promiscuous mode     = Y
> >  Allmulticast mode    = Y
> >  Unicast MAC filter   = Y
> > diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> > 491106a..12adaeb 100644
> > --- a/doc/guides/nics/mlx4.rst
> > +++ b/doc/guides/nics/mlx4.rst
> > @@ -142,6 +142,9 @@ Limitations
> >    The ability to enable/disable CRC stripping requires OFED version
> >    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> >
> > +- TSO (Transmit Segmentation Offload) is supported in OFED version
> > +  4.4 and above or in rdma-core version v18 and above.
> > +
> >  Prerequisites
> >  -------------
> >
> > diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
> > index
> > 73f9d40..63bc003 100644
> > --- a/drivers/net/mlx4/Makefile
> > +++ b/drivers/net/mlx4/Makefile
> > @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
> >  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
> >  	$Q $(RM) -f -- '$@'
> >  	$Q : > '$@'
> > +	$Q sh -- '$<' '$@' \
> > +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> > +		infiniband/mlx4dv.h \
> > +		type 'struct mlx4_wqe_lso_seg' \
> > +		$(AUTOCONF_OUTPUT)
> >
> >  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> >
> > diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> > d151a90..5d8c76d 100644
> > --- a/drivers/net/mlx4/mlx4.c
> > +++ b/drivers/net/mlx4/mlx4.c
> > @@ -677,6 +677,15 @@ struct mlx4_conf {
> >
> > 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
> >  		DEBUG("FCS stripping toggling is %ssupported",
> >  		      priv->hw_fcs_strip ? "" : "not ");
> > +		priv->tso =
> > +			((device_attr_ex.tso_caps.max_tso > 0) &&
> > +			 (device_attr_ex.tso_caps.supported_qpts &
> > +			  (1 << IBV_QPT_RAW_PACKET)));
> > +		if (priv->tso)
> > +			priv->tso_max_payload_sz =
> > +					device_attr_ex.tso_caps.max_tso;
> > +		DEBUG("TSO is %ssupported",
> > +		      priv->tso ? "" : "not ");
> >  		/* Configure the first MAC address by default. */
> >  		err = mlx4_get_mac(priv, &mac.addr_bytes);
> >  		if (err) {
> > diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> > 300cb4d..89d8c38 100644
> > --- a/drivers/net/mlx4/mlx4.h
> > +++ b/drivers/net/mlx4/mlx4.h
> > @@ -47,6 +47,9 @@
> >  /** Interrupt alarm timeout value in microseconds. */  #define
> > MLX4_INTR_ALARM_TIMEOUT 100000
> >
> > +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> > +MLX4_MAX_TSO_HEADER 192
> > +
> >  /** Port parameter. */
> >  #define MLX4_PMD_PORT_KVARG "port"
> >
> > @@ -90,6 +93,8 @@ struct priv {
> >  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
> >  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> > */
> >  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported.
> > */
> > +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> > +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> > size. */
> >  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format).
> > */
> >  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
> >  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> > */ diff --git a/drivers/net/mlx4/mlx4_prm.h
> > b/drivers/net/mlx4/mlx4_prm.h index b771d8c..aef77ba 100644
> > --- a/drivers/net/mlx4/mlx4_prm.h
> > +++ b/drivers/net/mlx4/mlx4_prm.h
> > @@ -19,6 +19,7 @@
> >  #ifdef PEDANTIC
> >  #pragma GCC diagnostic error "-Wpedantic"
> >  #endif
> > +#include "mlx4_autoconf.h"
> >
> >  /* ConnectX-3 Tx queue basic block. */  #define MLX4_TXBB_SHIFT 6 @@
> > -40,6 +41,7 @@
> >  /* Work queue element (WQE) flags. */  #define
> > MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> > MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> > +#define MLX4_WQE_CTRL_RR (1 << 6)
> >
> >  /* CQE checksum flags. */
> >  enum {
> > @@ -98,6 +100,19 @@ struct mlx4_cq {
> >  	int arm_sn; /**< Rx event counter. */  };
> >
> > +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> > +/*
> > + * WQE LSO segment structure.
> > + * Defined here as backward compatibility for rdma-core v17 and below.
> > + * Similar definition is found in infiniband/mlx4dv.h in rdma-core
> > +v18
> > + * and above.
> > + */
> > +struct mlx4_wqe_lso_seg {
> > +	rte_be32_t mss_hdr_size;
> > +	rte_be32_t header[];
> > +};
> > +#endif
> > +
> >  /**
> >   * Retrieve a CQE entry from a CQ.
> >   *
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 78b6dd5..b695539 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -38,10 +38,29 @@
> >   * DWORD (32 byte) of a TXBB.
> >   */
> >  struct pv {
> > -	volatile struct mlx4_wqe_data_seg *dseg;
> > +	union {
> > +		volatile struct mlx4_wqe_data_seg *dseg;
> > +		volatile uint32_t *dst;
> > +	};
> >  	uint32_t val;
> >  };
> >
> > +/** A helper structure for TSO packet handling. */ struct tso_info {
> > +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> > +	struct pv *pv;
> > +	/** Current entry in the pv array. */
> > +	int pv_counter;
> > +	/** Total size of the WQE including padding. */
> > +	uint32_t wqe_size;
> > +	/** Size of TSO header to prepend to each packet to send. */
> > +	uint16_t tso_header_size;
> > +	/** Total size of the TSO segment in the WQE. */
> > +	uint16_t wqe_tso_seg_size;
> > +	/** Raw WQE size in units of 16 Bytes and without padding. */
> > +	uint8_t fence_size;
> > +};
> > +
> >  /** A table to translate Rx completion flags to packet type. */
> > uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
> >  	/*
> > @@ -368,6 +387,345 @@ struct pv {
> >  }
> >
> >  /**
> > + * Obtain and calculate TSO information needed for assembling a TSO
> WQE.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to a structure to fill the info with.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline int
> > +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> > +			     struct txq *txq,
> > +			     struct tso_info *tinfo)
> > +{
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> > +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> > +
> > +	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
> > +	if (tunneled)
> > +		tinfo->tso_header_size +=
> > +				buf->outer_l2_len + buf->outer_l3_len;
> > +	if (unlikely(buf->tso_segsz == 0 ||
> > +		     tinfo->tso_header_size == 0 ||
> > +		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
> > +		     tinfo->tso_header_size > buf->data_len))
> > +		return -EINVAL;
> > +	/*
> > +	 * Calculate the WQE TSO segment size
> > +	 * Note:
> > +	 * 1. An LSO segment must be padded such that the subsequent data
> > +	 *    segment is 16-byte aligned.
> > +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> > +	 */
> > +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct
> > mlx4_wqe_lso_seg) +
> > +					    tinfo->tso_header_size,
> > +					    sizeof(struct
> > mlx4_wqe_data_seg));
> > +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> > +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> > +			     buf->nb_segs;
> > +	tinfo->wqe_size =
> > +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> > MLX4_SEG_SHIFT),
> > +			  MLX4_TXBB_SIZE);
> > +	/* Validate WQE size and WQE space in the send queue. */
> > +	if (sq->remain_size < tinfo->wqe_size ||
> > +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> > +		return -ENOMEM;
> > +	/* Init pv. */
> > +	tinfo->pv = (struct pv *)txq->bounce_buf;
> > +	tinfo->pv_counter = 0;
> > +	return 0;
> > +}
> > +
> > +/**
> > + * Fill the TSO WQE data segments with info on buffers to transmit .
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to TSO info to use.
> > + * @param dseg
> > + *   Pointer to the first data segment in the TSO WQE.
> > + * @param ctrl
> > + *   Pointer to the control segment in the TSO WQE.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline volatile struct mlx4_wqe_ctrl_seg *
> > +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> > +			     struct txq *txq,
> > +			     struct tso_info *tinfo,
> > +			     volatile struct mlx4_wqe_data_seg *dseg,
> > +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	uint32_t lkey;
> > +	int nb_segs = buf->nb_segs;
> > +	int nb_segs_txbb;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct rte_mbuf *sbuf = buf;
> > +	struct pv *pv = tinfo->pv;
> > +	int *pv_counter = &tinfo->pv_counter;
> > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
> > +			(volatile struct mlx4_wqe_ctrl_seg *)
> > +				((volatile uint8_t *)ctrl + tinfo->wqe_size);
> > +	uint16_t sb_of = tinfo->tso_header_size;
> > +	uint16_t data_len;
> > +
> > +	do {
> > +		/* how many dseg entries do we have in the current TXBB ?
> > */
> > +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> > +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> > +			       MLX4_SEG_SHIFT;
> > +		switch (nb_segs_txbb) {
> > +		default:
> > +			/* Should never happen. */
> > +			rte_panic("%p: Invalid number of SGEs(%d) for a
> > TXBB",
> > +			(void *)txq, nb_segs_txbb);
> > +			/* rte_panic never returns. */
> 
> Since this default case should not happen because of the above calculation I
> think we don't need it.
> Just "break" if the compiler complain of default case lack.
> 
Although "default" is not mandatory in switch case statement it is a good practice to have it even just for code clarity.
so I will keep it there.

> > +		case 4:
> > +			/* Memory region key for this memory pool. */
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			dseg->addr =
> > +
> > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> > +								     uintptr_t,
> > +								     sb_of));
> > +			dseg->lkey = lkey;
> > +			/*
> > +			 * This data segment starts at the beginning of a new
> > +			 * TXBB, so we need to postpone its byte_count
> > writing
> > +			 * for later.
> > +			 */
> > +			pv[*pv_counter].dseg = dseg;
> > +			/*
> > +			 * Zero length segment is treated as inline segment
> > +			 * with zero data.
> > +			 */
> > +			data_len = sbuf->data_len - sb_of;
> 
> Is there a chance that the data_len will be negative? Rolled in this case?
Since we verify ahead the all l2,l3 and L4 headers reside in the same fragment there is no reason for
data_len to become negative, this is why I use uint16_t which is  the same data type used in struct rte_mbuf
for representing data_len , and as we do it in mlx4_tx_burst_segs.

> Maybe it is better to change it for int16_t and to replace the next check to
> be:
> data_len > 0 ? data_len : 0x80000000
> 
I will keep this the way it is for 2 reasons:
1. Seems to me more cumbersome then what I wrote.
2. Code consistency wise, this is how we also wrote it in mlx4_tx_burst_segs,
     What's good there is also good here.

> 
> And I think I found a way to remove the sb_of calculations for each segment:
> 
> Each segment will create the next segment parameters while only the pre
> loop calculation for the first segment parameters will calculate the header
> offset:
> 
> The parameters: data_len and sb_of.
> 
> So before the loop:
> sb_of = tinfo->tso_header_size;
> data_len = sbuf->data_len - sb_of;
> 
> And inside the loop (after the check of nb_segs):
> sb_of = 0;
> data_len = sbuf->data_len(the next sbuf);
> 
> so each segment calculates the next segment parameters and we don't need
> the "- sb_of" calculation per segment.
> 
NICE :)

> > +			pv[(*pv_counter)++].val =
> > +				rte_cpu_to_be_32(data_len ?
> > +						 data_len :
> > +						 0x80000000);
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 3:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 2:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 1:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +		}
> > +		/* Wrap dseg if it points at the end of the queue. */
> > +		if ((volatile uint8_t *)dseg >= sq->eob)
> > +			dseg = (volatile struct mlx4_wqe_data_seg *)
> > +					((volatile uint8_t *)dseg - sq->size);
> > +	} while (true);
> > +err:
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * Fill the packet's l2, l3 and l4 headers to the WQE.
> > + *
> > + * This will be used as the header for each TSO segment that is
> transmitted.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to TSO info to use.
> > + * @param ctrl
> > + *   Pointer to the control segment in the TSO WQE.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline volatile struct mlx4_wqe_data_seg *
> > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> > +			   struct txq *txq,
> > +			   struct tso_info *tinfo,
> > +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	volatile struct mlx4_wqe_lso_seg *tseg =
> > +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct pv *pv = tinfo->pv;
> > +	int *pv_counter = &tinfo->pv_counter;
> > +	int remain_size = tinfo->tso_header_size;
> > +	char *from = rte_pktmbuf_mtod(buf, char *);
> > +	uint16_t txbb_avail_space;
> > +	/* Union to overcome volatile constraints when copying TSO header.
> > */
> > +	union {
> > +		volatile uint8_t *vto;
> > +		uint8_t *to;
> > +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> > +
> > +	/*
> > +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> > +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> > +	 * we can write the first 44 TSO header bytes without worry for TxQ
> > +	 * wrapping or overwriting the first TXBB 32bit word.
> > +	 */
> > +	txbb_avail_space = MLX4_TXBB_SIZE -
> > +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> > +			    sizeof(struct mlx4_wqe_lso_seg));
> 
> I think that better name is txbb_tail_size.
I think that txbb_avail_space is good enough, so no change here.

> 
> > +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> > +		/* Copy to end of txbb. */
> > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > +		from += txbb_avail_space;
> > +		thdr.to += txbb_avail_space;
> > +		/* New TXBB, Check for TxQ wrap. */
> > +		if (thdr.to >= sq->eob)
> > +			thdr.vto = sq->buf;
> > +		/* New TXBB, stash the first 32bits for later use. */
> > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> > +		from += sizeof(uint32_t);
> > +		thdr.to += sizeof(uint32_t);
> > +		remain_size -= (txbb_avail_space + sizeof(uint32_t));
> 
> You don't need the () here.
True
> 
> > +		/* Avail space in new TXBB is TXBB size - 4 */
> > +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> > +	}
> > +	if (remain_size > txbb_avail_space) {
> > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > +		from += txbb_avail_space;
> > +		thdr.to += txbb_avail_space;
> > +		remain_size -= txbb_avail_space;
> > +		/* New TXBB, Check for TxQ wrap. */
> > +		if (thdr.to >= sq->eob)
> > +			thdr.vto = sq->buf;
> > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> > +		(*pv_counter)++;
> > +	} else {
> 
> Here it should be else if (remain_size > 0).
true
> 
> > +		rte_memcpy(thdr.to, from, remain_size);
> > +	}
> > +
> > +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> > +					      tinfo->tso_header_size);
> > +	/* Calculate data segment location */
> > +	return (volatile struct mlx4_wqe_data_seg *)
> > +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> > }
> > +
> > +/**
> > + * Write data segments and header for TSO uni/multi segment packet.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param ctrl
> > + *   Pointer to the WQE control segment.
> > + *
> > + * @return
> > + *   Pointer to the next WQE control segment on success, NULL otherwise.
> > + */
> > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> > +rte_mbuf *buf, struct txq *txq,
> > +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	volatile struct mlx4_wqe_data_seg *dseg;
> > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct tso_info tinfo;
> > +	struct pv *pv;
> > +	int pv_counter;
> > +	int ret;
> > +
> > +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> > +	if (unlikely(ret))
> > +		goto error;
> > +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> > +	if (unlikely(dseg == NULL))
> > +		goto error;
> > +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> > +		dseg = (volatile struct mlx4_wqe_data_seg *)
> > +					((uintptr_t)dseg - sq->size);
> > +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> > +	if (unlikely(ctrl_next == NULL))
> > +		goto error;
> > +	/* Write the first DWORD of each TXBB save earlier. */
> > +	pv = tinfo.pv;
> > +	pv_counter = tinfo.pv_counter;
> > +	/* Need a barrier here before writing the first TXBB word. */
> > +	rte_io_wmb();
> 
> > +	for (--pv_counter; pv_counter  >= 0; pv_counter--)
> 
> Since we don't need the first check do while statement is better.
> To be fully safe you can use likely check before the memory barrier.
> 
Will return the if statement But will not change the loop as it is the same as in
mlx4_tx_burst_segs and I do want to have a consistent code.

> > +		*pv[pv_counter].dst = pv[pv_counter].val;
> > +	ctrl->fence_size = tinfo.fence_size;
> > +	sq->remain_size -= tinfo.wqe_size;
> > +	return ctrl_next;
> > +error:
> > +	txq->stats.odropped++;
> > +	return NULL;
> > +}
> > +
> > +/**
> >   * Write data segments of multi-segment packet.
> >   *
> >   * @param buf
> > @@ -560,6 +918,7 @@ struct pv {
> >  			uint16_t flags16[2];
> >  		} srcrb;
> >  		uint32_t lkey;
> > +		bool tso = txq->priv->tso && (buf->ol_flags &
> > PKT_TX_TCP_SEG);
> >
> >  		/* Clean up old buffer. */
> >  		if (likely(elt->buf != NULL)) {
> > @@ -578,7 +937,16 @@ struct pv {
> >  			} while (tmp != NULL);
> >  		}
> >  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -		if (buf->nb_segs == 1) {
> > +		if (tso) {
> > +			/* Change opcode to TSO */
> > +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> > +			owner_opcode |= MLX4_OPCODE_LSO |
> > MLX4_WQE_CTRL_RR;
> > +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> > +			if (!ctrl_next) {
> > +				elt->buf = NULL;
> > +				break;
> > +			}
> > +		} else if (buf->nb_segs == 1) {
> >  			/* Validate WQE space in the send queue. */
> >  			if (sq->remain_size < MLX4_TXBB_SIZE) {
> >  				elt->buf = NULL;
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.h
> > b/drivers/net/mlx4/mlx4_rxtx.h index 4c025e3..ffa8abf 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
> >  	unsigned int idx; /**< Mapping index. */
> >  	uint64_t opackets; /**< Total of successfully sent packets. */
> >  	uint64_t obytes; /**< Total of successfully sent bytes. */
> > -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> > */
> > +	uint64_t odropped; /**< Total number of packets failed to transmit.
> > */
> >  };
> >
> >  /** Tx queue descriptor. */
> > diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> > index 6edaadb..9aa7440 100644
> > --- a/drivers/net/mlx4/mlx4_txq.c
> > +++ b/drivers/net/mlx4/mlx4_txq.c
> > @@ -116,8 +116,14 @@
> >  			     DEV_TX_OFFLOAD_UDP_CKSUM |
> >  			     DEV_TX_OFFLOAD_TCP_CKSUM);
> >  	}
> > -	if (priv->hw_csum_l2tun)
> > +	if (priv->tso)
> > +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> > +	if (priv->hw_csum_l2tun) {
> >  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> > +		if (priv->tso)
> > +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> > +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> > +	}
> >  	return offloads;
> >  }
> >
> > --
> > 1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v6] net/mlx4: support hardware TSO
  2018-07-09 10:43       ` [dpdk-dev] [PATCH v5] " Moti Haimovsky
  2018-07-09 13:07         ` Matan Azrad
@ 2018-07-09 16:33         ` Moti Haimovsky
  2018-07-10 10:45           ` [dpdk-dev] [PATCH v7] " Moti Haimovsky
  1 sibling, 1 reply; 14+ messages in thread
From: Moti Haimovsky @ 2018-07-09 16:33 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v6:
* Minor bug fixes from previous commit.
* More optimizations on TSO data-segments creation routine.
in reply to
1531132986-5054-1-git-send-email-motih@mellanox.com

v5:
* Modification to the code according to review inputs from Matan
  Azrad.
* Code optimization to the TSO header copy routine.
* Rearranged the TSO data-segments creation routine.
in reply to
1530715998-15703-1-git-send-email-motih@mellanox.com

v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---

 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 378 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 422 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..6654843 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** Size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_size;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,351 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_size +=
+				buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_size == 0 ||
+		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_size > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_size,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+			(volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl + tinfo->wqe_size);
+	uint16_t sb_of = tinfo->tso_header_size;
+	uint16_t data_len = sbuf->data_len - sb_of;
+
+	do {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb = (MLX4_TXBB_SIZE -
+				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+			       MLX4_SEG_SHIFT;
+		switch (nb_segs_txbb) {
+		default:
+			/* Should never happen. */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+			(void *)txq, nb_segs_txbb);
+			/* rte_panic never returns. */
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			sb_of = 0;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			sb_of = 0;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			sb_of = 0;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			sb_of = 0;
+			/* fallthrough */
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	} while (true);
+err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_size = tinfo->tso_header_size;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+		/* Copy to end of txbb. */
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		pv[(*pv_counter)++].val = *(uint32_t *)from,
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_size -= txbb_avail_space + sizeof(uint32_t);
+		/* Avail space in new TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	}
+	if (remain_size > txbb_avail_space) {
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		remain_size -= txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+		(*pv_counter)++;
+	} else if (remain_size) {
+		rte_memcpy(thdr.to, from, remain_size);
+	}
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_size);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (likely(tinfo.pv_counter)) {
+		pv = tinfo.pv;
+		pv_counter = tinfo.pv_counter;
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			*pv[pv_counter].dst = pv[pv_counter].val;
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +924,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +943,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/mlx4: support hardware TSO
  2018-07-09 16:22           ` Mordechay Haimovsky
@ 2018-07-09 18:18             ` Matan Azrad
  0 siblings, 0 replies; 14+ messages in thread
From: Matan Azrad @ 2018-07-09 18:18 UTC (permalink / raw)
  To: Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev

Hi Moti

I continue the discussion here in spite of the next version was out just to see the full discussions. 

From: Mordechay Haimovsky
> inline
> 
> > From: Matan Azrad
> > Hi Moti
> >
> > Please see some comments below.
> >
> > From: Mordechay Haimovsky
> > > Implement support for hardware TSO.
> > >
> > > Signed-off-by: Moti Haimovsky <motih@mellanox.com>
...
> > > +	do {
> > > +		/* how many dseg entries do we have in the current TXBB ?
> > > */
> > > +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> > > +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> > > +			       MLX4_SEG_SHIFT;
> > > +		switch (nb_segs_txbb) {
> > > +		default:
> > > +			/* Should never happen. */
> > > +			rte_panic("%p: Invalid number of SGEs(%d) for a
> > > TXBB",
> > > +			(void *)txq, nb_segs_txbb);
> > > +			/* rte_panic never returns. */
> >
> > Since this default case should not happen because of the above
> > calculation I think we don't need it.
> > Just "break" if the compiler complain of default case lack.
> >
> Although "default" is not mandatory in switch case statement it is a good
> practice to have it even just for code clarity.
> so I will keep it there.

But the rte_panic code (and all the default block) is redundant and we don't need redundant code in our data-path.
You can remain a comment if you want for clarifying.
 

> > > +		case 4:
> > > +			/* Memory region key for this memory pool. */
> > > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > > +			if (unlikely(lkey == (uint32_t)-1))
> > > +				goto err;
> > > +			dseg->addr =
> > > +
> > > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> > > +								     uintptr_t,
> > > +								     sb_of));
> > > +			dseg->lkey = lkey;
> > > +			/*
> > > +			 * This data segment starts at the beginning of a new
> > > +			 * TXBB, so we need to postpone its byte_count
> > > writing
> > > +			 * for later.
> > > +			 */
> > > +			pv[*pv_counter].dseg = dseg;
> > > +			/*
> > > +			 * Zero length segment is treated as inline segment
> > > +			 * with zero data.
> > > +			 */
> > > +			data_len = sbuf->data_len - sb_of;
> >
> > Is there a chance that the data_len will be negative? Rolled in this case?
> Since we verify ahead the all l2,l3 and L4 headers reside in the same fragment
> there is no reason for data_len to become negative, this is why I use uint16_t
> which is  the same data type used in struct rte_mbuf for representing
> data_len , and as we do it in mlx4_tx_burst_segs.
> 
> > Maybe it is better to change it for int16_t and to replace the next
> > check to
> > be:
> > data_len > 0 ? data_len : 0x80000000
> >
> I will keep this the way it is for 2 reasons:
> 1. Seems to me more cumbersome then what I wrote.

OK, you right here, if it cannot be negative we shouldn't change it :)

> 2. Code consistency wise, this is how we also wrote it in mlx4_tx_burst_segs,
>      What's good there is also good here.

Not agree, here is really a different case from there, a lot of assumption are different and the code may reflects it.

> > And I think I found a way to remove the sb_of calculations for each
> segment:
> >
> > Each segment will create the next segment parameters while only the
> > pre loop calculation for the first segment parameters will calculate
> > the header
> > offset:
> >
> > The parameters: data_len and sb_of.
> >
> > So before the loop:
> > sb_of = tinfo->tso_header_size;
> > data_len = sbuf->data_len - sb_of;
> >
> > And inside the loop (after the check of nb_segs):
> > sb_of = 0;
> > data_len = sbuf->data_len(the next sbuf);
> >
> > so each segment calculates the next segment parameters and we don't
> > need the "- sb_of" calculation per segment.
> >
> NICE :)
> 

Sorry for see it only now, but we don't need even the "sb_of=0" per segment:
We can add one more parameter for the next segment 
addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, tinfo->tso_header_size)
before the loop
and
addr= rte_pktmbuf_mtod(sbuf, uintptr_t)
inside the loop

so finally we save 2 cycles per segment :)
...
> > > +static inline volatile struct mlx4_wqe_data_seg *
> > > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> > > +			   struct txq *txq,
> > > +			   struct tso_info *tinfo,
> > > +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > > +	volatile struct mlx4_wqe_lso_seg *tseg =
> > > +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> > > +	struct mlx4_sq *sq = &txq->msq;
> > > +	struct pv *pv = tinfo->pv;
> > > +	int *pv_counter = &tinfo->pv_counter;
> > > +	int remain_size = tinfo->tso_header_size;
> > > +	char *from = rte_pktmbuf_mtod(buf, char *);
> > > +	uint16_t txbb_avail_space;
> > > +	/* Union to overcome volatile constraints when copying TSO header.
> > > */
> > > +	union {
> > > +		volatile uint8_t *vto;
> > > +		uint8_t *to;
> > > +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> > > +
> > > +	/*
> > > +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> > > +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> > > +	 * we can write the first 44 TSO header bytes without worry for TxQ
> > > +	 * wrapping or overwriting the first TXBB 32bit word.
> > > +	 */
> > > +	txbb_avail_space = MLX4_TXBB_SIZE -
> > > +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> > > +			    sizeof(struct mlx4_wqe_lso_seg));
> >
> > I think that better name is txbb_tail_size.
> I think that txbb_avail_space is good enough, so no change here.

My suggestion is because this size is only the tail size in the txbb without the first 4 bytes, so it may be more reasonable.
I can understand also your suggestion while avail points to the available txbb size to write (the first 4B are not available now only later).
I'm not going to argue about it :)
 
> 
> >
> > > +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> > > +		/* Copy to end of txbb. */
> > > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > > +		from += txbb_avail_space;
> > > +		thdr.to += txbb_avail_space;
> > > +		/* New TXBB, Check for TxQ wrap. */
> > > +		if (thdr.to >= sq->eob)
> > > +			thdr.vto = sq->buf;
> > > +		/* New TXBB, stash the first 32bits for later use. */
> > > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > > +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> > > +		from += sizeof(uint32_t);
> > > +		thdr.to += sizeof(uint32_t);
> > > +		remain_size -= (txbb_avail_space + sizeof(uint32_t));
> >
> > You don't need the () here.
> True
> >
> > > +		/* Avail space in new TXBB is TXBB size - 4 */
> > > +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> > > +	}
> > > +	if (remain_size > txbb_avail_space) {
> > > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > > +		from += txbb_avail_space;
> > > +		thdr.to += txbb_avail_space;
> > > +		remain_size -= txbb_avail_space;
> > > +		/* New TXBB, Check for TxQ wrap. */
> > > +		if (thdr.to >= sq->eob)
> > > +			thdr.vto = sq->buf;
> > > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > > +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> > > +		(*pv_counter)++;
> > > +	} else {
> >
> > Here it should be else if (remain_size > 0).
> true
> >
> > > +		rte_memcpy(thdr.to, from, remain_size);
> > > +	}
> > > +
> > > +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> > > +					      tinfo->tso_header_size);
> > > +	/* Calculate data segment location */
> > > +	return (volatile struct mlx4_wqe_data_seg *)
> > > +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> > > }
> > > +
> > > +/**
> > > + * Write data segments and header for TSO uni/multi segment packet.
> > > + *
> > > + * @param buf
> > > + *   Pointer to the first packet mbuf.
> > > + * @param txq
> > > + *   Pointer to Tx queue structure.
> > > + * @param ctrl
> > > + *   Pointer to the WQE control segment.
> > > + *
> > > + * @return
> > > + *   Pointer to the next WQE control segment on success, NULL
> otherwise.
> > > + */
> > > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> > > +rte_mbuf *buf, struct txq *txq,
> > > +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > > +	volatile struct mlx4_wqe_data_seg *dseg;
> > > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> > > +	struct mlx4_sq *sq = &txq->msq;
> > > +	struct tso_info tinfo;
> > > +	struct pv *pv;
> > > +	int pv_counter;
> > > +	int ret;
> > > +
> > > +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> > > +	if (unlikely(ret))
> > > +		goto error;
> > > +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> > > +	if (unlikely(dseg == NULL))
> > > +		goto error;
> > > +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> > > +		dseg = (volatile struct mlx4_wqe_data_seg *)
> > > +					((uintptr_t)dseg - sq->size);
> > > +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> > > +	if (unlikely(ctrl_next == NULL))
> > > +		goto error;
> > > +	/* Write the first DWORD of each TXBB save earlier. */
> > > +	pv = tinfo.pv;
> > > +	pv_counter = tinfo.pv_counter;
> > > +	/* Need a barrier here before writing the first TXBB word. */
> > > +	rte_io_wmb();
> >
> > > +	for (--pv_counter; pv_counter  >= 0; pv_counter--)
> >
> > Since we don't need the first check do while statement is better.
> > To be fully safe you can use likely check before the memory barrier.
> >
> Will return the if statement But will not change the loop as it is the same as in
> mlx4_tx_burst_segs and I do want to have a consistent code.

I'm not agree with this statement as above - different assumptions - different optimized code.

Here and in the mlx4_tx_burst_segs code we don't need the first check in the for loop and we don't need redundant checks in our datapath.

So both should be updated.

While the difference is in the prior check, here it should be likely and there it should not.
So actually there the "for" loop can stay as is but we don't need the first if check of pv_counter because it is already checked in the for loop.

I suggest to optimize both(prior patch for mlx4_tx_burst_segs optimization).

> > > +		*pv[pv_counter].dst = pv[pv_counter].val;
> > > +	ctrl->fence_size = tinfo.fence_size;
> > > +	sq->remain_size -= tinfo.wqe_size;
> > > +	return ctrl_next;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v7] net/mlx4: support hardware TSO
  2018-07-09 16:33         ` [dpdk-dev] [PATCH v6] " Moti Haimovsky
@ 2018-07-10 10:45           ` Moti Haimovsky
  2018-07-10 11:02             ` Matan Azrad
  0 siblings, 1 reply; 14+ messages in thread
From: Moti Haimovsky @ 2018-07-10 10:45 UTC (permalink / raw)
  To: adrien.mazarguil, matan; +Cc: dev, Moti Haimovsky

Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v7:
* More optimizations on TSO data-segments creation routine.
in reply to
1531153995-26627-1-git-send-email-motih@mellanox.com

v6:
* Minor bug fixes from previous commit.
* More optimizations on TSO data-segments creation routine.
in reply to
1531132986-5054-1-git-send-email-motih@mellanox.com

v5:
* Modification to the code according to review inputs from Matan
  Azrad.
* Code optimization to the TSO header copy routine.
* Rearranged the TSO data-segments creation routine.
in reply to
1530715998-15703-1-git-send-email-motih@mellanox.com

v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 369 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 413 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@ struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@ struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..8c88eff 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** Size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_size;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,342 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_size +=
+				buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_size == 0 ||
+		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_size > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_size,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+			(volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl + tinfo->wqe_size);
+	uint16_t data_len = sbuf->data_len - tinfo->tso_header_size;
+	uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t,
+						      tinfo->tso_header_size);
+
+	do {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb = (MLX4_TXBB_SIZE -
+				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+			       MLX4_SEG_SHIFT;
+		switch (nb_segs_txbb) {
+#ifndef NDEBUG
+		default:
+			/* Should never happen. */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+			(void *)txq, nb_segs_txbb);
+			/* rte_panic never returns. */
+			break;
+#endif /* NDEBUG */
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			dseg->addr = rte_cpu_to_be_64(data_addr);
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	} while (true);
+err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_size = tinfo->tso_header_size;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+		/* Copy to end of txbb. */
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		pv[(*pv_counter)++].val = *(uint32_t *)from,
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_size -= txbb_avail_space + sizeof(uint32_t);
+		/* Avail space in new TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	}
+	if (remain_size > txbb_avail_space) {
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		remain_size -= txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+		(*pv_counter)++;
+	} else if (remain_size) {
+		rte_memcpy(thdr.to, from, remain_size);
+	}
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_size);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (likely(tinfo.pv_counter)) {
+		pv = tinfo.pv;
+		pv_counter = tinfo.pv_counter;
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		do {
+			--pv_counter;
+			*pv[pv_counter].dst = pv[pv_counter].val;
+		} while (pv_counter > 0);
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +915,7 @@ struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +934,16 @@ struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v7] net/mlx4: support hardware TSO
  2018-07-10 10:45           ` [dpdk-dev] [PATCH v7] " Moti Haimovsky
@ 2018-07-10 11:02             ` Matan Azrad
  2018-07-10 12:03               ` Shahaf Shuler
  0 siblings, 1 reply; 14+ messages in thread
From: Matan Azrad @ 2018-07-10 11:02 UTC (permalink / raw)
  To: Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev



From: Mordechay Haimovsky
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>

Thanks Moti!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v7] net/mlx4: support hardware TSO
  2018-07-10 11:02             ` Matan Azrad
@ 2018-07-10 12:03               ` Shahaf Shuler
  0 siblings, 0 replies; 14+ messages in thread
From: Shahaf Shuler @ 2018-07-10 12:03 UTC (permalink / raw)
  To: Matan Azrad, Mordechay Haimovsky, Adrien Mazarguil; +Cc: dev

Tuesday, July 10, 2018 2:03 PM, Matan Azrad:
> Subject: Re: [dpdk-dev] [PATCH v7] net/mlx4: support hardware TSO
> 
> 
> 
> From: Mordechay Haimovsky
> > Implement support for hardware TSO.
> >
> > Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> Acked-by: Matan Azrad <matan@mellanox.com>

Applied to next-net-mlx, thanks. 

> 
> Thanks Moti!

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-07-10 12:03 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <0~1530184583-30166-1-git-send-email-motih@mellanox.com>
2018-06-28 11:57 ` [dpdk-dev] [PATCH v2] net/mlx4: support hardware TSO Moti Haimovsky
2018-06-28 12:48   ` [dpdk-dev] [PATCH v3] " Moti Haimovsky
2018-06-28 14:15     ` Adrien Mazarguil
2018-06-28 15:19     ` Matan Azrad
2018-07-04 14:53     ` [dpdk-dev] [PATCH v4] " Moti Haimovsky
2018-07-05 12:30       ` Matan Azrad
2018-07-09 10:43       ` [dpdk-dev] [PATCH v5] " Moti Haimovsky
2018-07-09 13:07         ` Matan Azrad
2018-07-09 16:22           ` Mordechay Haimovsky
2018-07-09 18:18             ` Matan Azrad
2018-07-09 16:33         ` [dpdk-dev] [PATCH v6] " Moti Haimovsky
2018-07-10 10:45           ` [dpdk-dev] [PATCH v7] " Moti Haimovsky
2018-07-10 11:02             ` Matan Azrad
2018-07-10 12:03               ` Shahaf Shuler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).