DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
@ 2015-08-20 15:37 Vlad Zolotarov
  2015-08-24  8:11 ` Vlad Zolotarov
                   ` (6 more replies)
  0 siblings, 7 replies; 15+ messages in thread
From: Vlad Zolotarov @ 2015-08-20 15:37 UTC (permalink / raw)
  To: dev

According to 82599 and x540 HW specifications RS bit *must* be
set in the last descriptor of *every* packet.

Before this patch there were 3 types of Tx callbacks that were setting
RS bit every tx_rs_thresh descriptors. This patch introduces a set of
new callbacks, one for each type mentioned above, that will set the RS
bit in every EOP descriptor.

ixgbe_set_tx_function() will set the appropriate Tx callback according
to the device family.

This patch fixes the Tx hang we were constantly hitting with a
seastar-based application on x540 NIC.

Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
---
New in v4:
   - Styling (white spaces) fixes.

New in v3:
   - Enforce the RS bit setting instead of enforcing tx_rs_thresh to be 1.
---
 drivers/net/ixgbe/ixgbe_ethdev.c   |  14 +++-
 drivers/net/ixgbe/ixgbe_ethdev.h   |   4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c     | 139 ++++++++++++++++++++++++++++---------
 drivers/net/ixgbe/ixgbe_rxtx.h     |   2 +
 drivers/net/ixgbe/ixgbe_rxtx_vec.c |  29 ++++++--
 5 files changed, 149 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index b8ee1e9..355882c 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -866,12 +866,17 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
 	uint32_t ctrl_ext;
 	uint16_t csum;
 	int diag, i;
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
 
 	PMD_INIT_FUNC_TRACE();
 
 	eth_dev->dev_ops = &ixgbe_eth_dev_ops;
 	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
-	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+
+	if (rs_deferring_allowed)
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
 
 	/*
 	 * For secondary processes, we don't initialise any further as primary
@@ -1147,12 +1152,17 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
 	struct ixgbe_hwstrip *hwstrip =
 		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(eth_dev->data->dev_private);
 	struct ether_addr *perm_addr = (struct ether_addr *) hw->mac.perm_addr;
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
 
 	PMD_INIT_FUNC_TRACE();
 
 	eth_dev->dev_ops = &ixgbevf_eth_dev_ops;
 	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
-	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+
+	if (rs_deferring_allowed)
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
 
 	/* for secondary processes, we don't initialise any further as primary
 	 * has already done this work. Only check we don't need a different
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index c3d4f4f..390356d 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -367,9 +367,13 @@ uint16_t ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue,
 
 uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 
 uint16_t ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_simple_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 
 int ixgbe_dev_rss_hash_update(struct rte_eth_dev *dev,
 			      struct rte_eth_rss_conf *rss_conf);
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 91023b9..044f72c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -164,11 +164,16 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
 
 /* Populate 4 descriptors with data from 4 mbufs */
 static inline void
-tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
+tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
+    bool always_rs)
 {
 	uint64_t buf_dma_addr;
 	uint32_t pkt_len;
 	int i;
+	uint32_t flags = DCMD_DTYP_FLAGS;
+
+	if (always_rs)
+		flags |= IXGBE_ADVTXD_DCMD_RS;
 
 	for (i = 0; i < 4; ++i, ++txdp, ++pkts) {
 		buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
@@ -178,7 +183,7 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 		txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
 
 		txdp->read.cmd_type_len =
-			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
+			rte_cpu_to_le_32(flags | pkt_len);
 
 		txdp->read.olinfo_status =
 			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
@@ -189,10 +194,15 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 
 /* Populate 1 descriptor with data from 1 mbuf */
 static inline void
-tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
+tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
+    bool always_rs)
 {
 	uint64_t buf_dma_addr;
 	uint32_t pkt_len;
+	uint32_t flags = DCMD_DTYP_FLAGS;
+
+	if (always_rs)
+		flags |= IXGBE_ADVTXD_DCMD_RS;
 
 	buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
 	pkt_len = (*pkts)->data_len;
@@ -200,7 +210,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 	/* write data to descriptor */
 	txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
 	txdp->read.cmd_type_len =
-			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
+			rte_cpu_to_le_32(flags | pkt_len);
 	txdp->read.olinfo_status =
 			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
 	rte_prefetch0(&(*pkts)->pool);
@@ -212,7 +222,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
  */
 static inline void
 ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
-		      uint16_t nb_pkts)
+		      uint16_t nb_pkts, bool always_rs)
 {
 	volatile union ixgbe_adv_tx_desc *txdp = &(txq->tx_ring[txq->tx_tail]);
 	struct ixgbe_tx_entry *txep = &(txq->sw_ring[txq->tx_tail]);
@@ -232,20 +242,21 @@ ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
 		for (j = 0; j < N_PER_LOOP; ++j) {
 			(txep + i + j)->mbuf = *(pkts + i + j);
 		}
-		tx4(txdp + i, pkts + i);
+		tx4(txdp + i, pkts + i, always_rs);
 	}
 
 	if (unlikely(leftover > 0)) {
 		for (i = 0; i < leftover; ++i) {
 			(txep + mainpart + i)->mbuf = *(pkts + mainpart + i);
-			tx1(txdp + mainpart + i, pkts + mainpart + i);
+			tx1(txdp + mainpart + i, pkts + mainpart + i,
+			    always_rs);
 		}
 	}
 }
 
 static inline uint16_t
 tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
-	     uint16_t nb_pkts)
+	     uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
 	volatile union ixgbe_adv_tx_desc *tx_r = txq->tx_ring;
@@ -281,22 +292,25 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	 */
 	if ((txq->tx_tail + nb_pkts) > txq->nb_tx_desc) {
 		n = (uint16_t)(txq->nb_tx_desc - txq->tx_tail);
-		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n);
+		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n, always_rs);
 
-		/*
-		 * We know that the last descriptor in the ring will need to
-		 * have its RS bit set because tx_rs_thresh has to be
-		 * a divisor of the ring size
-		 */
-		tx_r[txq->tx_next_rs].read.cmd_type_len |=
-			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
-		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		if (!always_rs) {
+			/*
+			 * We know that the last descriptor in the ring will
+			 * need to have its RS bit set because tx_rs_thresh has
+			 * to be a divisor of the ring size
+			 */
+			tx_r[txq->tx_next_rs].read.cmd_type_len |=
+				rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
+			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		}
 
 		txq->tx_tail = 0;
 	}
 
 	/* Fill H/W descriptor ring with mbuf data */
-	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n));
+	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n),
+			      always_rs);
 	txq->tx_tail = (uint16_t)(txq->tx_tail + (nb_pkts - n));
 
 	/*
@@ -306,7 +320,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	 * but instead of subtracting 1 and doing >=, we can just do
 	 * greater than without subtracting.
 	 */
-	if (txq->tx_tail > txq->tx_next_rs) {
+	if (!always_rs && txq->tx_tail > txq->tx_next_rs) {
 		tx_r[txq->tx_next_rs].read.cmd_type_len |=
 			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
 		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
@@ -329,22 +343,23 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_pkts;
 }
 
-uint16_t
-ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
-		       uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+			uint16_t nb_pkts, bool always_rs)
 {
 	uint16_t nb_tx;
 
 	/* Try to transmit at least chunks of TX_MAX_BURST pkts */
 	if (likely(nb_pkts <= RTE_PMD_IXGBE_TX_MAX_BURST))
-		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts);
+		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts, always_rs);
 
 	/* transmit more than the max burst, in chunks of TX_MAX_BURST */
 	nb_tx = 0;
 	while (nb_pkts) {
 		uint16_t ret, n;
 		n = (uint16_t)RTE_MIN(nb_pkts, RTE_PMD_IXGBE_TX_MAX_BURST);
-		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n);
+		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n, always_rs);
 		nb_tx = (uint16_t)(nb_tx + ret);
 		nb_pkts = (uint16_t)(nb_pkts - ret);
 		if (ret < n)
@@ -354,6 +369,20 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_tx;
 }
 
+uint16_t
+ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+		       uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_simple_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+				 uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 static inline void
 ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
 		volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
@@ -565,9 +594,10 @@ ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
 	return (0);
 }
 
-uint16_t
-ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
-		uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+		 uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq;
 	struct ixgbe_tx_entry *sw_ring;
@@ -827,11 +857,20 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 * The last packet data descriptor needs End Of Packet (EOP)
 		 */
 		cmd_type_len |= IXGBE_TXD_CMD_EOP;
-		txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
+
+		if (always_rs) {
+			PMD_TX_FREE_LOG(DEBUG,
+					"Setting RS bit on TXD id=%4u (port=%d queue=%d)",
+					tx_last, txq->port_id, txq->queue_id);
+			cmd_type_len |= IXGBE_TXD_CMD_RS;
+		} else {
+			txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
+		}
+
 		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
 
 		/* Set RS bit only on threshold packets' last descriptor */
-		if (txq->nb_tx_used >= txq->tx_rs_thresh) {
+		if (!always_rs && txq->nb_tx_used >= txq->tx_rs_thresh) {
 			PMD_TX_FREE_LOG(DEBUG,
 					"Setting RS bit on TXD id="
 					"%4u (port=%d queue=%d)",
@@ -859,6 +898,20 @@ end_of_tx:
 	return (nb_tx);
 }
 
+uint16_t
+ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 /*********************************************************************
  *
  *  RX functions
@@ -2047,6 +2100,22 @@ static const struct ixgbe_txq_ops def_txq_ops = {
 void __attribute__((cold))
 ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 {
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	/*
+	 * According to 82599 and x540 specifications RS bit *must* be set on
+	 * the last descriptor of *every* packet. Therefore we will not allow
+	 * the tx_rs_thresh above 1 for all NICs newer than 82598. Since VFs are
+	 * available only on devices starting from 82599, tx_rs_thresh should be
+	 * set to 1 for ALL VF devices.
+	 */
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
+
+	if (rs_deferring_allowed)
+		PMD_INIT_LOG(DEBUG, "Using an RS bit setting deferring callback");
+	else
+		PMD_INIT_LOG(DEBUG, "Using an always setting RS bit callback");
+
 	/* Use a simple Tx queue (no offloads, no multi segs) if possible */
 	if (((txq->txq_flags & IXGBE_SIMPLE_FLAGS) == IXGBE_SIMPLE_FLAGS)
 			&& (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)) {
@@ -2056,10 +2125,14 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 				(rte_eal_process_type() != RTE_PROC_PRIMARY ||
 					ixgbe_txq_vec_setup(txq) == 0)) {
 			PMD_INIT_LOG(DEBUG, "Vector tx enabled.");
-			dev->tx_pkt_burst = ixgbe_xmit_pkts_vec;
+			dev->tx_pkt_burst = (rs_deferring_allowed ?
+					     ixgbe_xmit_pkts_vec :
+					     ixgbe_xmit_pkts_vec_always_rs);
 		} else
 #endif
-		dev->tx_pkt_burst = ixgbe_xmit_pkts_simple;
+		dev->tx_pkt_burst = (rs_deferring_allowed ?
+				     ixgbe_xmit_pkts_simple :
+				     ixgbe_xmit_pkts_simple_always_rs);
 	} else {
 		PMD_INIT_LOG(DEBUG, "Using full-featured tx code path");
 		PMD_INIT_LOG(DEBUG,
@@ -2070,7 +2143,9 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 				" - tx_rs_thresh = %lu " "[RTE_PMD_IXGBE_TX_MAX_BURST=%lu]",
 				(unsigned long)txq->tx_rs_thresh,
 				(unsigned long)RTE_PMD_IXGBE_TX_MAX_BURST);
-		dev->tx_pkt_burst = ixgbe_xmit_pkts;
+		dev->tx_pkt_burst = (rs_deferring_allowed ?
+				     ixgbe_xmit_pkts :
+				     ixgbe_xmit_pkts_always_rs);
 	}
 }
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 113682a..c7ed71b 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -285,6 +285,8 @@ void ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq);
 
 uint16_t ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_vec_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 int ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq);
 
 #endif /* RTE_IXGBE_INC_VECTOR */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index cf25a53..a680fc2 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -723,9 +723,10 @@ tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
 		txep[i].mbuf = tx_pkts[i];
 }
 
-uint16_t
-ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-		       uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		     uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
 	volatile union ixgbe_adv_tx_desc *txdp;
@@ -735,6 +736,9 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
 	int i;
 
+	if (always_rs)
+		flags = rs;
+
 	if (unlikely(nb_pkts > RTE_IXGBE_VPMD_TX_BURST))
 		nb_pkts = RTE_IXGBE_VPMD_TX_BURST;
 
@@ -764,7 +768,8 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		nb_commit = (uint16_t)(nb_commit - n);
 
 		tx_id = 0;
-		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		if (!always_rs)
+			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 
 		/* avoid reach the end of ring */
 		txdp = &(txq->tx_ring[tx_id]);
@@ -776,7 +781,7 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	vtx(txdp, tx_pkts, nb_commit, flags);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
-	if (tx_id > txq->tx_next_rs) {
+	if (!always_rs && tx_id > txq->tx_next_rs) {
 		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
 			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
 		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
@@ -790,6 +795,20 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_pkts;
 }
 
+uint16_t
+ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		    uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_vec_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 static void __attribute__((cold))
 ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
 {
-- 
2.1.0

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
@ 2015-08-24  8:11 ` Vlad Zolotarov
  2015-10-27 18:09   ` Thomas Monjalon
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Vlad Zolotarov @ 2015-08-24  8:11 UTC (permalink / raw)
  To: dev; +Cc: jeffrey.t.kirsher, jesse.brandeburg



On 08/20/15 18:37, Vlad Zolotarov wrote:
> According to 82599 and x540 HW specifications RS bit *must* be
> set in the last descriptor of *every* packet.
>
> Before this patch there were 3 types of Tx callbacks that were setting
> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> new callbacks, one for each type mentioned above, that will set the RS
> bit in every EOP descriptor.
>
> ixgbe_set_tx_function() will set the appropriate Tx callback according
> to the device family.

[+Jesse and Jeff]

I've started to look at the i40e PMD and it has the same RS bit 
deferring logic
as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a 
descriptor write-back
completion mode.

 From the HW Spec it's unclear if RS bit should be set on *every* descriptor
with EOP bit. However I noticed that Linux driver, before it moved to 
HEAD write-back mode, was setting RS
bit on every EOP descriptor.

So, here is a question to Intel guys: could u, pls., clarify this point?

Thanks in advance,
vlad

>
> This patch fixes the Tx hang we were constantly hitting with a
> seastar-based application on x540 NIC.
>
> Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
> ---
> New in v4:
>     - Styling (white spaces) fixes.
>
> New in v3:
>     - Enforce the RS bit setting instead of enforcing tx_rs_thresh to be 1.
> ---
>   drivers/net/ixgbe/ixgbe_ethdev.c   |  14 +++-
>   drivers/net/ixgbe/ixgbe_ethdev.h   |   4 ++
>   drivers/net/ixgbe/ixgbe_rxtx.c     | 139 ++++++++++++++++++++++++++++---------
>   drivers/net/ixgbe/ixgbe_rxtx.h     |   2 +
>   drivers/net/ixgbe/ixgbe_rxtx_vec.c |  29 ++++++--
>   5 files changed, 149 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
> index b8ee1e9..355882c 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -866,12 +866,17 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
>   	uint32_t ctrl_ext;
>   	uint16_t csum;
>   	int diag, i;
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
>   
>   	PMD_INIT_FUNC_TRACE();
>   
>   	eth_dev->dev_ops = &ixgbe_eth_dev_ops;
>   	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
> -	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +
> +	if (rs_deferring_allowed)
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +	else
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
>   
>   	/*
>   	 * For secondary processes, we don't initialise any further as primary
> @@ -1147,12 +1152,17 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
>   	struct ixgbe_hwstrip *hwstrip =
>   		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(eth_dev->data->dev_private);
>   	struct ether_addr *perm_addr = (struct ether_addr *) hw->mac.perm_addr;
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
>   
>   	PMD_INIT_FUNC_TRACE();
>   
>   	eth_dev->dev_ops = &ixgbevf_eth_dev_ops;
>   	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
> -	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +
> +	if (rs_deferring_allowed)
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +	else
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
>   
>   	/* for secondary processes, we don't initialise any further as primary
>   	 * has already done this work. Only check we don't need a different
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
> index c3d4f4f..390356d 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.h
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.h
> @@ -367,9 +367,13 @@ uint16_t ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue,
>   
>   uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   
>   uint16_t ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_simple_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   
>   int ixgbe_dev_rss_hash_update(struct rte_eth_dev *dev,
>   			      struct rte_eth_rss_conf *rss_conf);
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 91023b9..044f72c 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -164,11 +164,16 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
>   
>   /* Populate 4 descriptors with data from 4 mbufs */
>   static inline void
> -tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
> +tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
> +    bool always_rs)
>   {
>   	uint64_t buf_dma_addr;
>   	uint32_t pkt_len;
>   	int i;
> +	uint32_t flags = DCMD_DTYP_FLAGS;
> +
> +	if (always_rs)
> +		flags |= IXGBE_ADVTXD_DCMD_RS;
>   
>   	for (i = 0; i < 4; ++i, ++txdp, ++pkts) {
>   		buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
> @@ -178,7 +183,7 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   		txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
>   
>   		txdp->read.cmd_type_len =
> -			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
> +			rte_cpu_to_le_32(flags | pkt_len);
>   
>   		txdp->read.olinfo_status =
>   			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
> @@ -189,10 +194,15 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   
>   /* Populate 1 descriptor with data from 1 mbuf */
>   static inline void
> -tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
> +tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
> +    bool always_rs)
>   {
>   	uint64_t buf_dma_addr;
>   	uint32_t pkt_len;
> +	uint32_t flags = DCMD_DTYP_FLAGS;
> +
> +	if (always_rs)
> +		flags |= IXGBE_ADVTXD_DCMD_RS;
>   
>   	buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
>   	pkt_len = (*pkts)->data_len;
> @@ -200,7 +210,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   	/* write data to descriptor */
>   	txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
>   	txdp->read.cmd_type_len =
> -			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
> +			rte_cpu_to_le_32(flags | pkt_len);
>   	txdp->read.olinfo_status =
>   			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
>   	rte_prefetch0(&(*pkts)->pool);
> @@ -212,7 +222,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>    */
>   static inline void
>   ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
> -		      uint16_t nb_pkts)
> +		      uint16_t nb_pkts, bool always_rs)
>   {
>   	volatile union ixgbe_adv_tx_desc *txdp = &(txq->tx_ring[txq->tx_tail]);
>   	struct ixgbe_tx_entry *txep = &(txq->sw_ring[txq->tx_tail]);
> @@ -232,20 +242,21 @@ ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
>   		for (j = 0; j < N_PER_LOOP; ++j) {
>   			(txep + i + j)->mbuf = *(pkts + i + j);
>   		}
> -		tx4(txdp + i, pkts + i);
> +		tx4(txdp + i, pkts + i, always_rs);
>   	}
>   
>   	if (unlikely(leftover > 0)) {
>   		for (i = 0; i < leftover; ++i) {
>   			(txep + mainpart + i)->mbuf = *(pkts + mainpart + i);
> -			tx1(txdp + mainpart + i, pkts + mainpart + i);
> +			tx1(txdp + mainpart + i, pkts + mainpart + i,
> +			    always_rs);
>   		}
>   	}
>   }
>   
>   static inline uint16_t
>   tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> -	     uint16_t nb_pkts)
> +	     uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
>   	volatile union ixgbe_adv_tx_desc *tx_r = txq->tx_ring;
> @@ -281,22 +292,25 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	 */
>   	if ((txq->tx_tail + nb_pkts) > txq->nb_tx_desc) {
>   		n = (uint16_t)(txq->nb_tx_desc - txq->tx_tail);
> -		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n);
> +		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n, always_rs);
>   
> -		/*
> -		 * We know that the last descriptor in the ring will need to
> -		 * have its RS bit set because tx_rs_thresh has to be
> -		 * a divisor of the ring size
> -		 */
> -		tx_r[txq->tx_next_rs].read.cmd_type_len |=
> -			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
> -		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		if (!always_rs) {
> +			/*
> +			 * We know that the last descriptor in the ring will
> +			 * need to have its RS bit set because tx_rs_thresh has
> +			 * to be a divisor of the ring size
> +			 */
> +			tx_r[txq->tx_next_rs].read.cmd_type_len |=
> +				rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
> +			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		}
>   
>   		txq->tx_tail = 0;
>   	}
>   
>   	/* Fill H/W descriptor ring with mbuf data */
> -	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n));
> +	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n),
> +			      always_rs);
>   	txq->tx_tail = (uint16_t)(txq->tx_tail + (nb_pkts - n));
>   
>   	/*
> @@ -306,7 +320,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	 * but instead of subtracting 1 and doing >=, we can just do
>   	 * greater than without subtracting.
>   	 */
> -	if (txq->tx_tail > txq->tx_next_rs) {
> +	if (!always_rs && txq->tx_tail > txq->tx_next_rs) {
>   		tx_r[txq->tx_next_rs].read.cmd_type_len |=
>   			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
>   		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
> @@ -329,22 +343,23 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_pkts;
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		       uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			uint16_t nb_pkts, bool always_rs)
>   {
>   	uint16_t nb_tx;
>   
>   	/* Try to transmit at least chunks of TX_MAX_BURST pkts */
>   	if (likely(nb_pkts <= RTE_PMD_IXGBE_TX_MAX_BURST))
> -		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts);
> +		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts, always_rs);
>   
>   	/* transmit more than the max burst, in chunks of TX_MAX_BURST */
>   	nb_tx = 0;
>   	while (nb_pkts) {
>   		uint16_t ret, n;
>   		n = (uint16_t)RTE_MIN(nb_pkts, RTE_PMD_IXGBE_TX_MAX_BURST);
> -		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n);
> +		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n, always_rs);
>   		nb_tx = (uint16_t)(nb_tx + ret);
>   		nb_pkts = (uint16_t)(nb_pkts - ret);
>   		if (ret < n)
> @@ -354,6 +369,20 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_tx;
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		       uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_simple_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +				 uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   static inline void
>   ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
>   		volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
> @@ -565,9 +594,10 @@ ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
>   	return (0);
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		 uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq;
>   	struct ixgbe_tx_entry *sw_ring;
> @@ -827,11 +857,20 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		 * The last packet data descriptor needs End Of Packet (EOP)
>   		 */
>   		cmd_type_len |= IXGBE_TXD_CMD_EOP;
> -		txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
> +
> +		if (always_rs) {
> +			PMD_TX_FREE_LOG(DEBUG,
> +					"Setting RS bit on TXD id=%4u (port=%d queue=%d)",
> +					tx_last, txq->port_id, txq->queue_id);
> +			cmd_type_len |= IXGBE_TXD_CMD_RS;
> +		} else {
> +			txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
> +		}
> +
>   		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
>   
>   		/* Set RS bit only on threshold packets' last descriptor */
> -		if (txq->nb_tx_used >= txq->tx_rs_thresh) {
> +		if (!always_rs && txq->nb_tx_used >= txq->tx_rs_thresh) {
>   			PMD_TX_FREE_LOG(DEBUG,
>   					"Setting RS bit on TXD id="
>   					"%4u (port=%d queue=%d)",
> @@ -859,6 +898,20 @@ end_of_tx:
>   	return (nb_tx);
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			  uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   /*********************************************************************
>    *
>    *  RX functions
> @@ -2047,6 +2100,22 @@ static const struct ixgbe_txq_ops def_txq_ops = {
>   void __attribute__((cold))
>   ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   {
> +	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> +	/*
> +	 * According to 82599 and x540 specifications RS bit *must* be set on
> +	 * the last descriptor of *every* packet. Therefore we will not allow
> +	 * the tx_rs_thresh above 1 for all NICs newer than 82598. Since VFs are
> +	 * available only on devices starting from 82599, tx_rs_thresh should be
> +	 * set to 1 for ALL VF devices.
> +	 */
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
> +
> +	if (rs_deferring_allowed)
> +		PMD_INIT_LOG(DEBUG, "Using an RS bit setting deferring callback");
> +	else
> +		PMD_INIT_LOG(DEBUG, "Using an always setting RS bit callback");
> +
>   	/* Use a simple Tx queue (no offloads, no multi segs) if possible */
>   	if (((txq->txq_flags & IXGBE_SIMPLE_FLAGS) == IXGBE_SIMPLE_FLAGS)
>   			&& (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)) {
> @@ -2056,10 +2125,14 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   				(rte_eal_process_type() != RTE_PROC_PRIMARY ||
>   					ixgbe_txq_vec_setup(txq) == 0)) {
>   			PMD_INIT_LOG(DEBUG, "Vector tx enabled.");
> -			dev->tx_pkt_burst = ixgbe_xmit_pkts_vec;
> +			dev->tx_pkt_burst = (rs_deferring_allowed ?
> +					     ixgbe_xmit_pkts_vec :
> +					     ixgbe_xmit_pkts_vec_always_rs);
>   		} else
>   #endif
> -		dev->tx_pkt_burst = ixgbe_xmit_pkts_simple;
> +		dev->tx_pkt_burst = (rs_deferring_allowed ?
> +				     ixgbe_xmit_pkts_simple :
> +				     ixgbe_xmit_pkts_simple_always_rs);
>   	} else {
>   		PMD_INIT_LOG(DEBUG, "Using full-featured tx code path");
>   		PMD_INIT_LOG(DEBUG,
> @@ -2070,7 +2143,9 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   				" - tx_rs_thresh = %lu " "[RTE_PMD_IXGBE_TX_MAX_BURST=%lu]",
>   				(unsigned long)txq->tx_rs_thresh,
>   				(unsigned long)RTE_PMD_IXGBE_TX_MAX_BURST);
> -		dev->tx_pkt_burst = ixgbe_xmit_pkts;
> +		dev->tx_pkt_burst = (rs_deferring_allowed ?
> +				     ixgbe_xmit_pkts :
> +				     ixgbe_xmit_pkts_always_rs);
>   	}
>   }
>   
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 113682a..c7ed71b 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -285,6 +285,8 @@ void ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq);
>   
>   uint16_t ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_vec_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   int ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq);
>   
>   #endif /* RTE_IXGBE_INC_VECTOR */
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> index cf25a53..a680fc2 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> @@ -723,9 +723,10 @@ tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
>   		txep[i].mbuf = tx_pkts[i];
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		       uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		     uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
>   	volatile union ixgbe_adv_tx_desc *txdp;
> @@ -735,6 +736,9 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
>   	int i;
>   
> +	if (always_rs)
> +		flags = rs;
> +
>   	if (unlikely(nb_pkts > RTE_IXGBE_VPMD_TX_BURST))
>   		nb_pkts = RTE_IXGBE_VPMD_TX_BURST;
>   
> @@ -764,7 +768,8 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		nb_commit = (uint16_t)(nb_commit - n);
>   
>   		tx_id = 0;
> -		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		if (!always_rs)
> +			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
>   
>   		/* avoid reach the end of ring */
>   		txdp = &(txq->tx_ring[tx_id]);
> @@ -776,7 +781,7 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	vtx(txdp, tx_pkts, nb_commit, flags);
>   
>   	tx_id = (uint16_t)(tx_id + nb_commit);
> -	if (tx_id > txq->tx_next_rs) {
> +	if (!always_rs && tx_id > txq->tx_next_rs) {
>   		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
>   			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
>   		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
> @@ -790,6 +795,20 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_pkts;
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		    uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_vec_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			      uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   static void __attribute__((cold))
>   ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
>   {

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-08-24  8:11 ` Vlad Zolotarov
@ 2015-10-27 18:09   ` Thomas Monjalon
  2015-10-27 18:47     ` Vlad Zolotarov
  0 siblings, 1 reply; 15+ messages in thread
From: Thomas Monjalon @ 2015-10-27 18:09 UTC (permalink / raw)
  To: Vlad Zolotarov, Konstantin Ananyev, helin.zhang
  Cc: dev, jeffrey.t.kirsher, jesse.brandeburg

Any Follow-up to this discussion?
Should we mark this patch as rejected?

2015-08-24 11:11, Vlad Zolotarov:
> On 08/20/15 18:37, Vlad Zolotarov wrote:
> > According to 82599 and x540 HW specifications RS bit *must* be
> > set in the last descriptor of *every* packet.
> >
> > Before this patch there were 3 types of Tx callbacks that were setting
> > RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> > new callbacks, one for each type mentioned above, that will set the RS
> > bit in every EOP descriptor.
> >
> > ixgbe_set_tx_function() will set the appropriate Tx callback according
> > to the device family.
> 
> [+Jesse and Jeff]
> 
> I've started to look at the i40e PMD and it has the same RS bit 
> deferring logic
> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a 
> descriptor write-back
> completion mode.
> 
>  From the HW Spec it's unclear if RS bit should be set on *every* descriptor
> with EOP bit. However I noticed that Linux driver, before it moved to 
> HEAD write-back mode, was setting RS
> bit on every EOP descriptor.
> 
> So, here is a question to Intel guys: could u, pls., clarify this point?
> 
> Thanks in advance,
> vlad

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-10-27 18:09   ` Thomas Monjalon
@ 2015-10-27 18:47     ` Vlad Zolotarov
  2015-10-27 18:50       ` Brandeburg, Jesse
  2015-10-27 19:10       ` Ananyev, Konstantin
  0 siblings, 2 replies; 15+ messages in thread
From: Vlad Zolotarov @ 2015-10-27 18:47 UTC (permalink / raw)
  To: Thomas Monjalon, Konstantin Ananyev, helin.zhang
  Cc: dev, jeffrey.t.kirsher, jesse.brandeburg



On 10/27/15 20:09, Thomas Monjalon wrote:
> Any Follow-up to this discussion?
> Should we mark this patch as rejected?

Hmmm... This patch fixes an obvious spec violation. Why would it be 
rejected?

>
> 2015-08-24 11:11, Vlad Zolotarov:
>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>> According to 82599 and x540 HW specifications RS bit *must* be
>>> set in the last descriptor of *every* packet.
>>>
>>> Before this patch there were 3 types of Tx callbacks that were setting
>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>> new callbacks, one for each type mentioned above, that will set the RS
>>> bit in every EOP descriptor.
>>>
>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>> to the device family.
>> [+Jesse and Jeff]
>>
>> I've started to look at the i40e PMD and it has the same RS bit
>> deferring logic
>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>> descriptor write-back
>> completion mode.
>>
>>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>> with EOP bit. However I noticed that Linux driver, before it moved to
>> HEAD write-back mode, was setting RS
>> bit on every EOP descriptor.
>>
>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>
>> Thanks in advance,
>> vlad
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-10-27 18:47     ` Vlad Zolotarov
@ 2015-10-27 18:50       ` Brandeburg, Jesse
  2015-10-27 19:10       ` Ananyev, Konstantin
  1 sibling, 0 replies; 15+ messages in thread
From: Brandeburg, Jesse @ 2015-10-27 18:50 UTC (permalink / raw)
  To: Vlad Zolotarov, Thomas Monjalon, Ananyev, Konstantin, Zhang,
	Helin, Rustad, Mark D, Skidmore, Donald C, Tantilov, Emil S
  Cc: dev, Kirsher, Jeffrey T

+ixgbe developers.

-----Original Message-----
From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com] 
Sent: Tuesday, October 27, 2015 11:48 AM
To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598



On 10/27/15 20:09, Thomas Monjalon wrote:
> Any Follow-up to this discussion?
> Should we mark this patch as rejected?

Hmmm... This patch fixes an obvious spec violation. Why would it be 
rejected?

>
> 2015-08-24 11:11, Vlad Zolotarov:
>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>> According to 82599 and x540 HW specifications RS bit *must* be
>>> set in the last descriptor of *every* packet.
>>>
>>> Before this patch there were 3 types of Tx callbacks that were setting
>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>> new callbacks, one for each type mentioned above, that will set the RS
>>> bit in every EOP descriptor.
>>>
>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>> to the device family.
>> [+Jesse and Jeff]
>>
>> I've started to look at the i40e PMD and it has the same RS bit
>> deferring logic
>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>> descriptor write-back
>> completion mode.
>>
>>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>> with EOP bit. However I noticed that Linux driver, before it moved to
>> HEAD write-back mode, was setting RS
>> bit on every EOP descriptor.
>>
>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>
>> Thanks in advance,
>> vlad
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-10-27 18:47     ` Vlad Zolotarov
  2015-10-27 18:50       ` Brandeburg, Jesse
@ 2015-10-27 19:10       ` Ananyev, Konstantin
  2015-10-27 19:14         ` Vlad Zolotarov
  1 sibling, 1 reply; 15+ messages in thread
From: Ananyev, Konstantin @ 2015-10-27 19:10 UTC (permalink / raw)
  To: Vlad Zolotarov, Thomas Monjalon, Zhang, Helin
  Cc: dev, Kirsher, Jeffrey T, Brandeburg, Jesse


Hi lads,

> -----Original Message-----
> From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com]
> Sent: Tuesday, October 27, 2015 6:48 PM
> To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
> Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
> Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
> 
> 
> 
> On 10/27/15 20:09, Thomas Monjalon wrote:
> > Any Follow-up to this discussion?
> > Should we mark this patch as rejected?
> 
> Hmmm... This patch fixes an obvious spec violation. Why would it be
> rejected?

No I don't think we can reject the patch:
There is a reproducible  TX hang on ixgbe PMD on described conditions.
Though, as I explained here:
http://dpdk.org/ml/archives/dev/2015-September/023574.html
Vlad's patch would cause quite a big slowdown.
We are still in the process to get an answer from HW guys are there any
alternatives that will allow to fix the problem and avoid the slowdown.
Konstantin  

> 
> >
> > 2015-08-24 11:11, Vlad Zolotarov:
> >> On 08/20/15 18:37, Vlad Zolotarov wrote:
> >>> According to 82599 and x540 HW specifications RS bit *must* be
> >>> set in the last descriptor of *every* packet.
> >>>
> >>> Before this patch there were 3 types of Tx callbacks that were setting
> >>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> >>> new callbacks, one for each type mentioned above, that will set the RS
> >>> bit in every EOP descriptor.
> >>>
> >>> ixgbe_set_tx_function() will set the appropriate Tx callback according
> >>> to the device family.
> >> [+Jesse and Jeff]
> >>
> >> I've started to look at the i40e PMD and it has the same RS bit
> >> deferring logic
> >> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
> >> descriptor write-back
> >> completion mode.
> >>
> >>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
> >> with EOP bit. However I noticed that Linux driver, before it moved to
> >> HEAD write-back mode, was setting RS
> >> bit on every EOP descriptor.
> >>
> >> So, here is a question to Intel guys: could u, pls., clarify this point?
> >>
> >> Thanks in advance,
> >> vlad
> >
> >

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
  2015-10-27 19:10       ` Ananyev, Konstantin
@ 2015-10-27 19:14         ` Vlad Zolotarov
  0 siblings, 0 replies; 15+ messages in thread
From: Vlad Zolotarov @ 2015-10-27 19:14 UTC (permalink / raw)
  To: Ananyev, Konstantin, Thomas Monjalon, Zhang, Helin
  Cc: dev, Kirsher, Jeffrey T, Brandeburg, Jesse



On 10/27/15 21:10, Ananyev, Konstantin wrote:
> Hi lads,
>
>> -----Original Message-----
>> From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com]
>> Sent: Tuesday, October 27, 2015 6:48 PM
>> To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
>> Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
>> Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
>>
>>
>>
>> On 10/27/15 20:09, Thomas Monjalon wrote:
>>> Any Follow-up to this discussion?
>>> Should we mark this patch as rejected?
>> Hmmm... This patch fixes an obvious spec violation. Why would it be
>> rejected?
> No I don't think we can reject the patch:
> There is a reproducible  TX hang on ixgbe PMD on described conditions.
> Though, as I explained here:
> http://dpdk.org/ml/archives/dev/2015-September/023574.html
> Vlad's patch would cause quite a big slowdown.
> We are still in the process to get an answer from HW guys are there any
> alternatives that will allow to fix the problem and avoid the slowdown.

+1

> Konstantin
>
>>> 2015-08-24 11:11, Vlad Zolotarov:
>>>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>>>> According to 82599 and x540 HW specifications RS bit *must* be
>>>>> set in the last descriptor of *every* packet.
>>>>>
>>>>> Before this patch there were 3 types of Tx callbacks that were setting
>>>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>>>> new callbacks, one for each type mentioned above, that will set the RS
>>>>> bit in every EOP descriptor.
>>>>>
>>>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>>>> to the device family.
>>>> [+Jesse and Jeff]
>>>>
>>>> I've started to look at the i40e PMD and it has the same RS bit
>>>> deferring logic
>>>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>>>> descriptor write-back
>>>> completion mode.
>>>>
>>>>    From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>>>> with EOP bit. However I noticed that Linux driver, before it moved to
>>>> HEAD write-back mode, was setting RS
>>>> bit on every EOP descriptor.
>>>>
>>>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>>>
>>>> Thanks in advance,
>>>> vlad
>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv6 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
  2015-08-24  8:11 ` Vlad Zolotarov
@ 2015-11-09 19:21 ` Konstantin Ananyev
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-09 19:21 UTC (permalink / raw)
  To: dev

First patch contains changes in testpmd that allow to reproduce the issue.
Second patch is the actual fix.

Konstantin Ananyev (2):
  testpmd: add ability to split outgoing packets
  ixgbe: fix TX hang when RS distance exceeds HW limit

 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 app/test/test_pmd_perf.c                    |   8 +-
 doc/guides/rel_notes/release_2_2.rst        |   7 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
 10 files changed, 357 insertions(+), 18 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv6 1/2] testpmd: add ability to split outgoing packets
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
  2015-08-24  8:11 ` Vlad Zolotarov
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
@ 2015-11-09 19:21 ` Konstantin Ananyev
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-09 19:21 UTC (permalink / raw)
  To: dev

For CSUM forwarding mode add ability to copy & split outgoing packet
into the new mbuf that consists of multiple segments.
For TXONLY and CSUM forwarding modes add ability to make number of
segments in the outgoing packet to vary on a per packet basis.
Number of segments and size of each segment is controlled by
'set txpkts' command.
Split policy is controlled by 'set txsplit' command.
Possible values are: on | off | rand.
Tha allows to increase test coverage for TX PMD codepaths.

v6 changes:
- fix typos
- testpmd guide: fix invalid command description

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 7 files changed, 319 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index c637198..a92fe0b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -199,7 +199,7 @@ static void cmd_help_long_parsed(void *parsed_result,
 			"clear port (info|stats|xstats|fdir|stat_qmap) (port_id|all)\n"
 			"    Clear information for port_id, or all.\n\n"
 
-			"show config (rxtx|cores|fwd)\n"
+			"show config (rxtx|cores|fwd|txpkts)\n"
 			"    Display the given configuration.\n\n"
 
 			"read rxd (port_id) (queue_id) (rxd_id)\n"
@@ -246,7 +246,12 @@ static void cmd_help_long_parsed(void *parsed_result,
 
 			"set txpkts (x[,y]*)\n"
 			"    Set the length of each segment of TXONLY"
-			" packets.\n\n"
+			" and optionally CSUM packets.\n\n"
+
+			"set txsplit (off|on|rand)\n"
+			"    Set the split policy for the TX packets."
+			" Right now only applicable for CSUM and TXONLY"
+			" modes\n\n"
 
 			"set corelist (x[,y]*)\n"
 			"    Set the list of forwarding cores.\n\n"
@@ -2621,6 +2626,47 @@ cmdline_parse_inst_t cmd_set_txpkts = {
 	},
 };
 
+/* *** SET COPY AND SPLIT POLICY ON TX PACKETS *** */
+
+struct cmd_set_txsplit_result {
+	cmdline_fixed_string_t cmd_keyword;
+	cmdline_fixed_string_t txsplit;
+	cmdline_fixed_string_t mode;
+};
+
+static void
+cmd_set_txsplit_parsed(void *parsed_result,
+		      __attribute__((unused)) struct cmdline *cl,
+		      __attribute__((unused)) void *data)
+{
+	struct cmd_set_txsplit_result *res;
+
+	res = parsed_result;
+	set_tx_pkt_split(res->mode);
+}
+
+cmdline_parse_token_string_t cmd_set_txsplit_keyword =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 cmd_keyword, "set");
+cmdline_parse_token_string_t cmd_set_txsplit_name =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 txsplit, "txsplit");
+cmdline_parse_token_string_t cmd_set_txsplit_mode =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 mode, NULL);
+
+cmdline_parse_inst_t cmd_set_txsplit = {
+	.f = cmd_set_txsplit_parsed,
+	.data = NULL,
+	.help_str = "set txsplit on|off|rand",
+	.tokens = {
+		(void *)&cmd_set_txsplit_keyword,
+		(void *)&cmd_set_txsplit_name,
+		(void *)&cmd_set_txsplit_mode,
+		NULL,
+	},
+};
+
 /* *** ADD/REMOVE ALL VLAN IDENTIFIERS TO/FROM A PORT VLAN RX FILTER *** */
 struct cmd_rx_vlan_filter_all_result {
 	cmdline_fixed_string_t rx_vlan;
@@ -5233,6 +5279,8 @@ static void cmd_showcfg_parsed(void *parsed_result,
 		fwd_lcores_config_display();
 	else if (!strcmp(res->what, "fwd"))
 		fwd_config_display();
+	else if (!strcmp(res->what, "txpkts"))
+		show_tx_pkt_segments();
 }
 
 cmdline_parse_token_string_t cmd_showcfg_show =
@@ -5241,12 +5289,12 @@ cmdline_parse_token_string_t cmd_showcfg_port =
 	TOKEN_STRING_INITIALIZER(struct cmd_showcfg_result, cfg, "config");
 cmdline_parse_token_string_t cmd_showcfg_what =
 	TOKEN_STRING_INITIALIZER(struct cmd_showcfg_result, what,
-				 "rxtx#cores#fwd");
+				 "rxtx#cores#fwd#txpkts");
 
 cmdline_parse_inst_t cmd_showcfg = {
 	.f = cmd_showcfg_parsed,
 	.data = NULL,
-	.help_str = "show config rxtx|cores|fwd",
+	.help_str = "show config rxtx|cores|fwd|txpkts",
 	.tokens = {
 		(void *)&cmd_showcfg_show,
 		(void *)&cmd_showcfg_port,
@@ -9574,6 +9622,7 @@ cmdline_parse_ctx_t main_ctx[] = {
 	(cmdline_parse_inst_t *)&cmd_reset,
 	(cmdline_parse_inst_t *)&cmd_set_numbers,
 	(cmdline_parse_inst_t *)&cmd_set_txpkts,
+	(cmdline_parse_inst_t *)&cmd_set_txsplit,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_list,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_mask,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_mode,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 938b456..8ec7d83 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -97,6 +97,24 @@
 
 static char *flowtype_to_str(uint16_t flow_type);
 
+static const struct {
+	enum tx_pkt_split split;
+	const char *name;
+} tx_split_name[] = {
+	{
+		.split = TX_PKT_SPLIT_OFF,
+		.name = "off",
+	},
+	{
+		.split = TX_PKT_SPLIT_ON,
+		.name = "on",
+	},
+	{
+		.split = TX_PKT_SPLIT_RND,
+		.name = "rand",
+	},
+};
+
 struct rss_type_info {
 	char str[32];
 	uint64_t rss_type;
@@ -1582,6 +1600,49 @@ set_nb_pkt_per_burst(uint16_t nb)
 	       (unsigned int) nb_pkt_per_burst);
 }
 
+static const char *
+tx_split_get_name(enum tx_pkt_split split)
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(tx_split_name); i++) {
+		if (tx_split_name[i].split == split)
+			return tx_split_name[i].name;
+	}
+	return NULL;
+}
+
+void
+set_tx_pkt_split(const char *name)
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(tx_split_name); i++) {
+		if (strcmp(tx_split_name[i].name, name) == 0) {
+			tx_pkt_split = tx_split_name[i].split;
+			return;
+		}
+	}
+	printf("unknown value: \"%s\"\n", name);
+}
+
+void
+show_tx_pkt_segments(void)
+{
+	uint32_t i, n;
+	const char *split;
+
+	n = tx_pkt_nb_segs;
+	split = tx_split_get_name(tx_pkt_split);
+
+	printf("Number of segments: %u\n", n);
+	printf("Segment sizes: ");
+	for (i = 0; i != n - 1; i++)
+		printf("%hu,", tx_pkt_seg_lengths[i]);
+	printf("%hu\n", tx_pkt_seg_lengths[i]);
+	printf("Split packet: %s\n", split);
+}
+
 void
 set_tx_pkt_segments(unsigned *seg_lengths, unsigned nb_segs)
 {
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index c9c095d..7e4f662 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -457,6 +457,155 @@ process_outer_cksums(void *outer_l3_hdr, struct testpmd_offload_info *info,
 }
 
 /*
+ * Helper function.
+ * Performs actual copying.
+ * Returns number of segments in the destination mbuf on success,
+ * or negative error code on failure.
+ */
+static int
+mbuf_copy_split(const struct rte_mbuf *ms, struct rte_mbuf *md[],
+	uint16_t seglen[], uint8_t nb_seg)
+{
+	uint32_t dlen, slen, tlen;
+	uint32_t i, len;
+	const struct rte_mbuf *m;
+	const uint8_t *src;
+	uint8_t *dst;
+
+	dlen = 0;
+	slen = 0;
+	tlen = 0;
+
+	dst = NULL;
+	src = NULL;
+
+	m = ms;
+	i = 0;
+	while (ms != NULL && i != nb_seg) {
+
+		if (slen == 0) {
+			slen = rte_pktmbuf_data_len(ms);
+			src = rte_pktmbuf_mtod(ms, const uint8_t *);
+		}
+
+		if (dlen == 0) {
+			dlen = RTE_MIN(seglen[i], slen);
+			md[i]->data_len = dlen;
+			md[i]->next = (i + 1 == nb_seg) ? NULL : md[i + 1];
+			dst = rte_pktmbuf_mtod(md[i], uint8_t *);
+		}
+
+		len = RTE_MIN(slen, dlen);
+		memcpy(dst, src, len);
+		tlen += len;
+		slen -= len;
+		dlen -= len;
+		src += len;
+		dst += len;
+
+		if (slen == 0)
+			ms = ms->next;
+		if (dlen == 0)
+			i++;
+	}
+
+	if (ms != NULL)
+		return -ENOBUFS;
+	else if (tlen != m->pkt_len)
+		return -EINVAL;
+
+	md[0]->nb_segs = nb_seg;
+	md[0]->pkt_len = tlen;
+	md[0]->vlan_tci = m->vlan_tci;
+	md[0]->vlan_tci_outer = m->vlan_tci_outer;
+	md[0]->ol_flags = m->ol_flags;
+	md[0]->tx_offload = m->tx_offload;
+
+	return nb_seg;
+}
+
+/*
+ * Allocate a new mbuf with up to tx_pkt_nb_segs segments.
+ * Copy packet contents and offload information into then new segmented mbuf.
+ */
+static struct rte_mbuf *
+pkt_copy_split(const struct rte_mbuf *pkt)
+{
+	int32_t n, rc;
+	uint32_t i, len, nb_seg;
+	struct rte_mempool *mp;
+	uint16_t seglen[RTE_MAX_SEGS_PER_PKT];
+	struct rte_mbuf *p, *md[RTE_MAX_SEGS_PER_PKT];
+
+	mp = current_fwd_lcore()->mbp;
+
+	if (tx_pkt_split == TX_PKT_SPLIT_RND)
+		nb_seg = random() % tx_pkt_nb_segs + 1;
+	else
+		nb_seg = tx_pkt_nb_segs;
+
+	memcpy(seglen, tx_pkt_seg_lengths, nb_seg * sizeof(seglen[0]));
+
+	/* calculate number of segments to use and their length. */
+	len = 0;
+	for (i = 0; i != nb_seg && len < pkt->pkt_len; i++) {
+		len += seglen[i];
+		md[i] = NULL;
+	}
+
+	n = pkt->pkt_len - len;
+
+	/* update size of the last segment to fit rest of the packet */
+	if (n >= 0) {
+		seglen[i - 1] += n;
+		len += n;
+	}
+
+	nb_seg = i;
+	while (i != 0) {
+		p = rte_pktmbuf_alloc(mp);
+		if (p == NULL) {
+			RTE_LOG(ERR, USER1,
+				"failed to allocate %u-th of %u mbuf "
+				"from mempool: %s\n",
+				nb_seg - i, nb_seg, mp->name);
+			break;
+		}
+
+		md[--i] = p;
+		if (rte_pktmbuf_tailroom(md[i]) < seglen[i]) {
+			RTE_LOG(ERR, USER1, "mempool %s, %u-th segment: "
+				"expected seglen: %u, "
+				"actual mbuf tailroom: %u\n",
+				mp->name, i, seglen[i],
+				rte_pktmbuf_tailroom(md[i]));
+			break;
+		}
+	}
+
+	/* all mbufs successfully allocated, do copy */
+	if (i == 0) {
+		rc = mbuf_copy_split(pkt, md, seglen, nb_seg);
+		if (rc < 0)
+			RTE_LOG(ERR, USER1,
+				"mbuf_copy_split for %p(len=%u, nb_seg=%hhu) "
+				"into %u segments failed with error code: %d\n",
+				pkt, pkt->pkt_len, pkt->nb_segs, nb_seg, rc);
+
+		/* figure out how many mbufs to free. */
+		i = RTE_MAX(rc, 0);
+	}
+
+	/* free unused mbufs */
+	for (; i != nb_seg; i++) {
+		rte_pktmbuf_free_seg(md[i]);
+		md[i] = NULL;
+	}
+
+	return md[0];
+}
+
+/*
  * Receive a burst of packets, and for each packet:
  *  - parse packet, and try to recognize a supported packet type (1)
  *  - if it's not a supported packet type, don't touch the packet, else:
@@ -486,7 +635,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 {
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
 	struct rte_port *txp;
-	struct rte_mbuf *m;
+	struct rte_mbuf *m, *p;
 	struct ether_hdr *eth_hdr;
 	void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or IPv6 */
 	uint16_t nb_rx;
@@ -627,6 +776,16 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 		m->tso_segsz = info.tso_segsz;
 		m->ol_flags = ol_flags;
 
+		/* Do split & copy for the packet. */
+		if (tx_pkt_split != TX_PKT_SPLIT_OFF) {
+			p = pkt_copy_split(m);
+			if (p != NULL) {
+				rte_pktmbuf_free(m);
+				m = p;
+				pkts_burst[i] = m;
+			}
+		}
+
 		/* if verbose mode is enabled, dump debug info */
 		if (verbose_level > 0) {
 			struct {
@@ -648,6 +807,8 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 			const char *name;
 
 			printf("-----------------\n");
+			printf("mbuf=%p, pkt_len=%u, nb_segs=%hhu:\n",
+				m, m->pkt_len, m->nb_segs);
 			/* dump rx parsed packet info */
 			printf("rx: l2_len=%d ethertype=%x l3_len=%d "
 				"l4_proto=%d l4_len=%d\n",
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 2e302bb..1ee935b 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -173,6 +173,9 @@ uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT] = {
 };
 uint8_t  tx_pkt_nb_segs = 1; /**< Number of segments in TXONLY packets */
 
+enum tx_pkt_split tx_pkt_split = TX_PKT_SPLIT_OFF;
+/**< Split policy for packets to TX. */
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t mb_mempool_cache = DEF_MBUF_CACHE; /**< Size of mbuf mempool cache. */
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index d6742d6..ee7de98 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -361,6 +361,14 @@ extern uint16_t tx_pkt_length; /**< Length of TXONLY packet */
 extern uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT]; /**< Seg. lengths */
 extern uint8_t  tx_pkt_nb_segs; /**< Number of segments in TX packets */
 
+enum tx_pkt_split {
+	TX_PKT_SPLIT_OFF,
+	TX_PKT_SPLIT_ON,
+	TX_PKT_SPLIT_RND,
+};
+
+extern enum tx_pkt_split tx_pkt_split;
+
 extern uint16_t nb_pkt_per_burst;
 extern uint16_t mb_mempool_cache;
 extern int8_t rx_pthresh;
@@ -509,6 +517,8 @@ void set_qmap(portid_t port_id, uint8_t is_rx, uint16_t queue_id, uint8_t map_va
 
 void set_verbose_level(uint16_t vb_level);
 void set_tx_pkt_segments(unsigned *seg_lengths, unsigned nb_segs);
+void show_tx_pkt_segments(void);
+void set_tx_pkt_split(const char *name);
 void set_nb_pkt_per_burst(uint16_t pkt_burst);
 char *list_pkt_forwarding_modes(void);
 void set_pkt_forwarding_mode(const char *fwd_mode);
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index db8f37a..a903d4f 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -210,6 +210,7 @@ pkt_burst_transmit(struct fwd_stream *fs)
 	uint64_t end_tsc;
 	uint64_t core_cycles;
 #endif
+	uint32_t nb_segs, pkt_len;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	start_tsc = rte_rdtsc();
@@ -233,7 +234,12 @@ pkt_burst_transmit(struct fwd_stream *fs)
 		}
 		pkt->data_len = tx_pkt_seg_lengths[0];
 		pkt_seg = pkt;
-		for (i = 1; i < tx_pkt_nb_segs; i++) {
+		if (tx_pkt_split == TX_PKT_SPLIT_RND)
+			nb_segs = random() % tx_pkt_nb_segs + 1;
+		else
+			nb_segs = tx_pkt_nb_segs;
+		pkt_len = pkt->data_len;
+		for (i = 1; i < nb_segs; i++) {
 			pkt_seg->next = tx_mbuf_alloc(mbp);
 			if (pkt_seg->next == NULL) {
 				pkt->nb_segs = i;
@@ -242,6 +248,7 @@ pkt_burst_transmit(struct fwd_stream *fs)
 			}
 			pkt_seg = pkt_seg->next;
 			pkt_seg->data_len = tx_pkt_seg_lengths[i];
+			pkt_len += pkt_seg->data_len;
 		}
 		pkt_seg->next = NULL; /* Last segment of packet. */
 
@@ -266,8 +273,8 @@ pkt_burst_transmit(struct fwd_stream *fs)
 		 * Complete first mbuf of packet and append it to the
 		 * burst of packets to be transmitted.
 		 */
-		pkt->nb_segs = tx_pkt_nb_segs;
-		pkt->pkt_len = tx_pkt_length;
+		pkt->nb_segs = nb_segs;
+		pkt->pkt_len = pkt_len;
 		pkt->ol_flags = ol_flags;
 		pkt->vlan_tci = vlan_tci;
 		pkt->vlan_tci_outer = vlan_tci_outer;
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 4fb1e0b..3a326f9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -211,7 +211,7 @@ show config
 Displays the configuration of the application.
 The configuration comes from the command-line, the runtime or the application defaults::
 
-   testpmd> show config (rxtx|cores|fwd)
+   testpmd> show config (rxtx|cores|fwd|txpkts)
 
 The available information categories are:
 
@@ -221,6 +221,8 @@ The available information categories are:
 
 * ``fwd``: Packet forwarding configuration.
 
+* ``txpkts``: Packets to TX configuration.
+
 For example:
 
 .. code-block:: console
@@ -396,6 +398,23 @@ Set the length of each segment of the TX-ONLY packets::
 
 Where x[,y]* represents a CSV list of values, without white space.
 
+set txsplit
+~~~~~~~~~~~
+
+Set the split policy for the TX packets, applicable for TX-ONLY and CSUM forwarding modes::
+
+   testpmd> set txsplit (off|on|rand)
+
+Where:
+
+* ``off`` disable packet copy & split for CSUM mode.
+
+* ``on`` split outgoing packet into multiple segments. Size of each segment
+  and number of segments per packet is determined by ``set txpkts`` command
+  (see above).
+
+* ``rand`` same as 'on', but number of segments per each packet is a random value between 1 and total number of segments.
+
 set corelist
 ~~~~~~~~~~~~
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv6 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
                   ` (2 preceding siblings ...)
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
@ 2015-11-09 19:21 ` Konstantin Ananyev
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 0/2] " Konstantin Ananyev
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-09 19:21 UTC (permalink / raw)
  To: dev

One of the ways to reproduce the issue:

testpmd <EAL-OPTIONS> -- -i --txqflags=0
testpmd> set fwd txonly
testpmd> set txpkts 64,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
testpmd> set txsplit rand
testpmd> start

After some time TX on ixgbe queue will hang,
and all packet transmission on that queue will stop.

This bug was first reported and investigated by
Vlad Zolotarov <vladz@cloudius-systems.com>:
"We can reproduce this issue when stressed the xmit path with a lot of highly
fragmented TCP frames (packets with up to 33 fragments with non-headers
fragments as small as 4 bytes) with all offload features enabled."

The root cause is that ixgbe_xmit_pkts() in some cases violates the HW rule
that the distance between TDs with RS bit set should not exceed 40 TDs.

>From the latest 82599 spec update:
"When WTHRESH is set to zero, the software device driver should set the RS bit
in the Tx descriptors with the EOP bit set and at least once in the 40
descriptors."

The fix is to make sure that the distance between TDs with RS bit set
would never exceed HW limit.
As part of that fix, tx_rs_thresh for ixgbe PMD is not allowed to be greater
then to 32 to comply with HW restrictions.

With that fix slight slowdown for the full-featured ixgbe TX path
might be observed (from our testing - up to 4%).

ixgbe simple TX path is unaffected by that patch.

v5 changes:
- rework the patch to avoid setting RS bit on every EOP descriptor
 (while that approach is valid, it causes significant slowdown
  on the TX path: up to 25%).

v6 changes:
- fix pmd_perf_autotest
- fix error description
- update RN

Reported-by: Vlad Zolotarov <vladz@cloudius-systems.com>
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_pmd_perf.c             |  8 ++++----
 doc/guides/rel_notes/release_2_2.rst |  7 +++++++
 drivers/net/ixgbe/ixgbe_rxtx.c       | 32 +++++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/app/test/test_pmd_perf.c b/app/test/test_pmd_perf.c
index 1fd6843..ef9262c 100644
--- a/app/test/test_pmd_perf.c
+++ b/app/test/test_pmd_perf.c
@@ -841,10 +841,10 @@ test_set_rxtx_conf(cmdline_fixed_string_t mode)
 		port_conf.rxmode.enable_scatter = 0;
 		return 0;
 	} else if (!strcmp(mode, "scalar")) {
-		/* bulk alloc rx, simple tx */
-		tx_conf.txq_flags = 0xf01;
-		tx_conf.tx_rs_thresh = 128;
-		tx_conf.tx_free_thresh = 128;
+		/* bulk alloc rx, full-featured tx */
+		tx_conf.txq_flags = 0;
+		tx_conf.tx_rs_thresh = 32;
+		tx_conf.tx_free_thresh = 32;
 		port_conf.rxmode.hw_ip_checksum = 1;
 		port_conf.rxmode.enable_scatter = 0;
 		return 0;
diff --git a/doc/guides/rel_notes/release_2_2.rst b/doc/guides/rel_notes/release_2_2.rst
index 59dda59..62e225b 100644
--- a/doc/guides/rel_notes/release_2_2.rst
+++ b/doc/guides/rel_notes/release_2_2.rst
@@ -134,6 +134,13 @@ Drivers
 
   VF needs the PF interrupt support initialized even if not started.
 
+* **ixgbe: Fixed TX hang when RS distance exceeds HW limit.**
+
+  Fixed an issue when TX queue can hang when a lot of highly fragmented
+  packets have to be sent.
+  As part of that fix, tx_rs_thresh for ixgbe PMD is not allowed to be greater
+  then to 32 to comply with HW restrictions.
+
 * **i40e: Fixed base driver allocation when not using first numa node.**
 
   Fixed i40e issue that occurred when a DPDK application didn't initialize
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 5561195..ca6fb69 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -572,7 +572,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	struct ixgbe_tx_entry *sw_ring;
 	struct ixgbe_tx_entry *txe, *txn;
 	volatile union ixgbe_adv_tx_desc *txr;
-	volatile union ixgbe_adv_tx_desc *txd;
+	volatile union ixgbe_adv_tx_desc *txd, *txp;
 	struct rte_mbuf     *tx_pkt;
 	struct rte_mbuf     *m_seg;
 	uint64_t buf_dma_addr;
@@ -595,6 +595,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	txr     = txq->tx_ring;
 	tx_id   = txq->tx_tail;
 	txe = &sw_ring[tx_id];
+	txp = NULL;
 
 	/* Determine if the descriptor ring needs to be cleaned. */
 	if (txq->nb_tx_free < txq->tx_free_thresh)
@@ -638,6 +639,12 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 */
 		nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx);
 
+		if (txp != NULL &&
+				nb_used + txq->nb_tx_used >= txq->tx_rs_thresh)
+			/* set RS on the previous packet in the burst */
+			txp->read.cmd_type_len |=
+				rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
+
 		/*
 		 * The number of descriptors that must be allocated for a
 		 * packet is the number of segments of that packet, plus 1
@@ -840,10 +847,18 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 			/* Update txq RS bit counters */
 			txq->nb_tx_used = 0;
-		}
+			txp = NULL;
+		} else
+			txp = txd;
+
 		txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
 	}
+
 end_of_tx:
+	/* set RS on last packet in the burst */
+	if (txp != NULL)
+		txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
+
 	rte_wmb();
 
 	/*
@@ -2019,9 +2034,16 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 			tx_conf->tx_free_thresh : DEFAULT_TX_FREE_THRESH);
 	if (tx_rs_thresh >= (nb_desc - 2)) {
 		PMD_INIT_LOG(ERR, "tx_rs_thresh must be less than the number "
-			     "of TX descriptors minus 2. (tx_rs_thresh=%u "
-			     "port=%d queue=%d)", (unsigned int)tx_rs_thresh,
-			     (int)dev->data->port_id, (int)queue_idx);
+			"of TX descriptors minus 2. (tx_rs_thresh=%u "
+			"port=%d queue=%d)", (unsigned int)tx_rs_thresh,
+			(int)dev->data->port_id, (int)queue_idx);
+		return -(EINVAL);
+	}
+	if (tx_rs_thresh > DEFAULT_TX_RS_THRESH) {
+		PMD_INIT_LOG(ERR, "tx_rs_thresh must be less or equal than %u. "
+			"(tx_rs_thresh=%u port=%d queue=%d)",
+			DEFAULT_TX_RS_THRESH, (unsigned int)tx_rs_thresh,
+			(int)dev->data->port_id, (int)queue_idx);
 		return -(EINVAL);
 	}
 	if (tx_free_thresh >= (nb_desc - 3)) {
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv7 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
                   ` (3 preceding siblings ...)
  2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
@ 2015-11-10 13:48 ` Konstantin Ananyev
  2015-11-10 14:06   ` De Lara Guarch, Pablo
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
  6 siblings, 1 reply; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-10 13:48 UTC (permalink / raw)
  To: dev

First patch contains changes in testpmd that allow to reproduce the issue.
Second patch is the actual fix.

Konstantin Ananyev (2):
  testpmd: add ability to split outgoing packets
  ixgbe: fix TX hang when RS distance exceeds HW limit

 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 app/test/test_pmd_perf.c                    |   8 +-
 doc/guides/rel_notes/release_2_2.rst        |   7 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
 10 files changed, 357 insertions(+), 18 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv7 1/2] testpmd: add ability to split outgoing packets
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
                   ` (4 preceding siblings ...)
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 0/2] " Konstantin Ananyev
@ 2015-11-10 13:48 ` Konstantin Ananyev
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
  6 siblings, 0 replies; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-10 13:48 UTC (permalink / raw)
  To: dev

For CSUM forwarding mode add ability to copy & split outgoing packet
into the new mbuf that consists of multiple segments.
For TXONLY and CSUM forwarding modes add ability to make number of
segments in the outgoing packet to vary on a per packet basis.
Number of segments and size of each segment is controlled by
'set txpkts' command.
Split policy is controlled by 'set txsplit' command.
Possible values are: on | off | rand.
Tha allows to increase test coverage for TX PMD codepaths.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 7 files changed, 319 insertions(+), 9 deletions(-)

v6 changes:
- fix typos
- testpmd guide: fix invalid command description

v7 changes:
- move vN changes after the changed file list

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index c637198..a92fe0b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -199,7 +199,7 @@ static void cmd_help_long_parsed(void *parsed_result,
 			"clear port (info|stats|xstats|fdir|stat_qmap) (port_id|all)\n"
 			"    Clear information for port_id, or all.\n\n"
 
-			"show config (rxtx|cores|fwd)\n"
+			"show config (rxtx|cores|fwd|txpkts)\n"
 			"    Display the given configuration.\n\n"
 
 			"read rxd (port_id) (queue_id) (rxd_id)\n"
@@ -246,7 +246,12 @@ static void cmd_help_long_parsed(void *parsed_result,
 
 			"set txpkts (x[,y]*)\n"
 			"    Set the length of each segment of TXONLY"
-			" packets.\n\n"
+			" and optionally CSUM packets.\n\n"
+
+			"set txsplit (off|on|rand)\n"
+			"    Set the split policy for the TX packets."
+			" Right now only applicable for CSUM and TXONLY"
+			" modes\n\n"
 
 			"set corelist (x[,y]*)\n"
 			"    Set the list of forwarding cores.\n\n"
@@ -2621,6 +2626,47 @@ cmdline_parse_inst_t cmd_set_txpkts = {
 	},
 };
 
+/* *** SET COPY AND SPLIT POLICY ON TX PACKETS *** */
+
+struct cmd_set_txsplit_result {
+	cmdline_fixed_string_t cmd_keyword;
+	cmdline_fixed_string_t txsplit;
+	cmdline_fixed_string_t mode;
+};
+
+static void
+cmd_set_txsplit_parsed(void *parsed_result,
+		      __attribute__((unused)) struct cmdline *cl,
+		      __attribute__((unused)) void *data)
+{
+	struct cmd_set_txsplit_result *res;
+
+	res = parsed_result;
+	set_tx_pkt_split(res->mode);
+}
+
+cmdline_parse_token_string_t cmd_set_txsplit_keyword =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 cmd_keyword, "set");
+cmdline_parse_token_string_t cmd_set_txsplit_name =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 txsplit, "txsplit");
+cmdline_parse_token_string_t cmd_set_txsplit_mode =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_txsplit_result,
+				 mode, NULL);
+
+cmdline_parse_inst_t cmd_set_txsplit = {
+	.f = cmd_set_txsplit_parsed,
+	.data = NULL,
+	.help_str = "set txsplit on|off|rand",
+	.tokens = {
+		(void *)&cmd_set_txsplit_keyword,
+		(void *)&cmd_set_txsplit_name,
+		(void *)&cmd_set_txsplit_mode,
+		NULL,
+	},
+};
+
 /* *** ADD/REMOVE ALL VLAN IDENTIFIERS TO/FROM A PORT VLAN RX FILTER *** */
 struct cmd_rx_vlan_filter_all_result {
 	cmdline_fixed_string_t rx_vlan;
@@ -5233,6 +5279,8 @@ static void cmd_showcfg_parsed(void *parsed_result,
 		fwd_lcores_config_display();
 	else if (!strcmp(res->what, "fwd"))
 		fwd_config_display();
+	else if (!strcmp(res->what, "txpkts"))
+		show_tx_pkt_segments();
 }
 
 cmdline_parse_token_string_t cmd_showcfg_show =
@@ -5241,12 +5289,12 @@ cmdline_parse_token_string_t cmd_showcfg_port =
 	TOKEN_STRING_INITIALIZER(struct cmd_showcfg_result, cfg, "config");
 cmdline_parse_token_string_t cmd_showcfg_what =
 	TOKEN_STRING_INITIALIZER(struct cmd_showcfg_result, what,
-				 "rxtx#cores#fwd");
+				 "rxtx#cores#fwd#txpkts");
 
 cmdline_parse_inst_t cmd_showcfg = {
 	.f = cmd_showcfg_parsed,
 	.data = NULL,
-	.help_str = "show config rxtx|cores|fwd",
+	.help_str = "show config rxtx|cores|fwd|txpkts",
 	.tokens = {
 		(void *)&cmd_showcfg_show,
 		(void *)&cmd_showcfg_port,
@@ -9574,6 +9622,7 @@ cmdline_parse_ctx_t main_ctx[] = {
 	(cmdline_parse_inst_t *)&cmd_reset,
 	(cmdline_parse_inst_t *)&cmd_set_numbers,
 	(cmdline_parse_inst_t *)&cmd_set_txpkts,
+	(cmdline_parse_inst_t *)&cmd_set_txsplit,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_list,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_mask,
 	(cmdline_parse_inst_t *)&cmd_set_fwd_mode,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 938b456..8ec7d83 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -97,6 +97,24 @@
 
 static char *flowtype_to_str(uint16_t flow_type);
 
+static const struct {
+	enum tx_pkt_split split;
+	const char *name;
+} tx_split_name[] = {
+	{
+		.split = TX_PKT_SPLIT_OFF,
+		.name = "off",
+	},
+	{
+		.split = TX_PKT_SPLIT_ON,
+		.name = "on",
+	},
+	{
+		.split = TX_PKT_SPLIT_RND,
+		.name = "rand",
+	},
+};
+
 struct rss_type_info {
 	char str[32];
 	uint64_t rss_type;
@@ -1582,6 +1600,49 @@ set_nb_pkt_per_burst(uint16_t nb)
 	       (unsigned int) nb_pkt_per_burst);
 }
 
+static const char *
+tx_split_get_name(enum tx_pkt_split split)
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(tx_split_name); i++) {
+		if (tx_split_name[i].split == split)
+			return tx_split_name[i].name;
+	}
+	return NULL;
+}
+
+void
+set_tx_pkt_split(const char *name)
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(tx_split_name); i++) {
+		if (strcmp(tx_split_name[i].name, name) == 0) {
+			tx_pkt_split = tx_split_name[i].split;
+			return;
+		}
+	}
+	printf("unknown value: \"%s\"\n", name);
+}
+
+void
+show_tx_pkt_segments(void)
+{
+	uint32_t i, n;
+	const char *split;
+
+	n = tx_pkt_nb_segs;
+	split = tx_split_get_name(tx_pkt_split);
+
+	printf("Number of segments: %u\n", n);
+	printf("Segment sizes: ");
+	for (i = 0; i != n - 1; i++)
+		printf("%hu,", tx_pkt_seg_lengths[i]);
+	printf("%hu\n", tx_pkt_seg_lengths[i]);
+	printf("Split packet: %s\n", split);
+}
+
 void
 set_tx_pkt_segments(unsigned *seg_lengths, unsigned nb_segs)
 {
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index c9c095d..7e4f662 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -457,6 +457,155 @@ process_outer_cksums(void *outer_l3_hdr, struct testpmd_offload_info *info,
 }
 
 /*
+ * Helper function.
+ * Performs actual copying.
+ * Returns number of segments in the destination mbuf on success,
+ * or negative error code on failure.
+ */
+static int
+mbuf_copy_split(const struct rte_mbuf *ms, struct rte_mbuf *md[],
+	uint16_t seglen[], uint8_t nb_seg)
+{
+	uint32_t dlen, slen, tlen;
+	uint32_t i, len;
+	const struct rte_mbuf *m;
+	const uint8_t *src;
+	uint8_t *dst;
+
+	dlen = 0;
+	slen = 0;
+	tlen = 0;
+
+	dst = NULL;
+	src = NULL;
+
+	m = ms;
+	i = 0;
+	while (ms != NULL && i != nb_seg) {
+
+		if (slen == 0) {
+			slen = rte_pktmbuf_data_len(ms);
+			src = rte_pktmbuf_mtod(ms, const uint8_t *);
+		}
+
+		if (dlen == 0) {
+			dlen = RTE_MIN(seglen[i], slen);
+			md[i]->data_len = dlen;
+			md[i]->next = (i + 1 == nb_seg) ? NULL : md[i + 1];
+			dst = rte_pktmbuf_mtod(md[i], uint8_t *);
+		}
+
+		len = RTE_MIN(slen, dlen);
+		memcpy(dst, src, len);
+		tlen += len;
+		slen -= len;
+		dlen -= len;
+		src += len;
+		dst += len;
+
+		if (slen == 0)
+			ms = ms->next;
+		if (dlen == 0)
+			i++;
+	}
+
+	if (ms != NULL)
+		return -ENOBUFS;
+	else if (tlen != m->pkt_len)
+		return -EINVAL;
+
+	md[0]->nb_segs = nb_seg;
+	md[0]->pkt_len = tlen;
+	md[0]->vlan_tci = m->vlan_tci;
+	md[0]->vlan_tci_outer = m->vlan_tci_outer;
+	md[0]->ol_flags = m->ol_flags;
+	md[0]->tx_offload = m->tx_offload;
+
+	return nb_seg;
+}
+
+/*
+ * Allocate a new mbuf with up to tx_pkt_nb_segs segments.
+ * Copy packet contents and offload information into then new segmented mbuf.
+ */
+static struct rte_mbuf *
+pkt_copy_split(const struct rte_mbuf *pkt)
+{
+	int32_t n, rc;
+	uint32_t i, len, nb_seg;
+	struct rte_mempool *mp;
+	uint16_t seglen[RTE_MAX_SEGS_PER_PKT];
+	struct rte_mbuf *p, *md[RTE_MAX_SEGS_PER_PKT];
+
+	mp = current_fwd_lcore()->mbp;
+
+	if (tx_pkt_split == TX_PKT_SPLIT_RND)
+		nb_seg = random() % tx_pkt_nb_segs + 1;
+	else
+		nb_seg = tx_pkt_nb_segs;
+
+	memcpy(seglen, tx_pkt_seg_lengths, nb_seg * sizeof(seglen[0]));
+
+	/* calculate number of segments to use and their length. */
+	len = 0;
+	for (i = 0; i != nb_seg && len < pkt->pkt_len; i++) {
+		len += seglen[i];
+		md[i] = NULL;
+	}
+
+	n = pkt->pkt_len - len;
+
+	/* update size of the last segment to fit rest of the packet */
+	if (n >= 0) {
+		seglen[i - 1] += n;
+		len += n;
+	}
+
+	nb_seg = i;
+	while (i != 0) {
+		p = rte_pktmbuf_alloc(mp);
+		if (p == NULL) {
+			RTE_LOG(ERR, USER1,
+				"failed to allocate %u-th of %u mbuf "
+				"from mempool: %s\n",
+				nb_seg - i, nb_seg, mp->name);
+			break;
+		}
+
+		md[--i] = p;
+		if (rte_pktmbuf_tailroom(md[i]) < seglen[i]) {
+			RTE_LOG(ERR, USER1, "mempool %s, %u-th segment: "
+				"expected seglen: %u, "
+				"actual mbuf tailroom: %u\n",
+				mp->name, i, seglen[i],
+				rte_pktmbuf_tailroom(md[i]));
+			break;
+		}
+	}
+
+	/* all mbufs successfully allocated, do copy */
+	if (i == 0) {
+		rc = mbuf_copy_split(pkt, md, seglen, nb_seg);
+		if (rc < 0)
+			RTE_LOG(ERR, USER1,
+				"mbuf_copy_split for %p(len=%u, nb_seg=%hhu) "
+				"into %u segments failed with error code: %d\n",
+				pkt, pkt->pkt_len, pkt->nb_segs, nb_seg, rc);
+
+		/* figure out how many mbufs to free. */
+		i = RTE_MAX(rc, 0);
+	}
+
+	/* free unused mbufs */
+	for (; i != nb_seg; i++) {
+		rte_pktmbuf_free_seg(md[i]);
+		md[i] = NULL;
+	}
+
+	return md[0];
+}
+
+/*
  * Receive a burst of packets, and for each packet:
  *  - parse packet, and try to recognize a supported packet type (1)
  *  - if it's not a supported packet type, don't touch the packet, else:
@@ -486,7 +635,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 {
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
 	struct rte_port *txp;
-	struct rte_mbuf *m;
+	struct rte_mbuf *m, *p;
 	struct ether_hdr *eth_hdr;
 	void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or IPv6 */
 	uint16_t nb_rx;
@@ -627,6 +776,16 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 		m->tso_segsz = info.tso_segsz;
 		m->ol_flags = ol_flags;
 
+		/* Do split & copy for the packet. */
+		if (tx_pkt_split != TX_PKT_SPLIT_OFF) {
+			p = pkt_copy_split(m);
+			if (p != NULL) {
+				rte_pktmbuf_free(m);
+				m = p;
+				pkts_burst[i] = m;
+			}
+		}
+
 		/* if verbose mode is enabled, dump debug info */
 		if (verbose_level > 0) {
 			struct {
@@ -648,6 +807,8 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 			const char *name;
 
 			printf("-----------------\n");
+			printf("mbuf=%p, pkt_len=%u, nb_segs=%hhu:\n",
+				m, m->pkt_len, m->nb_segs);
 			/* dump rx parsed packet info */
 			printf("rx: l2_len=%d ethertype=%x l3_len=%d "
 				"l4_proto=%d l4_len=%d\n",
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 2e302bb..1ee935b 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -173,6 +173,9 @@ uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT] = {
 };
 uint8_t  tx_pkt_nb_segs = 1; /**< Number of segments in TXONLY packets */
 
+enum tx_pkt_split tx_pkt_split = TX_PKT_SPLIT_OFF;
+/**< Split policy for packets to TX. */
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t mb_mempool_cache = DEF_MBUF_CACHE; /**< Size of mbuf mempool cache. */
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index d6742d6..ee7de98 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -361,6 +361,14 @@ extern uint16_t tx_pkt_length; /**< Length of TXONLY packet */
 extern uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT]; /**< Seg. lengths */
 extern uint8_t  tx_pkt_nb_segs; /**< Number of segments in TX packets */
 
+enum tx_pkt_split {
+	TX_PKT_SPLIT_OFF,
+	TX_PKT_SPLIT_ON,
+	TX_PKT_SPLIT_RND,
+};
+
+extern enum tx_pkt_split tx_pkt_split;
+
 extern uint16_t nb_pkt_per_burst;
 extern uint16_t mb_mempool_cache;
 extern int8_t rx_pthresh;
@@ -509,6 +517,8 @@ void set_qmap(portid_t port_id, uint8_t is_rx, uint16_t queue_id, uint8_t map_va
 
 void set_verbose_level(uint16_t vb_level);
 void set_tx_pkt_segments(unsigned *seg_lengths, unsigned nb_segs);
+void show_tx_pkt_segments(void);
+void set_tx_pkt_split(const char *name);
 void set_nb_pkt_per_burst(uint16_t pkt_burst);
 char *list_pkt_forwarding_modes(void);
 void set_pkt_forwarding_mode(const char *fwd_mode);
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index db8f37a..a903d4f 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -210,6 +210,7 @@ pkt_burst_transmit(struct fwd_stream *fs)
 	uint64_t end_tsc;
 	uint64_t core_cycles;
 #endif
+	uint32_t nb_segs, pkt_len;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	start_tsc = rte_rdtsc();
@@ -233,7 +234,12 @@ pkt_burst_transmit(struct fwd_stream *fs)
 		}
 		pkt->data_len = tx_pkt_seg_lengths[0];
 		pkt_seg = pkt;
-		for (i = 1; i < tx_pkt_nb_segs; i++) {
+		if (tx_pkt_split == TX_PKT_SPLIT_RND)
+			nb_segs = random() % tx_pkt_nb_segs + 1;
+		else
+			nb_segs = tx_pkt_nb_segs;
+		pkt_len = pkt->data_len;
+		for (i = 1; i < nb_segs; i++) {
 			pkt_seg->next = tx_mbuf_alloc(mbp);
 			if (pkt_seg->next == NULL) {
 				pkt->nb_segs = i;
@@ -242,6 +248,7 @@ pkt_burst_transmit(struct fwd_stream *fs)
 			}
 			pkt_seg = pkt_seg->next;
 			pkt_seg->data_len = tx_pkt_seg_lengths[i];
+			pkt_len += pkt_seg->data_len;
 		}
 		pkt_seg->next = NULL; /* Last segment of packet. */
 
@@ -266,8 +273,8 @@ pkt_burst_transmit(struct fwd_stream *fs)
 		 * Complete first mbuf of packet and append it to the
 		 * burst of packets to be transmitted.
 		 */
-		pkt->nb_segs = tx_pkt_nb_segs;
-		pkt->pkt_len = tx_pkt_length;
+		pkt->nb_segs = nb_segs;
+		pkt->pkt_len = pkt_len;
 		pkt->ol_flags = ol_flags;
 		pkt->vlan_tci = vlan_tci;
 		pkt->vlan_tci_outer = vlan_tci_outer;
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 4fb1e0b..3a326f9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -211,7 +211,7 @@ show config
 Displays the configuration of the application.
 The configuration comes from the command-line, the runtime or the application defaults::
 
-   testpmd> show config (rxtx|cores|fwd)
+   testpmd> show config (rxtx|cores|fwd|txpkts)
 
 The available information categories are:
 
@@ -221,6 +221,8 @@ The available information categories are:
 
 * ``fwd``: Packet forwarding configuration.
 
+* ``txpkts``: Packets to TX configuration.
+
 For example:
 
 .. code-block:: console
@@ -396,6 +398,23 @@ Set the length of each segment of the TX-ONLY packets::
 
 Where x[,y]* represents a CSV list of values, without white space.
 
+set txsplit
+~~~~~~~~~~~
+
+Set the split policy for the TX packets, applicable for TX-ONLY and CSUM forwarding modes::
+
+   testpmd> set txsplit (off|on|rand)
+
+Where:
+
+* ``off`` disable packet copy & split for CSUM mode.
+
+* ``on`` split outgoing packet into multiple segments. Size of each segment
+  and number of segments per packet is determined by ``set txpkts`` command
+  (see above).
+
+* ``rand`` same as 'on', but number of segments per each packet is a random value between 1 and total number of segments.
+
 set corelist
 ~~~~~~~~~~~~
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [dpdk-dev] [PATCHv7 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
                   ` (5 preceding siblings ...)
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
@ 2015-11-10 13:48 ` Konstantin Ananyev
  6 siblings, 0 replies; 15+ messages in thread
From: Konstantin Ananyev @ 2015-11-10 13:48 UTC (permalink / raw)
  To: dev

One of the ways to reproduce the issue:

testpmd <EAL-OPTIONS> -- -i --txqflags=0
testpmd> set fwd txonly
testpmd> set txpkts 64,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
testpmd> set txsplit rand
testpmd> start

After some time TX on ixgbe queue will hang,
and all packet transmission on that queue will stop.

This bug was first reported and investigated by
Vlad Zolotarov <vladz@cloudius-systems.com>:
"We can reproduce this issue when stressed the xmit path with a lot of highly
fragmented TCP frames (packets with up to 33 fragments with non-headers
fragments as small as 4 bytes) with all offload features enabled."

The root cause is that ixgbe_xmit_pkts() in some cases violates the HW rule
that the distance between TDs with RS bit set should not exceed 40 TDs.

>From the latest 82599 spec update:
"When WTHRESH is set to zero, the software device driver should set the RS bit
in the Tx descriptors with the EOP bit set and at least once in the 40
descriptors."

The fix is to make sure that the distance between TDs with RS bit set
would never exceed HW limit.
As part of that fix, tx_rs_thresh for ixgbe PMD is not allowed to be greater
then to 32 to comply with HW restrictions.

With that fix slight slowdown for the full-featured ixgbe TX path
might be observed (from our testing - up to 4%).

ixgbe simple TX path is unaffected by that patch.

Reported-by: Vlad Zolotarov <vladz@cloudius-systems.com>
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_pmd_perf.c             |  8 ++++----
 doc/guides/rel_notes/release_2_2.rst |  7 +++++++
 drivers/net/ixgbe/ixgbe_rxtx.c       | 32 +++++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 9 deletions(-)

v5 changes:
- rework the patch to avoid setting RS bit on every EOP descriptor
 (while that approach is valid, it causes significant slowdown
  on the TX path: up to 25%).

v6 changes:
- fix pmd_perf_autotest
- fix error description
- update RN

v7 changes:
- move vN changes after the changed file list

diff --git a/app/test/test_pmd_perf.c b/app/test/test_pmd_perf.c
index 1fd6843..ef9262c 100644
--- a/app/test/test_pmd_perf.c
+++ b/app/test/test_pmd_perf.c
@@ -841,10 +841,10 @@ test_set_rxtx_conf(cmdline_fixed_string_t mode)
 		port_conf.rxmode.enable_scatter = 0;
 		return 0;
 	} else if (!strcmp(mode, "scalar")) {
-		/* bulk alloc rx, simple tx */
-		tx_conf.txq_flags = 0xf01;
-		tx_conf.tx_rs_thresh = 128;
-		tx_conf.tx_free_thresh = 128;
+		/* bulk alloc rx, full-featured tx */
+		tx_conf.txq_flags = 0;
+		tx_conf.tx_rs_thresh = 32;
+		tx_conf.tx_free_thresh = 32;
 		port_conf.rxmode.hw_ip_checksum = 1;
 		port_conf.rxmode.enable_scatter = 0;
 		return 0;
diff --git a/doc/guides/rel_notes/release_2_2.rst b/doc/guides/rel_notes/release_2_2.rst
index 59dda59..62e225b 100644
--- a/doc/guides/rel_notes/release_2_2.rst
+++ b/doc/guides/rel_notes/release_2_2.rst
@@ -134,6 +134,13 @@ Drivers
 
   VF needs the PF interrupt support initialized even if not started.
 
+* **ixgbe: Fixed TX hang when RS distance exceeds HW limit.**
+
+  Fixed an issue when TX queue can hang when a lot of highly fragmented
+  packets have to be sent.
+  As part of that fix, tx_rs_thresh for ixgbe PMD is not allowed to be greater
+  then to 32 to comply with HW restrictions.
+
 * **i40e: Fixed base driver allocation when not using first numa node.**
 
   Fixed i40e issue that occurred when a DPDK application didn't initialize
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 5561195..ca6fb69 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -572,7 +572,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	struct ixgbe_tx_entry *sw_ring;
 	struct ixgbe_tx_entry *txe, *txn;
 	volatile union ixgbe_adv_tx_desc *txr;
-	volatile union ixgbe_adv_tx_desc *txd;
+	volatile union ixgbe_adv_tx_desc *txd, *txp;
 	struct rte_mbuf     *tx_pkt;
 	struct rte_mbuf     *m_seg;
 	uint64_t buf_dma_addr;
@@ -595,6 +595,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	txr     = txq->tx_ring;
 	tx_id   = txq->tx_tail;
 	txe = &sw_ring[tx_id];
+	txp = NULL;
 
 	/* Determine if the descriptor ring needs to be cleaned. */
 	if (txq->nb_tx_free < txq->tx_free_thresh)
@@ -638,6 +639,12 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 */
 		nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx);
 
+		if (txp != NULL &&
+				nb_used + txq->nb_tx_used >= txq->tx_rs_thresh)
+			/* set RS on the previous packet in the burst */
+			txp->read.cmd_type_len |=
+				rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
+
 		/*
 		 * The number of descriptors that must be allocated for a
 		 * packet is the number of segments of that packet, plus 1
@@ -840,10 +847,18 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 			/* Update txq RS bit counters */
 			txq->nb_tx_used = 0;
-		}
+			txp = NULL;
+		} else
+			txp = txd;
+
 		txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
 	}
+
 end_of_tx:
+	/* set RS on last packet in the burst */
+	if (txp != NULL)
+		txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
+
 	rte_wmb();
 
 	/*
@@ -2019,9 +2034,16 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 			tx_conf->tx_free_thresh : DEFAULT_TX_FREE_THRESH);
 	if (tx_rs_thresh >= (nb_desc - 2)) {
 		PMD_INIT_LOG(ERR, "tx_rs_thresh must be less than the number "
-			     "of TX descriptors minus 2. (tx_rs_thresh=%u "
-			     "port=%d queue=%d)", (unsigned int)tx_rs_thresh,
-			     (int)dev->data->port_id, (int)queue_idx);
+			"of TX descriptors minus 2. (tx_rs_thresh=%u "
+			"port=%d queue=%d)", (unsigned int)tx_rs_thresh,
+			(int)dev->data->port_id, (int)queue_idx);
+		return -(EINVAL);
+	}
+	if (tx_rs_thresh > DEFAULT_TX_RS_THRESH) {
+		PMD_INIT_LOG(ERR, "tx_rs_thresh must be less or equal than %u. "
+			"(tx_rs_thresh=%u port=%d queue=%d)",
+			DEFAULT_TX_RS_THRESH, (unsigned int)tx_rs_thresh,
+			(int)dev->data->port_id, (int)queue_idx);
 		return -(EINVAL);
 	}
 	if (tx_free_thresh >= (nb_desc - 3)) {
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCHv7 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 0/2] " Konstantin Ananyev
@ 2015-11-10 14:06   ` De Lara Guarch, Pablo
  2015-11-11 23:15     ` Thomas Monjalon
  0 siblings, 1 reply; 15+ messages in thread
From: De Lara Guarch, Pablo @ 2015-11-10 14:06 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Konstantin
> Ananyev
> Sent: Tuesday, November 10, 2015 1:48 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCHv7 0/2] ixgbe: fix TX hang when RS distance
> exceeds HW limit
> 
> First patch contains changes in testpmd that allow to reproduce the issue.
> Second patch is the actual fix.
> 
> Konstantin Ananyev (2):
>   testpmd: add ability to split outgoing packets
>   ixgbe: fix TX hang when RS distance exceeds HW limit
> 
>  app/test-pmd/cmdline.c                      |  57 +++++++++-
>  app/test-pmd/config.c                       |  61 +++++++++++
>  app/test-pmd/csumonly.c                     | 163
> +++++++++++++++++++++++++++-
>  app/test-pmd/testpmd.c                      |   3 +
>  app/test-pmd/testpmd.h                      |  10 ++
>  app/test-pmd/txonly.c                       |  13 ++-
>  app/test/test_pmd_perf.c                    |   8 +-
>  doc/guides/rel_notes/release_2_2.rst        |   7 ++
>  doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
>  drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
>  10 files changed, 357 insertions(+), 18 deletions(-)
> 
> --
> 1.8.3.1

Series-acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [dpdk-dev] [PATCHv7 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit
  2015-11-10 14:06   ` De Lara Guarch, Pablo
@ 2015-11-11 23:15     ` Thomas Monjalon
  0 siblings, 0 replies; 15+ messages in thread
From: Thomas Monjalon @ 2015-11-11 23:15 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

> > First patch contains changes in testpmd that allow to reproduce the issue.
> > Second patch is the actual fix.
> > 
> > Konstantin Ananyev (2):
> >   testpmd: add ability to split outgoing packets
> >   ixgbe: fix TX hang when RS distance exceeds HW limit
> 
> Series-acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-11-11 23:17 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-20 15:37 [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598 Vlad Zolotarov
2015-08-24  8:11 ` Vlad Zolotarov
2015-10-27 18:09   ` Thomas Monjalon
2015-10-27 18:47     ` Vlad Zolotarov
2015-10-27 18:50       ` Brandeburg, Jesse
2015-10-27 19:10       ` Ananyev, Konstantin
2015-10-27 19:14         ` Vlad Zolotarov
2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 0/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
2015-11-09 19:21 ` [dpdk-dev] [PATCHv6 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev
2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 0/2] " Konstantin Ananyev
2015-11-10 14:06   ` De Lara Guarch, Pablo
2015-11-11 23:15     ` Thomas Monjalon
2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 1/2] testpmd: add ability to split outgoing packets Konstantin Ananyev
2015-11-10 13:48 ` [dpdk-dev] [PATCHv7 2/2] ixgbe: fix TX hang when RS distance exceeds HW limit Konstantin Ananyev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).