DPDK patches and discussions
 help / color / mirror / Atom feed
From: Matan Azrad <matan@mellanox.com>
To: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: dev@dpdk.org
Subject: [dpdk-dev] [PATCH v2 4/8] net/mlx4: optimize Tx multi-segment case
Date: Wed,  6 Dec 2017 14:48:09 +0000	[thread overview]
Message-ID: <1512571693-15338-5-git-send-email-matan@mellanox.com> (raw)
In-Reply-To: <1512571693-15338-1-git-send-email-matan@mellanox.com>

mlx4 Tx block can handle up to 4 data segments or control segment + up
to 3 data segments. The first data segment in each not first Tx block
must validate Tx queue wraparound and must use IO memory barrier before
writing the byte count.

The previous multi-segment code used "for" loop to iterate over all
packet segments and separated first Tx block data case by "if"
statements.

Use switch case and unconditional branches instead of "for" loop can
optimize the case and prevents the unnecessary checks for each data
segment; This hints to compiler to create optimized jump table.

Optimize this case by switch case and unconditional branches usage.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 198 ++++++++++++++++++++++++++++---------------
 1 file changed, 128 insertions(+), 70 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 1d8240a..adf02c0 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -421,6 +421,39 @@ struct pv {
 	return buf->pool;
 }
 
+/**
+ * Write Tx data segment to the SQ.
+ *
+ * @param dseg
+ *   Pointer to data segment in SQ.
+ * @param lkey
+ *   Memory region lkey.
+ * @param addr
+ *   Data address.
+ * @param byte_count
+ *   Big endian bytes count of the data to send.
+ */
+static inline void
+mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
+		       uint32_t lkey, uintptr_t addr, rte_be32_t  byte_count)
+{
+	dseg->addr = rte_cpu_to_be_64(addr);
+	dseg->lkey = rte_cpu_to_be_32(lkey);
+#if RTE_CACHE_LINE_SIZE < 64
+	/*
+	 * Need a barrier here before writing the byte_count
+	 * fields to make sure that all the data is visible
+	 * before the byte_count field is set.
+	 * Otherwise, if the segment begins a new cacheline,
+	 * the HCA prefetcher could grab the 64-byte chunk and
+	 * get a valid (!= 0xffffffff) byte count but stale
+	 * data, and end up sending the wrong data.
+	 */
+	rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
+	dseg->byte_count = byte_count;
+}
+
 static int
 mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
 		   volatile struct mlx4_wqe_ctrl_seg **pctrl)
@@ -432,15 +465,14 @@ struct pv {
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 	volatile struct mlx4_wqe_ctrl_seg *ctrl;
 	volatile struct mlx4_wqe_data_seg *dseg;
-	struct rte_mbuf *sbuf;
+	struct rte_mbuf *sbuf = buf;
 	uint32_t lkey;
-	uintptr_t addr;
-	uint32_t byte_count;
 	int pv_counter = 0;
+	int nb_segs = buf->nb_segs;
 
 	/* Calculate the needed work queue entry size for this packet. */
 	wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) +
-		buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
+		nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
 	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
 	/*
 	 * Check that there is room for this WQE in the send queue and that
@@ -457,67 +489,99 @@ struct pv {
 	dseg = (volatile struct mlx4_wqe_data_seg *)
 			((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg));
 	*pctrl = ctrl;
-	/* Fill the data segments with buffer information. */
-	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
-		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
-		rte_prefetch0((volatile void *)addr);
-		/* Memory region key (big endian) for this memory pool. */
+	/*
+	 * Fill the data segments with buffer information.
+	 * First WQE TXBB head segment is always control segment,
+	 * so jump to tail TXBB data segments code for the first
+	 * WQE data segments filling.
+	 */
+	goto txbb_tail_segs;
+txbb_head_seg:
+	/* Memory region key (big endian) for this memory pool. */
+	lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+	if (unlikely(lkey == (uint32_t)-1)) {
+		DEBUG("%p: unable to get MP <-> MR association",
+		      (void *)txq);
+		return -1;
+	}
+	/* Handle WQE wraparound. */
+	if (dseg >=
+		(volatile struct mlx4_wqe_data_seg *)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+			sq->buf;
+	dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t));
+	dseg->lkey = rte_cpu_to_be_32(lkey);
+	/*
+	 * This data segment starts at the beginning of a new
+	 * TXBB, so we need to postpone its byte_count writing
+	 * for later.
+	 */
+	pv[pv_counter].dseg = dseg;
+	/*
+	 * Zero length segment is treated as inline segment
+	 * with zero data.
+	 */
+	pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ?
+						sbuf->data_len : 0x80000000);
+	sbuf = sbuf->next;
+	dseg++;
+	nb_segs--;
+txbb_tail_segs:
+	/* Jump to default if there are more than two segments remaining. */
+	switch (nb_segs) {
+	default:
 		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
-		dseg->lkey = rte_cpu_to_be_32(lkey);
-		/* Calculate the needed work queue entry size for this packet */
-		if (unlikely(lkey == rte_cpu_to_be_32((uint32_t)-1))) {
-			/* MR does not exist. */
+		if (unlikely(lkey == (uint32_t)-1)) {
 			DEBUG("%p: unable to get MP <-> MR association",
 			      (void *)txq);
 			return -1;
 		}
-		if (likely(sbuf->data_len)) {
-			byte_count = rte_cpu_to_be_32(sbuf->data_len);
-		} else {
-			/*
-			 * Zero length segment is treated as inline segment
-			 * with zero data.
-			 */
-			byte_count = RTE_BE32(0x80000000);
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		sbuf = sbuf->next;
+		dseg++;
+		nb_segs--;
+		/* fallthrough */
+	case 2:
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		if (unlikely(lkey == (uint32_t)-1)) {
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			return -1;
 		}
-		/*
-		 * If the data segment is not at the beginning of a
-		 * Tx basic block (TXBB) then write the byte count,
-		 * else postpone the writing to just before updating the
-		 * control segment.
-		 */
-		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
-			dseg->addr = rte_cpu_to_be_64(addr);
-			dseg->lkey = rte_cpu_to_be_32(lkey);
-#if RTE_CACHE_LINE_SIZE < 64
-			/*
-			 * Need a barrier here before writing the byte_count
-			 * fields to make sure that all the data is visible
-			 * before the byte_count field is set.
-			 * Otherwise, if the segment begins a new cacheline,
-			 * the HCA prefetcher could grab the 64-byte chunk and
-			 * get a valid (!= 0xffffffff) byte count but stale
-			 * data, and end up sending the wrong data.
-			 */
-			rte_io_wmb();
-#endif /* RTE_CACHE_LINE_SIZE */
-			dseg->byte_count = byte_count;
-		} else {
-			/*
-			 * This data segment starts at the beginning of a new
-			 * TXBB, so we need to postpone its byte_count writing
-			 * for later.
-			 */
-			/* Handle WQE wraparound. */
-			if (dseg >=
-			    (volatile struct mlx4_wqe_data_seg *)sq->eob)
-				dseg = (volatile struct mlx4_wqe_data_seg *)
-					sq->buf;
-			dseg->addr = rte_cpu_to_be_64(addr);
-			dseg->lkey = rte_cpu_to_be_32(lkey);
-			pv[pv_counter].dseg = dseg;
-			pv[pv_counter++].val = byte_count;
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		sbuf = sbuf->next;
+		dseg++;
+		nb_segs--;
+		/* fallthrough */
+	case 1:
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		if (unlikely(lkey == (uint32_t)-1)) {
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			return -1;
+		}
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		nb_segs--;
+		if (nb_segs) {
+			sbuf = sbuf->next;
+			dseg++;
+			goto txbb_head_seg;
 		}
+		/* fallthrough */
+	case 0:
+		break;
 	}
 	/* Write the first DWORD of each TXBB save earlier. */
 	if (pv_counter) {
@@ -583,7 +647,6 @@ struct pv {
 		} srcrb;
 		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 		uint32_t lkey;
-		uintptr_t addr;
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -618,24 +681,19 @@ struct pv {
 			dseg = (volatile struct mlx4_wqe_data_seg *)
 					((uintptr_t)ctrl +
 					sizeof(struct mlx4_wqe_ctrl_seg));
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			rte_prefetch0((volatile void *)addr);
-			dseg->addr = rte_cpu_to_be_64(addr);
-			/* Memory region key (big endian). */
+
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
 			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-			dseg->lkey = rte_cpu_to_be_32(lkey);
-			if (unlikely(dseg->lkey ==
-				rte_cpu_to_be_32((uint32_t)-1))) {
+			if (unlikely(lkey == (uint32_t)-1)) {
 				/* MR does not exist. */
 				DEBUG("%p: unable to get MP <-> MR association",
 				      (void *)txq);
 				elt->buf = NULL;
 				break;
 			}
-			/* Never be TXBB aligned, no need compiler barrier. */
-			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
-			/* Fill the control parameters for this packet. */
-			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			mlx4_fill_tx_data_seg(dseg, lkey,
+					      rte_pktmbuf_mtod(buf, uintptr_t),
+					      rte_cpu_to_be_32(buf->data_len));
 			nr_txbbs = 1;
 		} else {
 			nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl);
-- 
1.8.3.1

  parent reply	other threads:[~2017-12-06 14:48 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-28 12:19 [dpdk-dev] [PATCH 0/8] improve mlx4 Tx performance Matan Azrad
2017-11-28 12:19 ` [dpdk-dev] [PATCH 1/8] net/mlx4: fix Tx packet drop application report Matan Azrad
2017-12-06 10:57   ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 2/8] net/mlx4: remove unnecessary Tx wraparound checks Matan Azrad
2017-12-06 10:57   ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 3/8] net/mlx4: remove restamping from Tx error path Matan Azrad
2017-12-06 10:58   ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 4/8] net/mlx4: optimize Tx multi-segment case Matan Azrad
2017-12-06 10:58   ` Adrien Mazarguil
2017-12-06 11:29     ` Matan Azrad
2017-12-06 11:55       ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 5/8] net/mlx4: merge Tx queue rings management Matan Azrad
2017-12-06 10:58   ` Adrien Mazarguil
2017-12-06 11:43     ` Matan Azrad
2017-12-06 12:09       ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 6/8] net/mlx4: mitigate Tx send entry size calculations Matan Azrad
2017-12-06 10:59   ` Adrien Mazarguil
2017-11-28 12:19 ` [dpdk-dev] [PATCH 7/8] net/mlx4: align Tx descriptors number Matan Azrad
2017-12-06 10:59   ` Adrien Mazarguil
2017-12-06 11:44     ` Matan Azrad
2017-11-28 12:19 ` [dpdk-dev] [PATCH 8/8] net/mlx4: remove Tx completion elements counter Matan Azrad
2017-12-06 10:59   ` Adrien Mazarguil
2017-12-06 14:48 ` [dpdk-dev] [PATCH v2 0/8] improve mlx4 Tx performance Matan Azrad
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 1/8] net/mlx4: fix Tx packet drop application report Matan Azrad
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 2/8] net/mlx4: remove unnecessary Tx wraparound checks Matan Azrad
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 3/8] net/mlx4: remove restamping from Tx error path Matan Azrad
2017-12-06 14:48   ` Matan Azrad [this message]
2017-12-06 16:22     ` [dpdk-dev] [PATCH v2 4/8] net/mlx4: optimize Tx multi-segment case Adrien Mazarguil
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 5/8] net/mlx4: merge Tx queue rings management Matan Azrad
2017-12-06 16:22     ` Adrien Mazarguil
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 6/8] net/mlx4: mitigate Tx send entry size calculations Matan Azrad
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 7/8] net/mlx4: align Tx descriptors number Matan Azrad
2017-12-06 16:22     ` Adrien Mazarguil
2017-12-06 17:24       ` Matan Azrad
2017-12-06 14:48   ` [dpdk-dev] [PATCH v2 8/8] net/mlx4: remove Tx completion elements counter Matan Azrad
2017-12-06 16:22     ` Adrien Mazarguil
2017-12-06 17:57   ` [dpdk-dev] [PATCH v3 0/8] improve mlx4 Tx performance Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 1/8] net/mlx4: fix Tx packet drop application report Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 2/8] net/mlx4: remove unnecessary Tx wraparound checks Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 3/8] net/mlx4: remove restamping from Tx error path Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 4/8] net/mlx4: optimize Tx multi-segment case Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 5/8] net/mlx4: merge Tx queue rings management Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 6/8] net/mlx4: mitigate Tx send entry size calculations Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 7/8] net/mlx4: align Tx descriptors number Matan Azrad
2017-12-06 17:57     ` [dpdk-dev] [PATCH v3 8/8] net/mlx4: remove Tx completion elements counter Matan Azrad
2017-12-07 10:56     ` [dpdk-dev] [PATCH v3 0/8] improve mlx4 Tx performance Adrien Mazarguil
2017-12-10 10:22       ` Shahaf Shuler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1512571693-15338-5-git-send-email-matan@mellanox.com \
    --to=matan@mellanox.com \
    --cc=adrien.mazarguil@6wind.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).