DPDK patches and discussions
 help / color / mirror / Atom feed
From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Cc: cristian.dumitrescu@intel.com, jasvinder.singh@intel.com,
	Konstantin Ananyev <konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
Date: Thu, 18 Mar 2021 11:56:13 +0000	[thread overview]
Message-ID: <20210318115613.5503-2-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <20210318115613.5503-1-konstantin.ananyev@intel.com>

In many usage scenarios input mbufs for rte_sched_port_enqueue()
are not yet in the CPU cache(s). That causes quite significant stalls
due to memory latency. Current implementation tries to migitate it
using SW pipeline and SW prefetch techniques, but stalls are still present.
Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
metadata as a first stage of that function.
That helps to minimise load stalls at further stages of enqueue()
and improves overall enqueue performance.
With examples/qos_sched I observed:
on ICX box: up to 30% cycles reduction
on CSX AND BDX: 20-15% cycles reduction
I also run tests with mbufs already in the cache
(one core doing RX, QOS and TX).
With such scenario, on all mentioned above IA boxes
no performance drop was observed.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
v2: fix clang and checkpatch complains
---
 lib/librte_sched/rte_sched.c | 219 +++++------------------------------
 1 file changed, 31 insertions(+), 188 deletions(-)

diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 7c5688068..41ef147e0 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
 #endif /* RTE_SCHED_DEBUG */
 
 static inline struct rte_sched_subport *
-rte_sched_port_subport(struct rte_sched_port *port,
-	struct rte_mbuf *pkt)
+sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf_sched sch)
 {
-	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
+	uint32_t queue_id = sch.queue_id;
 	uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
 
 	return port->subports[subport_id];
 }
 
 static inline uint32_t
-rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
-	struct rte_mbuf *pkt, uint32_t subport_qmask)
+sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+	struct rte_mbuf_sched sch, uint32_t subport_qmask)
 {
 	struct rte_sched_queue *q;
 #ifdef RTE_SCHED_COLLECT_STATS
 	struct rte_sched_queue_extra *qe;
 #endif
-	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
+	uint32_t qindex = sch.queue_id;
 	uint32_t subport_queue_id = subport_qmask & qindex;
 
 	q = subport->queue + subport_queue_id;
@@ -1971,197 +1970,41 @@ int
 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
 		       uint32_t n_pkts)
 {
-	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
-		*pkt30, *pkt31, *pkt_last;
-	struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
-		**q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
-	struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
-		*subport20, *subport21, *subport30, *subport31, *subport_last;
-	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
-	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
-	uint32_t subport_qmask;
 	uint32_t result, i;
+	struct rte_mbuf_sched sch[n_pkts];
+	struct rte_sched_subport *subports[n_pkts];
+	struct rte_mbuf **q_base[n_pkts];
+	uint32_t q[n_pkts];
+
+	const uint32_t subport_qmask =
+		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
 	result = 0;
-	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
-	/*
-	 * Less then 6 input packets available, which is not enough to
-	 * feed the pipeline
-	 */
-	if (unlikely(n_pkts < 6)) {
-		struct rte_sched_subport *subports[5];
-		struct rte_mbuf **q_base[5];
-		uint32_t q[5];
-
-		/* Prefetch the mbuf structure of each packet */
-		for (i = 0; i < n_pkts; i++)
-			rte_prefetch0(pkts[i]);
-
-		/* Prefetch the subport structure for each packet */
-		for (i = 0; i < n_pkts; i++)
-			subports[i] = rte_sched_port_subport(port, pkts[i]);
-
-		/* Prefetch the queue structure for each queue */
-		for (i = 0; i < n_pkts; i++)
-			q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
-					pkts[i], subport_qmask);
-
-		/* Prefetch the write pointer location of each queue */
-		for (i = 0; i < n_pkts; i++) {
-			q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
-			rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
-				q[i], q_base[i]);
-		}
+	/* Prefetch the mbuf structure of each packet */
+	for (i = 0; i < n_pkts; i++)
+		sch[i] = pkts[i]->hash.sched;
 
-		/* Write each packet to its queue */
-		for (i = 0; i < n_pkts; i++)
-			result += rte_sched_port_enqueue_qwa(port, subports[i],
-						q[i], q_base[i], pkts[i]);
+	/* Prefetch the subport structure for each packet */
+	for (i = 0; i < n_pkts; i++)
+		subports[i] = sched_port_subport(port, sch[i]);
 
-		return result;
-	}
+	/* Prefetch the queue structure for each queue */
+	for (i = 0; i < n_pkts; i++)
+		q[i] = sched_port_enqueue_qptrs_prefetch0(subports[i],
+				sch[i], subport_qmask);
 
-	/* Feed the first 3 stages of the pipeline (6 packets needed) */
-	pkt20 = pkts[0];
-	pkt21 = pkts[1];
-	rte_prefetch0(pkt20);
-	rte_prefetch0(pkt21);
-
-	pkt10 = pkts[2];
-	pkt11 = pkts[3];
-	rte_prefetch0(pkt10);
-	rte_prefetch0(pkt11);
-
-	subport20 = rte_sched_port_subport(port, pkt20);
-	subport21 = rte_sched_port_subport(port, pkt21);
-	q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
-			pkt20, subport_qmask);
-	q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
-			pkt21, subport_qmask);
-
-	pkt00 = pkts[4];
-	pkt01 = pkts[5];
-	rte_prefetch0(pkt00);
-	rte_prefetch0(pkt01);
-
-	subport10 = rte_sched_port_subport(port, pkt10);
-	subport11 = rte_sched_port_subport(port, pkt11);
-	q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-			pkt10, subport_qmask);
-	q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-			pkt11, subport_qmask);
-
-	q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-	q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-	/* Run the pipeline */
-	for (i = 6; i < (n_pkts & (~1)); i += 2) {
-		/* Propagate stage inputs */
-		pkt30 = pkt20;
-		pkt31 = pkt21;
-		pkt20 = pkt10;
-		pkt21 = pkt11;
-		pkt10 = pkt00;
-		pkt11 = pkt01;
-		q30 = q20;
-		q31 = q21;
-		q20 = q10;
-		q21 = q11;
-		subport30 = subport20;
-		subport31 = subport21;
-		subport20 = subport10;
-		subport21 = subport11;
-		q30_base = q20_base;
-		q31_base = q21_base;
-
-		/* Stage 0: Get packets in */
-		pkt00 = pkts[i];
-		pkt01 = pkts[i + 1];
-		rte_prefetch0(pkt00);
-		rte_prefetch0(pkt01);
-
-		/* Stage 1: Prefetch subport and queue structure storing queue pointers */
-		subport10 = rte_sched_port_subport(port, pkt10);
-		subport11 = rte_sched_port_subport(port, pkt11);
-		q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-				pkt10, subport_qmask);
-		q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-				pkt11, subport_qmask);
-
-		/* Stage 2: Prefetch queue write location */
-		q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-		q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-		/* Stage 3: Write packet to queue and activate queue */
-		r30 = rte_sched_port_enqueue_qwa(port, subport30,
-				q30, q30_base, pkt30);
-		r31 = rte_sched_port_enqueue_qwa(port, subport31,
-				q31, q31_base, pkt31);
-		result += r30 + r31;
-	}
-
-	/*
-	 * Drain the pipeline (exactly 6 packets).
-	 * Handle the last packet in the case
-	 * of an odd number of input packets.
-	 */
-	pkt_last = pkts[n_pkts - 1];
-	rte_prefetch0(pkt_last);
-
-	subport00 = rte_sched_port_subport(port, pkt00);
-	subport01 = rte_sched_port_subport(port, pkt01);
-	q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
-			pkt00, subport_qmask);
-	q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
-			pkt01, subport_qmask);
-
-	q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
-	q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
-
-	r20 = rte_sched_port_enqueue_qwa(port, subport20,
-			q20, q20_base, pkt20);
-	r21 = rte_sched_port_enqueue_qwa(port, subport21,
-			q21, q21_base, pkt21);
-	result += r20 + r21;
-
-	subport_last = rte_sched_port_subport(port, pkt_last);
-	q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
-				pkt_last, subport_qmask);
-
-	q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
-	q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
-
-	r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
-			q10_base, pkt10);
-	r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
-			q11_base, pkt11);
-	result += r10 + r11;
-
-	q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
-		q_last, q_last_base);
-
-	r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
-			q00_base, pkt00);
-	r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
-			q01_base, pkt01);
-	result += r00 + r01;
-
-	if (n_pkts & 1) {
-		r_last = rte_sched_port_enqueue_qwa(port, subport_last,
-					q_last,	q_last_base, pkt_last);
-		result += r_last;
+	/* Prefetch the write pointer location of each queue */
+	for (i = 0; i < n_pkts; i++) {
+		q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
+		rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
+			q[i], q_base[i]);
 	}
 
+	/* Write each packet to its queue */
+	for (i = 0; i < n_pkts; i++)
+		result += rte_sched_port_enqueue_qwa(port, subports[i],
+					q[i], q_base[i], pkts[i]);
 	return result;
 }
 
-- 
2.25.1


  reply	other threads:[~2021-03-18 11:57 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-16 17:07 [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
2021-03-16 17:07 ` [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
2021-03-18 11:56   ` Konstantin Ananyev [this message]
2021-04-02 14:14     ` [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure Singh, Jasvinder
2021-04-02 21:12       ` Dumitrescu, Cristian
2021-04-03 23:53         ` Ananyev, Konstantin
2021-11-17 10:33           ` Thomas Monjalon
2021-11-17 10:53             ` Dumitrescu, Cristian

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210318115613.5503-2-konstantin.ananyev@intel.com \
    --to=konstantin.ananyev@intel.com \
    --cc=cristian.dumitrescu@intel.com \
    --cc=dev@dpdk.org \
    --cc=jasvinder.singh@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).