From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Cc: cristian.dumitrescu@intel.com, jasvinder.singh@intel.com,
Konstantin Ananyev <konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure
Date: Tue, 16 Mar 2021 17:07:23 +0000 [thread overview]
Message-ID: <20210316170723.22036-2-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <20210316170723.22036-1-konstantin.ananyev@intel.com>
Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
metadata as a first stage of that function.
That helps to avoid load stalls at futher stages of enqueue()
and improves overall enqueue perfomance.
With examples/qos_sched I observed:
on ICX box: up to 30% cycles reduction
on CSX AND BDX: 20-15% cycles redunction
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
lib/librte_sched/rte_sched.c | 233 +++++++----------------------------
1 file changed, 45 insertions(+), 188 deletions(-)
diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 7c56880681..f608617988 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -1861,24 +1861,29 @@ debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
#endif /* RTE_SCHED_DEBUG */
static inline struct rte_sched_subport *
-rte_sched_port_subport(struct rte_sched_port *port,
- struct rte_mbuf *pkt)
+sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf_sched sch)
{
- uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
+ uint32_t queue_id = sch.queue_id;
uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
return port->subports[subport_id];
}
+static inline struct rte_sched_subport *
+rte_sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf *pkt)
+{
+ return sched_port_subport(port, pkt->hash.sched);
+}
+
static inline uint32_t
-rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
- struct rte_mbuf *pkt, uint32_t subport_qmask)
+sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+ struct rte_mbuf_sched sch, uint32_t subport_qmask)
{
struct rte_sched_queue *q;
#ifdef RTE_SCHED_COLLECT_STATS
struct rte_sched_queue_extra *qe;
#endif
- uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
+ uint32_t qindex = sch.queue_id;
uint32_t subport_queue_id = subport_qmask & qindex;
q = subport->queue + subport_queue_id;
@@ -1891,6 +1896,14 @@ rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
return subport_queue_id;
}
+static inline uint32_t
+rte_sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+ struct rte_mbuf *pkt, uint32_t subport_qmask)
+{
+ return sched_port_enqueue_qptrs_prefetch0(subport, pkt->hash.sched,
+ subport_qmask);
+}
+
static inline void
rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
struct rte_sched_subport *subport,
@@ -1971,197 +1984,41 @@ int
rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
uint32_t n_pkts)
{
- struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
- *pkt30, *pkt31, *pkt_last;
- struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
- **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
- struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
- *subport20, *subport21, *subport30, *subport31, *subport_last;
- uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
- uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
- uint32_t subport_qmask;
uint32_t result, i;
+ struct rte_mbuf_sched sch[n_pkts];
+ struct rte_sched_subport *subports[n_pkts];
+ struct rte_mbuf **q_base[n_pkts];
+ uint32_t q[n_pkts];
+
+ const uint32_t subport_qmask =
+ (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
result = 0;
- subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
- /*
- * Less then 6 input packets available, which is not enough to
- * feed the pipeline
- */
- if (unlikely(n_pkts < 6)) {
- struct rte_sched_subport *subports[5];
- struct rte_mbuf **q_base[5];
- uint32_t q[5];
-
- /* Prefetch the mbuf structure of each packet */
- for (i = 0; i < n_pkts; i++)
- rte_prefetch0(pkts[i]);
-
- /* Prefetch the subport structure for each packet */
- for (i = 0; i < n_pkts; i++)
- subports[i] = rte_sched_port_subport(port, pkts[i]);
-
- /* Prefetch the queue structure for each queue */
- for (i = 0; i < n_pkts; i++)
- q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
- pkts[i], subport_qmask);
-
- /* Prefetch the write pointer location of each queue */
- for (i = 0; i < n_pkts; i++) {
- q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
- rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
- q[i], q_base[i]);
- }
+ /* Prefetch the mbuf structure of each packet */
+ for (i = 0; i < n_pkts; i++)
+ sch[i] = pkts[i]->hash.sched;
- /* Write each packet to its queue */
- for (i = 0; i < n_pkts; i++)
- result += rte_sched_port_enqueue_qwa(port, subports[i],
- q[i], q_base[i], pkts[i]);
+ /* Prefetch the subport structure for each packet */
+ for (i = 0; i < n_pkts; i++)
+ subports[i] = sched_port_subport(port, sch[i]);
- return result;
- }
+ /* Prefetch the queue structure for each queue */
+ for (i = 0; i < n_pkts; i++)
+ q[i] = sched_port_enqueue_qptrs_prefetch0(subports[i],
+ sch[i], subport_qmask);
- /* Feed the first 3 stages of the pipeline (6 packets needed) */
- pkt20 = pkts[0];
- pkt21 = pkts[1];
- rte_prefetch0(pkt20);
- rte_prefetch0(pkt21);
-
- pkt10 = pkts[2];
- pkt11 = pkts[3];
- rte_prefetch0(pkt10);
- rte_prefetch0(pkt11);
-
- subport20 = rte_sched_port_subport(port, pkt20);
- subport21 = rte_sched_port_subport(port, pkt21);
- q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
- pkt20, subport_qmask);
- q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
- pkt21, subport_qmask);
-
- pkt00 = pkts[4];
- pkt01 = pkts[5];
- rte_prefetch0(pkt00);
- rte_prefetch0(pkt01);
-
- subport10 = rte_sched_port_subport(port, pkt10);
- subport11 = rte_sched_port_subport(port, pkt11);
- q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
- pkt10, subport_qmask);
- q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
- pkt11, subport_qmask);
-
- q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
- q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
- /* Run the pipeline */
- for (i = 6; i < (n_pkts & (~1)); i += 2) {
- /* Propagate stage inputs */
- pkt30 = pkt20;
- pkt31 = pkt21;
- pkt20 = pkt10;
- pkt21 = pkt11;
- pkt10 = pkt00;
- pkt11 = pkt01;
- q30 = q20;
- q31 = q21;
- q20 = q10;
- q21 = q11;
- subport30 = subport20;
- subport31 = subport21;
- subport20 = subport10;
- subport21 = subport11;
- q30_base = q20_base;
- q31_base = q21_base;
-
- /* Stage 0: Get packets in */
- pkt00 = pkts[i];
- pkt01 = pkts[i + 1];
- rte_prefetch0(pkt00);
- rte_prefetch0(pkt01);
-
- /* Stage 1: Prefetch subport and queue structure storing queue pointers */
- subport10 = rte_sched_port_subport(port, pkt10);
- subport11 = rte_sched_port_subport(port, pkt11);
- q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
- pkt10, subport_qmask);
- q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
- pkt11, subport_qmask);
-
- /* Stage 2: Prefetch queue write location */
- q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
- q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
- /* Stage 3: Write packet to queue and activate queue */
- r30 = rte_sched_port_enqueue_qwa(port, subport30,
- q30, q30_base, pkt30);
- r31 = rte_sched_port_enqueue_qwa(port, subport31,
- q31, q31_base, pkt31);
- result += r30 + r31;
- }
-
- /*
- * Drain the pipeline (exactly 6 packets).
- * Handle the last packet in the case
- * of an odd number of input packets.
- */
- pkt_last = pkts[n_pkts - 1];
- rte_prefetch0(pkt_last);
-
- subport00 = rte_sched_port_subport(port, pkt00);
- subport01 = rte_sched_port_subport(port, pkt01);
- q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
- pkt00, subport_qmask);
- q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
- pkt01, subport_qmask);
-
- q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
- q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
-
- r20 = rte_sched_port_enqueue_qwa(port, subport20,
- q20, q20_base, pkt20);
- r21 = rte_sched_port_enqueue_qwa(port, subport21,
- q21, q21_base, pkt21);
- result += r20 + r21;
-
- subport_last = rte_sched_port_subport(port, pkt_last);
- q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
- pkt_last, subport_qmask);
-
- q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
- q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
-
- r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
- q10_base, pkt10);
- r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
- q11_base, pkt11);
- result += r10 + r11;
-
- q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
- rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
- q_last, q_last_base);
-
- r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
- q00_base, pkt00);
- r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
- q01_base, pkt01);
- result += r00 + r01;
-
- if (n_pkts & 1) {
- r_last = rte_sched_port_enqueue_qwa(port, subport_last,
- q_last, q_last_base, pkt_last);
- result += r_last;
+ /* Prefetch the write pointer location of each queue */
+ for (i = 0; i < n_pkts; i++) {
+ q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
+ rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
+ q[i], q_base[i]);
}
+ /* Write each packet to its queue */
+ for (i = 0; i < n_pkts; i++)
+ result += rte_sched_port_enqueue_qwa(port, subports[i],
+ q[i], q_base[i], pkts[i]);
return result;
}
--
2.26.2
next prev parent reply other threads:[~2021-03-16 17:07 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-16 17:07 [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
2021-03-16 17:07 ` Konstantin Ananyev [this message]
2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 " Konstantin Ananyev
2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
2021-04-02 14:14 ` Singh, Jasvinder
2021-04-02 21:12 ` Dumitrescu, Cristian
2021-04-03 23:53 ` Ananyev, Konstantin
2021-11-17 10:33 ` Thomas Monjalon
2021-11-17 10:53 ` Dumitrescu, Cristian
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210316170723.22036-2-konstantin.ananyev@intel.com \
--to=konstantin.ananyev@intel.com \
--cc=cristian.dumitrescu@intel.com \
--cc=dev@dpdk.org \
--cc=jasvinder.singh@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).