[dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun
@ 2021-03-16 17:07 Konstantin Ananyev
  2021-03-16 17:07 ` [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
  2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
  0 siblings, 2 replies; 9+ messages in thread
From: Konstantin Ananyev @ 2021-03-16 17:07 UTC (permalink / raw)
  To: dev; +Cc: cristian.dumitrescu, jasvinder.singh, Konstantin Ananyev, stable

'3' is not valid RTE_COLOR_ enum value.

Fixes: de3cfa2c9823 ("sched: initial import")
Cc: stable@dpdk.org

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 examples/qos_sched/app_thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/qos_sched/app_thread.c b/examples/qos_sched/app_thread.c
index dbc878b553..a5f402190c 100644
--- a/examples/qos_sched/app_thread.c
+++ b/examples/qos_sched/app_thread.c
@@ -53,7 +53,7 @@ get_pkt_sched(struct rte_mbuf *m, uint32_t *subport, uint32_t *pipe,
 	*queue = pipe_queue - *traffic_class;
 
 	/* Color (Destination IP) */
-	*color = pdata[COLOR_OFFSET] & 0x03;
+	*color = pdata[COLOR_OFFSET] % RTE_COLORS;
 
 	return 0;
 }
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure
  2021-03-16 17:07 [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
@ 2021-03-16 17:07 ` Konstantin Ananyev
  2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
  1 sibling, 0 replies; 9+ messages in thread
From: Konstantin Ananyev @ 2021-03-16 17:07 UTC (permalink / raw)
  To: dev; +Cc: cristian.dumitrescu, jasvinder.singh, Konstantin Ananyev

Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
metadata as a first stage of that function.
That helps to avoid load stalls at futher stages of enqueue()
and improves overall enqueue perfomance.
With examples/qos_sched I observed:
on ICX box: up to 30% cycles reduction
on CSX AND BDX: 20-15% cycles redunction

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_sched/rte_sched.c | 233 +++++++----------------------------
 1 file changed, 45 insertions(+), 188 deletions(-)

diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 7c56880681..f608617988 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -1861,24 +1861,29 @@ debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
 #endif /* RTE_SCHED_DEBUG */
 
 static inline struct rte_sched_subport *
-rte_sched_port_subport(struct rte_sched_port *port,
-	struct rte_mbuf *pkt)
+sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf_sched sch)
 {
-	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
+	uint32_t queue_id = sch.queue_id;
 	uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
 
 	return port->subports[subport_id];
 }
 
+static inline struct rte_sched_subport *
+rte_sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf *pkt)
+{
+	return sched_port_subport(port, pkt->hash.sched);
+}
+
 static inline uint32_t
-rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
-	struct rte_mbuf *pkt, uint32_t subport_qmask)
+sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+	struct rte_mbuf_sched sch, uint32_t subport_qmask)
 {
 	struct rte_sched_queue *q;
 #ifdef RTE_SCHED_COLLECT_STATS
 	struct rte_sched_queue_extra *qe;
 #endif
-	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
+	uint32_t qindex = sch.queue_id;
 	uint32_t subport_queue_id = subport_qmask & qindex;
 
 	q = subport->queue + subport_queue_id;
@@ -1891,6 +1896,14 @@ rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
 	return subport_queue_id;
 }
 
+static inline uint32_t
+rte_sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+	struct rte_mbuf *pkt, uint32_t subport_qmask)
+{
+	return sched_port_enqueue_qptrs_prefetch0(subport, pkt->hash.sched,
+			subport_qmask);
+}
+
 static inline void
 rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
 	struct rte_sched_subport *subport,
@@ -1971,197 +1984,41 @@ int
 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
 		       uint32_t n_pkts)
 {
-	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
-		*pkt30, *pkt31, *pkt_last;
-	struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
-		**q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
-	struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
-		*subport20, *subport21, *subport30, *subport31, *subport_last;
-	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
-	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
-	uint32_t subport_qmask;
 	uint32_t result, i;
+	struct rte_mbuf_sched sch[n_pkts];
+	struct rte_sched_subport *subports[n_pkts];
+	struct rte_mbuf **q_base[n_pkts];
+	uint32_t q[n_pkts];
+
+	const uint32_t subport_qmask =
+		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
 	result = 0;
-	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
-	/*
-	 * Less then 6 input packets available, which is not enough to
-	 * feed the pipeline
-	 */
-	if (unlikely(n_pkts < 6)) {
-		struct rte_sched_subport *subports[5];
-		struct rte_mbuf **q_base[5];
-		uint32_t q[5];
-
-		/* Prefetch the mbuf structure of each packet */
-		for (i = 0; i < n_pkts; i++)
-			rte_prefetch0(pkts[i]);
-
-		/* Prefetch the subport structure for each packet */
-		for (i = 0; i < n_pkts; i++)
-			subports[i] = rte_sched_port_subport(port, pkts[i]);
-
-		/* Prefetch the queue structure for each queue */
-		for (i = 0; i < n_pkts; i++)
-			q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
-					pkts[i], subport_qmask);
-
-		/* Prefetch the write pointer location of each queue */
-		for (i = 0; i < n_pkts; i++) {
-			q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
-			rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
-				q[i], q_base[i]);
-		}
+	/* Prefetch the mbuf structure of each packet */
+	for (i = 0; i < n_pkts; i++)
+		sch[i] = pkts[i]->hash.sched;
 
-		/* Write each packet to its queue */
-		for (i = 0; i < n_pkts; i++)
-			result += rte_sched_port_enqueue_qwa(port, subports[i],
-						q[i], q_base[i], pkts[i]);
+	/* Prefetch the subport structure for each packet */
+	for (i = 0; i < n_pkts; i++)
+		subports[i] = sched_port_subport(port, sch[i]);
 
-		return result;
-	}
+	/* Prefetch the queue structure for each queue */
+	for (i = 0; i < n_pkts; i++)
+		q[i] = sched_port_enqueue_qptrs_prefetch0(subports[i],
+				sch[i], subport_qmask);
 
-	/* Feed the first 3 stages of the pipeline (6 packets needed) */
-	pkt20 = pkts[0];
-	pkt21 = pkts[1];
-	rte_prefetch0(pkt20);
-	rte_prefetch0(pkt21);
-
-	pkt10 = pkts[2];
-	pkt11 = pkts[3];
-	rte_prefetch0(pkt10);
-	rte_prefetch0(pkt11);
-
-	subport20 = rte_sched_port_subport(port, pkt20);
-	subport21 = rte_sched_port_subport(port, pkt21);
-	q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
-			pkt20, subport_qmask);
-	q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
-			pkt21, subport_qmask);
-
-	pkt00 = pkts[4];
-	pkt01 = pkts[5];
-	rte_prefetch0(pkt00);
-	rte_prefetch0(pkt01);
-
-	subport10 = rte_sched_port_subport(port, pkt10);
-	subport11 = rte_sched_port_subport(port, pkt11);
-	q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-			pkt10, subport_qmask);
-	q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-			pkt11, subport_qmask);
-
-	q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-	q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-	/* Run the pipeline */
-	for (i = 6; i < (n_pkts & (~1)); i += 2) {
-		/* Propagate stage inputs */
-		pkt30 = pkt20;
-		pkt31 = pkt21;
-		pkt20 = pkt10;
-		pkt21 = pkt11;
-		pkt10 = pkt00;
-		pkt11 = pkt01;
-		q30 = q20;
-		q31 = q21;
-		q20 = q10;
-		q21 = q11;
-		subport30 = subport20;
-		subport31 = subport21;
-		subport20 = subport10;
-		subport21 = subport11;
-		q30_base = q20_base;
-		q31_base = q21_base;
-
-		/* Stage 0: Get packets in */
-		pkt00 = pkts[i];
-		pkt01 = pkts[i + 1];
-		rte_prefetch0(pkt00);
-		rte_prefetch0(pkt01);
-
-		/* Stage 1: Prefetch subport and queue structure storing queue pointers */
-		subport10 = rte_sched_port_subport(port, pkt10);
-		subport11 = rte_sched_port_subport(port, pkt11);
-		q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-				pkt10, subport_qmask);
-		q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-				pkt11, subport_qmask);
-
-		/* Stage 2: Prefetch queue write location */
-		q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-		q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-		/* Stage 3: Write packet to queue and activate queue */
-		r30 = rte_sched_port_enqueue_qwa(port, subport30,
-				q30, q30_base, pkt30);
-		r31 = rte_sched_port_enqueue_qwa(port, subport31,
-				q31, q31_base, pkt31);
-		result += r30 + r31;
-	}
-
-	/*
-	 * Drain the pipeline (exactly 6 packets).
-	 * Handle the last packet in the case
-	 * of an odd number of input packets.
-	 */
-	pkt_last = pkts[n_pkts - 1];
-	rte_prefetch0(pkt_last);
-
-	subport00 = rte_sched_port_subport(port, pkt00);
-	subport01 = rte_sched_port_subport(port, pkt01);
-	q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
-			pkt00, subport_qmask);
-	q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
-			pkt01, subport_qmask);
-
-	q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
-	q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
-
-	r20 = rte_sched_port_enqueue_qwa(port, subport20,
-			q20, q20_base, pkt20);
-	r21 = rte_sched_port_enqueue_qwa(port, subport21,
-			q21, q21_base, pkt21);
-	result += r20 + r21;
-
-	subport_last = rte_sched_port_subport(port, pkt_last);
-	q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
-				pkt_last, subport_qmask);
-
-	q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
-	q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
-
-	r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
-			q10_base, pkt10);
-	r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
-			q11_base, pkt11);
-	result += r10 + r11;
-
-	q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
-		q_last, q_last_base);
-
-	r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
-			q00_base, pkt00);
-	r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
-			q01_base, pkt01);
-	result += r00 + r01;
-
-	if (n_pkts & 1) {
-		r_last = rte_sched_port_enqueue_qwa(port, subport_last,
-					q_last,	q_last_base, pkt_last);
-		result += r_last;
+	/* Prefetch the write pointer location of each queue */
+	for (i = 0; i < n_pkts; i++) {
+		q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
+		rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
+			q[i], q_base[i]);
 	}
 
+	/* Write each packet to its queue */
+	for (i = 0; i < n_pkts; i++)
+		result += rte_sched_port_enqueue_qwa(port, subports[i],
+					q[i], q_base[i], pkts[i]);
 	return result;
 }
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun
  2021-03-16 17:07 [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
  2021-03-16 17:07 ` [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
@ 2021-03-18 11:56 ` Konstantin Ananyev
  2021-03-18 11:56   ` [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
  1 sibling, 1 reply; 9+ messages in thread
From: Konstantin Ananyev @ 2021-03-18 11:56 UTC (permalink / raw)
  To: dev; +Cc: cristian.dumitrescu, jasvinder.singh, Konstantin Ananyev, stable

'3' is not valid RTE_COLOR_ enum value.

Fixes: de3cfa2c9823 ("sched: initial import")
Cc: stable@dpdk.org

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 examples/qos_sched/app_thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/qos_sched/app_thread.c b/examples/qos_sched/app_thread.c
index dbc878b55..a5f402190 100644
--- a/examples/qos_sched/app_thread.c
+++ b/examples/qos_sched/app_thread.c
@@ -53,7 +53,7 @@ get_pkt_sched(struct rte_mbuf *m, uint32_t *subport, uint32_t *pipe,
 	*queue = pipe_queue - *traffic_class;
 
 	/* Color (Destination IP) */
-	*color = pdata[COLOR_OFFSET] & 0x03;
+	*color = pdata[COLOR_OFFSET] % RTE_COLORS;
 
 	return 0;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
@ 2021-03-18 11:56   ` Konstantin Ananyev
  2021-04-02 14:14     ` Singh, Jasvinder
  0 siblings, 1 reply; 9+ messages in thread
From: Konstantin Ananyev @ 2021-03-18 11:56 UTC (permalink / raw)
  To: dev; +Cc: cristian.dumitrescu, jasvinder.singh, Konstantin Ananyev

In many usage scenarios input mbufs for rte_sched_port_enqueue()
are not yet in the CPU cache(s). That causes quite significant stalls
due to memory latency. Current implementation tries to migitate it
using SW pipeline and SW prefetch techniques, but stalls are still present.
Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
metadata as a first stage of that function.
That helps to minimise load stalls at further stages of enqueue()
and improves overall enqueue performance.
With examples/qos_sched I observed:
on ICX box: up to 30% cycles reduction
on CSX AND BDX: 20-15% cycles reduction
I also run tests with mbufs already in the cache
(one core doing RX, QOS and TX).
With such scenario, on all mentioned above IA boxes
no performance drop was observed.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
v2: fix clang and checkpatch complains
---
 lib/librte_sched/rte_sched.c | 219 +++++------------------------------
 1 file changed, 31 insertions(+), 188 deletions(-)

diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 7c5688068..41ef147e0 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
 #endif /* RTE_SCHED_DEBUG */
 
 static inline struct rte_sched_subport *
-rte_sched_port_subport(struct rte_sched_port *port,
-	struct rte_mbuf *pkt)
+sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf_sched sch)
 {
-	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
+	uint32_t queue_id = sch.queue_id;
 	uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
 
 	return port->subports[subport_id];
 }
 
 static inline uint32_t
-rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
-	struct rte_mbuf *pkt, uint32_t subport_qmask)
+sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport,
+	struct rte_mbuf_sched sch, uint32_t subport_qmask)
 {
 	struct rte_sched_queue *q;
 #ifdef RTE_SCHED_COLLECT_STATS
 	struct rte_sched_queue_extra *qe;
 #endif
-	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
+	uint32_t qindex = sch.queue_id;
 	uint32_t subport_queue_id = subport_qmask & qindex;
 
 	q = subport->queue + subport_queue_id;
@@ -1971,197 +1970,41 @@ int
 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
 		       uint32_t n_pkts)
 {
-	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
-		*pkt30, *pkt31, *pkt_last;
-	struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
-		**q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
-	struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
-		*subport20, *subport21, *subport30, *subport31, *subport_last;
-	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
-	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
-	uint32_t subport_qmask;
 	uint32_t result, i;
+	struct rte_mbuf_sched sch[n_pkts];
+	struct rte_sched_subport *subports[n_pkts];
+	struct rte_mbuf **q_base[n_pkts];
+	uint32_t q[n_pkts];
+
+	const uint32_t subport_qmask =
+		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
 	result = 0;
-	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
 
-	/*
-	 * Less then 6 input packets available, which is not enough to
-	 * feed the pipeline
-	 */
-	if (unlikely(n_pkts < 6)) {
-		struct rte_sched_subport *subports[5];
-		struct rte_mbuf **q_base[5];
-		uint32_t q[5];
-
-		/* Prefetch the mbuf structure of each packet */
-		for (i = 0; i < n_pkts; i++)
-			rte_prefetch0(pkts[i]);
-
-		/* Prefetch the subport structure for each packet */
-		for (i = 0; i < n_pkts; i++)
-			subports[i] = rte_sched_port_subport(port, pkts[i]);
-
-		/* Prefetch the queue structure for each queue */
-		for (i = 0; i < n_pkts; i++)
-			q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
-					pkts[i], subport_qmask);
-
-		/* Prefetch the write pointer location of each queue */
-		for (i = 0; i < n_pkts; i++) {
-			q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
-			rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
-				q[i], q_base[i]);
-		}
+	/* Prefetch the mbuf structure of each packet */
+	for (i = 0; i < n_pkts; i++)
+		sch[i] = pkts[i]->hash.sched;
 
-		/* Write each packet to its queue */
-		for (i = 0; i < n_pkts; i++)
-			result += rte_sched_port_enqueue_qwa(port, subports[i],
-						q[i], q_base[i], pkts[i]);
+	/* Prefetch the subport structure for each packet */
+	for (i = 0; i < n_pkts; i++)
+		subports[i] = sched_port_subport(port, sch[i]);
 
-		return result;
-	}
+	/* Prefetch the queue structure for each queue */
+	for (i = 0; i < n_pkts; i++)
+		q[i] = sched_port_enqueue_qptrs_prefetch0(subports[i],
+				sch[i], subport_qmask);
 
-	/* Feed the first 3 stages of the pipeline (6 packets needed) */
-	pkt20 = pkts[0];
-	pkt21 = pkts[1];
-	rte_prefetch0(pkt20);
-	rte_prefetch0(pkt21);
-
-	pkt10 = pkts[2];
-	pkt11 = pkts[3];
-	rte_prefetch0(pkt10);
-	rte_prefetch0(pkt11);
-
-	subport20 = rte_sched_port_subport(port, pkt20);
-	subport21 = rte_sched_port_subport(port, pkt21);
-	q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
-			pkt20, subport_qmask);
-	q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
-			pkt21, subport_qmask);
-
-	pkt00 = pkts[4];
-	pkt01 = pkts[5];
-	rte_prefetch0(pkt00);
-	rte_prefetch0(pkt01);
-
-	subport10 = rte_sched_port_subport(port, pkt10);
-	subport11 = rte_sched_port_subport(port, pkt11);
-	q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-			pkt10, subport_qmask);
-	q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-			pkt11, subport_qmask);
-
-	q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-	q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-	/* Run the pipeline */
-	for (i = 6; i < (n_pkts & (~1)); i += 2) {
-		/* Propagate stage inputs */
-		pkt30 = pkt20;
-		pkt31 = pkt21;
-		pkt20 = pkt10;
-		pkt21 = pkt11;
-		pkt10 = pkt00;
-		pkt11 = pkt01;
-		q30 = q20;
-		q31 = q21;
-		q20 = q10;
-		q21 = q11;
-		subport30 = subport20;
-		subport31 = subport21;
-		subport20 = subport10;
-		subport21 = subport11;
-		q30_base = q20_base;
-		q31_base = q21_base;
-
-		/* Stage 0: Get packets in */
-		pkt00 = pkts[i];
-		pkt01 = pkts[i + 1];
-		rte_prefetch0(pkt00);
-		rte_prefetch0(pkt01);
-
-		/* Stage 1: Prefetch subport and queue structure storing queue pointers */
-		subport10 = rte_sched_port_subport(port, pkt10);
-		subport11 = rte_sched_port_subport(port, pkt11);
-		q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
-				pkt10, subport_qmask);
-		q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
-				pkt11, subport_qmask);
-
-		/* Stage 2: Prefetch queue write location */
-		q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
-		q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
-		rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
-
-		/* Stage 3: Write packet to queue and activate queue */
-		r30 = rte_sched_port_enqueue_qwa(port, subport30,
-				q30, q30_base, pkt30);
-		r31 = rte_sched_port_enqueue_qwa(port, subport31,
-				q31, q31_base, pkt31);
-		result += r30 + r31;
-	}
-
-	/*
-	 * Drain the pipeline (exactly 6 packets).
-	 * Handle the last packet in the case
-	 * of an odd number of input packets.
-	 */
-	pkt_last = pkts[n_pkts - 1];
-	rte_prefetch0(pkt_last);
-
-	subport00 = rte_sched_port_subport(port, pkt00);
-	subport01 = rte_sched_port_subport(port, pkt01);
-	q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
-			pkt00, subport_qmask);
-	q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
-			pkt01, subport_qmask);
-
-	q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
-	q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
-
-	r20 = rte_sched_port_enqueue_qwa(port, subport20,
-			q20, q20_base, pkt20);
-	r21 = rte_sched_port_enqueue_qwa(port, subport21,
-			q21, q21_base, pkt21);
-	result += r20 + r21;
-
-	subport_last = rte_sched_port_subport(port, pkt_last);
-	q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
-				pkt_last, subport_qmask);
-
-	q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
-	q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
-
-	r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
-			q10_base, pkt10);
-	r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
-			q11_base, pkt11);
-	result += r10 + r11;
-
-	q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
-	rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
-		q_last, q_last_base);
-
-	r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
-			q00_base, pkt00);
-	r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
-			q01_base, pkt01);
-	result += r00 + r01;
-
-	if (n_pkts & 1) {
-		r_last = rte_sched_port_enqueue_qwa(port, subport_last,
-					q_last,	q_last_base, pkt_last);
-		result += r_last;
+	/* Prefetch the write pointer location of each queue */
+	for (i = 0; i < n_pkts; i++) {
+		q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
+		rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
+			q[i], q_base[i]);
 	}
 
+	/* Write each packet to its queue */
+	for (i = 0; i < n_pkts; i++)
+		result += rte_sched_port_enqueue_qwa(port, subports[i],
+					q[i], q_base[i], pkts[i]);
 	return result;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-03-18 11:56   ` [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
@ 2021-04-02 14:14     ` Singh, Jasvinder
  2021-04-02 21:12       ` Dumitrescu, Cristian
  0 siblings, 1 reply; 9+ messages in thread
From: Singh, Jasvinder @ 2021-04-02 14:14 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev; +Cc: Dumitrescu, Cristian



> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, March 18, 2021 11:56 AM
> To: dev@dpdk.org
> Cc: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>
> Subject: [PATCH v2 2/2] qos: rearrange enqueue procedure
> 
> In many usage scenarios input mbufs for rte_sched_port_enqueue() are not
> yet in the CPU cache(s). That causes quite significant stalls due to memory
> latency. Current implementation tries to migitate it using SW pipeline and SW
> prefetch techniques, but stalls are still present.
> Rework rte_sched_port_enqueue() to do actual fetch of all mbufs metadata
> as a first stage of that function.
> That helps to minimise load stalls at further stages of enqueue() and
> improves overall enqueue performance.
> With examples/qos_sched I observed:
> on ICX box: up to 30% cycles reduction
> on CSX AND BDX: 20-15% cycles reduction
> I also run tests with mbufs already in the cache (one core doing RX, QOS and
> TX).
> With such scenario, on all mentioned above IA boxes no performance drop
> was observed.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> v2: fix clang and checkpatch complains
> ---
>  lib/librte_sched/rte_sched.c | 219 +++++------------------------------
>  1 file changed, 31 insertions(+), 188 deletions(-)
> 
> diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c index
> 7c5688068..41ef147e0 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct
> rte_sched_subport *subport, uint32_t bmp_pos,  #endif /*
> RTE_SCHED_DEBUG */
> 
>  static inline struct rte_sched_subport * -rte_sched_port_subport(struct
> rte_sched_port *port,
> -	struct rte_mbuf *pkt)
> +sched_port_subport(const struct rte_sched_port *port, struct
> +rte_mbuf_sched sch)
>  {
> -	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
> +	uint32_t queue_id = sch.queue_id;
>  	uint32_t subport_id = queue_id >> (port-
> >n_pipes_per_subport_log2 + 4);
> 
>  	return port->subports[subport_id];
>  }
> 
>  static inline uint32_t
> -rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport
> *subport,
> -	struct rte_mbuf *pkt, uint32_t subport_qmask)
> +sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport
> *subport,
> +	struct rte_mbuf_sched sch, uint32_t subport_qmask)
>  {
>  	struct rte_sched_queue *q;
>  #ifdef RTE_SCHED_COLLECT_STATS
>  	struct rte_sched_queue_extra *qe;
>  #endif
> -	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
> +	uint32_t qindex = sch.queue_id;
>  	uint32_t subport_queue_id = subport_qmask & qindex;
> 
>  	q = subport->queue + subport_queue_id; @@ -1971,197 +1970,41
> @@ int  rte_sched_port_enqueue(struct rte_sched_port *port, struct
> rte_mbuf **pkts,
>  		       uint32_t n_pkts)
>  {
> -	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
> -		*pkt30, *pkt31, *pkt_last;
> -	struct rte_mbuf **q00_base, **q01_base, **q10_base,
> **q11_base,
> -		**q20_base, **q21_base, **q30_base, **q31_base,
> **q_last_base;
> -	struct rte_sched_subport *subport00, *subport01, *subport10,
> *subport11,
> -		*subport20, *subport21, *subport30, *subport31,
> *subport_last;
> -	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
> -	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
> -	uint32_t subport_qmask;
>  	uint32_t result, i;
> +	struct rte_mbuf_sched sch[n_pkts];
> +	struct rte_sched_subport *subports[n_pkts];
> +	struct rte_mbuf **q_base[n_pkts];
> +	uint32_t q[n_pkts];
> +
> +	const uint32_t subport_qmask =
> +		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> 
>  	result = 0;
> -	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> 
> -	/*
> -	 * Less then 6 input packets available, which is not enough to
> -	 * feed the pipeline
> -	 */
> -	if (unlikely(n_pkts < 6)) {
> -		struct rte_sched_subport *subports[5];
> -		struct rte_mbuf **q_base[5];
> -		uint32_t q[5];
> -
> -		/* Prefetch the mbuf structure of each packet */
> -		for (i = 0; i < n_pkts; i++)
> -			rte_prefetch0(pkts[i]);
> -
> -		/* Prefetch the subport structure for each packet */
> -		for (i = 0; i < n_pkts; i++)
> -			subports[i] = rte_sched_port_subport(port, pkts[i]);
> -
> -		/* Prefetch the queue structure for each queue */
> -		for (i = 0; i < n_pkts; i++)
> -			q[i] =
> rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
> -					pkts[i], subport_qmask);
> -
> -		/* Prefetch the write pointer location of each queue */
> -		for (i = 0; i < n_pkts; i++) {
> -			q_base[i] =
> rte_sched_subport_pipe_qbase(subports[i], q[i]);
> -			rte_sched_port_enqueue_qwa_prefetch0(port,
> subports[i],
> -				q[i], q_base[i]);
> -		}
> +	/* Prefetch the mbuf structure of each packet */
> +	for (i = 0; i < n_pkts; i++)
> +		sch[i] = pkts[i]->hash.sched;
> 

Hi Konstantin,  thanks for the patch. In above case, all packets are touched straight with any prefetch. If we consider the input burst size of 64 pkts, it means 512 bytes of packet addresses  (8 cache-lines) which is likely to be available in cache. For larger size burst, e.g. 128 or 256, there might be instances when some addresses are not available the cache, may stall core. How about adding explicit prefetch before starting to iterate through the packets if that helps?       

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-04-02 14:14     ` Singh, Jasvinder
@ 2021-04-02 21:12       ` Dumitrescu, Cristian
  2021-04-03 23:53         ` Ananyev, Konstantin
  0 siblings, 1 reply; 9+ messages in thread
From: Dumitrescu, Cristian @ 2021-04-02 21:12 UTC (permalink / raw)
  To: Singh, Jasvinder, Ananyev, Konstantin, dev



> -----Original Message-----
> From: Singh, Jasvinder <jasvinder.singh@intel.com>
> Sent: Friday, April 2, 2021 3:15 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> Cc: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> Subject: RE: [PATCH v2 2/2] qos: rearrange enqueue procedure
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Sent: Thursday, March 18, 2021 11:56 AM
> > To: dev@dpdk.org
> > Cc: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Singh, Jasvinder
> > <jasvinder.singh@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>
> > Subject: [PATCH v2 2/2] qos: rearrange enqueue procedure
> >
> > In many usage scenarios input mbufs for rte_sched_port_enqueue() are
> not
> > yet in the CPU cache(s). That causes quite significant stalls due to memory
> > latency. Current implementation tries to migitate it using SW pipeline and
> SW
> > prefetch techniques, but stalls are still present.
> > Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
> metadata
> > as a first stage of that function.
> > That helps to minimise load stalls at further stages of enqueue() and
> > improves overall enqueue performance.
> > With examples/qos_sched I observed:
> > on ICX box: up to 30% cycles reduction
> > on CSX AND BDX: 20-15% cycles reduction
> > I also run tests with mbufs already in the cache (one core doing RX, QOS
> and
> > TX).
> > With such scenario, on all mentioned above IA boxes no performance drop
> > was observed.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > ---
> > v2: fix clang and checkpatch complains
> > ---
> >  lib/librte_sched/rte_sched.c | 219 +++++------------------------------
> >  1 file changed, 31 insertions(+), 188 deletions(-)
> >
> > diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> index
> > 7c5688068..41ef147e0 100644
> > --- a/lib/librte_sched/rte_sched.c
> > +++ b/lib/librte_sched/rte_sched.c
> > @@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct
> > rte_sched_subport *subport, uint32_t bmp_pos,  #endif /*
> > RTE_SCHED_DEBUG */
> >
> >  static inline struct rte_sched_subport * -rte_sched_port_subport(struct
> > rte_sched_port *port,
> > -	struct rte_mbuf *pkt)
> > +sched_port_subport(const struct rte_sched_port *port, struct
> > +rte_mbuf_sched sch)
> >  {
> > -	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
> > +	uint32_t queue_id = sch.queue_id;
> >  	uint32_t subport_id = queue_id >> (port-
> > >n_pipes_per_subport_log2 + 4);
> >
> >  	return port->subports[subport_id];
> >  }
> >
> >  static inline uint32_t
> > -rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport
> > *subport,
> > -	struct rte_mbuf *pkt, uint32_t subport_qmask)
> > +sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport
> > *subport,
> > +	struct rte_mbuf_sched sch, uint32_t subport_qmask)
> >  {
> >  	struct rte_sched_queue *q;
> >  #ifdef RTE_SCHED_COLLECT_STATS
> >  	struct rte_sched_queue_extra *qe;
> >  #endif
> > -	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
> > +	uint32_t qindex = sch.queue_id;
> >  	uint32_t subport_queue_id = subport_qmask & qindex;
> >
> >  	q = subport->queue + subport_queue_id; @@ -1971,197 +1970,41
> > @@ int  rte_sched_port_enqueue(struct rte_sched_port *port, struct
> > rte_mbuf **pkts,
> >  		       uint32_t n_pkts)
> >  {
> > -	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
> > -		*pkt30, *pkt31, *pkt_last;
> > -	struct rte_mbuf **q00_base, **q01_base, **q10_base,
> > **q11_base,
> > -		**q20_base, **q21_base, **q30_base, **q31_base,
> > **q_last_base;
> > -	struct rte_sched_subport *subport00, *subport01, *subport10,
> > *subport11,
> > -		*subport20, *subport21, *subport30, *subport31,
> > *subport_last;
> > -	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
> > -	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
> > -	uint32_t subport_qmask;
> >  	uint32_t result, i;
> > +	struct rte_mbuf_sched sch[n_pkts];
> > +	struct rte_sched_subport *subports[n_pkts];
> > +	struct rte_mbuf **q_base[n_pkts];
> > +	uint32_t q[n_pkts];
> > +
> > +	const uint32_t subport_qmask =
> > +		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> >
> >  	result = 0;
> > -	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> >
> > -	/*
> > -	 * Less then 6 input packets available, which is not enough to
> > -	 * feed the pipeline
> > -	 */
> > -	if (unlikely(n_pkts < 6)) {
> > -		struct rte_sched_subport *subports[5];
> > -		struct rte_mbuf **q_base[5];
> > -		uint32_t q[5];
> > -
> > -		/* Prefetch the mbuf structure of each packet */
> > -		for (i = 0; i < n_pkts; i++)
> > -			rte_prefetch0(pkts[i]);
> > -
> > -		/* Prefetch the subport structure for each packet */
> > -		for (i = 0; i < n_pkts; i++)
> > -			subports[i] = rte_sched_port_subport(port, pkts[i]);
> > -
> > -		/* Prefetch the queue structure for each queue */
> > -		for (i = 0; i < n_pkts; i++)
> > -			q[i] =
> > rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
> > -					pkts[i], subport_qmask);
> > -
> > -		/* Prefetch the write pointer location of each queue */
> > -		for (i = 0; i < n_pkts; i++) {
> > -			q_base[i] =
> > rte_sched_subport_pipe_qbase(subports[i], q[i]);
> > -			rte_sched_port_enqueue_qwa_prefetch0(port,
> > subports[i],
> > -				q[i], q_base[i]);
> > -		}
> > +	/* Prefetch the mbuf structure of each packet */
> > +	for (i = 0; i < n_pkts; i++)
> > +		sch[i] = pkts[i]->hash.sched;
> >
> 
> Hi Konstantin,  thanks for the patch. In above case, all packets are touched
> straight with any prefetch. If we consider the input burst size of 64 pkts, it
> means 512 bytes of packet addresses  (8 cache-lines) which is likely to be
> available in cache. For larger size burst, e.g. 128 or 256, there might be
> instances when some addresses are not available the cache, may stall core.
> How about adding explicit prefetch before starting to iterate through the
> packets if that helps?

Exactly. Konstantin, you might not be a fan of prefetches, but the current enqueue implementation (as well as the dequeue) uses a prefetch state machine. Please keep the prefetch state machine in the scalar code. Even if the examples/qos_sched might not show an advantage, this is just a sample app and there are some more relevant use-cases as well.

Thanks,
Cristian

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-04-02 21:12       ` Dumitrescu, Cristian
@ 2021-04-03 23:53         ` Ananyev, Konstantin
  2021-11-17 10:33           ` Thomas Monjalon
  0 siblings, 1 reply; 9+ messages in thread
From: Ananyev, Konstantin @ 2021-04-03 23:53 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Singh, Jasvinder, dev


Hi guys,

> > > In many usage scenarios input mbufs for rte_sched_port_enqueue() are
> > not
> > > yet in the CPU cache(s). That causes quite significant stalls due to memory
> > > latency. Current implementation tries to migitate it using SW pipeline and
> > SW
> > > prefetch techniques, but stalls are still present.
> > > Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
> > metadata
> > > as a first stage of that function.
> > > That helps to minimise load stalls at further stages of enqueue() and
> > > improves overall enqueue performance.
> > > With examples/qos_sched I observed:
> > > on ICX box: up to 30% cycles reduction
> > > on CSX AND BDX: 20-15% cycles reduction
> > > I also run tests with mbufs already in the cache (one core doing RX, QOS
> > and
> > > TX).
> > > With such scenario, on all mentioned above IA boxes no performance drop
> > > was observed.
> > >
> > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > ---
> > > v2: fix clang and checkpatch complains
> > > ---
> > >  lib/librte_sched/rte_sched.c | 219 +++++------------------------------
> > >  1 file changed, 31 insertions(+), 188 deletions(-)
> > >
> > > diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> > index
> > > 7c5688068..41ef147e0 100644
> > > --- a/lib/librte_sched/rte_sched.c
> > > +++ b/lib/librte_sched/rte_sched.c
> > > @@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct
> > > rte_sched_subport *subport, uint32_t bmp_pos,  #endif /*
> > > RTE_SCHED_DEBUG */
> > >
> > >  static inline struct rte_sched_subport * -rte_sched_port_subport(struct
> > > rte_sched_port *port,
> > > -	struct rte_mbuf *pkt)
> > > +sched_port_subport(const struct rte_sched_port *port, struct
> > > +rte_mbuf_sched sch)
> > >  {
> > > -	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
> > > +	uint32_t queue_id = sch.queue_id;
> > >  	uint32_t subport_id = queue_id >> (port-
> > > >n_pipes_per_subport_log2 + 4);
> > >
> > >  	return port->subports[subport_id];
> > >  }
> > >
> > >  static inline uint32_t
> > > -rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport
> > > *subport,
> > > -	struct rte_mbuf *pkt, uint32_t subport_qmask)
> > > +sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport
> > > *subport,
> > > +	struct rte_mbuf_sched sch, uint32_t subport_qmask)
> > >  {
> > >  	struct rte_sched_queue *q;
> > >  #ifdef RTE_SCHED_COLLECT_STATS
> > >  	struct rte_sched_queue_extra *qe;
> > >  #endif
> > > -	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
> > > +	uint32_t qindex = sch.queue_id;
> > >  	uint32_t subport_queue_id = subport_qmask & qindex;
> > >
> > >  	q = subport->queue + subport_queue_id; @@ -1971,197 +1970,41
> > > @@ int  rte_sched_port_enqueue(struct rte_sched_port *port, struct
> > > rte_mbuf **pkts,
> > >  		       uint32_t n_pkts)
> > >  {
> > > -	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
> > > -		*pkt30, *pkt31, *pkt_last;
> > > -	struct rte_mbuf **q00_base, **q01_base, **q10_base,
> > > **q11_base,
> > > -		**q20_base, **q21_base, **q30_base, **q31_base,
> > > **q_last_base;
> > > -	struct rte_sched_subport *subport00, *subport01, *subport10,
> > > *subport11,
> > > -		*subport20, *subport21, *subport30, *subport31,
> > > *subport_last;
> > > -	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
> > > -	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
> > > -	uint32_t subport_qmask;
> > >  	uint32_t result, i;
> > > +	struct rte_mbuf_sched sch[n_pkts];
> > > +	struct rte_sched_subport *subports[n_pkts];
> > > +	struct rte_mbuf **q_base[n_pkts];
> > > +	uint32_t q[n_pkts];
> > > +
> > > +	const uint32_t subport_qmask =
> > > +		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> > >
> > >  	result = 0;
> > > -	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> > >
> > > -	/*
> > > -	 * Less then 6 input packets available, which is not enough to
> > > -	 * feed the pipeline
> > > -	 */
> > > -	if (unlikely(n_pkts < 6)) {
> > > -		struct rte_sched_subport *subports[5];
> > > -		struct rte_mbuf **q_base[5];
> > > -		uint32_t q[5];
> > > -
> > > -		/* Prefetch the mbuf structure of each packet */
> > > -		for (i = 0; i < n_pkts; i++)
> > > -			rte_prefetch0(pkts[i]);
> > > -
> > > -		/* Prefetch the subport structure for each packet */
> > > -		for (i = 0; i < n_pkts; i++)
> > > -			subports[i] = rte_sched_port_subport(port, pkts[i]);
> > > -
> > > -		/* Prefetch the queue structure for each queue */
> > > -		for (i = 0; i < n_pkts; i++)
> > > -			q[i] =
> > > rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
> > > -					pkts[i], subport_qmask);
> > > -
> > > -		/* Prefetch the write pointer location of each queue */
> > > -		for (i = 0; i < n_pkts; i++) {
> > > -			q_base[i] =
> > > rte_sched_subport_pipe_qbase(subports[i], q[i]);
> > > -			rte_sched_port_enqueue_qwa_prefetch0(port,
> > > subports[i],
> > > -				q[i], q_base[i]);
> > > -		}
> > > +	/* Prefetch the mbuf structure of each packet */
> > > +	for (i = 0; i < n_pkts; i++)
> > > +		sch[i] = pkts[i]->hash.sched;
> > >
> >
> > Hi Konstantin,  thanks for the patch. In above case, all packets are touched
> > straight with any prefetch. If we consider the input burst size of 64 pkts, it
> > means 512 bytes of packet addresses  (8 cache-lines) which is likely to be
> > available in cache. For larger size burst, e.g. 128 or 256, there might be
> > instances when some addresses are not available the cache, may stall core.
> > How about adding explicit prefetch before starting to iterate through the
> > packets if that helps?

I don't think we need any prefetch here.
pkts[] is a sequential array, HW prefetcher should be able to do good job here.
Again in majority of use-cases pkts[] contents will already present in the cache.
Though there is a valid concern here: n_pkts can be big, in that case we probably
don't want to store too much on the stack and read too much from pkts[].
It is better to work in some fixed chunks (64 or so).
I can prepare v2 with these changes, if there still is an interest in this patch.   

> Exactly. Konstantin, you might not be a fan of prefetches, but the current enqueue implementation (as well as the dequeue) uses a prefetch
> state machine. Please keep the prefetch state machine in the scalar code.

It is not about our own preferences.
From my measurements new version is faster and it is definitely simpler.

> Even if the examples/qos_sched might not show an advantage,
> this is just a sample app and there are some more relevant use-cases as well.

Well, I hope that examples/qos_sched reflects at least some real-world use-cases for QOS library.
Otherwise why do we have it inside DPDK codebase? 
About 'more relevant use-cases': if you do know such, can you try them with the patch?
I would really appreciate that.
In fact, it is an ask not only to Cristian, but to all other interested parties:
if your app does use librte_sched - please try this patch and provide the feedback.
If some tests would flag a regression - I am absolutely ok to drop the patch.
Konstantin




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-04-03 23:53         ` Ananyev, Konstantin
@ 2021-11-17 10:33           ` Thomas Monjalon
  2021-11-17 10:53             ` Dumitrescu, Cristian
  0 siblings, 1 reply; 9+ messages in thread
From: Thomas Monjalon @ 2021-11-17 10:33 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Singh, Jasvinder, Ananyev, Konstantin; +Cc: dev

Hi all,
What is the conclusion for this patch and the number 1/2?

04/04/2021 01:53, Ananyev, Konstantin:
> 
> Hi guys,
> 
> > > > In many usage scenarios input mbufs for rte_sched_port_enqueue() are
> > > not
> > > > yet in the CPU cache(s). That causes quite significant stalls due to memory
> > > > latency. Current implementation tries to migitate it using SW pipeline and
> > > SW
> > > > prefetch techniques, but stalls are still present.
> > > > Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
> > > metadata
> > > > as a first stage of that function.
> > > > That helps to minimise load stalls at further stages of enqueue() and
> > > > improves overall enqueue performance.
> > > > With examples/qos_sched I observed:
> > > > on ICX box: up to 30% cycles reduction
> > > > on CSX AND BDX: 20-15% cycles reduction
> > > > I also run tests with mbufs already in the cache (one core doing RX, QOS
> > > and
> > > > TX).
> > > > With such scenario, on all mentioned above IA boxes no performance drop
> > > > was observed.
> > > >
> > > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > > ---
> > > > v2: fix clang and checkpatch complains
> > > > ---
> > > >  lib/librte_sched/rte_sched.c | 219 +++++------------------------------
> > > >  1 file changed, 31 insertions(+), 188 deletions(-)
> > > >
> > > > diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> > > index
> > > > 7c5688068..41ef147e0 100644
> > > > --- a/lib/librte_sched/rte_sched.c
> > > > +++ b/lib/librte_sched/rte_sched.c
> > > > @@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct
> > > > rte_sched_subport *subport, uint32_t bmp_pos,  #endif /*
> > > > RTE_SCHED_DEBUG */
> > > >
> > > >  static inline struct rte_sched_subport * -rte_sched_port_subport(struct
> > > > rte_sched_port *port,
> > > > -	struct rte_mbuf *pkt)
> > > > +sched_port_subport(const struct rte_sched_port *port, struct
> > > > +rte_mbuf_sched sch)
> > > >  {
> > > > -	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
> > > > +	uint32_t queue_id = sch.queue_id;
> > > >  	uint32_t subport_id = queue_id >> (port-
> > > > >n_pipes_per_subport_log2 + 4);
> > > >
> > > >  	return port->subports[subport_id];
> > > >  }
> > > >
> > > >  static inline uint32_t
> > > > -rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport
> > > > *subport,
> > > > -	struct rte_mbuf *pkt, uint32_t subport_qmask)
> > > > +sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport
> > > > *subport,
> > > > +	struct rte_mbuf_sched sch, uint32_t subport_qmask)
> > > >  {
> > > >  	struct rte_sched_queue *q;
> > > >  #ifdef RTE_SCHED_COLLECT_STATS
> > > >  	struct rte_sched_queue_extra *qe;
> > > >  #endif
> > > > -	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
> > > > +	uint32_t qindex = sch.queue_id;
> > > >  	uint32_t subport_queue_id = subport_qmask & qindex;
> > > >
> > > >  	q = subport->queue + subport_queue_id; @@ -1971,197 +1970,41
> > > > @@ int  rte_sched_port_enqueue(struct rte_sched_port *port, struct
> > > > rte_mbuf **pkts,
> > > >  		       uint32_t n_pkts)
> > > >  {
> > > > -	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
> > > > -		*pkt30, *pkt31, *pkt_last;
> > > > -	struct rte_mbuf **q00_base, **q01_base, **q10_base,
> > > > **q11_base,
> > > > -		**q20_base, **q21_base, **q30_base, **q31_base,
> > > > **q_last_base;
> > > > -	struct rte_sched_subport *subport00, *subport01, *subport10,
> > > > *subport11,
> > > > -		*subport20, *subport21, *subport30, *subport31,
> > > > *subport_last;
> > > > -	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
> > > > -	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
> > > > -	uint32_t subport_qmask;
> > > >  	uint32_t result, i;
> > > > +	struct rte_mbuf_sched sch[n_pkts];
> > > > +	struct rte_sched_subport *subports[n_pkts];
> > > > +	struct rte_mbuf **q_base[n_pkts];
> > > > +	uint32_t q[n_pkts];
> > > > +
> > > > +	const uint32_t subport_qmask =
> > > > +		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> > > >
> > > >  	result = 0;
> > > > -	subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> > > >
> > > > -	/*
> > > > -	 * Less then 6 input packets available, which is not enough to
> > > > -	 * feed the pipeline
> > > > -	 */
> > > > -	if (unlikely(n_pkts < 6)) {
> > > > -		struct rte_sched_subport *subports[5];
> > > > -		struct rte_mbuf **q_base[5];
> > > > -		uint32_t q[5];
> > > > -
> > > > -		/* Prefetch the mbuf structure of each packet */
> > > > -		for (i = 0; i < n_pkts; i++)
> > > > -			rte_prefetch0(pkts[i]);
> > > > -
> > > > -		/* Prefetch the subport structure for each packet */
> > > > -		for (i = 0; i < n_pkts; i++)
> > > > -			subports[i] = rte_sched_port_subport(port, pkts[i]);
> > > > -
> > > > -		/* Prefetch the queue structure for each queue */
> > > > -		for (i = 0; i < n_pkts; i++)
> > > > -			q[i] =
> > > > rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
> > > > -					pkts[i], subport_qmask);
> > > > -
> > > > -		/* Prefetch the write pointer location of each queue */
> > > > -		for (i = 0; i < n_pkts; i++) {
> > > > -			q_base[i] =
> > > > rte_sched_subport_pipe_qbase(subports[i], q[i]);
> > > > -			rte_sched_port_enqueue_qwa_prefetch0(port,
> > > > subports[i],
> > > > -				q[i], q_base[i]);
> > > > -		}
> > > > +	/* Prefetch the mbuf structure of each packet */
> > > > +	for (i = 0; i < n_pkts; i++)
> > > > +		sch[i] = pkts[i]->hash.sched;
> > > >
> > >
> > > Hi Konstantin,  thanks for the patch. In above case, all packets are touched
> > > straight with any prefetch. If we consider the input burst size of 64 pkts, it
> > > means 512 bytes of packet addresses  (8 cache-lines) which is likely to be
> > > available in cache. For larger size burst, e.g. 128 or 256, there might be
> > > instances when some addresses are not available the cache, may stall core.
> > > How about adding explicit prefetch before starting to iterate through the
> > > packets if that helps?
> 
> I don't think we need any prefetch here.
> pkts[] is a sequential array, HW prefetcher should be able to do good job here.
> Again in majority of use-cases pkts[] contents will already present in the cache.
> Though there is a valid concern here: n_pkts can be big, in that case we probably
> don't want to store too much on the stack and read too much from pkts[].
> It is better to work in some fixed chunks (64 or so).
> I can prepare v2 with these changes, if there still is an interest in this patch.   
> 
> > Exactly. Konstantin, you might not be a fan of prefetches, but the current enqueue implementation (as well as the dequeue) uses a prefetch
> > state machine. Please keep the prefetch state machine in the scalar code.
> 
> It is not about our own preferences.
> From my measurements new version is faster and it is definitely simpler.
> 
> > Even if the examples/qos_sched might not show an advantage,
> > this is just a sample app and there are some more relevant use-cases as well.
> 
> Well, I hope that examples/qos_sched reflects at least some real-world use-cases for QOS library.
> Otherwise why do we have it inside DPDK codebase? 
> About 'more relevant use-cases': if you do know such, can you try them with the patch?
> I would really appreciate that.
> In fact, it is an ask not only to Cristian, but to all other interested parties:
> if your app does use librte_sched - please try this patch and provide the feedback.
> If some tests would flag a regression - I am absolutely ok to drop the patch.
> Konstantin






^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
  2021-11-17 10:33           ` Thomas Monjalon
@ 2021-11-17 10:53             ` Dumitrescu, Cristian
  0 siblings, 0 replies; 9+ messages in thread
From: Dumitrescu, Cristian @ 2021-11-17 10:53 UTC (permalink / raw)
  To: Thomas Monjalon, Singh, Jasvinder, Ananyev, Konstantin; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Wednesday, November 17, 2021 10:33 AM
> To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure
> 
> Hi all,
> What is the conclusion for this patch and the number 1/2?
> 
> 04/04/2021 01:53, Ananyev, Konstantin:
> >
> > Hi guys,
> >
> > > > > In many usage scenarios input mbufs for rte_sched_port_enqueue()
> are
> > > > not
> > > > > yet in the CPU cache(s). That causes quite significant stalls due to
> memory
> > > > > latency. Current implementation tries to migitate it using SW pipeline
> and
> > > > SW
> > > > > prefetch techniques, but stalls are still present.
> > > > > Rework rte_sched_port_enqueue() to do actual fetch of all mbufs
> > > > metadata
> > > > > as a first stage of that function.
> > > > > That helps to minimise load stalls at further stages of enqueue() and
> > > > > improves overall enqueue performance.
> > > > > With examples/qos_sched I observed:
> > > > > on ICX box: up to 30% cycles reduction
> > > > > on CSX AND BDX: 20-15% cycles reduction
> > > > > I also run tests with mbufs already in the cache (one core doing RX,
> QOS
> > > > and
> > > > > TX).
> > > > > With such scenario, on all mentioned above IA boxes no performance
> drop
> > > > > was observed.
> > > > >
> > > > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > > > ---
> > > > > v2: fix clang and checkpatch complains
> > > > > ---
> > > > >  lib/librte_sched/rte_sched.c | 219 +++++------------------------------
> > > > >  1 file changed, 31 insertions(+), 188 deletions(-)
> > > > >
> > > > > diff --git a/lib/librte_sched/rte_sched.c
> b/lib/librte_sched/rte_sched.c
> > > > index
> > > > > 7c5688068..41ef147e0 100644
> > > > > --- a/lib/librte_sched/rte_sched.c
> > > > > +++ b/lib/librte_sched/rte_sched.c
> > > > > @@ -1861,24 +1861,23 @@ debug_check_queue_slab(struct
> > > > > rte_sched_subport *subport, uint32_t bmp_pos,  #endif /*
> > > > > RTE_SCHED_DEBUG */
> > > > >
> > > > >  static inline struct rte_sched_subport * -
> rte_sched_port_subport(struct
> > > > > rte_sched_port *port,
> > > > > -	struct rte_mbuf *pkt)
> > > > > +sched_port_subport(const struct rte_sched_port *port, struct
> > > > > +rte_mbuf_sched sch)
> > > > >  {
> > > > > -	uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
> > > > > +	uint32_t queue_id = sch.queue_id;
> > > > >  	uint32_t subport_id = queue_id >> (port-
> > > > > >n_pipes_per_subport_log2 + 4);
> > > > >
> > > > >  	return port->subports[subport_id];
> > > > >  }
> > > > >
> > > > >  static inline uint32_t
> > > > > -rte_sched_port_enqueue_qptrs_prefetch0(struct
> rte_sched_subport
> > > > > *subport,
> > > > > -	struct rte_mbuf *pkt, uint32_t subport_qmask)
> > > > > +sched_port_enqueue_qptrs_prefetch0(const struct
> rte_sched_subport
> > > > > *subport,
> > > > > +	struct rte_mbuf_sched sch, uint32_t subport_qmask)
> > > > >  {
> > > > >  	struct rte_sched_queue *q;
> > > > >  #ifdef RTE_SCHED_COLLECT_STATS
> > > > >  	struct rte_sched_queue_extra *qe;
> > > > >  #endif
> > > > > -	uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
> > > > > +	uint32_t qindex = sch.queue_id;
> > > > >  	uint32_t subport_queue_id = subport_qmask & qindex;
> > > > >
> > > > >  	q = subport->queue + subport_queue_id; @@ -1971,197
> +1970,41
> > > > > @@ int  rte_sched_port_enqueue(struct rte_sched_port *port,
> struct
> > > > > rte_mbuf **pkts,
> > > > >  		       uint32_t n_pkts)
> > > > >  {
> > > > > -	struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20,
> *pkt21,
> > > > > -		*pkt30, *pkt31, *pkt_last;
> > > > > -	struct rte_mbuf **q00_base, **q01_base, **q10_base,
> > > > > **q11_base,
> > > > > -		**q20_base, **q21_base, **q30_base, **q31_base,
> > > > > **q_last_base;
> > > > > -	struct rte_sched_subport *subport00, *subport01,
> *subport10,
> > > > > *subport11,
> > > > > -		*subport20, *subport21, *subport30, *subport31,
> > > > > *subport_last;
> > > > > -	uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
> > > > > -	uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
> > > > > -	uint32_t subport_qmask;
> > > > >  	uint32_t result, i;
> > > > > +	struct rte_mbuf_sched sch[n_pkts];
> > > > > +	struct rte_sched_subport *subports[n_pkts];
> > > > > +	struct rte_mbuf **q_base[n_pkts];
> > > > > +	uint32_t q[n_pkts];
> > > > > +
> > > > > +	const uint32_t subport_qmask =
> > > > > +		(1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
> > > > >
> > > > >  	result = 0;
> > > > > -	subport_qmask = (1 << (port->n_pipes_per_subport_log2 +
> 4)) - 1;
> > > > >
> > > > > -	/*
> > > > > -	 * Less then 6 input packets available, which is not enough to
> > > > > -	 * feed the pipeline
> > > > > -	 */
> > > > > -	if (unlikely(n_pkts < 6)) {
> > > > > -		struct rte_sched_subport *subports[5];
> > > > > -		struct rte_mbuf **q_base[5];
> > > > > -		uint32_t q[5];
> > > > > -
> > > > > -		/* Prefetch the mbuf structure of each packet */
> > > > > -		for (i = 0; i < n_pkts; i++)
> > > > > -			rte_prefetch0(pkts[i]);
> > > > > -
> > > > > -		/* Prefetch the subport structure for each packet */
> > > > > -		for (i = 0; i < n_pkts; i++)
> > > > > -			subports[i] = rte_sched_port_subport(port,
> pkts[i]);
> > > > > -
> > > > > -		/* Prefetch the queue structure for each queue */
> > > > > -		for (i = 0; i < n_pkts; i++)
> > > > > -			q[i] =
> > > > > rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
> > > > > -					pkts[i], subport_qmask);
> > > > > -
> > > > > -		/* Prefetch the write pointer location of each queue
> */
> > > > > -		for (i = 0; i < n_pkts; i++) {
> > > > > -			q_base[i] =
> > > > > rte_sched_subport_pipe_qbase(subports[i], q[i]);
> > > > > -
> 	rte_sched_port_enqueue_qwa_prefetch0(port,
> > > > > subports[i],
> > > > > -				q[i], q_base[i]);
> > > > > -		}
> > > > > +	/* Prefetch the mbuf structure of each packet */
> > > > > +	for (i = 0; i < n_pkts; i++)
> > > > > +		sch[i] = pkts[i]->hash.sched;
> > > > >
> > > >
> > > > Hi Konstantin,  thanks for the patch. In above case, all packets are
> touched
> > > > straight with any prefetch. If we consider the input burst size of 64 pkts,
> it
> > > > means 512 bytes of packet addresses  (8 cache-lines) which is likely to
> be
> > > > available in cache. For larger size burst, e.g. 128 or 256, there might be
> > > > instances when some addresses are not available the cache, may stall
> core.
> > > > How about adding explicit prefetch before starting to iterate through
> the
> > > > packets if that helps?
> >
> > I don't think we need any prefetch here.
> > pkts[] is a sequential array, HW prefetcher should be able to do good job
> here.
> > Again in majority of use-cases pkts[] contents will already present in the
> cache.
> > Though there is a valid concern here: n_pkts can be big, in that case we
> probably
> > don't want to store too much on the stack and read too much from pkts[].
> > It is better to work in some fixed chunks (64 or so).
> > I can prepare v2 with these changes, if there still is an interest in this patch.
> >
> > > Exactly. Konstantin, you might not be a fan of prefetches, but the current
> enqueue implementation (as well as the dequeue) uses a prefetch
> > > state machine. Please keep the prefetch state machine in the scalar code.
> >
> > It is not about our own preferences.
> > From my measurements new version is faster and it is definitely simpler.
> >
> > > Even if the examples/qos_sched might not show an advantage,
> > > this is just a sample app and there are some more relevant use-cases as
> well.
> >
> > Well, I hope that examples/qos_sched reflects at least some real-world
> use-cases for QOS library.
> > Otherwise why do we have it inside DPDK codebase?
> > About 'more relevant use-cases': if you do know such, can you try them
> with the patch?
> > I would really appreciate that.
> > In fact, it is an ask not only to Cristian, but to all other interested parties:
> > if your app does use librte_sched - please try this patch and provide the
> feedback.
> > If some tests would flag a regression - I am absolutely ok to drop the patch.
> > Konstantin
> 
> 
> 
Hi Thomas,

We are not going ahead with this at this moment. 

Regards,
Cristian

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-11-17 10:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-16 17:07 [dpdk-dev] [PATCH 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
2021-03-16 17:07 ` [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
2021-03-18 11:56 ` [dpdk-dev] [PATCH v2 1/2] examples/qos_sched: fixup colors value overrun Konstantin Ananyev
2021-03-18 11:56   ` [dpdk-dev] [PATCH v2 2/2] qos: rearrange enqueue procedure Konstantin Ananyev
2021-04-02 14:14     ` Singh, Jasvinder
2021-04-02 21:12       ` Dumitrescu, Cristian
2021-04-03 23:53         ` Ananyev, Konstantin
2021-11-17 10:33           ` Thomas Monjalon
2021-11-17 10:53             ` Dumitrescu, Cristian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).