From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 014BEA054F; Tue, 16 Mar 2021 18:07:55 +0100 (CET) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id DD5F9242918; Tue, 16 Mar 2021 18:07:54 +0100 (CET) Received: from mga03.intel.com (mga03.intel.com [134.134.136.65]) by mails.dpdk.org (Postfix) with ESMTP id DAB89242922 for ; Tue, 16 Mar 2021 18:07:52 +0100 (CET) IronPort-SDR: 1EDhNrjfdsC7s2jDqiusFH1CmMJQtowtN7By4o+7fGWz85qAhKXS5eGiVqaJYmvC22QtzPmafV +4f5HFvEMQ4g== X-IronPort-AV: E=McAfee;i="6000,8403,9925"; a="189348378" X-IronPort-AV: E=Sophos;i="5.81,254,1610438400"; d="scan'208";a="189348378" Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by orsmga103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 16 Mar 2021 10:07:51 -0700 IronPort-SDR: JMjmxWtXN3gbyTuK02mkhPcFUK3FgvoR7vUWdbCLnFRcN6guX31I5LSVqE83RVFRVrpTF4omvB fFQC/YQilKUw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.81,254,1610438400"; d="scan'208";a="440146301" Received: from sivswdev08.ir.intel.com ([10.237.217.47]) by FMSMGA003.fm.intel.com with ESMTP; 16 Mar 2021 10:07:50 -0700 From: Konstantin Ananyev To: dev@dpdk.org Cc: cristian.dumitrescu@intel.com, jasvinder.singh@intel.com, Konstantin Ananyev Date: Tue, 16 Mar 2021 17:07:23 +0000 Message-Id: <20210316170723.22036-2-konstantin.ananyev@intel.com> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20210316170723.22036-1-konstantin.ananyev@intel.com> References: <20210316170723.22036-1-konstantin.ananyev@intel.com> Subject: [dpdk-dev] [PATCH 2/2] qos: rearrange enqueue procedure X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Rework rte_sched_port_enqueue() to do actual fetch of all mbufs metadata as a first stage of that function. That helps to avoid load stalls at futher stages of enqueue() and improves overall enqueue perfomance. With examples/qos_sched I observed: on ICX box: up to 30% cycles reduction on CSX AND BDX: 20-15% cycles redunction Signed-off-by: Konstantin Ananyev --- lib/librte_sched/rte_sched.c | 233 +++++++---------------------------- 1 file changed, 45 insertions(+), 188 deletions(-) diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c index 7c56880681..f608617988 100644 --- a/lib/librte_sched/rte_sched.c +++ b/lib/librte_sched/rte_sched.c @@ -1861,24 +1861,29 @@ debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos, #endif /* RTE_SCHED_DEBUG */ static inline struct rte_sched_subport * -rte_sched_port_subport(struct rte_sched_port *port, - struct rte_mbuf *pkt) +sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf_sched sch) { - uint32_t queue_id = rte_mbuf_sched_queue_get(pkt); + uint32_t queue_id = sch.queue_id; uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4); return port->subports[subport_id]; } +static inline struct rte_sched_subport * +rte_sched_port_subport(const struct rte_sched_port *port, struct rte_mbuf *pkt) +{ + return sched_port_subport(port, pkt->hash.sched); +} + static inline uint32_t -rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport, - struct rte_mbuf *pkt, uint32_t subport_qmask) +sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport, + struct rte_mbuf_sched sch, uint32_t subport_qmask) { struct rte_sched_queue *q; #ifdef RTE_SCHED_COLLECT_STATS struct rte_sched_queue_extra *qe; #endif - uint32_t qindex = rte_mbuf_sched_queue_get(pkt); + uint32_t qindex = sch.queue_id; uint32_t subport_queue_id = subport_qmask & qindex; q = subport->queue + subport_queue_id; @@ -1891,6 +1896,14 @@ rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport, return subport_queue_id; } +static inline uint32_t +rte_sched_port_enqueue_qptrs_prefetch0(const struct rte_sched_subport *subport, + struct rte_mbuf *pkt, uint32_t subport_qmask) +{ + return sched_port_enqueue_qptrs_prefetch0(subport, pkt->hash.sched, + subport_qmask); +} + static inline void rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port, struct rte_sched_subport *subport, @@ -1971,197 +1984,41 @@ int rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts) { - struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21, - *pkt30, *pkt31, *pkt_last; - struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base, - **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base; - struct rte_sched_subport *subport00, *subport01, *subport10, *subport11, - *subport20, *subport21, *subport30, *subport31, *subport_last; - uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last; - uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last; - uint32_t subport_qmask; uint32_t result, i; + struct rte_mbuf_sched sch[n_pkts]; + struct rte_sched_subport *subports[n_pkts]; + struct rte_mbuf **q_base[n_pkts]; + uint32_t q[n_pkts]; + + const uint32_t subport_qmask = + (1 << (port->n_pipes_per_subport_log2 + 4)) - 1; result = 0; - subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1; - /* - * Less then 6 input packets available, which is not enough to - * feed the pipeline - */ - if (unlikely(n_pkts < 6)) { - struct rte_sched_subport *subports[5]; - struct rte_mbuf **q_base[5]; - uint32_t q[5]; - - /* Prefetch the mbuf structure of each packet */ - for (i = 0; i < n_pkts; i++) - rte_prefetch0(pkts[i]); - - /* Prefetch the subport structure for each packet */ - for (i = 0; i < n_pkts; i++) - subports[i] = rte_sched_port_subport(port, pkts[i]); - - /* Prefetch the queue structure for each queue */ - for (i = 0; i < n_pkts; i++) - q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i], - pkts[i], subport_qmask); - - /* Prefetch the write pointer location of each queue */ - for (i = 0; i < n_pkts; i++) { - q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]); - rte_sched_port_enqueue_qwa_prefetch0(port, subports[i], - q[i], q_base[i]); - } + /* Prefetch the mbuf structure of each packet */ + for (i = 0; i < n_pkts; i++) + sch[i] = pkts[i]->hash.sched; - /* Write each packet to its queue */ - for (i = 0; i < n_pkts; i++) - result += rte_sched_port_enqueue_qwa(port, subports[i], - q[i], q_base[i], pkts[i]); + /* Prefetch the subport structure for each packet */ + for (i = 0; i < n_pkts; i++) + subports[i] = sched_port_subport(port, sch[i]); - return result; - } + /* Prefetch the queue structure for each queue */ + for (i = 0; i < n_pkts; i++) + q[i] = sched_port_enqueue_qptrs_prefetch0(subports[i], + sch[i], subport_qmask); - /* Feed the first 3 stages of the pipeline (6 packets needed) */ - pkt20 = pkts[0]; - pkt21 = pkts[1]; - rte_prefetch0(pkt20); - rte_prefetch0(pkt21); - - pkt10 = pkts[2]; - pkt11 = pkts[3]; - rte_prefetch0(pkt10); - rte_prefetch0(pkt11); - - subport20 = rte_sched_port_subport(port, pkt20); - subport21 = rte_sched_port_subport(port, pkt21); - q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20, - pkt20, subport_qmask); - q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21, - pkt21, subport_qmask); - - pkt00 = pkts[4]; - pkt01 = pkts[5]; - rte_prefetch0(pkt00); - rte_prefetch0(pkt01); - - subport10 = rte_sched_port_subport(port, pkt10); - subport11 = rte_sched_port_subport(port, pkt11); - q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10, - pkt10, subport_qmask); - q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11, - pkt11, subport_qmask); - - q20_base = rte_sched_subport_pipe_qbase(subport20, q20); - q21_base = rte_sched_subport_pipe_qbase(subport21, q21); - rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base); - rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base); - - /* Run the pipeline */ - for (i = 6; i < (n_pkts & (~1)); i += 2) { - /* Propagate stage inputs */ - pkt30 = pkt20; - pkt31 = pkt21; - pkt20 = pkt10; - pkt21 = pkt11; - pkt10 = pkt00; - pkt11 = pkt01; - q30 = q20; - q31 = q21; - q20 = q10; - q21 = q11; - subport30 = subport20; - subport31 = subport21; - subport20 = subport10; - subport21 = subport11; - q30_base = q20_base; - q31_base = q21_base; - - /* Stage 0: Get packets in */ - pkt00 = pkts[i]; - pkt01 = pkts[i + 1]; - rte_prefetch0(pkt00); - rte_prefetch0(pkt01); - - /* Stage 1: Prefetch subport and queue structure storing queue pointers */ - subport10 = rte_sched_port_subport(port, pkt10); - subport11 = rte_sched_port_subport(port, pkt11); - q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10, - pkt10, subport_qmask); - q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11, - pkt11, subport_qmask); - - /* Stage 2: Prefetch queue write location */ - q20_base = rte_sched_subport_pipe_qbase(subport20, q20); - q21_base = rte_sched_subport_pipe_qbase(subport21, q21); - rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base); - rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base); - - /* Stage 3: Write packet to queue and activate queue */ - r30 = rte_sched_port_enqueue_qwa(port, subport30, - q30, q30_base, pkt30); - r31 = rte_sched_port_enqueue_qwa(port, subport31, - q31, q31_base, pkt31); - result += r30 + r31; - } - - /* - * Drain the pipeline (exactly 6 packets). - * Handle the last packet in the case - * of an odd number of input packets. - */ - pkt_last = pkts[n_pkts - 1]; - rte_prefetch0(pkt_last); - - subport00 = rte_sched_port_subport(port, pkt00); - subport01 = rte_sched_port_subport(port, pkt01); - q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00, - pkt00, subport_qmask); - q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01, - pkt01, subport_qmask); - - q10_base = rte_sched_subport_pipe_qbase(subport10, q10); - q11_base = rte_sched_subport_pipe_qbase(subport11, q11); - rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base); - rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base); - - r20 = rte_sched_port_enqueue_qwa(port, subport20, - q20, q20_base, pkt20); - r21 = rte_sched_port_enqueue_qwa(port, subport21, - q21, q21_base, pkt21); - result += r20 + r21; - - subport_last = rte_sched_port_subport(port, pkt_last); - q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last, - pkt_last, subport_qmask); - - q00_base = rte_sched_subport_pipe_qbase(subport00, q00); - q01_base = rte_sched_subport_pipe_qbase(subport01, q01); - rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base); - rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base); - - r10 = rte_sched_port_enqueue_qwa(port, subport10, q10, - q10_base, pkt10); - r11 = rte_sched_port_enqueue_qwa(port, subport11, q11, - q11_base, pkt11); - result += r10 + r11; - - q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last); - rte_sched_port_enqueue_qwa_prefetch0(port, subport_last, - q_last, q_last_base); - - r00 = rte_sched_port_enqueue_qwa(port, subport00, q00, - q00_base, pkt00); - r01 = rte_sched_port_enqueue_qwa(port, subport01, q01, - q01_base, pkt01); - result += r00 + r01; - - if (n_pkts & 1) { - r_last = rte_sched_port_enqueue_qwa(port, subport_last, - q_last, q_last_base, pkt_last); - result += r_last; + /* Prefetch the write pointer location of each queue */ + for (i = 0; i < n_pkts; i++) { + q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]); + rte_sched_port_enqueue_qwa_prefetch0(port, subports[i], + q[i], q_base[i]); } + /* Write each packet to its queue */ + for (i = 0; i < n_pkts; i++) + result += rte_sched_port_enqueue_qwa(port, subports[i], + q[i], q_base[i], pkts[i]); return result; } -- 2.26.2