From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: <dev-bounces@dpdk.org> Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 8B4E4A0547; Thu, 28 Oct 2021 12:20:49 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 748BD41144; Thu, 28 Oct 2021 12:20:48 +0200 (CEST) Received: from mga12.intel.com (mga12.intel.com [192.55.52.136]) by mails.dpdk.org (Postfix) with ESMTP id D0C604067B for <dev@dpdk.org>; Thu, 28 Oct 2021 12:20:41 +0200 (CEST) X-IronPort-AV: E=McAfee;i="6200,9189,10150"; a="210449395" X-IronPort-AV: E=Sophos;i="5.87,189,1631602800"; d="scan'208";a="210449395" Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by fmsmga106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 28 Oct 2021 03:18:09 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.87,189,1631602800"; d="scan'208";a="555889323" Received: from silpixa00400629.ir.intel.com ([10.237.213.30]) by fmsmga004.fm.intel.com with ESMTP; 28 Oct 2021 03:18:07 -0700 From: "Liguzinski, WojciechX" <wojciechx.liguzinski@intel.com> To: dev@dpdk.org, jasvinder.singh@intel.com, cristian.dumitrescu@intel.com Cc: megha.ajmera@intel.com Date: Thu, 28 Oct 2021 10:17:58 +0000 Message-Id: <20211028101802.4127295-2-wojciechx.liguzinski@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20211028101802.4127295-1-wojciechx.liguzinski@intel.com> References: <20211025113208.3910951-1-wojciechx.liguzinski@intel.com> <20211028101802.4127295-1-wojciechx.liguzinski@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [dpdk-dev] [PATCH v19 1/5] sched: add PIE based congestion management X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions <dev.dpdk.org> List-Unsubscribe: <https://mails.dpdk.org/options/dev>, <mailto:dev-request@dpdk.org?subject=unsubscribe> List-Archive: <http://mails.dpdk.org/archives/dev/> List-Post: <mailto:dev@dpdk.org> List-Help: <mailto:dev-request@dpdk.org?subject=help> List-Subscribe: <https://mails.dpdk.org/listinfo/dev>, <mailto:dev-request@dpdk.org?subject=subscribe> Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org> Implement PIE based congestion management based on rfc8033 Signed-off-by: Liguzinski, WojciechX <wojciechx.liguzinski@intel.com> Acked-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com> Acked-by: Jasvinder Singh <jasvinder.singh@intel.com> -- Changes in V19: - ACKs included in patches Changes in V18: - Resolved merge conflict in lib/sched/meson.build after rebasing ontop of main - Reverted whitespace change in app_thread.c - comment from Stephen Hemminger Changes in V17: - Corrected paragraph link naming in qos_framework.rst to fix CI builds Changes in V16: - Fixed 'title underline too short' error in qos_framework.rst - Applied __rte_unused macro to parameters in rte_sched_port_pie_dequeue() --- drivers/net/softnic/rte_eth_softnic_tm.c | 6 +- lib/sched/meson.build | 3 +- lib/sched/rte_pie.c | 82 +++++ lib/sched/rte_pie.h | 393 +++++++++++++++++++++++ lib/sched/rte_sched.c | 241 +++++++++----- lib/sched/rte_sched.h | 63 +++- lib/sched/version.map | 4 + 7 files changed, 702 insertions(+), 90 deletions(-) create mode 100644 lib/sched/rte_pie.c create mode 100644 lib/sched/rte_pie.h diff --git a/drivers/net/softnic/rte_eth_softnic_tm.c b/drivers/net/softnic/rte_eth_softnic_tm.c index 90baba15ce..e74092ce7f 100644 --- a/drivers/net/softnic/rte_eth_softnic_tm.c +++ b/drivers/net/softnic/rte_eth_softnic_tm.c @@ -420,7 +420,7 @@ pmd_tm_node_type_get(struct rte_eth_dev *dev, return 0; } -#ifdef RTE_SCHED_RED +#ifdef RTE_SCHED_CMAN #define WRED_SUPPORTED 1 #else #define WRED_SUPPORTED 0 @@ -2306,7 +2306,7 @@ tm_tc_wred_profile_get(struct rte_eth_dev *dev, uint32_t tc_id) return NULL; } -#ifdef RTE_SCHED_RED +#ifdef RTE_SCHED_CMAN static void wred_profiles_set(struct rte_eth_dev *dev, uint32_t subport_id) @@ -2321,7 +2321,7 @@ wred_profiles_set(struct rte_eth_dev *dev, uint32_t subport_id) for (tc_id = 0; tc_id < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; tc_id++) for (color = RTE_COLOR_GREEN; color < RTE_COLORS; color++) { struct rte_red_params *dst = - &pp->red_params[tc_id][color]; + &pp->cman_params->red_params[tc_id][color]; struct tm_wred_profile *src_wp = tm_tc_wred_profile_get(dev, tc_id); struct rte_tm_red_params *src = diff --git a/lib/sched/meson.build b/lib/sched/meson.build index 8ced4547aa..df75db51ed 100644 --- a/lib/sched/meson.build +++ b/lib/sched/meson.build @@ -7,11 +7,12 @@ if is_windows subdir_done() endif -sources = files('rte_sched.c', 'rte_red.c', 'rte_approx.c') +sources = files('rte_sched.c', 'rte_red.c', 'rte_approx.c', 'rte_pie.c') headers = files( 'rte_approx.h', 'rte_red.h', 'rte_sched.h', 'rte_sched_common.h', + 'rte_pie.h', ) deps += ['mbuf', 'meter'] diff --git a/lib/sched/rte_pie.c b/lib/sched/rte_pie.c new file mode 100644 index 0000000000..2fcecb2db4 --- /dev/null +++ b/lib/sched/rte_pie.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#include <stdlib.h> + +#include "rte_pie.h" +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_malloc.h> + +#ifdef __INTEL_COMPILER +#pragma warning(disable:2259) /* conversion may lose significant bits */ +#endif + +void +rte_pie_rt_data_init(struct rte_pie *pie) +{ + if (pie == NULL) { + /* Allocate memory to use the PIE data structure */ + pie = rte_malloc(NULL, sizeof(struct rte_pie), 0); + + if (pie == NULL) + RTE_LOG(ERR, SCHED, "%s: Memory allocation fails\n", __func__); + } + + pie->active = 0; + pie->in_measurement = 0; + pie->departed_bytes_count = 0; + pie->start_measurement = 0; + pie->last_measurement = 0; + pie->qlen = 0; + pie->avg_dq_time = 0; + pie->burst_allowance = 0; + pie->qdelay_old = 0; + pie->drop_prob = 0; + pie->accu_prob = 0; +} + +int +rte_pie_config_init(struct rte_pie_config *pie_cfg, + const uint16_t qdelay_ref, + const uint16_t dp_update_interval, + const uint16_t max_burst, + const uint16_t tailq_th) +{ + uint64_t tsc_hz = rte_get_tsc_hz(); + + if (pie_cfg == NULL) + return -1; + + if (qdelay_ref <= 0) { + RTE_LOG(ERR, SCHED, + "%s: Incorrect value for qdelay_ref\n", __func__); + return -EINVAL; + } + + if (dp_update_interval <= 0) { + RTE_LOG(ERR, SCHED, + "%s: Incorrect value for dp_update_interval\n", __func__); + return -EINVAL; + } + + if (max_burst <= 0) { + RTE_LOG(ERR, SCHED, + "%s: Incorrect value for max_burst\n", __func__); + return -EINVAL; + } + + if (tailq_th <= 0) { + RTE_LOG(ERR, SCHED, + "%s: Incorrect value for tailq_th\n", __func__); + return -EINVAL; + } + + pie_cfg->qdelay_ref = (tsc_hz * qdelay_ref) / 1000; + pie_cfg->dp_update_interval = (tsc_hz * dp_update_interval) / 1000; + pie_cfg->max_burst = (tsc_hz * max_burst) / 1000; + pie_cfg->tailq_th = tailq_th; + + return 0; +} diff --git a/lib/sched/rte_pie.h b/lib/sched/rte_pie.h new file mode 100644 index 0000000000..f83c95664f --- /dev/null +++ b/lib/sched/rte_pie.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef __RTE_PIE_H_INCLUDED__ +#define __RTE_PIE_H_INCLUDED__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Proportional Integral controller Enhanced (PIE) + * + * + ***/ + +#include <stdint.h> + +#include <rte_random.h> +#include <rte_debug.h> + +#define RTE_DQ_THRESHOLD 16384 /**< Queue length threshold (2^14) + * to start measurement cycle (bytes) + */ +#define RTE_DQ_WEIGHT 0.25 /**< Weight (RTE_DQ_THRESHOLD/2^16) to compute dequeue rate */ +#define RTE_ALPHA 0.125 /**< Weights in drop probability calculations */ +#define RTE_BETA 1.25 /**< Weights in drop probability calculations */ +#define RTE_RAND_MAX ~0LLU /**< Max value of the random number */ + + +/** + * PIE configuration parameters passed by user + * + */ +struct rte_pie_params { + uint16_t qdelay_ref; /**< Latency Target (milliseconds) */ + uint16_t dp_update_interval; /**< Update interval for drop probability (milliseconds) */ + uint16_t max_burst; /**< Max Burst Allowance (milliseconds) */ + uint16_t tailq_th; /**< Tailq drop threshold (packet counts) */ +}; + +/** + * PIE configuration parameters + * + */ +struct rte_pie_config { + uint64_t qdelay_ref; /**< Latency Target (in CPU cycles.) */ + uint64_t dp_update_interval; /**< Update interval for drop probability (in CPU cycles) */ + uint64_t max_burst; /**< Max Burst Allowance (in CPU cycles.) */ + uint16_t tailq_th; /**< Tailq drop threshold (packet counts) */ +}; + +/** + * RED run-time data + */ +struct rte_pie { + uint16_t active; /**< Flag for activating/deactivating pie */ + uint16_t in_measurement; /**< Flag for activation of measurement cycle */ + uint32_t departed_bytes_count; /**< Number of bytes departed in current measurement cycle */ + uint64_t start_measurement; /**< Time to start to measurement cycle (in cpu cycles) */ + uint64_t last_measurement; /**< Time of last measurement (in cpu cycles) */ + uint64_t qlen; /**< Queue length (packets count) */ + uint64_t qlen_bytes; /**< Queue length (bytes count) */ + uint64_t avg_dq_time; /**< Time averaged dequeue rate (in cpu cycles) */ + uint32_t burst_allowance; /**< Current burst allowance (bytes) */ + uint64_t qdelay_old; /**< Old queue delay (bytes) */ + double drop_prob; /**< Current packet drop probability */ + double accu_prob; /**< Accumulated packet drop probability */ +}; + +/** + * @brief Initialises run-time data + * + * @param pie [in,out] data pointer to PIE runtime data + */ +void +__rte_experimental +rte_pie_rt_data_init(struct rte_pie *pie); + +/** + * @brief Configures a single PIE configuration parameter structure. + * + * @param pie_cfg [in,out] config pointer to a PIE configuration parameter structure + * @param qdelay_ref [in] latency target(milliseconds) + * @param dp_update_interval [in] update interval for drop probability (milliseconds) + * @param max_burst [in] maximum burst allowance (milliseconds) + * @param tailq_th [in] tail drop threshold for the queue (number of packets) + * + * @return Operation status + * @retval 0 success + * @retval !0 error + */ +int +__rte_experimental +rte_pie_config_init(struct rte_pie_config *pie_cfg, + const uint16_t qdelay_ref, + const uint16_t dp_update_interval, + const uint16_t max_burst, + const uint16_t tailq_th); + +/** + * @brief Decides packet enqueue when queue is empty + * + * Note: packet is never dropped in this particular case. + * + * @param pie_cfg [in] config pointer to a PIE configuration parameter structure + * @param pie [in, out] data pointer to PIE runtime data + * @param pkt_len [in] packet length in bytes + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval !0 drop the packet + */ +static inline int +__rte_experimental +rte_pie_enqueue_empty(const struct rte_pie_config *pie_cfg, + struct rte_pie *pie, + uint32_t pkt_len) +{ + RTE_ASSERT(pkt_len != NULL); + + /* Update the PIE qlen parameter */ + pie->qlen++; + pie->qlen_bytes += pkt_len; + + /** + * If the queue has been idle for a while, turn off PIE and Reset counters + */ + if ((pie->active == 1) && + (pie->qlen < (pie_cfg->tailq_th * 0.1))) { + pie->active = 0; + pie->in_measurement = 0; + } + + return 0; +} + +/** + * @brief make a decision to drop or enqueue a packet based on probability + * criteria + * + * @param pie_cfg [in] config pointer to a PIE configuration parameter structure + * @param pie [in, out] data pointer to PIE runtime data + * @param time [in] current time (measured in cpu cycles) + */ +static inline void +__rte_experimental +_calc_drop_probability(const struct rte_pie_config *pie_cfg, + struct rte_pie *pie, uint64_t time) +{ + uint64_t qdelay_ref = pie_cfg->qdelay_ref; + + /* Note: can be implemented using integer multiply. + * DQ_THRESHOLD is power of 2 value. + */ + double current_qdelay = pie->qlen * (pie->avg_dq_time / RTE_DQ_THRESHOLD); + + double p = RTE_ALPHA * (current_qdelay - qdelay_ref) + + RTE_BETA * (current_qdelay - pie->qdelay_old); + + if (pie->drop_prob < 0.000001) + p = p * 0.00048828125; /* (1/2048) = 0.00048828125 */ + else if (pie->drop_prob < 0.00001) + p = p * 0.001953125; /* (1/512) = 0.001953125 */ + else if (pie->drop_prob < 0.0001) + p = p * 0.0078125; /* (1/128) = 0.0078125 */ + else if (pie->drop_prob < 0.001) + p = p * 0.03125; /* (1/32) = 0.03125 */ + else if (pie->drop_prob < 0.01) + p = p * 0.125; /* (1/8) = 0.125 */ + else if (pie->drop_prob < 0.1) + p = p * 0.5; /* (1/2) = 0.5 */ + + if (pie->drop_prob >= 0.1 && p > 0.02) + p = 0.02; + + pie->drop_prob += p; + + double qdelay = qdelay_ref * 0.5; + + /* Exponentially decay drop prob when congestion goes away */ + if (current_qdelay < qdelay && pie->qdelay_old < qdelay) + pie->drop_prob *= 0.98; /* 1 - 1/64 is sufficient */ + + /* Bound drop probability */ + if (pie->drop_prob < 0) + pie->drop_prob = 0; + if (pie->drop_prob > 1) + pie->drop_prob = 1; + + pie->qdelay_old = current_qdelay; + pie->last_measurement = time; + + uint64_t burst_allowance = pie->burst_allowance - pie_cfg->dp_update_interval; + + pie->burst_allowance = (burst_allowance > 0) ? burst_allowance : 0; +} + +/** + * @brief make a decision to drop or enqueue a packet based on probability + * criteria + * + * @param pie_cfg [in] config pointer to a PIE configuration parameter structure + * @param pie [in, out] data pointer to PIE runtime data + * + * @return operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet + */ +static inline int +__rte_experimental +_rte_pie_drop(const struct rte_pie_config *pie_cfg, + struct rte_pie *pie) +{ + uint64_t rand_value; + double qdelay = pie_cfg->qdelay_ref * 0.5; + + /* PIE is active but the queue is not congested: return 0 */ + if (((pie->qdelay_old < qdelay) && (pie->drop_prob < 0.2)) || + (pie->qlen <= (pie_cfg->tailq_th * 0.1))) + return 0; + + if (pie->drop_prob == 0) + pie->accu_prob = 0; + + /* For practical reasons, drop probability can be further scaled according + * to packet size, but one needs to set a bound to avoid unnecessary bias + * Random drop + */ + pie->accu_prob += pie->drop_prob; + + if (pie->accu_prob < 0.85) + return 0; + + if (pie->accu_prob >= 8.5) + return 1; + + rand_value = rte_rand()/RTE_RAND_MAX; + + if ((double)rand_value < pie->drop_prob) { + pie->accu_prob = 0; + return 1; + } + + /* No drop */ + return 0; +} + +/** + * @brief Decides if new packet should be enqeued or dropped for non-empty queue + * + * @param pie_cfg [in] config pointer to a PIE configuration parameter structure + * @param pie [in,out] data pointer to PIE runtime data + * @param pkt_len [in] packet length in bytes + * @param time [in] current time (measured in cpu cycles) + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet based on max threshold criterion + * @retval 2 drop the packet based on mark probability criterion + */ +static inline int +__rte_experimental +rte_pie_enqueue_nonempty(const struct rte_pie_config *pie_cfg, + struct rte_pie *pie, + uint32_t pkt_len, + const uint64_t time) +{ + /* Check queue space against the tail drop threshold */ + if (pie->qlen >= pie_cfg->tailq_th) { + + pie->accu_prob = 0; + return 1; + } + + if (pie->active) { + /* Update drop probability after certain interval */ + if ((time - pie->last_measurement) >= pie_cfg->dp_update_interval) + _calc_drop_probability(pie_cfg, pie, time); + + /* Decide whether packet to be dropped or enqueued */ + if (_rte_pie_drop(pie_cfg, pie) && pie->burst_allowance == 0) + return 2; + } + + /* When queue occupancy is over a certain threshold, turn on PIE */ + if ((pie->active == 0) && + (pie->qlen >= (pie_cfg->tailq_th * 0.1))) { + pie->active = 1; + pie->qdelay_old = 0; + pie->drop_prob = 0; + pie->in_measurement = 1; + pie->departed_bytes_count = 0; + pie->avg_dq_time = 0; + pie->last_measurement = time; + pie->burst_allowance = pie_cfg->max_burst; + pie->accu_prob = 0; + pie->start_measurement = time; + } + + /* when queue has been idle for a while, turn off PIE and Reset counters */ + if (pie->active == 1 && + pie->qlen < (pie_cfg->tailq_th * 0.1)) { + pie->active = 0; + pie->in_measurement = 0; + } + + /* Update PIE qlen parameter */ + pie->qlen++; + pie->qlen_bytes += pkt_len; + + /* No drop */ + return 0; +} + +/** + * @brief Decides if new packet should be enqeued or dropped + * Updates run time data and gives verdict whether to enqueue or drop the packet. + * + * @param pie_cfg [in] config pointer to a PIE configuration parameter structure + * @param pie [in,out] data pointer to PIE runtime data + * @param qlen [in] queue length + * @param pkt_len [in] packet length in bytes + * @param time [in] current time stamp (measured in cpu cycles) + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet based on drop probility criteria + */ +static inline int +__rte_experimental +rte_pie_enqueue(const struct rte_pie_config *pie_cfg, + struct rte_pie *pie, + const unsigned int qlen, + uint32_t pkt_len, + const uint64_t time) +{ + RTE_ASSERT(pie_cfg != NULL); + RTE_ASSERT(pie != NULL); + + if (qlen != 0) + return rte_pie_enqueue_nonempty(pie_cfg, pie, pkt_len, time); + else + return rte_pie_enqueue_empty(pie_cfg, pie, pkt_len); +} + +/** + * @brief PIE rate estimation method + * Called on each packet departure. + * + * @param pie [in] data pointer to PIE runtime data + * @param pkt_len [in] packet length in bytes + * @param time [in] current time stamp in cpu cycles + */ +static inline void +__rte_experimental +rte_pie_dequeue(struct rte_pie *pie, + uint32_t pkt_len, + uint64_t time) +{ + /* Dequeue rate estimation */ + if (pie->in_measurement) { + pie->departed_bytes_count += pkt_len; + + /* Start a new measurement cycle when enough packets */ + if (pie->departed_bytes_count >= RTE_DQ_THRESHOLD) { + uint64_t dq_time = time - pie->start_measurement; + + if (pie->avg_dq_time == 0) + pie->avg_dq_time = dq_time; + else + pie->avg_dq_time = dq_time * RTE_DQ_WEIGHT + pie->avg_dq_time + * (1 - RTE_DQ_WEIGHT); + + pie->in_measurement = 0; + } + } + + /* Start measurement cycle when enough data in the queue */ + if ((pie->qlen_bytes >= RTE_DQ_THRESHOLD) && (pie->in_measurement == 0)) { + pie->in_measurement = 1; + pie->start_measurement = time; + pie->departed_bytes_count = 0; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* __RTE_PIE_H_INCLUDED__ */ diff --git a/lib/sched/rte_sched.c b/lib/sched/rte_sched.c index a858f61f95..0db5335bb6 100644 --- a/lib/sched/rte_sched.c +++ b/lib/sched/rte_sched.c @@ -89,8 +89,12 @@ struct rte_sched_queue { struct rte_sched_queue_extra { struct rte_sched_queue_stats stats; -#ifdef RTE_SCHED_RED - struct rte_red red; +#ifdef RTE_SCHED_CMAN + RTE_STD_C11 + union { + struct rte_red red; + struct rte_pie pie; + }; #endif }; @@ -183,8 +187,14 @@ struct rte_sched_subport { /* Pipe queues size */ uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; -#ifdef RTE_SCHED_RED - struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS]; +#ifdef RTE_SCHED_CMAN + enum rte_sched_cman_mode cman; + + RTE_STD_C11 + union { + struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS]; + struct rte_pie_config pie_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + }; #endif /* Scheduling loop detection */ @@ -1078,6 +1088,90 @@ rte_sched_free_memory(struct rte_sched_port *port, uint32_t n_subports) rte_free(port); } +#ifdef RTE_SCHED_CMAN +static int +rte_sched_red_config(struct rte_sched_port *port, + struct rte_sched_subport *s, + struct rte_sched_subport_params *params, + uint32_t n_subports) +{ + uint32_t i; + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + + uint32_t j; + + for (j = 0; j < RTE_COLORS; j++) { + /* if min/max are both zero, then RED is disabled */ + if ((params->cman_params->red_params[i][j].min_th | + params->cman_params->red_params[i][j].max_th) == 0) { + continue; + } + + if (rte_red_config_init(&s->red_config[i][j], + params->cman_params->red_params[i][j].wq_log2, + params->cman_params->red_params[i][j].min_th, + params->cman_params->red_params[i][j].max_th, + params->cman_params->red_params[i][j].maxp_inv) != 0) { + rte_sched_free_memory(port, n_subports); + + RTE_LOG(NOTICE, SCHED, + "%s: RED configuration init fails\n", __func__); + return -EINVAL; + } + } + } + s->cman = RTE_SCHED_CMAN_RED; + return 0; +} + +static int +rte_sched_pie_config(struct rte_sched_port *port, + struct rte_sched_subport *s, + struct rte_sched_subport_params *params, + uint32_t n_subports) +{ + uint32_t i; + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + if (params->cman_params->pie_params[i].tailq_th > params->qsize[i]) { + RTE_LOG(NOTICE, SCHED, + "%s: PIE tailq threshold incorrect\n", __func__); + return -EINVAL; + } + + if (rte_pie_config_init(&s->pie_config[i], + params->cman_params->pie_params[i].qdelay_ref, + params->cman_params->pie_params[i].dp_update_interval, + params->cman_params->pie_params[i].max_burst, + params->cman_params->pie_params[i].tailq_th) != 0) { + rte_sched_free_memory(port, n_subports); + + RTE_LOG(NOTICE, SCHED, + "%s: PIE configuration init fails\n", __func__); + return -EINVAL; + } + } + s->cman = RTE_SCHED_CMAN_PIE; + return 0; +} + +static int +rte_sched_cman_config(struct rte_sched_port *port, + struct rte_sched_subport *s, + struct rte_sched_subport_params *params, + uint32_t n_subports) +{ + if (params->cman_params->cman_mode == RTE_SCHED_CMAN_RED) + return rte_sched_red_config(port, s, params, n_subports); + + else if (params->cman_params->cman_mode == RTE_SCHED_CMAN_PIE) + return rte_sched_pie_config(port, s, params, n_subports); + + return -EINVAL; +} +#endif + int rte_sched_subport_config(struct rte_sched_port *port, uint32_t subport_id, @@ -1167,29 +1261,11 @@ rte_sched_subport_config(struct rte_sched_port *port, s->n_pipe_profiles = params->n_pipe_profiles; s->n_max_pipe_profiles = params->n_max_pipe_profiles; -#ifdef RTE_SCHED_RED - for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { - uint32_t j; - - for (j = 0; j < RTE_COLORS; j++) { - /* if min/max are both zero, then RED is disabled */ - if ((params->red_params[i][j].min_th | - params->red_params[i][j].max_th) == 0) { - continue; - } - - if (rte_red_config_init(&s->red_config[i][j], - params->red_params[i][j].wq_log2, - params->red_params[i][j].min_th, - params->red_params[i][j].max_th, - params->red_params[i][j].maxp_inv) != 0) { - RTE_LOG(NOTICE, SCHED, - "%s: RED configuration init fails\n", - __func__); - ret = -EINVAL; - goto out; - } - } +#ifdef RTE_SCHED_CMAN + status = rte_sched_cman_config(port, s, params, n_subports); + if (status) { + RTE_LOG(NOTICE, SCHED, "%s: CMAN configuration fails\n", __func__); + return status; } #endif @@ -1718,30 +1794,19 @@ rte_sched_port_update_subport_stats(struct rte_sched_port *port, subport->stats.n_bytes_tc[tc_index] += pkt_len; } -#ifdef RTE_SCHED_RED -static inline void -rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, - struct rte_sched_subport *subport, - uint32_t qindex, - struct rte_mbuf *pkt, - uint32_t red) -#else static inline void rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, struct rte_sched_subport *subport, uint32_t qindex, struct rte_mbuf *pkt, - __rte_unused uint32_t red) -#endif + __rte_unused uint32_t n_pkts_cman_dropped) { uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex); uint32_t pkt_len = pkt->pkt_len; subport->stats.n_pkts_tc_dropped[tc_index] += 1; subport->stats.n_bytes_tc_dropped[tc_index] += pkt_len; -#ifdef RTE_SCHED_RED - subport->stats.n_pkts_red_dropped[tc_index] += red; -#endif + subport->stats.n_pkts_cman_dropped[tc_index] += n_pkts_cman_dropped; } static inline void @@ -1756,73 +1821,93 @@ rte_sched_port_update_queue_stats(struct rte_sched_subport *subport, qe->stats.n_bytes += pkt_len; } -#ifdef RTE_SCHED_RED -static inline void -rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport, - uint32_t qindex, - struct rte_mbuf *pkt, - uint32_t red) -#else static inline void rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport, uint32_t qindex, struct rte_mbuf *pkt, - __rte_unused uint32_t red) -#endif + __rte_unused uint32_t n_pkts_cman_dropped) { struct rte_sched_queue_extra *qe = subport->queue_extra + qindex; uint32_t pkt_len = pkt->pkt_len; qe->stats.n_pkts_dropped += 1; qe->stats.n_bytes_dropped += pkt_len; -#ifdef RTE_SCHED_RED - qe->stats.n_pkts_red_dropped += red; +#ifdef RTE_SCHED_CMAN + qe->stats.n_pkts_cman_dropped += n_pkts_cman_dropped; #endif } #endif /* RTE_SCHED_COLLECT_STATS */ -#ifdef RTE_SCHED_RED +#ifdef RTE_SCHED_CMAN static inline int -rte_sched_port_red_drop(struct rte_sched_port *port, +rte_sched_port_cman_drop(struct rte_sched_port *port, struct rte_sched_subport *subport, struct rte_mbuf *pkt, uint32_t qindex, uint16_t qlen) { struct rte_sched_queue_extra *qe; - struct rte_red_config *red_cfg; - struct rte_red *red; uint32_t tc_index; - enum rte_color color; tc_index = rte_sched_port_pipe_tc(port, qindex); - color = rte_sched_port_pkt_read_color(pkt); - red_cfg = &subport->red_config[tc_index][color]; + qe = subport->queue_extra + qindex; - if ((red_cfg->min_th | red_cfg->max_th) == 0) - return 0; + /* RED */ + if (subport->cman == RTE_SCHED_CMAN_RED) { + struct rte_red_config *red_cfg; + struct rte_red *red; + enum rte_color color; - qe = subport->queue_extra + qindex; - red = &qe->red; + color = rte_sched_port_pkt_read_color(pkt); + red_cfg = &subport->red_config[tc_index][color]; + + if ((red_cfg->min_th | red_cfg->max_th) == 0) + return 0; + + red = &qe->red; + + return rte_red_enqueue(red_cfg, red, qlen, port->time); + } - return rte_red_enqueue(red_cfg, red, qlen, port->time); + /* PIE */ + struct rte_pie_config *pie_cfg = &subport->pie_config[tc_index]; + struct rte_pie *pie = &qe->pie; + + return rte_pie_enqueue(pie_cfg, pie, qlen, pkt->pkt_len, port->time_cpu_cycles); } static inline void -rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port, +rte_sched_port_red_set_queue_empty_timestamp(struct rte_sched_port *port, struct rte_sched_subport *subport, uint32_t qindex) { struct rte_sched_queue_extra *qe = subport->queue_extra + qindex; - struct rte_red *red = &qe->red; + if (subport->cman == RTE_SCHED_CMAN_RED) { + struct rte_red *red = &qe->red; - rte_red_mark_queue_empty(red, port->time); + rte_red_mark_queue_empty(red, port->time); + } +} + +static inline void +rte_sched_port_pie_dequeue(struct rte_sched_subport *subport, +uint32_t qindex, uint32_t pkt_len, uint64_t time) { + if (subport->cman == RTE_SCHED_CMAN_PIE) { + struct rte_sched_queue_extra *qe = subport->queue_extra + qindex; + struct rte_pie *pie = &qe->pie; + + /* Update queue length */ + pie->qlen -= 1; + pie->qlen_bytes -= pkt_len; + + rte_pie_dequeue(pie, pkt_len, time); + } } #else -static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unused, +static inline int rte_sched_port_cman_drop(struct rte_sched_port *port __rte_unused, struct rte_sched_subport *subport __rte_unused, struct rte_mbuf *pkt __rte_unused, uint32_t qindex __rte_unused, @@ -1831,9 +1916,17 @@ static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unus return 0; } -#define rte_sched_port_set_queue_empty_timestamp(port, subport, qindex) +#define rte_sched_port_red_set_queue_empty_timestamp(port, subport, qindex) + +static inline void +rte_sched_port_pie_dequeue(struct rte_sched_subport *subport __rte_unused, + uint32_t qindex __rte_unused, + uint32_t pkt_len __rte_unused, + uint64_t time __rte_unused) { + /* do-nothing when RTE_SCHED_CMAN not defined */ +} -#endif /* RTE_SCHED_RED */ +#endif /* RTE_SCHED_CMAN */ #ifdef RTE_SCHED_DEBUG @@ -1929,7 +2022,7 @@ rte_sched_port_enqueue_qwa(struct rte_sched_port *port, qlen = q->qw - q->qr; /* Drop the packet (and update drop stats) when queue is full */ - if (unlikely(rte_sched_port_red_drop(port, subport, pkt, qindex, qlen) || + if (unlikely(rte_sched_port_cman_drop(port, subport, pkt, qindex, qlen) || (qlen >= qsize))) { rte_pktmbuf_free(pkt); #ifdef RTE_SCHED_COLLECT_STATS @@ -2402,6 +2495,7 @@ grinder_schedule(struct rte_sched_port *port, { struct rte_sched_grinder *grinder = subport->grinder + pos; struct rte_sched_queue *queue = grinder->queue[grinder->qpos]; + uint32_t qindex = grinder->qindex[grinder->qpos]; struct rte_mbuf *pkt = grinder->pkt; uint32_t pkt_len = pkt->pkt_len + port->frame_overhead; uint32_t be_tc_active; @@ -2421,15 +2515,16 @@ grinder_schedule(struct rte_sched_port *port, (pkt_len * grinder->wrr_cost[grinder->qpos]) & be_tc_active; if (queue->qr == queue->qw) { - uint32_t qindex = grinder->qindex[grinder->qpos]; - rte_bitmap_clear(subport->bmp, qindex); grinder->qmask &= ~(1 << grinder->qpos); if (be_tc_active) grinder->wrr_mask[grinder->qpos] = 0; - rte_sched_port_set_queue_empty_timestamp(port, subport, qindex); + + rte_sched_port_red_set_queue_empty_timestamp(port, subport, qindex); } + rte_sched_port_pie_dequeue(subport, qindex, pkt_len, port->time_cpu_cycles); + /* Reset pipe loop detection */ subport->pipe_loop = RTE_SCHED_PIPE_INVALID; grinder->productive = 1; diff --git a/lib/sched/rte_sched.h b/lib/sched/rte_sched.h index cb851301e9..9381b253b7 100644 --- a/lib/sched/rte_sched.h +++ b/lib/sched/rte_sched.h @@ -61,9 +61,10 @@ extern "C" { #include <rte_mbuf.h> #include <rte_meter.h> -/** Random Early Detection (RED) */ -#ifdef RTE_SCHED_RED +/** Congestion Management */ +#ifdef RTE_SCHED_CMAN #include "rte_red.h" +#include "rte_pie.h" #endif /** Maximum number of queues per pipe. @@ -110,6 +111,28 @@ extern "C" { #define RTE_SCHED_FRAME_OVERHEAD_DEFAULT 24 #endif +/** + * Congestion Management (CMAN) mode + * + * This is used for controlling the admission of packets into a packet queue or + * group of packet queues on congestion. + * + * The *Random Early Detection (RED)* algorithm works by proactively dropping + * more and more input packets as the queue occupancy builds up. When the queue + * is full or almost full, RED effectively works as *tail drop*. The *Weighted + * RED* algorithm uses a separate set of RED thresholds for each packet color. + * + * Similar to RED, Proportional Integral Controller Enhanced (PIE) randomly + * drops a packet at the onset of the congestion and tries to control the + * latency around the target value. The congestion detection, however, is based + * on the queueing latency instead of the queue length like RED. For more + * information, refer RFC8033. + */ +enum rte_sched_cman_mode { + RTE_SCHED_CMAN_RED, /**< Random Early Detection (RED) */ + RTE_SCHED_CMAN_PIE, /**< Proportional Integral Controller Enhanced (PIE) */ +}; + /* * Pipe configuration parameters. The period and credits_per_period * parameters are measured in bytes, with one byte meaning the time @@ -139,6 +162,24 @@ struct rte_sched_pipe_params { uint8_t wrr_weights[RTE_SCHED_BE_QUEUES_PER_PIPE]; }; +#ifdef RTE_SCHED_CMAN +/* + * Congestion Management configuration parameters. + */ +struct rte_sched_cman_params { + /** Congestion Management mode */ + enum rte_sched_cman_mode cman_mode; + + union { + /** RED parameters */ + struct rte_red_params red_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS]; + + /** PIE parameters */ + struct rte_pie_params pie_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + }; +}; +#endif + /* * Subport configuration parameters. The period and credits_per_period * parameters are measured in bytes, with one byte meaning the time @@ -174,9 +215,9 @@ struct rte_sched_subport_params { /** Max allowed profiles in the pipe profile table */ uint32_t n_max_pipe_profiles; -#ifdef RTE_SCHED_RED - /** RED parameters */ - struct rte_red_params red_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS]; +#ifdef RTE_SCHED_CMAN + /** Congestion Management parameters */ + struct rte_sched_cman_params *cman_params; #endif }; @@ -208,10 +249,8 @@ struct rte_sched_subport_stats { /** Number of bytes dropped for each traffic class */ uint64_t n_bytes_tc_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; -#ifdef RTE_SCHED_RED - /** Number of packets dropped by red */ - uint64_t n_pkts_red_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; -#endif + /** Number of packets dropped by congestion management scheme */ + uint64_t n_pkts_cman_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; }; /** Queue statistics */ @@ -222,10 +261,8 @@ struct rte_sched_queue_stats { /** Packets dropped */ uint64_t n_pkts_dropped; -#ifdef RTE_SCHED_RED - /** Packets dropped by RED */ - uint64_t n_pkts_red_dropped; -#endif + /** Packets dropped by congestion management scheme */ + uint64_t n_pkts_cman_dropped; /** Bytes successfully written */ uint64_t n_bytes; diff --git a/lib/sched/version.map b/lib/sched/version.map index a6e505c8ac..d22c07fc9f 100644 --- a/lib/sched/version.map +++ b/lib/sched/version.map @@ -30,4 +30,8 @@ EXPERIMENTAL { # added in 20.11 rte_sched_port_subport_profile_add; + + # added in 21.11 + rte_pie_rt_data_init; + rte_pie_config_init; }; -- 2.25.1