From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.lysator.liu.se (mail.lysator.liu.se [130.236.254.3]) by dpdk.org (Postfix) with ESMTP id A0CA54CBB for ; Tue, 11 Sep 2018 10:03:16 +0200 (CEST) Received: from mail.lysator.liu.se (localhost [127.0.0.1]) by mail.lysator.liu.se (Postfix) with ESMTP id 51D034008D for ; Tue, 11 Sep 2018 10:03:16 +0200 (CEST) Received: by mail.lysator.liu.se (Postfix, from userid 1004) id 3B81040087; Tue, 11 Sep 2018 10:03:16 +0200 (CEST) X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on bernadotte.lysator.liu.se X-Spam-Level: X-Spam-Status: No, score=-0.9 required=5.0 tests=ALL_TRUSTED,AWL autolearn=disabled version=3.4.1 X-Spam-Score: -0.9 Received: from isengard.friendlyfire.se (host-90-232-156-190.mobileonline.telia.com [90.232.156.190]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.lysator.liu.se (Postfix) with ESMTPSA id A40F240080; Tue, 11 Sep 2018 10:03:12 +0200 (CEST) From: =?UTF-8?q?Mattias=20R=C3=B6nnblom?= To: jerin.jacob@caviumnetworks.com Cc: bruce.richardson@intel.com, dev@dpdk.org, =?UTF-8?q?Mattias=20R=C3=B6nnblom?= Date: Tue, 11 Sep 2018 10:02:12 +0200 Message-Id: <20180911080216.3017-7-mattias.ronnblom@ericsson.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20180911080216.3017-1-mattias.ronnblom@ericsson.com> References: <20180911080216.3017-1-mattias.ronnblom@ericsson.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-Virus-Scanned: ClamAV using ClamSMTP Subject: [dpdk-dev] [PATCH v3 06/10] event/dsw: add DSW port load measurements X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 11 Sep 2018 08:03:17 -0000 The DSW event device port now attempts to estimate its load (i.e. how busy it is). This is required for load balancing to work (although load balancing is not included in this patch), and may also be useful for debugging purposes. Signed-off-by: Mattias Rönnblom --- drivers/event/dsw/dsw_evdev.c | 14 +++++ drivers/event/dsw/dsw_evdev.h | 40 +++++++++++++ drivers/event/dsw/dsw_event.c | 109 ++++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c index 40a7435be..bcfa17bab 100644 --- a/drivers/event/dsw/dsw_evdev.c +++ b/drivers/event/dsw/dsw_evdev.c @@ -4,6 +4,7 @@ #include +#include #include #include #include @@ -43,6 +44,11 @@ dsw_port_setup(struct rte_eventdev *dev, uint8_t port_id, port->in_ring = in_ring; + rte_atomic16_init(&port->load); + + port->load_update_interval = + (DSW_LOAD_UPDATE_INTERVAL * rte_get_timer_hz()) / US_PER_S; + dev->data->ports[port_id] = port; return 0; @@ -240,11 +246,19 @@ static int dsw_start(struct rte_eventdev *dev) { struct dsw_evdev *dsw = dsw_pmd_priv(dev); + uint16_t i; + uint64_t now; rte_atomic32_init(&dsw->credits_on_loan); initial_flow_to_port_assignment(dsw); + now = rte_get_timer_cycles(); + for (i = 0; i < dsw->num_ports; i++) { + dsw->ports[i].measurement_start = now; + dsw->ports[i].busy_start = now; + } + return 0; } diff --git a/drivers/event/dsw/dsw_evdev.h b/drivers/event/dsw/dsw_evdev.h index f8e94e4a4..a5399dda5 100644 --- a/drivers/event/dsw/dsw_evdev.h +++ b/drivers/event/dsw/dsw_evdev.h @@ -36,6 +36,15 @@ */ #define DSW_PARALLEL_FLOWS (1024) +/* 'Background tasks' are polling the control rings for * + * migration-related messages, or flush the output buffer (so + * buffered events doesn't linger too long). Shouldn't be too low, + * since the system won't benefit from the 'batching' effects from + * the output buffer, and shouldn't be too high, since it will make + * buffered events linger too long in case the port goes idle. + */ +#define DSW_MAX_PORT_OPS_PER_BG_TASK (128) + /* Avoid making small 'loans' from the central in-flight event credit * pool, to improve efficiency. */ @@ -50,6 +59,22 @@ */ #define DSW_IN_RING_SIZE (DSW_MAX_EVENTS) +#define DSW_MAX_LOAD (INT16_MAX) +#define DSW_LOAD_FROM_PERCENT(x) ((int16_t)(((x)*DSW_MAX_LOAD)/100)) +#define DSW_LOAD_TO_PERCENT(x) ((100*x)/DSW_MAX_LOAD) + +/* The thought behind keeping the load update interval shorter than + * the migration interval is that the load from newly migrated flows + * should 'show up' on the load measurement before new migrations are + * considered. This is to avoid having too many flows, from too many + * source ports, to be migrated too quickly to a lightly loaded port - + * in particular since this might cause the system to oscillate. + */ +#define DSW_LOAD_UPDATE_INTERVAL (DSW_MIGRATION_INTERVAL/4) +#define DSW_OLD_LOAD_WEIGHT (1) + +#define DSW_MIGRATION_INTERVAL (1000) + struct dsw_port { uint16_t id; @@ -71,10 +96,25 @@ struct dsw_port { uint16_t next_parallel_flow_id; + uint16_t ops_since_bg_task; + + uint64_t last_bg; + + /* For port load measurement. */ + uint64_t next_load_update; + uint64_t load_update_interval; + uint64_t measurement_start; + uint64_t busy_start; + uint64_t busy_cycles; + uint64_t total_busy_cycles; + uint16_t out_buffer_len[DSW_MAX_PORTS]; struct rte_event out_buffer[DSW_MAX_PORTS][DSW_MAX_PORT_OUT_BUFFER]; struct rte_event_ring *in_ring __rte_cache_aligned; + + /* Estimate of current port load. */ + rte_atomic16_t load __rte_cache_aligned; } __rte_cache_aligned; struct dsw_queue { diff --git a/drivers/event/dsw/dsw_event.c b/drivers/event/dsw/dsw_event.c index 4a3af8ecd..f326147c9 100644 --- a/drivers/event/dsw/dsw_event.c +++ b/drivers/event/dsw/dsw_event.c @@ -7,6 +7,7 @@ #include #include +#include #include static bool @@ -75,6 +76,70 @@ dsw_port_return_credits(struct dsw_evdev *dsw, struct dsw_port *port, } } +static void +dsw_port_load_record(struct dsw_port *port, unsigned int dequeued) +{ + if (dequeued > 0 && port->busy_start == 0) + /* work period begins */ + port->busy_start = rte_get_timer_cycles(); + else if (dequeued == 0 && port->busy_start > 0) { + /* work period ends */ + uint64_t work_period = + rte_get_timer_cycles() - port->busy_start; + port->busy_cycles += work_period; + port->busy_start = 0; + } +} + +static int16_t +dsw_port_load_close_period(struct dsw_port *port, uint64_t now) +{ + uint64_t passed = now - port->measurement_start; + uint64_t busy_cycles = port->busy_cycles; + + if (port->busy_start > 0) { + busy_cycles += (now - port->busy_start); + port->busy_start = now; + } + + int16_t load = (DSW_MAX_LOAD * busy_cycles) / passed; + + port->measurement_start = now; + port->busy_cycles = 0; + + port->total_busy_cycles += busy_cycles; + + return load; +} + +static void +dsw_port_load_update(struct dsw_port *port, uint64_t now) +{ + int16_t old_load; + int16_t period_load; + int16_t new_load; + + old_load = rte_atomic16_read(&port->load); + + period_load = dsw_port_load_close_period(port, now); + + new_load = (period_load + old_load*DSW_OLD_LOAD_WEIGHT) / + (DSW_OLD_LOAD_WEIGHT+1); + + rte_atomic16_set(&port->load, new_load); +} + +static void +dsw_port_consider_load_update(struct dsw_port *port, uint64_t now) +{ + if (now < port->next_load_update) + return; + + port->next_load_update = now + port->load_update_interval; + + dsw_port_load_update(port, now); +} + static uint8_t dsw_schedule(struct dsw_evdev *dsw, uint8_t queue_id, uint16_t flow_hash) { @@ -196,6 +261,39 @@ dsw_port_buffer_event(struct dsw_evdev *dsw, struct dsw_port *source_port, dsw_port_buffer_non_paused(dsw, source_port, dest_port_id, event); } +static void +dsw_port_note_op(struct dsw_port *port, uint16_t num_events) +{ + /* To pull the control ring reasonbly often on busy ports, + * each dequeued/enqueued event is considered an 'op' too. + */ + port->ops_since_bg_task += (num_events+1); +} + +static void +dsw_port_flush_out_buffers(struct dsw_evdev *dsw, struct dsw_port *source_port); + +static void +dsw_port_bg_process(struct dsw_evdev *dsw, struct dsw_port *port) +{ + if (unlikely(port->ops_since_bg_task >= DSW_MAX_PORT_OPS_PER_BG_TASK)) { + uint64_t now; + + now = rte_get_timer_cycles(); + + port->last_bg = now; + + /* Logic to avoid having events linger in the output + * buffer too long. + */ + dsw_port_flush_out_buffers(dsw, port); + + dsw_port_consider_load_update(port, now); + + port->ops_since_bg_task = 0; + } +} + static void dsw_port_flush_out_buffers(struct dsw_evdev *dsw, struct dsw_port *source_port) { @@ -225,6 +323,8 @@ dsw_event_enqueue_burst_generic(void *port, const struct rte_event events[], DSW_LOG_DP_PORT(DEBUG, source_port->id, "Attempting to enqueue %d " "events to port %d.\n", events_len, source_port->id); + dsw_port_bg_process(dsw, source_port); + /* XXX: For performance (=ring efficiency) reasons, the * scheduler relies on internal non-ring buffers instead of * immediately sending the event to the destination ring. For @@ -238,6 +338,7 @@ dsw_event_enqueue_burst_generic(void *port, const struct rte_event events[], * considered. */ if (unlikely(events_len == 0)) { + dsw_port_note_op(source_port, DSW_MAX_PORT_OPS_PER_BG_TASK); dsw_port_flush_out_buffers(dsw, source_port); return 0; } @@ -245,6 +346,8 @@ dsw_event_enqueue_burst_generic(void *port, const struct rte_event events[], if (unlikely(events_len > source_port->enqueue_depth)) events_len = source_port->enqueue_depth; + dsw_port_note_op(source_port, events_len); + if (!op_types_known) for (i = 0; i < events_len; i++) { switch (events[i].op) { @@ -337,6 +440,8 @@ dsw_event_dequeue_burst(void *port, struct rte_event *events, uint16_t num, source_port->pending_releases = 0; + dsw_port_bg_process(dsw, source_port); + if (unlikely(num > source_port->dequeue_depth)) num = source_port->dequeue_depth; @@ -344,6 +449,10 @@ dsw_event_dequeue_burst(void *port, struct rte_event *events, uint16_t num, source_port->pending_releases = dequeued; + dsw_port_load_record(source_port, dequeued); + + dsw_port_note_op(source_port, dequeued); + if (dequeued > 0) { DSW_LOG_DP_PORT(DEBUG, source_port->id, "Dequeued %d events.\n", dequeued); -- 2.17.1