* [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
@ 2017-08-23 14:47 Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
` (4 more replies)
0 siblings, 5 replies; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw)
To: rsanford; +Cc: dev
In the current implementation of the DPDK timer library, timers can be
created and set to be handled by a target lcore by adding it to a
skiplist that corresponds to that lcore. However, if an application
enables multiple lcores, and each of these lcores repeatedly attempts
to install timers on the same target lcore, overall application
throughput will be reduced as all lcores contend to acquire the lock
guarding the single skiplist of pending timers.
This patchset addresses this scenario by adding an array of skiplists
to each lcore's priv_timer struct, such that when lcore i installs a
timer on lcore k, the timer will be added to the ith skiplist for
lcore k. If lcore j installs a timer on lcore k simultaneously,
lcores i and j can both proceed since they will be acquiring different
locks for different lists.
When lcore k processes its pending timers, it will traverse each skiplist
in its array and acquire a skiplist's lock while a run list is broken
out; meanwhile, all other lists can continue to be modified. Then, all
run lists for lcore k are collected and traversed together so timers are
executed in their global order.
Gabriel Carrillo (3):
timer: add per-installer pending lists for each lcore
timer: handle timers installed from non-EAL threads
doc: update timer lib docs
doc/guides/prog_guide/timer_lib.rst | 19 ++-
lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++-------------
lib/librte_timer/rte_timer.h | 9 +-
3 files changed, 231 insertions(+), 126 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
@ 2017-08-23 14:47 ` Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger
2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo
` (3 subsequent siblings)
4 siblings, 2 replies; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw)
To: rsanford; +Cc: dev
Instead of each priv_timer struct containing a single skiplist, this
commit adds a skiplist for each enabled lcore to priv_timer. In the case
that multiple lcores repeatedly install timers on the same target lcore,
this change reduces lock contention for the target lcore's skiplists and
increases performance.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++----------------
lib/librte_timer/rte_timer.h | 9 +-
2 files changed, 202 insertions(+), 116 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 5ee0840..2db74c1 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -37,6 +37,7 @@
#include <inttypes.h>
#include <assert.h>
#include <sys/queue.h>
+#include <stdbool.h>
#include <rte_atomic.h>
#include <rte_common.h>
@@ -56,17 +57,19 @@
LIST_HEAD(rte_timer_list, rte_timer);
+struct skiplist {
+ struct rte_timer head; /**< dummy timer instance to head up list */
+ rte_spinlock_t lock; /**< lock to protect list access */
+ unsigned depth; /**< track the current depth of the skiplist */
+} __rte_cache_aligned;
+
struct priv_timer {
- struct rte_timer pending_head; /**< dummy timer instance to head up list */
- rte_spinlock_t list_lock; /**< lock to protect list access */
+ /** one pending list per enabled lcore */
+ struct skiplist pending_lists[RTE_MAX_LCORE];
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
int updated;
-
- /** track the current depth of the skiplist */
- unsigned curr_skiplist_depth;
-
unsigned prev_lcore; /**< used for lcore round robin */
/** running timer on this lcore now */
@@ -81,6 +84,10 @@ struct priv_timer {
/** per-lcore private info for timers */
static struct priv_timer priv_timer[RTE_MAX_LCORE];
+/** cache of IDs of enabled lcores */
+static unsigned enabled_lcores[RTE_MAX_LCORE];
+static int n_enabled_lcores;
+
/* when debug is enabled, store some statistics */
#ifdef RTE_LIBRTE_TIMER_DEBUG
#define __TIMER_STAT_ADD(name, n) do { \
@@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE];
void
rte_timer_subsystem_init(void)
{
- unsigned lcore_id;
+ unsigned lcore_id1, lcore_id2;
+ struct skiplist *list;
+ int i, j;
+
+ RTE_LCORE_FOREACH(lcore_id1)
+ enabled_lcores[n_enabled_lcores++] = lcore_id1;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
- rte_spinlock_init(&priv_timer[lcore_id].list_lock);
- priv_timer[lcore_id].prev_lcore = lcore_id;
+ for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores;
+ lcore_id1 = enabled_lcores[++i]) {
+ priv_timer[lcore_id1].prev_lcore = lcore_id1;
+
+ for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores;
+ lcore_id2 = enabled_lcores[++j]) {
+ list = &priv_timer[lcore_id1].pending_lists[lcore_id2];
+ rte_spinlock_init(&list->lock);
+ }
}
}
@@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim)
union rte_timer_status status;
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
}
@@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim,
* or ready to run on local core, exit
*/
if (prev_status.state == RTE_TIMER_RUNNING &&
- (prev_status.owner != (uint16_t)lcore_id ||
+ (prev_status.owner != (int)lcore_id ||
tim != priv_timer[lcore_id].running_tim))
return -1;
@@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim,
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_CONFIG;
- status.owner = (int16_t)lcore_id;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim)
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_RUNNING;
- status.owner = (int16_t)lcore_id;
+ status.installer = prev_status.installer;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth)
* are <= that time value.
*/
static void
-timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
+timer_get_prev_entries(uint64_t time_val, struct skiplist *list,
struct rte_timer **prev)
{
- unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
- prev[lvl] = &priv_timer[tim_lcore].pending_head;
+ unsigned lvl = list->depth;
+ prev[lvl] = &list->head;
while(lvl != 0) {
lvl--;
prev[lvl] = prev[lvl+1];
@@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
* all skiplist levels.
*/
static void
-timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
+timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
struct rte_timer **prev)
{
int i;
/* to get a specific entry in the list, look for just lower than the time
* values, and then increment on each level individually if necessary
*/
- timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
- for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries(tim->expire - 1, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
while (prev[i]->sl_next[i] != NULL &&
prev[i]->sl_next[i] != tim &&
prev[i]->sl_next[i]->expire <= tim->expire)
@@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
unsigned lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
* we are not called from rte_timer_manage() */
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_lock(&list->lock);
/* find where exactly this element goes in the list of elements
* for each depth. */
- timer_get_prev_entries(tim->expire, tim_lcore, prev);
+ timer_get_prev_entries(tim->expire, list, prev);
/* now assign it a new level and add at that level */
- const unsigned tim_level = timer_get_skiplist_level(
- priv_timer[tim_lcore].curr_skiplist_depth);
- if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
- priv_timer[tim_lcore].curr_skiplist_depth++;
+ const unsigned tim_level = timer_get_skiplist_level(list->depth);
+ if (tim_level == list->depth)
+ list->depth++;
lvl = tim_level;
- while (lvl > 0) {
+ while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) {
tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
prev[lvl]->sl_next[lvl] = tim;
lvl--;
@@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
/* save the lowest list entry into the expire field of the dummy hdr
* NOTE: this is not atomic on 32-bit*/
- priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
- pending_head.sl_next[0]->expire;
+ list->head.expire = list->head.sl_next[0]->expire;
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/*
@@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
unsigned prev_owner = prev_status.owner;
int i;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+
+ list = &priv_timer[prev_owner].pending_lists[prev_status.installer];
/* if timer needs is pending another core, we need to lock the
* list; if it is on local core, we need to lock if we are not
* called from rte_timer_manage() */
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_lock(&list->lock);
/* save the lowest list entry into the expire field of the dummy hdr.
* NOTE: this is not atomic on 32-bit */
- if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
- priv_timer[prev_owner].pending_head.expire =
- ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
+ if (tim == list->head.sl_next[0])
+ list->head.expire = ((tim->sl_next[0] == NULL) ?
+ 0 : tim->sl_next[0]->expire);
/* adjust pointers from previous entries to point past this */
- timer_get_prev_entries_for_node(tim, prev_owner, prev);
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries_for_node(tim, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
if (prev[i]->sl_next[i] == tim)
prev[i]->sl_next[i] = tim->sl_next[i];
}
/* in case we deleted last entry at a level, adjust down max level */
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
- if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
- priv_timer[prev_owner].curr_skiplist_depth --;
+ for (i = list->depth - 1; i >= 0; i--)
+ if (list->head.sl_next[i] == NULL)
+ list->depth--;
else
break;
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/* Reset and start the timer associated with the timer handle (private func) */
@@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.owner = (int16_t)tim_lcore;
+ status.installer = lcore_id;
+ status.owner = tim_lcore;
tim->status.u32 = status.u32;
return 0;
@@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim)
/* mark timer as stopped */
rte_wmb();
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
return 0;
@@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim)
}
/* must be called periodically, run all timer that expired */
-void rte_timer_manage(void)
+void
+rte_timer_manage(void)
{
union rte_timer_status status;
- struct rte_timer *tim, *next_tim;
- struct rte_timer *run_first_tim, **pprev;
- unsigned lcore_id = rte_lcore_id();
+ struct rte_timer *tim, *next_tim, **pprev;
+ struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1];
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
- uint64_t cur_time;
- int i, ret;
+ struct priv_timer *priv_tim;
+ unsigned installer_lcore, lcore_id = rte_lcore_id();
+ uint64_t cur_time, min_expire;
+ int i, j, min_idx, ret;
+ int nrunlists = 0;
+ int local_is_locked = 0;
+ struct skiplist *dest_list, *list = NULL;
+ bool done;
/* timer manager only runs on EAL thread with valid lcore_id */
assert(lcore_id < RTE_MAX_LCORE);
+ priv_tim = &priv_timer[lcore_id];
+
__TIMER_STAT_ADD(manage, 1);
- /* optimize for the case where per-cpu list is empty */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
- return;
- cur_time = rte_get_timer_cycles();
+ for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores;
+ installer_lcore = enabled_lcores[++i]) {
+ list = &priv_tim->pending_lists[installer_lcore];
+
+ /* optimize for the case where list is empty */
+ if (list->head.sl_next[0] == NULL)
+ continue;
+ cur_time = rte_get_timer_cycles();
#ifdef RTE_ARCH_X86_64
- /* on 64-bit the value cached in the pending_head.expired will be
- * updated atomically, so we can consult that for a quick check here
- * outside the lock */
- if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
- return;
+ /* on 64-bit the value cached in the list->head.expired will be
+ * updated atomically, so we can consult that for a quick check
+ * here outside the lock
+ */
+ if (likely(list->head.expire > cur_time))
+ continue;
#endif
- /* browse ordered list, add expired timers in 'expired' list */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ /* browse ordered list, add expired timers in 'expired' list */
+ rte_spinlock_lock(&list->lock);
- /* if nothing to do just unlock and return */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
- priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
- return;
- }
+ /* if nothing to do just unlock and continue */
+ if (list->head.sl_next[0] == NULL ||
+ list->head.sl_next[0]->expire > cur_time) {
+ rte_spinlock_unlock(&list->lock);
+ continue;
+ }
- /* save start of list of expired timers */
- tim = priv_timer[lcore_id].pending_head.sl_next[0];
+ /* save start of list of expired timers */
+ tim = list->head.sl_next[0];
+
+ /* break the existing list at current time point */
+ timer_get_prev_entries(cur_time, list, prev);
+ for (j = list->depth - 1; j >= 0; j--) {
+ if (prev[j] == &list->head)
+ continue;
+ list->head.sl_next[j] =
+ prev[j]->sl_next[j];
+ if (prev[j]->sl_next[j] == NULL)
+ list->depth--;
+ prev[j]->sl_next[j] = NULL;
+ }
- /* break the existing list at current time point */
- timer_get_prev_entries(cur_time, lcore_id, prev);
- for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
- if (prev[i] == &priv_timer[lcore_id].pending_head)
- continue;
- priv_timer[lcore_id].pending_head.sl_next[i] =
- prev[i]->sl_next[i];
- if (prev[i]->sl_next[i] == NULL)
- priv_timer[lcore_id].curr_skiplist_depth--;
- prev[i] ->sl_next[i] = NULL;
- }
+ /* transition run-list from PENDING to RUNNING */
+ run_first_tims[nrunlists] = tim;
+ pprev = &run_first_tims[nrunlists];
+ nrunlists++;
+
+ for (; tim != NULL; tim = next_tim) {
+ next_tim = tim->sl_next[0];
+
+ ret = timer_set_running_state(tim);
+ if (likely(ret == 0)) {
+ pprev = &tim->sl_next[0];
+ } else {
+ /* another core is trying to re-config this one,
+ * remove it from local expired list
+ */
+ *pprev = next_tim;
+ }
+ }
- /* transition run-list from PENDING to RUNNING */
- run_first_tim = tim;
- pprev = &run_first_tim;
+ /* update the next to expire timer value */
+ list->head.expire = (list->head.sl_next[0] == NULL) ?
+ 0 : list->head.sl_next[0]->expire;
- for ( ; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
+ rte_spinlock_unlock(&list->lock);
+ }
- ret = timer_set_running_state(tim);
- if (likely(ret == 0)) {
- pprev = &tim->sl_next[0];
- } else {
- /* another core is trying to re-config this one,
- * remove it from local expired list
- */
- *pprev = next_tim;
+ /* Now process the run lists */
+ while (1) {
+ done = true;
+ min_expire = UINT64_MAX;
+ min_idx = 0;
+
+ /* Find the next oldest timer to process */
+ for (i = 0; i < nrunlists; i++) {
+ tim = run_first_tims[i];
+
+ if (tim != NULL && tim->expire < min_expire) {
+ min_expire = tim->expire;
+ min_idx = i;
+ done = false;
+ }
}
- }
- /* update the next to expire timer value */
- priv_timer[lcore_id].pending_head.expire =
- (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
- priv_timer[lcore_id].pending_head.sl_next[0]->expire;
+ if (done)
+ break;
+
+ tim = run_first_tims[min_idx];
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+ /* Move down the runlist from which we picked a timer to
+ * execute
+ */
+ run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
- /* now scan expired list and call callbacks */
- for (tim = run_first_tim; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
- priv_timer[lcore_id].updated = 0;
- priv_timer[lcore_id].running_tim = tim;
+ priv_tim->updated = 0;
+ priv_tim->running_tim = tim;
/* execute callback function with list unlocked */
tim->f(tim, tim->arg);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
- * function, we have nothing to do here */
- if (priv_timer[lcore_id].updated == 1)
+ * function, we have nothing to do here
+ */
+ if (priv_tim->updated == 1)
continue;
if (tim->period == 0) {
/* remove from done list and mark timer as stopped */
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
rte_wmb();
tim->status.u32 = status.u32;
}
else {
- /* keep it in list and mark timer as pending */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ dest_list = &priv_tim->pending_lists[lcore_id];
+
+ /* If the destination list is the current list
+ * we can acquire the lock here, and hold it
+ * across removal and insertion of the timer.
+ */
+ if (list == dest_list) {
+ rte_spinlock_lock(&list->lock);
+ local_is_locked = 1;
+ }
+
+ /* set timer state back to PENDING and
+ * reinsert it in pending list
+ */
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
- status.owner = (int16_t)lcore_id;
+ status.installer = tim->status.installer;
+ status.owner = lcore_id;
rte_wmb();
tim->status.u32 = status.u32;
- __rte_timer_reset(tim, tim->expire + tim->period,
- tim->period, lcore_id, tim->f, tim->arg, 1);
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+
+ __rte_timer_reset(tim,
+ tim->expire + tim->period,
+ tim->period, lcore_id,
+ tim->f, tim->arg, local_is_locked);
+
+ if (local_is_locked) {
+ rte_spinlock_unlock(&list->lock);
+ local_is_locked = 0;
+ }
}
}
- priv_timer[lcore_id].running_tim = NULL;
+ priv_tim->running_tim = NULL;
}
/* dump statistics about timers */
diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h
index a276a73..4cc6baf 100644
--- a/lib/librte_timer/rte_timer.h
+++ b/lib/librte_timer/rte_timer.h
@@ -77,7 +77,7 @@ extern "C" {
#define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */
#define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */
-#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */
+#define RTE_TIMER_NO_LCORE -2
/**
* Timer type: Periodic or single (one-shot).
@@ -94,10 +94,11 @@ enum rte_timer_type {
union rte_timer_status {
RTE_STD_C11
struct {
- uint16_t state; /**< Stop, pending, running, config. */
- int16_t owner; /**< The lcore that owns the timer. */
+ unsigned state : 2;
+ int installer : 15;
+ int owner : 15;
};
- uint32_t u32; /**< To atomic-set status + owner. */
+ uint32_t u32; /**< To atomic-set status, installer, owner */
};
#ifdef RTE_LIBRTE_TIMER_DEBUG
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
@ 2017-08-23 14:47 ` Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo
` (2 subsequent siblings)
4 siblings, 1 reply; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw)
To: rsanford; +Cc: dev
This commit adds support for timers being created from
non-EAL threads; it maps timers from all such threads to
lcore id RTE_MAX_LCORE, and puts them all in a corresponding
skiplist.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
lib/librte_timer/rte_timer.c | 48 ++++++++++++++++++++++++++++++--------------
1 file changed, 33 insertions(+), 15 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 2db74c1..a493874 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -64,8 +64,8 @@ struct skiplist {
} __rte_cache_aligned;
struct priv_timer {
- /** one pending list per enabled lcore */
- struct skiplist pending_lists[RTE_MAX_LCORE];
+ /** one pending list per lcore, plus one for non-EAL threads */
+ struct skiplist pending_lists[RTE_MAX_LCORE + 1];
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
@@ -85,7 +85,7 @@ struct priv_timer {
static struct priv_timer priv_timer[RTE_MAX_LCORE];
/** cache of IDs of enabled lcores */
-static unsigned enabled_lcores[RTE_MAX_LCORE];
+static unsigned enabled_lcores[RTE_MAX_LCORE + 1];
static int n_enabled_lcores;
/* when debug is enabled, store some statistics */
@@ -103,23 +103,33 @@ static int n_enabled_lcores;
void
rte_timer_subsystem_init(void)
{
- unsigned lcore_id1, lcore_id2;
+ unsigned target_lcore, installer_lcore;
struct skiplist *list;
+ struct priv_timer *priv_tim;
int i, j;
- RTE_LCORE_FOREACH(lcore_id1)
- enabled_lcores[n_enabled_lcores++] = lcore_id1;
+ RTE_LCORE_FOREACH(target_lcore)
+ enabled_lcores[n_enabled_lcores++] = target_lcore;
+
+ /* To handle timers coming from non-EAL threads */
+ enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores;
- lcore_id1 = enabled_lcores[++i]) {
- priv_timer[lcore_id1].prev_lcore = lcore_id1;
+ for (i = 0, target_lcore = enabled_lcores[i]; i < n_enabled_lcores;
+ target_lcore = enabled_lcores[++i]) {
+ /* Don't use this value to index the priv_timer array */
+ if (target_lcore == RTE_MAX_LCORE)
+ continue;
- for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores;
- lcore_id2 = enabled_lcores[++j]) {
- list = &priv_timer[lcore_id1].pending_lists[lcore_id2];
+ priv_tim = &priv_timer[target_lcore];
+ priv_tim->prev_lcore = target_lcore;
+
+ for (j = 0, installer_lcore = enabled_lcores[j];
+ j < n_enabled_lcores;
+ installer_lcore = enabled_lcores[++j]) {
+ list = &priv_tim->pending_lists[installer_lcore];
rte_spinlock_init(&list->lock);
}
}
@@ -300,10 +310,16 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
static void
timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
{
- unsigned lcore_id = rte_lcore_id();
+ unsigned installer_lcore, lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
- struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id];
+ struct skiplist *list;
+
+ /* Check if timer being installed from non-EAL thread */
+ installer_lcore = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE :
+ lcore_id;
+
+ list = &priv_timer[tim_lcore].pending_lists[installer_lcore];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
@@ -439,7 +455,9 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.installer = lcore_id;
+ /* Check if installer is non-EAL thread */
+ status.installer = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE :
+ lcore_id;
status.owner = tim_lcore;
tim->status.u32 = status.u32;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 3/3] doc: update timer lib docs
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo
@ 2017-08-23 14:47 ` Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith
2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
4 siblings, 1 reply; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw)
To: rsanford; +Cc: dev
This change updates the timer library documentation to
reflect a change to the organization of the skiplists
in the implementation.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
doc/guides/prog_guide/timer_lib.rst | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst
index f437417..f94ffaa 100644
--- a/doc/guides/prog_guide/timer_lib.rst
+++ b/doc/guides/prog_guide/timer_lib.rst
@@ -1,5 +1,5 @@
.. BSD LICENSE
- Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -53,16 +53,17 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>`
Implementation Details
----------------------
-Timers are tracked on a per-lcore basis,
-with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure.
-The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level.
+Timers are tracked in a per-lcore array of skiplist data structures; each
+lcore has one skiplist corresponding to each other lcore that could load a timer on it. All pending
+timers in each skiplist are maintained in order of timer expiry.
+Each skiplist has ten levels and each entry in the table appears in each level with probability ¼^level.
This means that all entries are present in level 0, 1 in every 4 entries is present at level 1,
one in every 16 at level 2 and so on up to level 9.
This means that adding and removing entries from the timer list for a core can be done in log(n) time,
up to 4^10 entries, that is, approximately 1,000,000 timers per lcore.
A timer structure contains a special field called status,
-which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id).
+which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id).
Depending on the timer state, we know if a timer is present in a list or not:
* STOPPED: no owner, not in a list
@@ -77,17 +78,17 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al
When modifying the state of a timer,
a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically.
-Inside the rte_timer_manage() function,
-the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
+Inside the rte_timer_manage() function, each of an lcore's skiplists is traversed in sequence.
+Each skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
until an entry which has not yet expired has been encountered.
-To improve performance in the case where there are entries in the timer list but none of those timers have yet expired,
+To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired,
the expiry time of the first list entry is maintained within the per-core timer list structure itself.
On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure.
(Since expiry times are maintained as 64-bit values,
a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock,
so this additional check is skipped in favor of checking as normal once the lock has been taken.)
On both 64-bit and 32-bit platforms,
-a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty.
+rte_timer_manage() can continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty.
Use Cases
---------
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
` (2 preceding siblings ...)
2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo
@ 2017-08-23 15:02 ` Wiles, Keith
2017-08-23 16:19 ` Carrillo, Erik G
2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
4 siblings, 1 reply; 30+ messages in thread
From: Wiles, Keith @ 2017-08-23 15:02 UTC (permalink / raw)
To: Carrillo, Erik G; +Cc: rsanford, dev
> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com> wrote:
>
> In the current implementation of the DPDK timer library, timers can be
> created and set to be handled by a target lcore by adding it to a
> skiplist that corresponds to that lcore. However, if an application
> enables multiple lcores, and each of these lcores repeatedly attempts
> to install timers on the same target lcore, overall application
> throughput will be reduced as all lcores contend to acquire the lock
> guarding the single skiplist of pending timers.
>
> This patchset addresses this scenario by adding an array of skiplists
> to each lcore's priv_timer struct, such that when lcore i installs a
> timer on lcore k, the timer will be added to the ith skiplist for
> lcore k. If lcore j installs a timer on lcore k simultaneously,
> lcores i and j can both proceed since they will be acquiring different
> locks for different lists.
>
> When lcore k processes its pending timers, it will traverse each skiplist
> in its array and acquire a skiplist's lock while a run list is broken
> out; meanwhile, all other lists can continue to be modified. Then, all
> run lists for lcore k are collected and traversed together so timers are
> executed in their global order.
What is the performance and/or latency added to the timeout now?
I worry about the case when just about all of the cores are enabled, which could be as high was 128 or more now.
One option is to have the lcore j that wants to install a timer on lcore k to pass a message via a ring to lcore k to add that timer. We could even add that logic into setting a timer on a different lcore then the caller in the current API. The ring would be a multi-producer and single consumer, we still have the lock. What am I missing here?
>
> Gabriel Carrillo (3):
> timer: add per-installer pending lists for each lcore
> timer: handle timers installed from non-EAL threads
> doc: update timer lib docs
>
> doc/guides/prog_guide/timer_lib.rst | 19 ++-
> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++-------------
> lib/librte_timer/rte_timer.h | 9 +-
> 3 files changed, 231 insertions(+), 126 deletions(-)
>
> --
> 2.6.4
>
Regards,
Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith
@ 2017-08-23 16:19 ` Carrillo, Erik G
2017-08-23 16:50 ` Wiles, Keith
0 siblings, 1 reply; 30+ messages in thread
From: Carrillo, Erik G @ 2017-08-23 16:19 UTC (permalink / raw)
To: Wiles, Keith; +Cc: rsanford, dev
> -----Original Message-----
> From: Wiles, Keith
> Sent: Wednesday, August 23, 2017 10:02 AM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> Cc: rsanford@akamai.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
>
>
> > On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com>
> wrote:
> >
> > In the current implementation of the DPDK timer library, timers can be
> > created and set to be handled by a target lcore by adding it to a
> > skiplist that corresponds to that lcore. However, if an application
> > enables multiple lcores, and each of these lcores repeatedly attempts
> > to install timers on the same target lcore, overall application
> > throughput will be reduced as all lcores contend to acquire the lock
> > guarding the single skiplist of pending timers.
> >
> > This patchset addresses this scenario by adding an array of skiplists
> > to each lcore's priv_timer struct, such that when lcore i installs a
> > timer on lcore k, the timer will be added to the ith skiplist for
> > lcore k. If lcore j installs a timer on lcore k simultaneously,
> > lcores i and j can both proceed since they will be acquiring different
> > locks for different lists.
> >
> > When lcore k processes its pending timers, it will traverse each
> > skiplist in its array and acquire a skiplist's lock while a run list
> > is broken out; meanwhile, all other lists can continue to be modified.
> > Then, all run lists for lcore k are collected and traversed together
> > so timers are executed in their global order.
>
> What is the performance and/or latency added to the timeout now?
>
> I worry about the case when just about all of the cores are enabled, which
> could be as high was 128 or more now.
There is a case in the timer_perf_autotest that runs rte_timer_manage with zero timers that can give a sense of the added latency. When run with one lcore, it completes in around 25 cycles. When run with 43 lcores (the highest I have access to at the moment), rte_timer_mange completes in around 155 cycles. So it looks like each added lcore adds around 3 cycles of overhead for checking empty lists in my testing.
>
> One option is to have the lcore j that wants to install a timer on lcore k to pass
> a message via a ring to lcore k to add that timer. We could even add that logic
> into setting a timer on a different lcore then the caller in the current API. The
> ring would be a multi-producer and single consumer, we still have the lock.
> What am I missing here?
>
I did try this approach: initially I had a multi-producer single-consumer ring that would hold requests to add or delete a timer from lcore k's skiplist, but it didn't really give an appreciable increase in my test application throughput. In profiling this solution, the hotspot had moved from acquiring the skiplist's spinlock to the rte_atomic32_cmpset that the multiple-producer ring code uses to manipulate the head pointer.
Then, I tried multiple single-producer single-consumer rings per target lcore. This removed the ring hotspot, but the performance didn't increase as much as with the proposed solution. These solutions also add overhead to rte_timer_manage, as it would have to process the rings and then process the skiplists.
One other thing to note is that a solution that uses such messages changes the use models for the timer. One interesting example is:
- lcore I enqueues a message to install a timer on lcore k
- lcore k runs rte_timer_manage, processes its messages and adds the timer to its list
- lcore I then enqueues a message to stop the same timer, now owned by lcore k
- lcore k does not run rte_timer_manage again
- lcore I wants to free the timer but it might not be safe
Even though lcore I has successfully enqueued the request to stop the timer (and delete it from lcore k's pending list), it hasn't actually been deleted from the list yet, so freeing it could corrupt the list. This case exists in the existing timer stress tests.
Another interesting scenario is:
- lcore I resets a timer to install it on lcore k
- lcore j resets the same timer to install it on lcore k
- then, lcore k runs timer_manage
Lcore j's message obviates lcore i's message, and it would be wasted work for lcore k to process it, so we should mark it to be skipped over. Handling all the edge cases was more complex than the solution proposed.
> >
> > Gabriel Carrillo (3):
> > timer: add per-installer pending lists for each lcore
> > timer: handle timers installed from non-EAL threads
> > doc: update timer lib docs
> >
> > doc/guides/prog_guide/timer_lib.rst | 19 ++-
> > lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++---------
> ----
> > lib/librte_timer/rte_timer.h | 9 +-
> > 3 files changed, 231 insertions(+), 126 deletions(-)
> >
> > --
> > 2.6.4
> >
>
> Regards,
> Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 16:19 ` Carrillo, Erik G
@ 2017-08-23 16:50 ` Wiles, Keith
2017-08-23 19:28 ` Carrillo, Erik G
0 siblings, 1 reply; 30+ messages in thread
From: Wiles, Keith @ 2017-08-23 16:50 UTC (permalink / raw)
To: Carrillo, Erik G; +Cc: rsanford, dev
> On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com> wrote:
>
>
>
>> -----Original Message-----
>> From: Wiles, Keith
>> Sent: Wednesday, August 23, 2017 10:02 AM
>> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
>> Cc: rsanford@akamai.com; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
>>
>>
>>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com>
>> wrote:
>>>
>>> In the current implementation of the DPDK timer library, timers can be
>>> created and set to be handled by a target lcore by adding it to a
>>> skiplist that corresponds to that lcore. However, if an application
>>> enables multiple lcores, and each of these lcores repeatedly attempts
>>> to install timers on the same target lcore, overall application
>>> throughput will be reduced as all lcores contend to acquire the lock
>>> guarding the single skiplist of pending timers.
>>>
>>> This patchset addresses this scenario by adding an array of skiplists
>>> to each lcore's priv_timer struct, such that when lcore i installs a
>>> timer on lcore k, the timer will be added to the ith skiplist for
>>> lcore k. If lcore j installs a timer on lcore k simultaneously,
>>> lcores i and j can both proceed since they will be acquiring different
>>> locks for different lists.
>>>
>>> When lcore k processes its pending timers, it will traverse each
>>> skiplist in its array and acquire a skiplist's lock while a run list
>>> is broken out; meanwhile, all other lists can continue to be modified.
>>> Then, all run lists for lcore k are collected and traversed together
>>> so timers are executed in their global order.
>>
>> What is the performance and/or latency added to the timeout now?
>>
>> I worry about the case when just about all of the cores are enabled, which
>> could be as high was 128 or more now.
>
> There is a case in the timer_perf_autotest that runs rte_timer_manage with zero timers that can give a sense of the added latency. When run with one lcore, it completes in around 25 cycles. When run with 43 lcores (the highest I have access to at the moment), rte_timer_mange completes in around 155 cycles. So it looks like each added lcore adds around 3 cycles of overhead for checking empty lists in my testing.
Does this mean we have only 25 cycles on the current design or is the 25 cycles for the new design?
If for the new design, then what is the old design cost compared to the new cost.
I also think we need the call to a timer function in the calculation, just to make sure we have at least one timer in the list and we account for any short cuts in the code for no timers active.
>
>>
>> One option is to have the lcore j that wants to install a timer on lcore k to pass
>> a message via a ring to lcore k to add that timer. We could even add that logic
>> into setting a timer on a different lcore then the caller in the current API. The
>> ring would be a multi-producer and single consumer, we still have the lock.
>> What am I missing here?
>>
>
> I did try this approach: initially I had a multi-producer single-consumer ring that would hold requests to add or delete a timer from lcore k's skiplist, but it didn't really give an appreciable increase in my test application throughput. In profiling this solution, the hotspot had moved from acquiring the skiplist's spinlock to the rte_atomic32_cmpset that the multiple-producer ring code uses to manipulate the head pointer.
>
> Then, I tried multiple single-producer single-consumer rings per target lcore. This removed the ring hotspot, but the performance didn't increase as much as with the proposed solution. These solutions also add overhead to rte_timer_manage, as it would have to process the rings and then process the skiplists.
>
> One other thing to note is that a solution that uses such messages changes the use models for the timer. One interesting example is:
> - lcore I enqueues a message to install a timer on lcore k
> - lcore k runs rte_timer_manage, processes its messages and adds the timer to its list
> - lcore I then enqueues a message to stop the same timer, now owned by lcore k
> - lcore k does not run rte_timer_manage again
> - lcore I wants to free the timer but it might not be safe
This case seems like a mistake to me as lcore k should continue to call rte_timer_manager() to process any new timers from other lcores not just the case where the list becomes empty and lcore k does not add timer to his list.
>
> Even though lcore I has successfully enqueued the request to stop the timer (and delete it from lcore k's pending list), it hasn't actually been deleted from the list yet, so freeing it could corrupt the list. This case exists in the existing timer stress tests.
>
> Another interesting scenario is:
> - lcore I resets a timer to install it on lcore k
> - lcore j resets the same timer to install it on lcore k
> - then, lcore k runs timer_manage
This one also seems like a mistake, more then one lcore setting the same timer seems like a problem and should not be done. A lcore should own a timer and no other lcore should be able to change that timer. If multiple lcores need a timer then they should not share the same timer structure.
>
> Lcore j's message obviates lcore i's message, and it would be wasted work for lcore k to process it, so we should mark it to be skipped over. Handling all the edge cases was more complex than the solution proposed.
Hmmm, to me it seems simple here as long as the lcores follow the same rules and sharing a timer structure is very risky and avoidable IMO.
Once you have lcores adding timers to another lcore then all accesses to that skip list must be serialized or you get unpredictable results. This should also fix most of the edge cases you are talking about.
Also it seems to me the case with an lcore adding timers to another lcore timer list is a specific use case and could be handled by a different set of APIs for that specific use case. Then we do not need to change the current design and all of the overhead is placed on the new APIs/design. IMO we are turning the current timer design into a global timer design as it really is a per lcore design today and I beleive that is a mistake.
>
>>>
>>> Gabriel Carrillo (3):
>>> timer: add per-installer pending lists for each lcore
>>> timer: handle timers installed from non-EAL threads
>>> doc: update timer lib docs
>>>
>>> doc/guides/prog_guide/timer_lib.rst | 19 ++-
>>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++---------
>> ----
>>> lib/librte_timer/rte_timer.h | 9 +-
>>> 3 files changed, 231 insertions(+), 126 deletions(-)
>>>
>>> --
>>> 2.6.4
>>>
>>
>> Regards,
>> Keith
Regards,
Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 16:50 ` Wiles, Keith
@ 2017-08-23 19:28 ` Carrillo, Erik G
2017-08-23 21:04 ` Wiles, Keith
0 siblings, 1 reply; 30+ messages in thread
From: Carrillo, Erik G @ 2017-08-23 19:28 UTC (permalink / raw)
To: Wiles, Keith; +Cc: rsanford, dev
> -----Original Message-----
> From: Wiles, Keith
> Sent: Wednesday, August 23, 2017 11:50 AM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> Cc: rsanford@akamai.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
>
>
> > On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com>
> wrote:
> >
> >
> >
> >> -----Original Message-----
> >> From: Wiles, Keith
> >> Sent: Wednesday, August 23, 2017 10:02 AM
> >> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> >> Cc: rsanford@akamai.com; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements
> >> ***
> >>
> >>
> >>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo
> >>> <erik.g.carrillo@intel.com>
> >> wrote:
> >>>
> >>> In the current implementation of the DPDK timer library, timers can
> >>> be created and set to be handled by a target lcore by adding it to a
> >>> skiplist that corresponds to that lcore. However, if an application
> >>> enables multiple lcores, and each of these lcores repeatedly
> >>> attempts to install timers on the same target lcore, overall
> >>> application throughput will be reduced as all lcores contend to
> >>> acquire the lock guarding the single skiplist of pending timers.
> >>>
> >>> This patchset addresses this scenario by adding an array of
> >>> skiplists to each lcore's priv_timer struct, such that when lcore i
> >>> installs a timer on lcore k, the timer will be added to the ith
> >>> skiplist for lcore k. If lcore j installs a timer on lcore k
> >>> simultaneously, lcores i and j can both proceed since they will be
> >>> acquiring different locks for different lists.
> >>>
> >>> When lcore k processes its pending timers, it will traverse each
> >>> skiplist in its array and acquire a skiplist's lock while a run list
> >>> is broken out; meanwhile, all other lists can continue to be modified.
> >>> Then, all run lists for lcore k are collected and traversed together
> >>> so timers are executed in their global order.
> >>
> >> What is the performance and/or latency added to the timeout now?
> >>
> >> I worry about the case when just about all of the cores are enabled,
> >> which could be as high was 128 or more now.
> >
> > There is a case in the timer_perf_autotest that runs rte_timer_manage
> with zero timers that can give a sense of the added latency. When run with
> one lcore, it completes in around 25 cycles. When run with 43 lcores (the
> highest I have access to at the moment), rte_timer_mange completes in
> around 155 cycles. So it looks like each added lcore adds around 3 cycles of
> overhead for checking empty lists in my testing.
>
> Does this mean we have only 25 cycles on the current design or is the 25
> cycles for the new design?
>
Both - when run with one lcore, the new design becomes equivalent to the original one. I tested the current design to confirm.
> If for the new design, then what is the old design cost compared to the new
> cost.
>
> I also think we need the call to a timer function in the calculation, just to
> make sure we have at least one timer in the list and we account for any short
> cuts in the code for no timers active.
>
Looking at the numbers for non-empty lists in timer_perf_autotest, the overhead appears to fall away. Here are some representative runs for timer_perf_autotest:
43 lcores enabled, installing 1M timers on an lcore and processing them with current design:
<...snipped...>
Appending 1000000 timers
Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us)
Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73 (0us)
Resetting 1000000 timers
Time for 1000000 timers: 1406756396 (641ms), Time per timer: 1406 (1us)
<...snipped...>
43 lcores enabled, installing 1M timers on an lcore and processing them with proposed design:
<...snipped...>
Appending 1000000 timers
Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us)
Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79 (0us)
Resetting 1000000 timers
Time for 1000000 timers: 1427189116 (650ms), Time per timer: 1427 (1us)
<...snipped...>
The above are not averages, so the numbers don't really indicate which is faster, but they show that the overhead of the proposed design should not be appreciable.
> >
> >>
> >> One option is to have the lcore j that wants to install a timer on
> >> lcore k to pass a message via a ring to lcore k to add that timer. We
> >> could even add that logic into setting a timer on a different lcore
> >> then the caller in the current API. The ring would be a multi-producer and
> single consumer, we still have the lock.
> >> What am I missing here?
> >>
> >
> > I did try this approach: initially I had a multi-producer single-consumer ring
> that would hold requests to add or delete a timer from lcore k's skiplist, but it
> didn't really give an appreciable increase in my test application throughput.
> In profiling this solution, the hotspot had moved from acquiring the skiplist's
> spinlock to the rte_atomic32_cmpset that the multiple-producer ring code
> uses to manipulate the head pointer.
> >
> > Then, I tried multiple single-producer single-consumer rings per target
> lcore. This removed the ring hotspot, but the performance didn't increase as
> much as with the proposed solution. These solutions also add overhead to
> rte_timer_manage, as it would have to process the rings and then process
> the skiplists.
> >
> > One other thing to note is that a solution that uses such messages changes
> the use models for the timer. One interesting example is:
> > - lcore I enqueues a message to install a timer on lcore k
> > - lcore k runs rte_timer_manage, processes its messages and adds the
> > timer to its list
> > - lcore I then enqueues a message to stop the same timer, now owned by
> > lcore k
> > - lcore k does not run rte_timer_manage again
> > - lcore I wants to free the timer but it might not be safe
>
> This case seems like a mistake to me as lcore k should continue to call
> rte_timer_manager() to process any new timers from other lcores not just
> the case where the list becomes empty and lcore k does not add timer to his
> list.
>
> >
> > Even though lcore I has successfully enqueued the request to stop the
> timer (and delete it from lcore k's pending list), it hasn't actually been
> deleted from the list yet, so freeing it could corrupt the list. This case exists
> in the existing timer stress tests.
> >
> > Another interesting scenario is:
> > - lcore I resets a timer to install it on lcore k
> > - lcore j resets the same timer to install it on lcore k
> > - then, lcore k runs timer_manage
>
> This one also seems like a mistake, more then one lcore setting the same
> timer seems like a problem and should not be done. A lcore should own a
> timer and no other lcore should be able to change that timer. If multiple
> lcores need a timer then they should not share the same timer structure.
>
Both of the above cases exist in the timer library stress tests, so a solution would presumably need to address them or it would be less flexible. The original design passed these tests, as does the proposed one.
> >
> > Lcore j's message obviates lcore i's message, and it would be wasted work
> for lcore k to process it, so we should mark it to be skipped over. Handling all
> the edge cases was more complex than the solution proposed.
>
> Hmmm, to me it seems simple here as long as the lcores follow the same
> rules and sharing a timer structure is very risky and avoidable IMO.
>
> Once you have lcores adding timers to another lcore then all accesses to that
> skip list must be serialized or you get unpredictable results. This should also
> fix most of the edge cases you are talking about.
>
> Also it seems to me the case with an lcore adding timers to another lcore
> timer list is a specific use case and could be handled by a different set of APIs
> for that specific use case. Then we do not need to change the current design
> and all of the overhead is placed on the new APIs/design. IMO we are
> turning the current timer design into a global timer design as it really is a per
> lcore design today and I beleive that is a mistake.
>
Well, the original API explicitly supports installing a timer to be executed on a different lcore, and there are no API changes in the patchset. Also, the proposed design keeps the per-lcore design intact; it only takes what used to be one large skiplist that held timers for all installing lcores, and separates it into N skiplists that correspond 1:1 to an installing lcore. When an lcore processes timers on its lists it will still only be managing timers it owns, and no others.
> >
> >>>
> >>> Gabriel Carrillo (3):
> >>> timer: add per-installer pending lists for each lcore
> >>> timer: handle timers installed from non-EAL threads
> >>> doc: update timer lib docs
> >>>
> >>> doc/guides/prog_guide/timer_lib.rst | 19 ++-
> >>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------
> ---
> >> ----
> >>> lib/librte_timer/rte_timer.h | 9 +-
> >>> 3 files changed, 231 insertions(+), 126 deletions(-)
> >>>
> >>> --
> >>> 2.6.4
> >>>
> >>
> >> Regards,
> >> Keith
>
> Regards,
> Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 19:28 ` Carrillo, Erik G
@ 2017-08-23 21:04 ` Wiles, Keith
2017-08-24 14:08 ` Carrillo, Erik G
0 siblings, 1 reply; 30+ messages in thread
From: Wiles, Keith @ 2017-08-23 21:04 UTC (permalink / raw)
To: Carrillo, Erik G; +Cc: rsanford, dev
> On Aug 23, 2017, at 2:28 PM, Carrillo, Erik G <erik.g.carrillo@intel.com> wrote:
>
>>
>> -----Original Message-----
>> From: Wiles, Keith
>> Sent: Wednesday, August 23, 2017 11:50 AM
>> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
>> Cc: rsanford@akamai.com; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
>>
>>
>>> On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com>
>> wrote:
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Wiles, Keith
>>>> Sent: Wednesday, August 23, 2017 10:02 AM
>>>> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
>>>> Cc: rsanford@akamai.com; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements
>>>> ***
>>>>
>>>>
>>>>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo
>>>>> <erik.g.carrillo@intel.com>
>>>> wrote:
>>>>>
>>>>> In the current implementation of the DPDK timer library, timers can
>>>>> be created and set to be handled by a target lcore by adding it to a
>>>>> skiplist that corresponds to that lcore. However, if an application
>>>>> enables multiple lcores, and each of these lcores repeatedly
>>>>> attempts to install timers on the same target lcore, overall
>>>>> application throughput will be reduced as all lcores contend to
>>>>> acquire the lock guarding the single skiplist of pending timers.
>>>>>
>>>>> This patchset addresses this scenario by adding an array of
>>>>> skiplists to each lcore's priv_timer struct, such that when lcore i
>>>>> installs a timer on lcore k, the timer will be added to the ith
>>>>> skiplist for lcore k. If lcore j installs a timer on lcore k
>>>>> simultaneously, lcores i and j can both proceed since they will be
>>>>> acquiring different locks for different lists.
>>>>>
>>>>> When lcore k processes its pending timers, it will traverse each
>>>>> skiplist in its array and acquire a skiplist's lock while a run list
>>>>> is broken out; meanwhile, all other lists can continue to be modified.
>>>>> Then, all run lists for lcore k are collected and traversed together
>>>>> so timers are executed in their global order.
>>>>
>>>> What is the performance and/or latency added to the timeout now?
>>>>
>>>> I worry about the case when just about all of the cores are enabled,
>>>> which could be as high was 128 or more now.
>>>
>>> There is a case in the timer_perf_autotest that runs rte_timer_manage
>> with zero timers that can give a sense of the added latency. When run with
>> one lcore, it completes in around 25 cycles. When run with 43 lcores (the
>> highest I have access to at the moment), rte_timer_mange completes in
>> around 155 cycles. So it looks like each added lcore adds around 3 cycles of
>> overhead for checking empty lists in my testing.
>>
>> Does this mean we have only 25 cycles on the current design or is the 25
>> cycles for the new design?
>>
>
> Both - when run with one lcore, the new design becomes equivalent to the original one. I tested the current design to confirm.
Good thanks
>
>> If for the new design, then what is the old design cost compared to the new
>> cost.
>>
>> I also think we need the call to a timer function in the calculation, just to
>> make sure we have at least one timer in the list and we account for any short
>> cuts in the code for no timers active.
>>
>
> Looking at the numbers for non-empty lists in timer_perf_autotest, the overhead appears to fall away. Here are some representative runs for timer_perf_autotest:
>
> 43 lcores enabled, installing 1M timers on an lcore and processing them with current design:
>
> <...snipped...>
> Appending 1000000 timers
> Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us)
> Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73 (0us)
> Resetting 1000000 timers
> Time for 1000000 timers: 1406756396 (641ms), Time per timer: 1406 (1us)
> <...snipped...>
>
> 43 lcores enabled, installing 1M timers on an lcore and processing them with proposed design:
>
> <...snipped...>
> Appending 1000000 timers
> Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us)
> Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79 (0us)
> Resetting 1000000 timers
> Time for 1000000 timers: 1427189116 (650ms), Time per timer: 1427 (1us)
> <...snipped…>
it looks ok then. The main concern I had was the timers in Pktgen and someone telling the jitter increase or latency or performance. I guess I will just have to wait an see.
>
> The above are not averages, so the numbers don't really indicate which is faster, but they show that the overhead of the proposed design should not be appreciable.
>
>>>
>>>>
>>>> One option is to have the lcore j that wants to install a timer on
>>>> lcore k to pass a message via a ring to lcore k to add that timer. We
>>>> could even add that logic into setting a timer on a different lcore
>>>> then the caller in the current API. The ring would be a multi-producer and
>> single consumer, we still have the lock.
>>>> What am I missing here?
>>>>
>>>
>>> I did try this approach: initially I had a multi-producer single-consumer ring
>> that would hold requests to add or delete a timer from lcore k's skiplist, but it
>> didn't really give an appreciable increase in my test application throughput.
>> In profiling this solution, the hotspot had moved from acquiring the skiplist's
>> spinlock to the rte_atomic32_cmpset that the multiple-producer ring code
>> uses to manipulate the head pointer.
>>>
>>> Then, I tried multiple single-producer single-consumer rings per target
>> lcore. This removed the ring hotspot, but the performance didn't increase as
>> much as with the proposed solution. These solutions also add overhead to
>> rte_timer_manage, as it would have to process the rings and then process
>> the skiplists.
>>>
>>> One other thing to note is that a solution that uses such messages changes
>> the use models for the timer. One interesting example is:
>>> - lcore I enqueues a message to install a timer on lcore k
>>> - lcore k runs rte_timer_manage, processes its messages and adds the
>>> timer to its list
>>> - lcore I then enqueues a message to stop the same timer, now owned by
>>> lcore k
>>> - lcore k does not run rte_timer_manage again
>>> - lcore I wants to free the timer but it might not be safe
>>
>> This case seems like a mistake to me as lcore k should continue to call
>> rte_timer_manager() to process any new timers from other lcores not just
>> the case where the list becomes empty and lcore k does not add timer to his
>> list.
>>
>>>
>>> Even though lcore I has successfully enqueued the request to stop the
>> timer (and delete it from lcore k's pending list), it hasn't actually been
>> deleted from the list yet, so freeing it could corrupt the list. This case exists
>> in the existing timer stress tests.
>>>
>>> Another interesting scenario is:
>>> - lcore I resets a timer to install it on lcore k
>>> - lcore j resets the same timer to install it on lcore k
>>> - then, lcore k runs timer_manage
>>
>> This one also seems like a mistake, more then one lcore setting the same
>> timer seems like a problem and should not be done. A lcore should own a
>> timer and no other lcore should be able to change that timer. If multiple
>> lcores need a timer then they should not share the same timer structure.
>>
>
> Both of the above cases exist in the timer library stress tests, so a solution would presumably need to address them or it would be less flexible. The original design passed these tests, as does the proposed one.
I get this twitch when one lcore is adding timers to another lcore as I come from a realtime OS background, but I guess if no one else cares or finds a problem I will have to live with it. Having a test for something does not make it a good test or a reasonable reason to continue a design issue. We can make any test work, but is it right is the real question and we will just have to wait an see I guess.
>
>>>
>>> Lcore j's message obviates lcore i's message, and it would be wasted work
>> for lcore k to process it, so we should mark it to be skipped over. Handling all
>> the edge cases was more complex than the solution proposed.
>>
>> Hmmm, to me it seems simple here as long as the lcores follow the same
>> rules and sharing a timer structure is very risky and avoidable IMO.
>>
>> Once you have lcores adding timers to another lcore then all accesses to that
>> skip list must be serialized or you get unpredictable results. This should also
>> fix most of the edge cases you are talking about.
>>
>> Also it seems to me the case with an lcore adding timers to another lcore
>> timer list is a specific use case and could be handled by a different set of APIs
>> for that specific use case. Then we do not need to change the current design
>> and all of the overhead is placed on the new APIs/design. IMO we are
>> turning the current timer design into a global timer design as it really is a per
>> lcore design today and I beleive that is a mistake.
>>
>
> Well, the original API explicitly supports installing a timer to be executed on a different lcore, and there are no API changes in the patchset. Also, the proposed design keeps the per-lcore design intact; it only takes what used to be one large skiplist that held timers for all installing lcores, and separates it into N skiplists that correspond 1:1 to an installing lcore. When an lcore processes timers on its lists it will still only be managing timers it owns, and no others.
Having an API to explicitly support some feature is not a reason to keep something, but I think you have reduce my twitching some :-) so I will let it go.
Thanks for the information.
>
>
>>>
>>>>>
>>>>> Gabriel Carrillo (3):
>>>>> timer: add per-installer pending lists for each lcore
>>>>> timer: handle timers installed from non-EAL threads
>>>>> doc: update timer lib docs
>>>>>
>>>>> doc/guides/prog_guide/timer_lib.rst | 19 ++-
>>>>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------
>> ---
>>>> ----
>>>>> lib/librte_timer/rte_timer.h | 9 +-
>>>>> 3 files changed, 231 insertions(+), 126 deletions(-)
>>>>>
>>>>> --
>>>>> 2.6.4
>>>>>
>>>>
>>>> Regards,
>>>> Keith
>>
>> Regards,
>> Keith
Regards,
Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
2017-08-23 21:04 ` Wiles, Keith
@ 2017-08-24 14:08 ` Carrillo, Erik G
0 siblings, 0 replies; 30+ messages in thread
From: Carrillo, Erik G @ 2017-08-24 14:08 UTC (permalink / raw)
To: Wiles, Keith; +Cc: rsanford, dev
> -----Original Message-----
> From: Wiles, Keith
> Sent: Wednesday, August 23, 2017 4:05 PM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> Cc: rsanford@akamai.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements ***
>
>
> > On Aug 23, 2017, at 2:28 PM, Carrillo, Erik G <erik.g.carrillo@intel.com>
> wrote:
> >
> >>
> >> -----Original Message-----
> >> From: Wiles, Keith
> >> Sent: Wednesday, August 23, 2017 11:50 AM
> >> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> >> Cc: rsanford@akamai.com; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements
> >> ***
> >>
> >>
> >>> On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G
> >>> <erik.g.carrillo@intel.com>
> >> wrote:
> >>>
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Wiles, Keith
> >>>> Sent: Wednesday, August 23, 2017 10:02 AM
> >>>> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> >>>> Cc: rsanford@akamai.com; dev@dpdk.org
> >>>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements
> >>>> ***
> >>>>
> >>>>
> >>>>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo
> >>>>> <erik.g.carrillo@intel.com>
> >>>> wrote:
> >>>>>
> >>>>> In the current implementation of the DPDK timer library, timers
> >>>>> can be created and set to be handled by a target lcore by adding
> >>>>> it to a skiplist that corresponds to that lcore. However, if an
> >>>>> application enables multiple lcores, and each of these lcores
> >>>>> repeatedly attempts to install timers on the same target lcore,
> >>>>> overall application throughput will be reduced as all lcores
> >>>>> contend to acquire the lock guarding the single skiplist of pending
> timers.
> >>>>>
> >>>>> This patchset addresses this scenario by adding an array of
> >>>>> skiplists to each lcore's priv_timer struct, such that when lcore
> >>>>> i installs a timer on lcore k, the timer will be added to the ith
> >>>>> skiplist for lcore k. If lcore j installs a timer on lcore k
> >>>>> simultaneously, lcores i and j can both proceed since they will be
> >>>>> acquiring different locks for different lists.
> >>>>>
> >>>>> When lcore k processes its pending timers, it will traverse each
> >>>>> skiplist in its array and acquire a skiplist's lock while a run
> >>>>> list is broken out; meanwhile, all other lists can continue to be
> modified.
> >>>>> Then, all run lists for lcore k are collected and traversed
> >>>>> together so timers are executed in their global order.
> >>>>
> >>>> What is the performance and/or latency added to the timeout now?
> >>>>
> >>>> I worry about the case when just about all of the cores are
> >>>> enabled, which could be as high was 128 or more now.
> >>>
> >>> There is a case in the timer_perf_autotest that runs
> >>> rte_timer_manage
> >> with zero timers that can give a sense of the added latency. When run
> with
> >> one lcore, it completes in around 25 cycles. When run with 43 lcores
> >> (the highest I have access to at the moment), rte_timer_mange
> >> completes in around 155 cycles. So it looks like each added lcore
> >> adds around 3 cycles of overhead for checking empty lists in my testing.
> >>
> >> Does this mean we have only 25 cycles on the current design or is the
> >> 25 cycles for the new design?
> >>
> >
> > Both - when run with one lcore, the new design becomes equivalent to the
> original one. I tested the current design to confirm.
>
> Good thanks
>
> >
> >> If for the new design, then what is the old design cost compared to
> >> the new cost.
> >>
> >> I also think we need the call to a timer function in the calculation,
> >> just to make sure we have at least one timer in the list and we
> >> account for any short cuts in the code for no timers active.
> >>
> >
> > Looking at the numbers for non-empty lists in timer_perf_autotest, the
> overhead appears to fall away. Here are some representative runs for
> timer_perf_autotest:
> >
> > 43 lcores enabled, installing 1M timers on an lcore and processing them
> with current design:
> >
> > <...snipped...>
> > Appending 1000000 timers
> > Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us)
> > Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73
> > (0us) Resetting 1000000 timers Time for 1000000 timers: 1406756396
> > (641ms), Time per timer: 1406 (1us) <...snipped...>
> >
> > 43 lcores enabled, installing 1M timers on an lcore and processing them
> with proposed design:
> >
> > <...snipped...>
> > Appending 1000000 timers
> > Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us)
> > Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79
> > (0us) Resetting 1000000 timers Time for 1000000 timers: 1427189116
> > (650ms), Time per timer: 1427 (1us) <...snipped…>
>
> it looks ok then. The main concern I had was the timers in Pktgen and
> someone telling the jitter increase or latency or performance. I guess I will
> just have to wait an see.
>
> >
> > The above are not averages, so the numbers don't really indicate which is
> faster, but they show that the overhead of the proposed design should not
> be appreciable.
> >
> >>>
> >>>>
> >>>> One option is to have the lcore j that wants to install a timer on
> >>>> lcore k to pass a message via a ring to lcore k to add that timer.
> >>>> We could even add that logic into setting a timer on a different
> >>>> lcore then the caller in the current API. The ring would be a
> >>>> multi-producer and
> >> single consumer, we still have the lock.
> >>>> What am I missing here?
> >>>>
> >>>
> >>> I did try this approach: initially I had a multi-producer
> >>> single-consumer ring
> >> that would hold requests to add or delete a timer from lcore k's
> >> skiplist, but it didn't really give an appreciable increase in my test
> application throughput.
> >> In profiling this solution, the hotspot had moved from acquiring the
> >> skiplist's spinlock to the rte_atomic32_cmpset that the
> >> multiple-producer ring code uses to manipulate the head pointer.
> >>>
> >>> Then, I tried multiple single-producer single-consumer rings per
> >>> target
> >> lcore. This removed the ring hotspot, but the performance didn't
> >> increase as much as with the proposed solution. These solutions also
> >> add overhead to rte_timer_manage, as it would have to process the
> >> rings and then process the skiplists.
> >>>
> >>> One other thing to note is that a solution that uses such messages
> >>> changes
> >> the use models for the timer. One interesting example is:
> >>> - lcore I enqueues a message to install a timer on lcore k
> >>> - lcore k runs rte_timer_manage, processes its messages and adds the
> >>> timer to its list
> >>> - lcore I then enqueues a message to stop the same timer, now owned
> >>> by lcore k
> >>> - lcore k does not run rte_timer_manage again
> >>> - lcore I wants to free the timer but it might not be safe
> >>
> >> This case seems like a mistake to me as lcore k should continue to
> >> call
> >> rte_timer_manager() to process any new timers from other lcores not
> >> just the case where the list becomes empty and lcore k does not add
> >> timer to his list.
> >>
> >>>
> >>> Even though lcore I has successfully enqueued the request to stop
> >>> the
> >> timer (and delete it from lcore k's pending list), it hasn't actually
> >> been deleted from the list yet, so freeing it could corrupt the
> >> list. This case exists in the existing timer stress tests.
> >>>
> >>> Another interesting scenario is:
> >>> - lcore I resets a timer to install it on lcore k
> >>> - lcore j resets the same timer to install it on lcore k
> >>> - then, lcore k runs timer_manage
> >>
> >> This one also seems like a mistake, more then one lcore setting the
> >> same timer seems like a problem and should not be done. A lcore
> >> should own a timer and no other lcore should be able to change that
> >> timer. If multiple lcores need a timer then they should not share the same
> timer structure.
> >>
> >
> > Both of the above cases exist in the timer library stress tests, so a solution
> would presumably need to address them or it would be less flexible. The
> original design passed these tests, as does the proposed one.
>
> I get this twitch when one lcore is adding timers to another lcore as I come
> from a realtime OS background, but I guess if no one else cares or finds a
> problem I will have to live with it. Having a test for something does not make
> it a good test or a reasonable reason to continue a design issue. We can make
> any test work, but is it right is the real question and we will just have to wait
> an see I guess.
>
> >
> >>>
> >>> Lcore j's message obviates lcore i's message, and it would be wasted
> >>> work
> >> for lcore k to process it, so we should mark it to be skipped over.
> Handling all
> >> the edge cases was more complex than the solution proposed.
> >>
> >> Hmmm, to me it seems simple here as long as the lcores follow the
> >> same rules and sharing a timer structure is very risky and avoidable IMO.
> >>
> >> Once you have lcores adding timers to another lcore then all accesses
> >> to that skip list must be serialized or you get unpredictable
> >> results. This should also fix most of the edge cases you are talking about.
> >>
> >> Also it seems to me the case with an lcore adding timers to another
> >> lcore timer list is a specific use case and could be handled by a
> >> different set of APIs for that specific use case. Then we do not need
> >> to change the current design and all of the overhead is placed on the
> >> new APIs/design. IMO we are turning the current timer design into a
> >> global timer design as it really is a per lcore design today and I beleive that
> is a mistake.
> >>
> >
> > Well, the original API explicitly supports installing a timer to be executed on
> a different lcore, and there are no API changes in the patchset. Also, the
> proposed design keeps the per-lcore design intact; it only takes what used
> to be one large skiplist that held timers for all installing lcores, and separates
> it into N skiplists that correspond 1:1 to an installing lcore. When an lcore
> processes timers on its lists it will still only be managing timers it owns, and no
> others.
>
>
> Having an API to explicitly support some feature is not a reason to keep
> something, but I think you have reduce my twitching some :-) so I will let it
> go.
>
> Thanks for the information.
You're welcome, and thank you for the feedback.
Regards,
Gabriel
>
> >
> >
> >>>
> >>>>>
> >>>>> Gabriel Carrillo (3):
> >>>>> timer: add per-installer pending lists for each lcore
> >>>>> timer: handle timers installed from non-EAL threads
> >>>>> doc: update timer lib docs
> >>>>>
> >>>>> doc/guides/prog_guide/timer_lib.rst | 19 ++-
> >>>>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++---
> ---
> >> ---
> >>>> ----
> >>>>> lib/librte_timer/rte_timer.h | 9 +-
> >>>>> 3 files changed, 231 insertions(+), 126 deletions(-)
> >>>>>
> >>>>> --
> >>>>> 2.6.4
> >>>>>
> >>>>
> >>>> Regards,
> >>>> Keith
> >>
> >> Regards,
> >> Keith
>
> Regards,
> Keith
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 0/3] *** timer library enhancements ***
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
` (3 preceding siblings ...)
2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith
@ 2017-08-25 20:26 ` Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
4 siblings, 1 reply; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-25 20:26 UTC (permalink / raw)
To: rsanford; +Cc: dev
In the current implementation of the DPDK timer library, timers can be
created and set to be handled by a target lcore by adding it to a
skiplist that corresponds to that lcore. However, if an application
enables multiple lcores, and each of these lcores repeatedly attempts
to install timers on the same target lcore, overall application
throughput will be reduced as all lcores contend to acquire the lock
guarding the single skiplist of pending timers.
This patchset addresses this scenario by adding an array of skiplists
to each lcore's priv_timer struct, such that when lcore i installs a
timer on lcore k, the timer will be added to the ith skiplist for
lcore k. If lcore j installs a timer on lcore k simultaneously,
lcores i and j can both proceed since they will be acquiring different
locks for different lists.
When lcore k processes its pending timers, it will traverse each skiplist
in its array and acquire a skiplist's lock while a run list is broken
out; meanwhile, all other lists can continue to be modified. Then, all
run lists for lcore k are collected and traversed together so timers are
executed in their relative order.
Gabriel Carrillo (3):
timer: add per-installer pending lists for each lcore
timer: handle timers installed from non-EAL threads
doc: update timer lib docs
doc/guides/prog_guide/timer_lib.rst | 19 ++-
lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++-------------
lib/librte_timer/rte_timer.h | 9 +-
3 files changed, 231 insertions(+), 126 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
@ 2017-08-25 20:27 ` Gabriel Carrillo
2017-08-29 10:57 ` Ananyev, Konstantin
2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger
1 sibling, 1 reply; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw)
To: rsanford; +Cc: dev
Instead of each priv_timer struct containing a single skiplist, this
commit adds a skiplist for each enabled lcore to priv_timer. In the case
that multiple lcores repeatedly install timers on the same target lcore,
this change reduces lock contention for the target lcore's skiplists and
increases performance.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++----------------
lib/librte_timer/rte_timer.h | 9 +-
2 files changed, 202 insertions(+), 116 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 5ee0840..da2fc1a 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -37,6 +37,7 @@
#include <inttypes.h>
#include <assert.h>
#include <sys/queue.h>
+#include <stdbool.h>
#include <rte_atomic.h>
#include <rte_common.h>
@@ -56,17 +57,19 @@
LIST_HEAD(rte_timer_list, rte_timer);
+struct skiplist {
+ struct rte_timer head; /**< dummy timer instance to head up list */
+ rte_spinlock_t lock; /**< lock to protect list access */
+ unsigned int depth; /**< track the current depth of the skiplist */
+} __rte_cache_aligned;
+
struct priv_timer {
- struct rte_timer pending_head; /**< dummy timer instance to head up list */
- rte_spinlock_t list_lock; /**< lock to protect list access */
+ /** one pending list per enabled lcore */
+ struct skiplist pending_lists[RTE_MAX_LCORE];
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
int updated;
-
- /** track the current depth of the skiplist */
- unsigned curr_skiplist_depth;
-
unsigned prev_lcore; /**< used for lcore round robin */
/** running timer on this lcore now */
@@ -81,6 +84,10 @@ struct priv_timer {
/** per-lcore private info for timers */
static struct priv_timer priv_timer[RTE_MAX_LCORE];
+/** cache of IDs of enabled lcores */
+static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static int n_enabled_lcores;
+
/* when debug is enabled, store some statistics */
#ifdef RTE_LIBRTE_TIMER_DEBUG
#define __TIMER_STAT_ADD(name, n) do { \
@@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE];
void
rte_timer_subsystem_init(void)
{
- unsigned lcore_id;
+ unsigned int lcore_id1, lcore_id2;
+ struct skiplist *list;
+ int i, j;
+
+ RTE_LCORE_FOREACH(lcore_id1)
+ enabled_lcores[n_enabled_lcores++] = lcore_id1;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
- rte_spinlock_init(&priv_timer[lcore_id].list_lock);
- priv_timer[lcore_id].prev_lcore = lcore_id;
+ for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores;
+ lcore_id1 = enabled_lcores[++i]) {
+ priv_timer[lcore_id1].prev_lcore = lcore_id1;
+
+ for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores;
+ lcore_id2 = enabled_lcores[++j]) {
+ list = &priv_timer[lcore_id1].pending_lists[lcore_id2];
+ rte_spinlock_init(&list->lock);
+ }
}
}
@@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim)
union rte_timer_status status;
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
}
@@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim,
* or ready to run on local core, exit
*/
if (prev_status.state == RTE_TIMER_RUNNING &&
- (prev_status.owner != (uint16_t)lcore_id ||
+ (prev_status.owner != (int)lcore_id ||
tim != priv_timer[lcore_id].running_tim))
return -1;
@@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim,
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_CONFIG;
- status.owner = (int16_t)lcore_id;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim)
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_RUNNING;
- status.owner = (int16_t)lcore_id;
+ status.installer = prev_status.installer;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth)
* are <= that time value.
*/
static void
-timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
+timer_get_prev_entries(uint64_t time_val, struct skiplist *list,
struct rte_timer **prev)
{
- unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
- prev[lvl] = &priv_timer[tim_lcore].pending_head;
+ unsigned int lvl = list->depth;
+ prev[lvl] = &list->head;
while(lvl != 0) {
lvl--;
prev[lvl] = prev[lvl+1];
@@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
* all skiplist levels.
*/
static void
-timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
+timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
struct rte_timer **prev)
{
int i;
/* to get a specific entry in the list, look for just lower than the time
* values, and then increment on each level individually if necessary
*/
- timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
- for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries(tim->expire - 1, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
while (prev[i]->sl_next[i] != NULL &&
prev[i]->sl_next[i] != tim &&
prev[i]->sl_next[i]->expire <= tim->expire)
@@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
unsigned lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
* we are not called from rte_timer_manage() */
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_lock(&list->lock);
/* find where exactly this element goes in the list of elements
* for each depth. */
- timer_get_prev_entries(tim->expire, tim_lcore, prev);
+ timer_get_prev_entries(tim->expire, list, prev);
/* now assign it a new level and add at that level */
- const unsigned tim_level = timer_get_skiplist_level(
- priv_timer[tim_lcore].curr_skiplist_depth);
- if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
- priv_timer[tim_lcore].curr_skiplist_depth++;
+ const unsigned int tim_level = timer_get_skiplist_level(list->depth);
+ if (tim_level == list->depth)
+ list->depth++;
lvl = tim_level;
- while (lvl > 0) {
+ while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) {
tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
prev[lvl]->sl_next[lvl] = tim;
lvl--;
@@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
/* save the lowest list entry into the expire field of the dummy hdr
* NOTE: this is not atomic on 32-bit*/
- priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
- pending_head.sl_next[0]->expire;
+ list->head.expire = list->head.sl_next[0]->expire;
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/*
@@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
unsigned prev_owner = prev_status.owner;
int i;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+
+ list = &priv_timer[prev_owner].pending_lists[prev_status.installer];
/* if timer needs is pending another core, we need to lock the
* list; if it is on local core, we need to lock if we are not
* called from rte_timer_manage() */
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_lock(&list->lock);
/* save the lowest list entry into the expire field of the dummy hdr.
* NOTE: this is not atomic on 32-bit */
- if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
- priv_timer[prev_owner].pending_head.expire =
- ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
+ if (tim == list->head.sl_next[0])
+ list->head.expire = ((tim->sl_next[0] == NULL) ?
+ 0 : tim->sl_next[0]->expire);
/* adjust pointers from previous entries to point past this */
- timer_get_prev_entries_for_node(tim, prev_owner, prev);
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries_for_node(tim, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
if (prev[i]->sl_next[i] == tim)
prev[i]->sl_next[i] = tim->sl_next[i];
}
/* in case we deleted last entry at a level, adjust down max level */
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
- if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
- priv_timer[prev_owner].curr_skiplist_depth --;
+ for (i = list->depth - 1; i >= 0; i--)
+ if (list->head.sl_next[i] == NULL)
+ list->depth--;
else
break;
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/* Reset and start the timer associated with the timer handle (private func) */
@@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.owner = (int16_t)tim_lcore;
+ status.installer = lcore_id;
+ status.owner = tim_lcore;
tim->status.u32 = status.u32;
return 0;
@@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim)
/* mark timer as stopped */
rte_wmb();
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
return 0;
@@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim)
}
/* must be called periodically, run all timer that expired */
-void rte_timer_manage(void)
+void
+rte_timer_manage(void)
{
union rte_timer_status status;
- struct rte_timer *tim, *next_tim;
- struct rte_timer *run_first_tim, **pprev;
- unsigned lcore_id = rte_lcore_id();
+ struct rte_timer *tim, *next_tim, **pprev;
+ struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1];
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
- uint64_t cur_time;
- int i, ret;
+ struct priv_timer *priv_tim;
+ unsigned int installer_lcore, lcore_id = rte_lcore_id();
+ uint64_t cur_time, min_expire;
+ int i, j, min_idx, ret;
+ int nrunlists = 0;
+ int local_is_locked = 0;
+ struct skiplist *dest_list, *list = NULL;
+ bool done;
/* timer manager only runs on EAL thread with valid lcore_id */
assert(lcore_id < RTE_MAX_LCORE);
+ priv_tim = &priv_timer[lcore_id];
+
__TIMER_STAT_ADD(manage, 1);
- /* optimize for the case where per-cpu list is empty */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
- return;
- cur_time = rte_get_timer_cycles();
+ for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores;
+ installer_lcore = enabled_lcores[++i]) {
+ list = &priv_tim->pending_lists[installer_lcore];
+
+ /* optimize for the case where list is empty */
+ if (list->head.sl_next[0] == NULL)
+ continue;
+ cur_time = rte_get_timer_cycles();
#ifdef RTE_ARCH_X86_64
- /* on 64-bit the value cached in the pending_head.expired will be
- * updated atomically, so we can consult that for a quick check here
- * outside the lock */
- if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
- return;
+ /* on 64-bit the value cached in the list->head.expired will be
+ * updated atomically, so we can consult that for a quick check
+ * here outside the lock
+ */
+ if (likely(list->head.expire > cur_time))
+ continue;
#endif
- /* browse ordered list, add expired timers in 'expired' list */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ /* browse ordered list, add expired timers in 'expired' list */
+ rte_spinlock_lock(&list->lock);
- /* if nothing to do just unlock and return */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
- priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
- return;
- }
+ /* if nothing to do just unlock and continue */
+ if (list->head.sl_next[0] == NULL ||
+ list->head.sl_next[0]->expire > cur_time) {
+ rte_spinlock_unlock(&list->lock);
+ continue;
+ }
- /* save start of list of expired timers */
- tim = priv_timer[lcore_id].pending_head.sl_next[0];
+ /* save start of list of expired timers */
+ tim = list->head.sl_next[0];
+
+ /* break the existing list at current time point */
+ timer_get_prev_entries(cur_time, list, prev);
+ for (j = list->depth - 1; j >= 0; j--) {
+ if (prev[j] == &list->head)
+ continue;
+ list->head.sl_next[j] =
+ prev[j]->sl_next[j];
+ if (prev[j]->sl_next[j] == NULL)
+ list->depth--;
+ prev[j]->sl_next[j] = NULL;
+ }
- /* break the existing list at current time point */
- timer_get_prev_entries(cur_time, lcore_id, prev);
- for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
- if (prev[i] == &priv_timer[lcore_id].pending_head)
- continue;
- priv_timer[lcore_id].pending_head.sl_next[i] =
- prev[i]->sl_next[i];
- if (prev[i]->sl_next[i] == NULL)
- priv_timer[lcore_id].curr_skiplist_depth--;
- prev[i] ->sl_next[i] = NULL;
- }
+ /* transition run-list from PENDING to RUNNING */
+ run_first_tims[nrunlists] = tim;
+ pprev = &run_first_tims[nrunlists];
+ nrunlists++;
+
+ for (; tim != NULL; tim = next_tim) {
+ next_tim = tim->sl_next[0];
+
+ ret = timer_set_running_state(tim);
+ if (likely(ret == 0)) {
+ pprev = &tim->sl_next[0];
+ } else {
+ /* another core is trying to re-config this one,
+ * remove it from local expired list
+ */
+ *pprev = next_tim;
+ }
+ }
- /* transition run-list from PENDING to RUNNING */
- run_first_tim = tim;
- pprev = &run_first_tim;
+ /* update the next to expire timer value */
+ list->head.expire = (list->head.sl_next[0] == NULL) ?
+ 0 : list->head.sl_next[0]->expire;
- for ( ; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
+ rte_spinlock_unlock(&list->lock);
+ }
- ret = timer_set_running_state(tim);
- if (likely(ret == 0)) {
- pprev = &tim->sl_next[0];
- } else {
- /* another core is trying to re-config this one,
- * remove it from local expired list
- */
- *pprev = next_tim;
+ /* Now process the run lists */
+ while (1) {
+ done = true;
+ min_expire = UINT64_MAX;
+ min_idx = 0;
+
+ /* Find the next oldest timer to process */
+ for (i = 0; i < nrunlists; i++) {
+ tim = run_first_tims[i];
+
+ if (tim != NULL && tim->expire < min_expire) {
+ min_expire = tim->expire;
+ min_idx = i;
+ done = false;
+ }
}
- }
- /* update the next to expire timer value */
- priv_timer[lcore_id].pending_head.expire =
- (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
- priv_timer[lcore_id].pending_head.sl_next[0]->expire;
+ if (done)
+ break;
+
+ tim = run_first_tims[min_idx];
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+ /* Move down the runlist from which we picked a timer to
+ * execute
+ */
+ run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
- /* now scan expired list and call callbacks */
- for (tim = run_first_tim; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
- priv_timer[lcore_id].updated = 0;
- priv_timer[lcore_id].running_tim = tim;
+ priv_tim->updated = 0;
+ priv_tim->running_tim = tim;
/* execute callback function with list unlocked */
tim->f(tim, tim->arg);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
- * function, we have nothing to do here */
- if (priv_timer[lcore_id].updated == 1)
+ * function, we have nothing to do here
+ */
+ if (priv_tim->updated == 1)
continue;
if (tim->period == 0) {
/* remove from done list and mark timer as stopped */
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.installer = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
rte_wmb();
tim->status.u32 = status.u32;
}
else {
- /* keep it in list and mark timer as pending */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ dest_list = &priv_tim->pending_lists[lcore_id];
+
+ /* If the destination list is the current list
+ * we can acquire the lock here, and hold it
+ * across removal and insertion of the timer.
+ */
+ if (list == dest_list) {
+ rte_spinlock_lock(&list->lock);
+ local_is_locked = 1;
+ }
+
+ /* set timer state back to PENDING and
+ * reinsert it in pending list
+ */
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
- status.owner = (int16_t)lcore_id;
+ status.installer = tim->status.installer;
+ status.owner = lcore_id;
rte_wmb();
tim->status.u32 = status.u32;
- __rte_timer_reset(tim, tim->expire + tim->period,
- tim->period, lcore_id, tim->f, tim->arg, 1);
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+
+ __rte_timer_reset(tim,
+ tim->expire + tim->period,
+ tim->period, lcore_id,
+ tim->f, tim->arg, local_is_locked);
+
+ if (local_is_locked) {
+ rte_spinlock_unlock(&list->lock);
+ local_is_locked = 0;
+ }
}
}
- priv_timer[lcore_id].running_tim = NULL;
+ priv_tim->running_tim = NULL;
}
/* dump statistics about timers */
diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h
index a276a73..4cc6baf 100644
--- a/lib/librte_timer/rte_timer.h
+++ b/lib/librte_timer/rte_timer.h
@@ -77,7 +77,7 @@ extern "C" {
#define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */
#define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */
-#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */
+#define RTE_TIMER_NO_LCORE -2
/**
* Timer type: Periodic or single (one-shot).
@@ -94,10 +94,11 @@ enum rte_timer_type {
union rte_timer_status {
RTE_STD_C11
struct {
- uint16_t state; /**< Stop, pending, running, config. */
- int16_t owner; /**< The lcore that owns the timer. */
+ unsigned state : 2;
+ int installer : 15;
+ int owner : 15;
};
- uint32_t u32; /**< To atomic-set status + owner. */
+ uint32_t u32; /**< To atomic-set status, installer, owner */
};
#ifdef RTE_LIBRTE_TIMER_DEBUG
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 2/3] timer: handle timers installed from non-EAL threads
2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo
@ 2017-08-25 20:27 ` Gabriel Carrillo
0 siblings, 0 replies; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw)
To: rsanford; +Cc: dev
This commit adds support for timers being created from
non-EAL threads; it maps timers from all such threads to
lcore id RTE_MAX_LCORE, and puts them all in a corresponding
skiplist.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 48 ++++++++++++++++++++++++++++++--------------
1 file changed, 33 insertions(+), 15 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index da2fc1a..ca7b80c 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -64,8 +64,8 @@ struct skiplist {
} __rte_cache_aligned;
struct priv_timer {
- /** one pending list per enabled lcore */
- struct skiplist pending_lists[RTE_MAX_LCORE];
+ /** one pending list per lcore, plus one for non-EAL threads */
+ struct skiplist pending_lists[RTE_MAX_LCORE + 1];
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
@@ -85,7 +85,7 @@ struct priv_timer {
static struct priv_timer priv_timer[RTE_MAX_LCORE];
/** cache of IDs of enabled lcores */
-static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static unsigned int enabled_lcores[RTE_MAX_LCORE + 1];
static int n_enabled_lcores;
/* when debug is enabled, store some statistics */
@@ -103,23 +103,33 @@ static int n_enabled_lcores;
void
rte_timer_subsystem_init(void)
{
- unsigned int lcore_id1, lcore_id2;
+ unsigned int target_lcore, installer_lcore;
struct skiplist *list;
+ struct priv_timer *priv_tim;
int i, j;
- RTE_LCORE_FOREACH(lcore_id1)
- enabled_lcores[n_enabled_lcores++] = lcore_id1;
+ RTE_LCORE_FOREACH(target_lcore)
+ enabled_lcores[n_enabled_lcores++] = target_lcore;
+
+ /* To handle timers coming from non-EAL threads */
+ enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores;
- lcore_id1 = enabled_lcores[++i]) {
- priv_timer[lcore_id1].prev_lcore = lcore_id1;
+ for (i = 0, target_lcore = enabled_lcores[i]; i < n_enabled_lcores;
+ target_lcore = enabled_lcores[++i]) {
+ /* Don't use this value to index the priv_timer array */
+ if (target_lcore == RTE_MAX_LCORE)
+ continue;
- for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores;
- lcore_id2 = enabled_lcores[++j]) {
- list = &priv_timer[lcore_id1].pending_lists[lcore_id2];
+ priv_tim = &priv_timer[target_lcore];
+ priv_tim->prev_lcore = target_lcore;
+
+ for (j = 0, installer_lcore = enabled_lcores[j];
+ j < n_enabled_lcores;
+ installer_lcore = enabled_lcores[++j]) {
+ list = &priv_tim->pending_lists[installer_lcore];
rte_spinlock_init(&list->lock);
}
}
@@ -300,10 +310,16 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
static void
timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
{
- unsigned lcore_id = rte_lcore_id();
+ unsigned int installer_lcore, lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
- struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id];
+ struct skiplist *list;
+
+ /* Check if timer being installed from non-EAL thread */
+ installer_lcore = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE :
+ lcore_id;
+
+ list = &priv_timer[tim_lcore].pending_lists[installer_lcore];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
@@ -439,7 +455,9 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.installer = lcore_id;
+ /* Check if installer is non-EAL thread */
+ status.installer = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE :
+ lcore_id;
status.owner = tim_lcore;
tim->status.u32 = status.u32;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 3/3] doc: update timer lib docs
2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo
@ 2017-08-25 20:27 ` Gabriel Carrillo
0 siblings, 0 replies; 30+ messages in thread
From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw)
To: rsanford; +Cc: dev
This change updates the timer library documentation to
reflect a change to the organization of the skiplists
in the implementation.
Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
---
doc/guides/prog_guide/timer_lib.rst | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst
index f437417..f94ffaa 100644
--- a/doc/guides/prog_guide/timer_lib.rst
+++ b/doc/guides/prog_guide/timer_lib.rst
@@ -1,5 +1,5 @@
.. BSD LICENSE
- Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -53,16 +53,17 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>`
Implementation Details
----------------------
-Timers are tracked on a per-lcore basis,
-with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure.
-The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level.
+Timers are tracked in a per-lcore array of skiplist data structures; each
+lcore has one skiplist corresponding to each other lcore that could load a timer on it. All pending
+timers in each skiplist are maintained in order of timer expiry.
+Each skiplist has ten levels and each entry in the table appears in each level with probability ¼^level.
This means that all entries are present in level 0, 1 in every 4 entries is present at level 1,
one in every 16 at level 2 and so on up to level 9.
This means that adding and removing entries from the timer list for a core can be done in log(n) time,
up to 4^10 entries, that is, approximately 1,000,000 timers per lcore.
A timer structure contains a special field called status,
-which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id).
+which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id).
Depending on the timer state, we know if a timer is present in a list or not:
* STOPPED: no owner, not in a list
@@ -77,17 +78,17 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al
When modifying the state of a timer,
a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically.
-Inside the rte_timer_manage() function,
-the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
+Inside the rte_timer_manage() function, each of an lcore's skiplists is traversed in sequence.
+Each skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
until an entry which has not yet expired has been encountered.
-To improve performance in the case where there are entries in the timer list but none of those timers have yet expired,
+To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired,
the expiry time of the first list entry is maintained within the per-core timer list structure itself.
On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure.
(Since expiry times are maintained as 64-bit values,
a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock,
so this additional check is skipped in favor of checking as normal once the lock has been taken.)
On both 64-bit and 32-bit platforms,
-a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty.
+rte_timer_manage() can continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty.
Use Cases
---------
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
@ 2017-08-29 10:57 ` Ananyev, Konstantin
2017-08-29 16:13 ` Carrillo, Erik G
0 siblings, 1 reply; 30+ messages in thread
From: Ananyev, Konstantin @ 2017-08-29 10:57 UTC (permalink / raw)
To: Carrillo, Erik G, rsanford; +Cc: dev
Hi Gabriel,
>
> Instead of each priv_timer struct containing a single skiplist, this
> commit adds a skiplist for each enabled lcore to priv_timer. In the case
> that multiple lcores repeatedly install timers on the same target lcore,
> this change reduces lock contention for the target lcore's skiplists and
> increases performance.
I am not an rte_timer expert, but there is one thing that worries me:
It seems that complexity of timer_manage() has increased with that patch quite a bit:
now it has to check/process up to RTE_MAX_LCORE skiplists instead of one,
also it has to somehow to properly sort up to RTE_MAX_LCORE lists of
retrieved (ready to run) timers.
Wouldn't all that affect it's running time?
I understand your intention to reduce lock contention,
but I suppose at least it could be done in a configurable way.
Let say allow user to specify dimension of pending_lists[] at init phase or so.
Then timer from lcore_id=N will endup in pending_lists[N%RTE_DIM(pendilng_list)].
Another thought - might be better to divide pending timers list not by client (lcore) id,
but by expiration time - some analog of timer wheel or so.
That, I think might greatly decrease the probability that timer_manage() and timer_add()
will try to access the same list.
>From other side timer_manage() still would have to consume skip-lists one by one.
Though I suppose that's quite radical change from what we have right now.
Konstantin
>
> Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com>
> ---
> v2:
> * Address checkpatch warnings
>
> lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++----------------
> lib/librte_timer/rte_timer.h | 9 +-
> 2 files changed, 202 insertions(+), 116 deletions(-)
>
> diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
> index 5ee0840..da2fc1a 100644
> --- a/lib/librte_timer/rte_timer.c
> +++ b/lib/librte_timer/rte_timer.c
> @@ -37,6 +37,7 @@
> #include <inttypes.h>
> #include <assert.h>
> #include <sys/queue.h>
> +#include <stdbool.h>
>
> #include <rte_atomic.h>
> #include <rte_common.h>
> @@ -56,17 +57,19 @@
>
> LIST_HEAD(rte_timer_list, rte_timer);
>
> +struct skiplist {
> + struct rte_timer head; /**< dummy timer instance to head up list */
> + rte_spinlock_t lock; /**< lock to protect list access */
> + unsigned int depth; /**< track the current depth of the skiplist */
> +} __rte_cache_aligned;
> +
> struct priv_timer {
> - struct rte_timer pending_head; /**< dummy timer instance to head up list */
> - rte_spinlock_t list_lock; /**< lock to protect list access */
> + /** one pending list per enabled lcore */
> + struct skiplist pending_lists[RTE_MAX_LCORE];
>
> /** per-core variable that true if a timer was updated on this
> * core since last reset of the variable */
> int updated;
> -
> - /** track the current depth of the skiplist */
> - unsigned curr_skiplist_depth;
> -
> unsigned prev_lcore; /**< used for lcore round robin */
>
> /** running timer on this lcore now */
> @@ -81,6 +84,10 @@ struct priv_timer {
> /** per-lcore private info for timers */
> static struct priv_timer priv_timer[RTE_MAX_LCORE];
>
> +/** cache of IDs of enabled lcores */
> +static unsigned int enabled_lcores[RTE_MAX_LCORE];
> +static int n_enabled_lcores;
> +
> /* when debug is enabled, store some statistics */
> #ifdef RTE_LIBRTE_TIMER_DEBUG
> #define __TIMER_STAT_ADD(name, n) do { \
> @@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE];
> void
> rte_timer_subsystem_init(void)
> {
> - unsigned lcore_id;
> + unsigned int lcore_id1, lcore_id2;
> + struct skiplist *list;
> + int i, j;
> +
> + RTE_LCORE_FOREACH(lcore_id1)
> + enabled_lcores[n_enabled_lcores++] = lcore_id1;
>
> /* since priv_timer is static, it's zeroed by default, so only init some
> * fields.
> */
> - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
> - rte_spinlock_init(&priv_timer[lcore_id].list_lock);
> - priv_timer[lcore_id].prev_lcore = lcore_id;
> + for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores;
> + lcore_id1 = enabled_lcores[++i]) {
> + priv_timer[lcore_id1].prev_lcore = lcore_id1;
> +
> + for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores;
> + lcore_id2 = enabled_lcores[++j]) {
> + list = &priv_timer[lcore_id1].pending_lists[lcore_id2];
> + rte_spinlock_init(&list->lock);
> + }
> }
> }
>
> @@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim)
> union rte_timer_status status;
>
> status.state = RTE_TIMER_STOP;
> - status.owner = RTE_TIMER_NO_OWNER;
> + status.installer = RTE_TIMER_NO_LCORE;
> + status.owner = RTE_TIMER_NO_LCORE;
> tim->status.u32 = status.u32;
> }
>
> @@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim,
> * or ready to run on local core, exit
> */
> if (prev_status.state == RTE_TIMER_RUNNING &&
> - (prev_status.owner != (uint16_t)lcore_id ||
> + (prev_status.owner != (int)lcore_id ||
> tim != priv_timer[lcore_id].running_tim))
> return -1;
>
> @@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim,
> /* here, we know that timer is stopped or pending,
> * mark it atomically as being configured */
> status.state = RTE_TIMER_CONFIG;
> - status.owner = (int16_t)lcore_id;
> + status.installer = RTE_TIMER_NO_LCORE;
> + status.owner = lcore_id;
> success = rte_atomic32_cmpset(&tim->status.u32,
> prev_status.u32,
> status.u32);
> @@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim)
> /* here, we know that timer is stopped or pending,
> * mark it atomically as being configured */
> status.state = RTE_TIMER_RUNNING;
> - status.owner = (int16_t)lcore_id;
> + status.installer = prev_status.installer;
> + status.owner = lcore_id;
> success = rte_atomic32_cmpset(&tim->status.u32,
> prev_status.u32,
> status.u32);
> @@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth)
> * are <= that time value.
> */
> static void
> -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
> +timer_get_prev_entries(uint64_t time_val, struct skiplist *list,
> struct rte_timer **prev)
> {
> - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
> - prev[lvl] = &priv_timer[tim_lcore].pending_head;
> + unsigned int lvl = list->depth;
> + prev[lvl] = &list->head;
> while(lvl != 0) {
> lvl--;
> prev[lvl] = prev[lvl+1];
> @@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
> * all skiplist levels.
> */
> static void
> -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
> +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
> struct rte_timer **prev)
> {
> int i;
> /* to get a specific entry in the list, look for just lower than the time
> * values, and then increment on each level individually if necessary
> */
> - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
> - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
> + timer_get_prev_entries(tim->expire - 1, list, prev);
> + for (i = list->depth - 1; i >= 0; i--) {
> while (prev[i]->sl_next[i] != NULL &&
> prev[i]->sl_next[i] != tim &&
> prev[i]->sl_next[i]->expire <= tim->expire)
> @@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
> unsigned lcore_id = rte_lcore_id();
> unsigned lvl;
> struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
> + struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id];
>
> /* if timer needs to be scheduled on another core, we need to
> * lock the list; if it is on local core, we need to lock if
> * we are not called from rte_timer_manage() */
> if (tim_lcore != lcore_id || !local_is_locked)
> - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
> + rte_spinlock_lock(&list->lock);
>
> /* find where exactly this element goes in the list of elements
> * for each depth. */
> - timer_get_prev_entries(tim->expire, tim_lcore, prev);
> + timer_get_prev_entries(tim->expire, list, prev);
>
> /* now assign it a new level and add at that level */
> - const unsigned tim_level = timer_get_skiplist_level(
> - priv_timer[tim_lcore].curr_skiplist_depth);
> - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
> - priv_timer[tim_lcore].curr_skiplist_depth++;
> + const unsigned int tim_level = timer_get_skiplist_level(list->depth);
> + if (tim_level == list->depth)
> + list->depth++;
>
> lvl = tim_level;
> - while (lvl > 0) {
> + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) {
> tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
> prev[lvl]->sl_next[lvl] = tim;
> lvl--;
> @@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
>
> /* save the lowest list entry into the expire field of the dummy hdr
> * NOTE: this is not atomic on 32-bit*/
> - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
> - pending_head.sl_next[0]->expire;
> + list->head.expire = list->head.sl_next[0]->expire;
>
> if (tim_lcore != lcore_id || !local_is_locked)
> - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
> + rte_spinlock_unlock(&list->lock);
> }
>
> /*
> @@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
> unsigned prev_owner = prev_status.owner;
> int i;
> struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
> + struct skiplist *list;
> +
> + list = &priv_timer[prev_owner].pending_lists[prev_status.installer];
>
> /* if timer needs is pending another core, we need to lock the
> * list; if it is on local core, we need to lock if we are not
> * called from rte_timer_manage() */
> if (prev_owner != lcore_id || !local_is_locked)
> - rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
> + rte_spinlock_lock(&list->lock);
>
> /* save the lowest list entry into the expire field of the dummy hdr.
> * NOTE: this is not atomic on 32-bit */
> - if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
> - priv_timer[prev_owner].pending_head.expire =
> - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
> + if (tim == list->head.sl_next[0])
> + list->head.expire = ((tim->sl_next[0] == NULL) ?
> + 0 : tim->sl_next[0]->expire);
>
> /* adjust pointers from previous entries to point past this */
> - timer_get_prev_entries_for_node(tim, prev_owner, prev);
> - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
> + timer_get_prev_entries_for_node(tim, list, prev);
> + for (i = list->depth - 1; i >= 0; i--) {
> if (prev[i]->sl_next[i] == tim)
> prev[i]->sl_next[i] = tim->sl_next[i];
> }
>
> /* in case we deleted last entry at a level, adjust down max level */
> - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
> - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
> - priv_timer[prev_owner].curr_skiplist_depth --;
> + for (i = list->depth - 1; i >= 0; i--)
> + if (list->head.sl_next[i] == NULL)
> + list->depth--;
> else
> break;
>
> if (prev_owner != lcore_id || !local_is_locked)
> - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
> + rte_spinlock_unlock(&list->lock);
> }
>
> /* Reset and start the timer associated with the timer handle (private func) */
> @@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
> * the state so we don't need to use cmpset() here */
> rte_wmb();
> status.state = RTE_TIMER_PENDING;
> - status.owner = (int16_t)tim_lcore;
> + status.installer = lcore_id;
> + status.owner = tim_lcore;
> tim->status.u32 = status.u32;
>
> return 0;
> @@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim)
> /* mark timer as stopped */
> rte_wmb();
> status.state = RTE_TIMER_STOP;
> - status.owner = RTE_TIMER_NO_OWNER;
> + status.installer = RTE_TIMER_NO_LCORE;
> + status.owner = RTE_TIMER_NO_LCORE;
> tim->status.u32 = status.u32;
>
> return 0;
> @@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim)
> }
>
> /* must be called periodically, run all timer that expired */
> -void rte_timer_manage(void)
> +void
> +rte_timer_manage(void)
> {
> union rte_timer_status status;
> - struct rte_timer *tim, *next_tim;
> - struct rte_timer *run_first_tim, **pprev;
> - unsigned lcore_id = rte_lcore_id();
> + struct rte_timer *tim, *next_tim, **pprev;
> + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1];
> struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
> - uint64_t cur_time;
> - int i, ret;
> + struct priv_timer *priv_tim;
> + unsigned int installer_lcore, lcore_id = rte_lcore_id();
> + uint64_t cur_time, min_expire;
> + int i, j, min_idx, ret;
> + int nrunlists = 0;
> + int local_is_locked = 0;
> + struct skiplist *dest_list, *list = NULL;
> + bool done;
>
> /* timer manager only runs on EAL thread with valid lcore_id */
> assert(lcore_id < RTE_MAX_LCORE);
>
> + priv_tim = &priv_timer[lcore_id];
> +
> __TIMER_STAT_ADD(manage, 1);
> - /* optimize for the case where per-cpu list is empty */
> - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
> - return;
> - cur_time = rte_get_timer_cycles();
> + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores;
> + installer_lcore = enabled_lcores[++i]) {
> + list = &priv_tim->pending_lists[installer_lcore];
> +
> + /* optimize for the case where list is empty */
> + if (list->head.sl_next[0] == NULL)
> + continue;
> + cur_time = rte_get_timer_cycles();
>
> #ifdef RTE_ARCH_X86_64
> - /* on 64-bit the value cached in the pending_head.expired will be
> - * updated atomically, so we can consult that for a quick check here
> - * outside the lock */
> - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
> - return;
> + /* on 64-bit the value cached in the list->head.expired will be
> + * updated atomically, so we can consult that for a quick check
> + * here outside the lock
> + */
> + if (likely(list->head.expire > cur_time))
> + continue;
> #endif
>
> - /* browse ordered list, add expired timers in 'expired' list */
> - rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
> + /* browse ordered list, add expired timers in 'expired' list */
> + rte_spinlock_lock(&list->lock);
>
> - /* if nothing to do just unlock and return */
> - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
> - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
> - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
> - return;
> - }
> + /* if nothing to do just unlock and continue */
> + if (list->head.sl_next[0] == NULL ||
> + list->head.sl_next[0]->expire > cur_time) {
> + rte_spinlock_unlock(&list->lock);
> + continue;
> + }
>
> - /* save start of list of expired timers */
> - tim = priv_timer[lcore_id].pending_head.sl_next[0];
> + /* save start of list of expired timers */
> + tim = list->head.sl_next[0];
> +
> + /* break the existing list at current time point */
> + timer_get_prev_entries(cur_time, list, prev);
> + for (j = list->depth - 1; j >= 0; j--) {
> + if (prev[j] == &list->head)
> + continue;
> + list->head.sl_next[j] =
> + prev[j]->sl_next[j];
> + if (prev[j]->sl_next[j] == NULL)
> + list->depth--;
> + prev[j]->sl_next[j] = NULL;
> + }
>
> - /* break the existing list at current time point */
> - timer_get_prev_entries(cur_time, lcore_id, prev);
> - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
> - if (prev[i] == &priv_timer[lcore_id].pending_head)
> - continue;
> - priv_timer[lcore_id].pending_head.sl_next[i] =
> - prev[i]->sl_next[i];
> - if (prev[i]->sl_next[i] == NULL)
> - priv_timer[lcore_id].curr_skiplist_depth--;
> - prev[i] ->sl_next[i] = NULL;
> - }
> + /* transition run-list from PENDING to RUNNING */
> + run_first_tims[nrunlists] = tim;
> + pprev = &run_first_tims[nrunlists];
> + nrunlists++;
> +
> + for (; tim != NULL; tim = next_tim) {
> + next_tim = tim->sl_next[0];
> +
> + ret = timer_set_running_state(tim);
> + if (likely(ret == 0)) {
> + pprev = &tim->sl_next[0];
> + } else {
> + /* another core is trying to re-config this one,
> + * remove it from local expired list
> + */
> + *pprev = next_tim;
> + }
> + }
>
> - /* transition run-list from PENDING to RUNNING */
> - run_first_tim = tim;
> - pprev = &run_first_tim;
> + /* update the next to expire timer value */
> + list->head.expire = (list->head.sl_next[0] == NULL) ?
> + 0 : list->head.sl_next[0]->expire;
>
> - for ( ; tim != NULL; tim = next_tim) {
> - next_tim = tim->sl_next[0];
> + rte_spinlock_unlock(&list->lock);
> + }
>
> - ret = timer_set_running_state(tim);
> - if (likely(ret == 0)) {
> - pprev = &tim->sl_next[0];
> - } else {
> - /* another core is trying to re-config this one,
> - * remove it from local expired list
> - */
> - *pprev = next_tim;
> + /* Now process the run lists */
> + while (1) {
> + done = true;
> + min_expire = UINT64_MAX;
> + min_idx = 0;
> +
> + /* Find the next oldest timer to process */
> + for (i = 0; i < nrunlists; i++) {
> + tim = run_first_tims[i];
> +
> + if (tim != NULL && tim->expire < min_expire) {
> + min_expire = tim->expire;
> + min_idx = i;
> + done = false;
> + }
> }
> - }
>
> - /* update the next to expire timer value */
> - priv_timer[lcore_id].pending_head.expire =
> - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
> - priv_timer[lcore_id].pending_head.sl_next[0]->expire;
> + if (done)
> + break;
> +
> + tim = run_first_tims[min_idx];
>
> - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
> + /* Move down the runlist from which we picked a timer to
> + * execute
> + */
> + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
>
> - /* now scan expired list and call callbacks */
> - for (tim = run_first_tim; tim != NULL; tim = next_tim) {
> - next_tim = tim->sl_next[0];
> - priv_timer[lcore_id].updated = 0;
> - priv_timer[lcore_id].running_tim = tim;
> + priv_tim->updated = 0;
> + priv_tim->running_tim = tim;
>
> /* execute callback function with list unlocked */
> tim->f(tim, tim->arg);
>
> __TIMER_STAT_ADD(pending, -1);
> /* the timer was stopped or reloaded by the callback
> - * function, we have nothing to do here */
> - if (priv_timer[lcore_id].updated == 1)
> + * function, we have nothing to do here
> + */
> + if (priv_tim->updated == 1)
> continue;
>
> if (tim->period == 0) {
> /* remove from done list and mark timer as stopped */
> status.state = RTE_TIMER_STOP;
> - status.owner = RTE_TIMER_NO_OWNER;
> + status.installer = RTE_TIMER_NO_LCORE;
> + status.owner = RTE_TIMER_NO_LCORE;
> rte_wmb();
> tim->status.u32 = status.u32;
> }
> else {
> - /* keep it in list and mark timer as pending */
> - rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
> + dest_list = &priv_tim->pending_lists[lcore_id];
> +
> + /* If the destination list is the current list
> + * we can acquire the lock here, and hold it
> + * across removal and insertion of the timer.
> + */
> + if (list == dest_list) {
> + rte_spinlock_lock(&list->lock);
> + local_is_locked = 1;
> + }
> +
> + /* set timer state back to PENDING and
> + * reinsert it in pending list
> + */
> status.state = RTE_TIMER_PENDING;
> __TIMER_STAT_ADD(pending, 1);
> - status.owner = (int16_t)lcore_id;
> + status.installer = tim->status.installer;
> + status.owner = lcore_id;
> rte_wmb();
> tim->status.u32 = status.u32;
> - __rte_timer_reset(tim, tim->expire + tim->period,
> - tim->period, lcore_id, tim->f, tim->arg, 1);
> - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
> +
> + __rte_timer_reset(tim,
> + tim->expire + tim->period,
> + tim->period, lcore_id,
> + tim->f, tim->arg, local_is_locked);
> +
> + if (local_is_locked) {
> + rte_spinlock_unlock(&list->lock);
> + local_is_locked = 0;
> + }
> }
> }
> - priv_timer[lcore_id].running_tim = NULL;
> + priv_tim->running_tim = NULL;
> }
>
> /* dump statistics about timers */
> diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h
> index a276a73..4cc6baf 100644
> --- a/lib/librte_timer/rte_timer.h
> +++ b/lib/librte_timer/rte_timer.h
> @@ -77,7 +77,7 @@ extern "C" {
> #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */
> #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */
>
> -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */
> +#define RTE_TIMER_NO_LCORE -2
>
> /**
> * Timer type: Periodic or single (one-shot).
> @@ -94,10 +94,11 @@ enum rte_timer_type {
> union rte_timer_status {
> RTE_STD_C11
> struct {
> - uint16_t state; /**< Stop, pending, running, config. */
> - int16_t owner; /**< The lcore that owns the timer. */
> + unsigned state : 2;
> + int installer : 15;
> + int owner : 15;
> };
> - uint32_t u32; /**< To atomic-set status + owner. */
> + uint32_t u32; /**< To atomic-set status, installer, owner */
> };
>
> #ifdef RTE_LIBRTE_TIMER_DEBUG
> --
> 2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
@ 2017-08-29 15:11 ` Stephen Hemminger
1 sibling, 0 replies; 30+ messages in thread
From: Stephen Hemminger @ 2017-08-29 15:11 UTC (permalink / raw)
To: Gabriel Carrillo; +Cc: rsanford, dev
On Wed, 23 Aug 2017 09:47:22 -0500
Gabriel Carrillo <erik.g.carrillo@intel.com> wrote:
> __TIMER_STAT_ADD(manage, 1);
> - /* optimize for the case where per-cpu list is empty */
> - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
> - return;
> - cur_time = rte_get_timer_cycles();
> + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores;
> + installer_lcore = enabled_lcores[++i]) {
> + list = &priv_tim->pending_lists[installer_lcore];
> +
> + /* optimize for the case where list is empty */
> + if (list->head.sl_next[0] == NULL)
> + continue;
> + cur_time = rte_get_timer_cycles();
This code is critical. We expect applications using timers to call timer_manage
very often and the case of no timers present, and no timer due must consume as
few cycles as possible
This change will add significant performance delays to these applications.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore
2017-08-29 10:57 ` Ananyev, Konstantin
@ 2017-08-29 16:13 ` Carrillo, Erik G
0 siblings, 0 replies; 30+ messages in thread
From: Carrillo, Erik G @ 2017-08-29 16:13 UTC (permalink / raw)
To: Ananyev, Konstantin, rsanford; +Cc: dev, Stephen Hemminger
Hi Konstantin,
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Tuesday, August 29, 2017 5:57 AM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>; rsanford@akamai.com
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists
> for each lcore
>
> Hi Gabriel,
>
> >
> > Instead of each priv_timer struct containing a single skiplist, this
> > commit adds a skiplist for each enabled lcore to priv_timer. In the
> > case that multiple lcores repeatedly install timers on the same target
> > lcore, this change reduces lock contention for the target lcore's
> > skiplists and increases performance.
>
> I am not an rte_timer expert, but there is one thing that worries me:
> It seems that complexity of timer_manage() has increased with that patch
> quite a bit:
> now it has to check/process up to RTE_MAX_LCORE skiplists instead of one,
> also it has to somehow to properly sort up to RTE_MAX_LCORE lists of
> retrieved (ready to run) timers.
> Wouldn't all that affect it's running time?
>
Yes, it would indeed increase it.
> I understand your intention to reduce lock contention, but I suppose at least
> it could be done in a configurable way.
> Let say allow user to specify dimension of pending_lists[] at init phase or so.
> Then timer from lcore_id=N will endup in
> pending_lists[N%RTE_DIM(pendilng_list)].
>
This is a neat idea, and seemingly would allow the original performance to be maintained for applications where contention is not an issue. I'll look into this change, as it may address other developers' concerns as well - thanks for the suggestion.
> Another thought - might be better to divide pending timers list not by client
> (lcore) id, but by expiration time - some analog of timer wheel or so.
> That, I think might greatly decrease the probability that timer_manage() and
> timer_add() will try to access the same list.
> From other side timer_manage() still would have to consume skip-lists one
> by one.
> Though I suppose that's quite radical change from what we have right now.
> Konstantin
>
Thanks,
Gabriel
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 0/3] timer library enhancement
2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
@ 2017-09-13 22:05 ` Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
` (3 more replies)
0 siblings, 4 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw)
To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati
In the current implementation of the DPDK timer library, timers can be
created and set to be handled by a target lcore by adding it to a
skiplist that corresponds to that lcore. However, if an application
enables multiple lcores, and each of these lcores repeatedly attempts
to install timers on the same target lcore, overall application
throughput will be reduced as all lcores contend to acquire the lock
guarding the single skiplist of pending timers.
This patchset addresses this scenario by adding an option to enable an
array of skiplists in each lcore's priv_timer struct. Then, when lcore i
installs a timer on lcore k, the timer will be added to the ith skiplist
for lcore k. If lcore j installs a timer on lcore k simultaneously,
lcores i and j can both proceed since they will be acquiring different
locks for different lists. This functionality is off by default, and
can be enabled via a new function.
When lcore k processes its pending timers, if the multiple pending list
option is enabled, it will traverse skiplists in its array and acquire
the current skiplist's lock while a run list is broken out; meanwhile,
all other lists can continue to be modified. Then, all run lists for
lcore k are collected and traversed together so timers are executed in
their relative order. If the multiple pending list option is not enabled
(the default), only a single list will be traversed, as before.
Erik Gabriel Carrillo (3):
timer: add multiple pending lists option for each lcore
timer: handle timers installed from non-EAL threads
doc: update timer lib docs
doc/guides/prog_guide/timer_lib.rst | 27 ++-
doc/guides/rel_notes/release_17_11.rst | 7 +
lib/librte_timer/rte_timer.c | 384 ++++++++++++++++++++++-----------
lib/librte_timer/rte_timer.h | 29 ++-
lib/librte_timer/rte_timer_version.map | 6 +
5 files changed, 319 insertions(+), 134 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
@ 2017-09-13 22:05 ` Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
` (2 subsequent siblings)
3 siblings, 0 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw)
To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati
This commit adds support for enabling multiple pending timer lists in
each lcore's priv_timer struct with a new function; a single list is still
used by default. In the case that multiple lcores repeatedly install
timers on the same target lcore, this option reduces lock contention for
the target lcore's skiplists and increases performance.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v3:
* Added ability to enable multiple pending lists for a specified lcore
with a new function, in response to feedback from Keith Wiles, Konstantin
Ananyev, and Stephen Hemminger. This functionality is off by default,
so that new overhead in rte_timer_manage() is minimized if the
application doesn't want this feature.
* Updated map file to reflect addition of new function.
* Made minor updates to some variable names and logic structure.
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 376 ++++++++++++++++++++++-----------
lib/librte_timer/rte_timer.h | 29 ++-
lib/librte_timer/rte_timer_version.map | 6 +
3 files changed, 287 insertions(+), 124 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 5ee0840..1f4141e 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -37,6 +37,7 @@
#include <inttypes.h>
#include <assert.h>
#include <sys/queue.h>
+#include <stdbool.h>
#include <rte_atomic.h>
#include <rte_common.h>
@@ -54,19 +55,24 @@
#include "rte_timer.h"
+#define SINGLE_LIST_IDX 0
+
LIST_HEAD(rte_timer_list, rte_timer);
+struct skiplist {
+ struct rte_timer head; /**< dummy timer instance to head up list */
+ rte_spinlock_t lock; /**< lock to protect list access */
+ unsigned int depth; /**< track the current depth of the skiplist */
+} __rte_cache_aligned;
+
struct priv_timer {
- struct rte_timer pending_head; /**< dummy timer instance to head up list */
- rte_spinlock_t list_lock; /**< lock to protect list access */
+ /** one pending list per enabled lcore */
+ struct skiplist pending_lists[RTE_MAX_LCORE];
+ bool multi_pendlists;
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
int updated;
-
- /** track the current depth of the skiplist */
- unsigned curr_skiplist_depth;
-
unsigned prev_lcore; /**< used for lcore round robin */
/** running timer on this lcore now */
@@ -81,6 +87,10 @@ struct priv_timer {
/** per-lcore private info for timers */
static struct priv_timer priv_timer[RTE_MAX_LCORE];
+/** cache of IDs of enabled lcores */
+static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static int n_enabled_lcores;
+
/* when debug is enabled, store some statistics */
#ifdef RTE_LIBRTE_TIMER_DEBUG
#define __TIMER_STAT_ADD(name, n) do { \
@@ -96,17 +106,60 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE];
void
rte_timer_subsystem_init(void)
{
- unsigned lcore_id;
+ unsigned int lcore_id, pending_lists_idx;
+ struct skiplist *list;
+ struct priv_timer *priv_tim;
+ int i, j;
+
+ RTE_LCORE_FOREACH(lcore_id)
+ enabled_lcores[n_enabled_lcores++] = lcore_id;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
- rte_spinlock_init(&priv_timer[lcore_id].list_lock);
- priv_timer[lcore_id].prev_lcore = lcore_id;
+ for (i = 0; i < n_enabled_lcores; i++) {
+ lcore_id = enabled_lcores[i];
+
+
+ priv_tim = &priv_timer[lcore_id];
+ priv_tim->prev_lcore = lcore_id;
+ priv_tim->multi_pendlists = false;
+
+ for (j = 0; j < n_enabled_lcores; j++) {
+ pending_lists_idx = enabled_lcores[j];
+ list = &priv_tim->pending_lists[pending_lists_idx];
+ rte_spinlock_init(&list->lock);
+ }
+
+ /* Init the list at SINGLE_LIST_IDX, in case we didn't get to
+ * it above. This will be the skiplist we use unless the
+ * multi-pendlists option is later enabled for this lcore.
+ */
+ list = &priv_tim->pending_lists[SINGLE_LIST_IDX];
+ rte_spinlock_init(&list->lock);
}
}
+int
+rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id)
+{
+ struct priv_timer *priv_tim = &priv_timer[lcore_id];
+ struct skiplist *list = &priv_tim->pending_lists[SINGLE_LIST_IDX];
+
+ rte_spinlock_lock(&list->lock);
+
+ if (list->head.sl_next[0] != NULL) {
+ /* Timers are already on the default pending list. */
+ rte_spinlock_unlock(&list->lock);
+ return -1;
+ }
+
+ priv_tim->multi_pendlists = true;
+ rte_spinlock_unlock(&list->lock);
+
+ return 0;
+}
+
/* Initialize the timer handle tim for use */
void
rte_timer_init(struct rte_timer *tim)
@@ -114,7 +167,8 @@ rte_timer_init(struct rte_timer *tim)
union rte_timer_status status;
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
}
@@ -142,7 +196,7 @@ timer_set_config_state(struct rte_timer *tim,
* or ready to run on local core, exit
*/
if (prev_status.state == RTE_TIMER_RUNNING &&
- (prev_status.owner != (uint16_t)lcore_id ||
+ (prev_status.owner != (int)lcore_id ||
tim != priv_timer[lcore_id].running_tim))
return -1;
@@ -153,7 +207,8 @@ timer_set_config_state(struct rte_timer *tim,
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_CONFIG;
- status.owner = (int16_t)lcore_id;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -185,7 +240,8 @@ timer_set_running_state(struct rte_timer *tim)
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_RUNNING;
- status.owner = (int16_t)lcore_id;
+ status.index = prev_status.index;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -236,11 +292,11 @@ timer_get_skiplist_level(unsigned curr_depth)
* are <= that time value.
*/
static void
-timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
+timer_get_prev_entries(uint64_t time_val, struct skiplist *list,
struct rte_timer **prev)
{
- unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
- prev[lvl] = &priv_timer[tim_lcore].pending_head;
+ unsigned int lvl = list->depth;
+ prev[lvl] = &list->head;
while(lvl != 0) {
lvl--;
prev[lvl] = prev[lvl+1];
@@ -255,15 +311,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
* all skiplist levels.
*/
static void
-timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
+timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
struct rte_timer **prev)
{
int i;
/* to get a specific entry in the list, look for just lower than the time
* values, and then increment on each level individually if necessary
*/
- timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
- for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries(tim->expire - 1, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
while (prev[i]->sl_next[i] != NULL &&
prev[i]->sl_next[i] != tim &&
prev[i]->sl_next[i]->expire <= tim->expire)
@@ -277,30 +333,39 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
* timer must not be in a list
*/
static void
-timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
+timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked,
+ int *pending_lists_idx)
{
unsigned lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+ struct priv_timer *priv_tim = &priv_timer[tim_lcore];
+
+ if (priv_tim->multi_pendlists)
+ *pending_lists_idx = lcore_id;
+ else
+ *pending_lists_idx = SINGLE_LIST_IDX;
+
+ list = &priv_tim->pending_lists[*pending_lists_idx];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
* we are not called from rte_timer_manage() */
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_lock(&list->lock);
/* find where exactly this element goes in the list of elements
* for each depth. */
- timer_get_prev_entries(tim->expire, tim_lcore, prev);
+ timer_get_prev_entries(tim->expire, list, prev);
/* now assign it a new level and add at that level */
- const unsigned tim_level = timer_get_skiplist_level(
- priv_timer[tim_lcore].curr_skiplist_depth);
- if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
- priv_timer[tim_lcore].curr_skiplist_depth++;
+ const unsigned int tim_level = timer_get_skiplist_level(list->depth);
+ if (tim_level == list->depth)
+ list->depth++;
lvl = tim_level;
- while (lvl > 0) {
+ while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) {
tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
prev[lvl]->sl_next[lvl] = tim;
lvl--;
@@ -310,11 +375,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
/* save the lowest list entry into the expire field of the dummy hdr
* NOTE: this is not atomic on 32-bit*/
- priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
- pending_head.sl_next[0]->expire;
+ list->head.expire = list->head.sl_next[0]->expire;
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/*
@@ -330,35 +394,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
unsigned prev_owner = prev_status.owner;
int i;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+
+ list = &priv_timer[prev_owner].pending_lists[prev_status.index];
/* if timer needs is pending another core, we need to lock the
* list; if it is on local core, we need to lock if we are not
* called from rte_timer_manage() */
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_lock(&list->lock);
/* save the lowest list entry into the expire field of the dummy hdr.
* NOTE: this is not atomic on 32-bit */
- if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
- priv_timer[prev_owner].pending_head.expire =
- ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
+ if (tim == list->head.sl_next[0])
+ list->head.expire = ((tim->sl_next[0] == NULL) ?
+ 0 : tim->sl_next[0]->expire);
/* adjust pointers from previous entries to point past this */
- timer_get_prev_entries_for_node(tim, prev_owner, prev);
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries_for_node(tim, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
if (prev[i]->sl_next[i] == tim)
prev[i]->sl_next[i] = tim->sl_next[i];
}
/* in case we deleted last entry at a level, adjust down max level */
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
- if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
- priv_timer[prev_owner].curr_skiplist_depth --;
+ for (i = list->depth - 1; i >= 0; i--)
+ if (list->head.sl_next[i] == NULL)
+ list->depth--;
else
break;
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/* Reset and start the timer associated with the timer handle (private func) */
@@ -371,15 +438,16 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
union rte_timer_status prev_status, status;
int ret;
unsigned lcore_id = rte_lcore_id();
+ struct priv_timer *priv_tim = &priv_timer[lcore_id];
+ int pending_lists_index;
/* round robin for tim_lcore */
if (tim_lcore == (unsigned)LCORE_ID_ANY) {
if (lcore_id < RTE_MAX_LCORE) {
/* EAL thread with valid lcore_id */
- tim_lcore = rte_get_next_lcore(
- priv_timer[lcore_id].prev_lcore,
- 0, 1);
- priv_timer[lcore_id].prev_lcore = tim_lcore;
+ tim_lcore = rte_get_next_lcore(priv_tim->prev_lcore, 0,
+ 1);
+ priv_tim->prev_lcore = tim_lcore;
} else
/* non-EAL thread do not run rte_timer_manage(),
* so schedule the timer on the first enabled lcore. */
@@ -395,7 +463,7 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
__TIMER_STAT_ADD(reset, 1);
if (prev_status.state == RTE_TIMER_RUNNING &&
lcore_id < RTE_MAX_LCORE) {
- priv_timer[lcore_id].updated = 1;
+ priv_tim->updated = 1;
}
/* remove it from list */
@@ -410,13 +478,14 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
tim->arg = arg;
__TIMER_STAT_ADD(pending, 1);
- timer_add(tim, tim_lcore, local_is_locked);
+ timer_add(tim, tim_lcore, local_is_locked, &pending_lists_index);
- /* update state: as we are in CONFIG state, only us can modify
+ /* update state: as we are in CONFIG state, only we can modify
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.owner = (int16_t)tim_lcore;
+ status.index = pending_lists_index;
+ status.owner = tim_lcore;
tim->status.u32 = status.u32;
return 0;
@@ -484,7 +553,8 @@ rte_timer_stop(struct rte_timer *tim)
/* mark timer as stopped */
rte_wmb();
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
return 0;
@@ -506,119 +576,185 @@ rte_timer_pending(struct rte_timer *tim)
}
/* must be called periodically, run all timer that expired */
-void rte_timer_manage(void)
+void
+rte_timer_manage(void)
{
union rte_timer_status status;
- struct rte_timer *tim, *next_tim;
- struct rte_timer *run_first_tim, **pprev;
- unsigned lcore_id = rte_lcore_id();
+ struct rte_timer *tim, *next_tim, **pprev;
+ struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1];
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
- uint64_t cur_time;
- int i, ret;
+ struct priv_timer *priv_tim;
+ unsigned int lcore_id = rte_lcore_id();
+ uint64_t cur_time, min_expire;
+ int i, j, min_idx, ret;
+ int npendlists, nrunlists = 0;
+ int local_is_locked = 0;
+ struct skiplist *dest_list, *list = NULL;
+ bool done;
+ int idx;
/* timer manager only runs on EAL thread with valid lcore_id */
assert(lcore_id < RTE_MAX_LCORE);
+ priv_tim = &priv_timer[lcore_id];
+
__TIMER_STAT_ADD(manage, 1);
- /* optimize for the case where per-cpu list is empty */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
- return;
- cur_time = rte_get_timer_cycles();
+ npendlists = (priv_tim->multi_pendlists) ? n_enabled_lcores : 1;
+
+ for (i = 0; i < npendlists; i++) {
+ idx = (priv_tim->multi_pendlists) ? enabled_lcores[i] :
+ SINGLE_LIST_IDX;
+ list = &priv_tim->pending_lists[idx];
+
+ /* optimize for the case where list is empty */
+ if (list->head.sl_next[0] == NULL)
+ continue;
+ cur_time = rte_get_timer_cycles();
#ifdef RTE_ARCH_X86_64
- /* on 64-bit the value cached in the pending_head.expired will be
- * updated atomically, so we can consult that for a quick check here
- * outside the lock */
- if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
- return;
+ /* on 64-bit the value cached in the list->head.expired will be
+ * updated atomically, so we can consult that for a quick check
+ * here outside the lock
+ */
+ if (likely(list->head.expire > cur_time))
+ continue;
#endif
- /* browse ordered list, add expired timers in 'expired' list */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ /* browse ordered list, add expired timers in 'expired' list */
+ rte_spinlock_lock(&list->lock);
- /* if nothing to do just unlock and return */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
- priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
- return;
- }
+ /* if nothing to do just unlock and continue */
+ if (list->head.sl_next[0] == NULL ||
+ list->head.sl_next[0]->expire > cur_time) {
+ rte_spinlock_unlock(&list->lock);
+ continue;
+ }
- /* save start of list of expired timers */
- tim = priv_timer[lcore_id].pending_head.sl_next[0];
+ /* save start of list of expired timers */
+ tim = list->head.sl_next[0];
+
+ /* break the existing list at current time point */
+ timer_get_prev_entries(cur_time, list, prev);
+ for (j = list->depth - 1; j >= 0; j--) {
+ if (prev[j] == &list->head)
+ continue;
+ list->head.sl_next[j] =
+ prev[j]->sl_next[j];
+ if (prev[j]->sl_next[j] == NULL)
+ list->depth--;
+ prev[j]->sl_next[j] = NULL;
+ }
- /* break the existing list at current time point */
- timer_get_prev_entries(cur_time, lcore_id, prev);
- for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
- if (prev[i] == &priv_timer[lcore_id].pending_head)
- continue;
- priv_timer[lcore_id].pending_head.sl_next[i] =
- prev[i]->sl_next[i];
- if (prev[i]->sl_next[i] == NULL)
- priv_timer[lcore_id].curr_skiplist_depth--;
- prev[i] ->sl_next[i] = NULL;
- }
+ /* transition run-list from PENDING to RUNNING */
+ run_first_tims[nrunlists] = tim;
+ pprev = &run_first_tims[nrunlists];
+ nrunlists++;
+
+ for (; tim != NULL; tim = next_tim) {
+ next_tim = tim->sl_next[0];
+
+ ret = timer_set_running_state(tim);
+ if (likely(ret == 0)) {
+ pprev = &tim->sl_next[0];
+ } else {
+ /* another core is trying to re-config this one,
+ * remove it from local expired list
+ */
+ *pprev = next_tim;
+ }
+ }
- /* transition run-list from PENDING to RUNNING */
- run_first_tim = tim;
- pprev = &run_first_tim;
+ /* update the next to expire timer value */
+ list->head.expire = (list->head.sl_next[0] == NULL) ?
+ 0 : list->head.sl_next[0]->expire;
- for ( ; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
+ rte_spinlock_unlock(&list->lock);
+ }
- ret = timer_set_running_state(tim);
- if (likely(ret == 0)) {
- pprev = &tim->sl_next[0];
- } else {
- /* another core is trying to re-config this one,
- * remove it from local expired list
- */
- *pprev = next_tim;
+ /* Now process the run lists */
+ while (1) {
+ done = true;
+ min_expire = UINT64_MAX;
+ min_idx = 0;
+
+ /* Find the next oldest timer to process */
+ for (i = 0; i < nrunlists; i++) {
+ tim = run_first_tims[i];
+
+ if (tim != NULL && tim->expire < min_expire) {
+ min_expire = tim->expire;
+ min_idx = i;
+ done = false;
+ }
}
- }
- /* update the next to expire timer value */
- priv_timer[lcore_id].pending_head.expire =
- (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
- priv_timer[lcore_id].pending_head.sl_next[0]->expire;
+ if (done)
+ break;
+
+ tim = run_first_tims[min_idx];
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+ /* Move down the runlist from which we picked a timer to
+ * execute
+ */
+ run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
- /* now scan expired list and call callbacks */
- for (tim = run_first_tim; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
- priv_timer[lcore_id].updated = 0;
- priv_timer[lcore_id].running_tim = tim;
+ priv_tim->updated = 0;
+ priv_tim->running_tim = tim;
/* execute callback function with list unlocked */
tim->f(tim, tim->arg);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
- * function, we have nothing to do here */
- if (priv_timer[lcore_id].updated == 1)
+ * function, we have nothing to do here
+ */
+ if (priv_tim->updated == 1)
continue;
if (tim->period == 0) {
/* remove from done list and mark timer as stopped */
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
rte_wmb();
tim->status.u32 = status.u32;
}
else {
- /* keep it in list and mark timer as pending */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ idx = (priv_tim->multi_pendlists) ? lcore_id :
+ SINGLE_LIST_IDX;
+ dest_list = &priv_tim->pending_lists[idx];
+
+ /* If the destination list is the current list
+ * we can acquire the lock here, and hold it
+ * across removal and insertion of the timer.
+ */
+ if (list == dest_list) {
+ rte_spinlock_lock(&list->lock);
+ local_is_locked = 1;
+ }
+
+ /* set timer state back to PENDING and
+ * reinsert it in pending list
+ */
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
- status.owner = (int16_t)lcore_id;
+ status.index = tim->status.index;
+ status.owner = lcore_id;
rte_wmb();
tim->status.u32 = status.u32;
- __rte_timer_reset(tim, tim->expire + tim->period,
- tim->period, lcore_id, tim->f, tim->arg, 1);
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+
+ __rte_timer_reset(tim,
+ tim->expire + tim->period,
+ tim->period, lcore_id,
+ tim->f, tim->arg, local_is_locked);
+
+ if (local_is_locked) {
+ rte_spinlock_unlock(&list->lock);
+ local_is_locked = 0;
+ }
}
}
- priv_timer[lcore_id].running_tim = NULL;
+ priv_tim->running_tim = NULL;
}
/* dump statistics about timers */
diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h
index a276a73..5870b4a 100644
--- a/lib/librte_timer/rte_timer.h
+++ b/lib/librte_timer/rte_timer.h
@@ -77,7 +77,7 @@ extern "C" {
#define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */
#define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */
-#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */
+#define RTE_TIMER_NO_LCORE -2
/**
* Timer type: Periodic or single (one-shot).
@@ -94,10 +94,11 @@ enum rte_timer_type {
union rte_timer_status {
RTE_STD_C11
struct {
- uint16_t state; /**< Stop, pending, running, config. */
- int16_t owner; /**< The lcore that owns the timer. */
+ unsigned state : 2;
+ int index : 15;
+ int owner : 15;
};
- uint32_t u32; /**< To atomic-set status + owner. */
+ uint32_t u32; /**< To atomic-set status, index, owner */
};
#ifdef RTE_LIBRTE_TIMER_DEBUG
@@ -168,6 +169,26 @@ struct rte_timer
void rte_timer_subsystem_init(void);
/**
+ * Enable multiple pending lists for specified lcore ID.
+ *
+ * This function enables the use of per-installer pending lists for a
+ * target lcore. When timers are being installed from multiple
+ * lcores simultaneously, this option can increase performance by
+ * reducing contention for the same pending list.
+ *
+ * This function should be called after the timer subsystem has been
+ * initialized, and before any timers have been started on the desired
+ * lcore.
+ *
+ * @param lcore_id
+ * The lcore on which to enable multiple pending lists
+ * @return
+ * - 0: Success
+ * - (-1): Failure; pending timers already exist on the default list.
+ */
+int rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id);
+
+/**
* Initialize a timer handle.
*
* The rte_timer_init() function initializes the timer handle *tim*
diff --git a/lib/librte_timer/rte_timer_version.map b/lib/librte_timer/rte_timer_version.map
index 9b2e4b8..7d4cbe8 100644
--- a/lib/librte_timer/rte_timer_version.map
+++ b/lib/librte_timer/rte_timer_version.map
@@ -13,3 +13,9 @@ DPDK_2.0 {
local: *;
};
+
+DPDK_17.11 {
+ global:
+
+ rte_timer_subsystem_set_multi_pendlists;
+} DPDK_2.0;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
@ 2017-09-13 22:05 ` Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
3 siblings, 0 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw)
To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati
This commit adds support for timers being created from
non-EAL threads; it maps timers from all such threads to
lcore id RTE_MAX_LCORE, and puts them all in a corresponding
skiplist.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
V3:
* Rebased patch on reworked parent commit
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 1f4141e..c489c5f 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -66,8 +66,8 @@ struct skiplist {
} __rte_cache_aligned;
struct priv_timer {
- /** one pending list per enabled lcore */
- struct skiplist pending_lists[RTE_MAX_LCORE];
+ /** one pending list per lcore, plus one for non-EAL threads */
+ struct skiplist pending_lists[RTE_MAX_LCORE + 1];
bool multi_pendlists;
/** per-core variable that true if a timer was updated on this
@@ -88,7 +88,7 @@ struct priv_timer {
static struct priv_timer priv_timer[RTE_MAX_LCORE];
/** cache of IDs of enabled lcores */
-static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static unsigned int enabled_lcores[RTE_MAX_LCORE + 1];
static int n_enabled_lcores;
/* when debug is enabled, store some statistics */
@@ -114,12 +114,18 @@ rte_timer_subsystem_init(void)
RTE_LCORE_FOREACH(lcore_id)
enabled_lcores[n_enabled_lcores++] = lcore_id;
+ /* To handle timers coming from non-EAL threads */
+ enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE;
+
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
for (i = 0; i < n_enabled_lcores; i++) {
lcore_id = enabled_lcores[i];
+ /* Don't use this value to index the priv_timer array */
+ if (lcore_id == RTE_MAX_LCORE)
+ continue;
priv_tim = &priv_timer[lcore_id];
priv_tim->prev_lcore = lcore_id;
@@ -343,7 +349,9 @@ timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked,
struct priv_timer *priv_tim = &priv_timer[tim_lcore];
if (priv_tim->multi_pendlists)
- *pending_lists_idx = lcore_id;
+ /* Check if timer being installed from non-EAL thread */
+ *pending_lists_idx = (lcore_id == LCORE_ID_ANY) ?
+ RTE_MAX_LCORE : lcore_id;
else
*pending_lists_idx = SINGLE_LIST_IDX;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
@ 2017-09-13 22:05 ` Erik Gabriel Carrillo
2017-09-18 16:19 ` Mcnamara, John
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
3 siblings, 1 reply; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw)
To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati
This change updates the timer library documentation to
reflect a change to the organization of the skiplists
in the implementation.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v3
* Updated implementation details section of timer_lib.rst to reflect the
addition of the option to use multiple pending timer lists per lcore.
* Updated release notes to reflect the addition of new function in timer
lib API.
doc/guides/prog_guide/timer_lib.rst | 27 +++++++++++++++++----------
doc/guides/rel_notes/release_17_11.rst | 7 +++++++
2 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst
index f437417..dfabf24 100644
--- a/doc/guides/prog_guide/timer_lib.rst
+++ b/doc/guides/prog_guide/timer_lib.rst
@@ -1,5 +1,5 @@
.. BSD LICENSE
- Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -53,16 +53,19 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>`
Implementation Details
----------------------
-Timers are tracked on a per-lcore basis,
-with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure.
-The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level.
+Timers are tracked on a per-lcore basis, with all pending timers for a core being maintained in order of timer
+expiry in either a single skiplist data structure or an array of skiplists, depending on whether
+the lcore has been configured for multiple pending lists. Multiple pending lists can be enabled when an
+application experiences contention for a single list for that lcore; skiplists corresponding to every other
+enabled lcore will be created.
+Each skiplist data structure has ten levels and each entry in the table appears in each level with probability ¼^level.
This means that all entries are present in level 0, 1 in every 4 entries is present at level 1,
one in every 16 at level 2 and so on up to level 9.
This means that adding and removing entries from the timer list for a core can be done in log(n) time,
up to 4^10 entries, that is, approximately 1,000,000 timers per lcore.
A timer structure contains a special field called status,
-which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id).
+which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id).
Depending on the timer state, we know if a timer is present in a list or not:
* STOPPED: no owner, not in a list
@@ -77,17 +80,21 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al
When modifying the state of a timer,
a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically.
-Inside the rte_timer_manage() function,
-the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
-until an entry which has not yet expired has been encountered.
-To improve performance in the case where there are entries in the timer list but none of those timers have yet expired,
+Inside the rte_timer_manage() function, the timer lists are processed.
+If multiple pending lists have been enabled for an lcore, then each skiplist will
+be traversed sequentially, and run lists will be broken out and then processed.
+If multiple pending lists are not enabled for an lcore, then only a single skiplist will be traversed.
+A skiplist is used as a regular list by iterating along the level
+0 list, which contains all timer entries, until an entry which has not yet expired has been encountered.
+To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired,
the expiry time of the first list entry is maintained within the per-core timer list structure itself.
On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure.
(Since expiry times are maintained as 64-bit values,
a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock,
so this additional check is skipped in favor of checking as normal once the lock has been taken.)
On both 64-bit and 32-bit platforms,
-a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty.
+rte_timer_manage() can either return or continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty,
+depending on whether or not the lcore has multiple pending lists.
Use Cases
---------
diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst
index 170f4f9..4683cbe 100644
--- a/doc/guides/rel_notes/release_17_11.rst
+++ b/doc/guides/rel_notes/release_17_11.rst
@@ -110,6 +110,13 @@ API Changes
Also, make sure to start the actual text at the margin.
=========================================================
+* **Updated timer library.**
+
+ The timer library has been updated; it can now support multiple timer lists
+ per lcore where it previously only had one. This functionality is off by
+ default but can be enabled in cases where contention for a single list is
+ an issue with the new function ``rte_timer_subsystem_set_multi_pendlists()``.
+
ABI Changes
-----------
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo
@ 2017-09-18 16:19 ` Mcnamara, John
2017-09-19 17:04 ` Carrillo, Erik G
0 siblings, 1 reply; 30+ messages in thread
From: Mcnamara, John @ 2017-09-18 16:19 UTC (permalink / raw)
To: Carrillo, Erik G, rsanford
Cc: dev, Ananyev, Konstantin, stephen, Wiles, Keith, Vangati, Narender
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Erik Gabriel Carrillo
> Sent: Wednesday, September 13, 2017 11:05 PM
> To: rsanford@akamai.com
> Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>; Vangati,
> Narender <narender.vangati@intel.com>
> Subject: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
>
> This change updates the timer library documentation to reflect a change to
> the organization of the skiplists in the implementation.
>
> Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
> +Each skiplist data structure has ten levels and each entry in the table
> appears in each level with probability ¼^level.
Probably best not to use that Unicode, or otherwise, 1/4 character since
it can mess up the PDF output. Use "0.25^level" instead.
Apart from that:
Acked-by: John McNamara <john.mcnamara@intel.com>
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 0/3] timer library enhancement
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
` (2 preceding siblings ...)
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo
@ 2017-09-19 17:02 ` Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
` (3 more replies)
3 siblings, 4 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw)
To: rsanford; +Cc: dev, john.mcnamara
In the current implementation of the DPDK timer library, timers can be
created and set to be handled by a target lcore by adding it to a
skiplist that corresponds to that lcore. However, if an application
enables multiple lcores, and each of these lcores repeatedly attempts
to install timers on the same target lcore, overall application
throughput will be reduced as all lcores contend to acquire the lock
guarding the single skiplist of pending timers.
This patchset addresses this scenario by adding an option to enable an
array of skiplists in each lcore's priv_timer struct. Then, when lcore i
installs a timer on lcore k, the timer will be added to the ith skiplist
for lcore k. If lcore j installs a timer on lcore k simultaneously,
lcores i and j can both proceed since they will be acquiring different
locks for different lists. This functionality is off by default, and
can be enabled via a new function.
When lcore k processes its pending timers, if the multiple pending list
option is enabled, it will traverse skiplists in its array and acquire
the current skiplist's lock while a run list is broken out; meanwhile,
all other lists can continue to be modified. Then, all run lists for
lcore k are collected and traversed together so timers are executed in
their relative order. If the multiple pending list option is not enabled
(the default), only a single list will be traversed, as before.
Erik Gabriel Carrillo (3):
timer: add multiple pending lists option for each lcore
timer: handle timers installed from non-EAL threads
doc: update timer lib docs
doc/guides/prog_guide/timer_lib.rst | 31 +--
doc/guides/rel_notes/release_17_11.rst | 7 +
lib/librte_timer/rte_timer.c | 384 ++++++++++++++++++++++-----------
lib/librte_timer/rte_timer.h | 29 ++-
lib/librte_timer/rte_timer_version.map | 6 +
5 files changed, 321 insertions(+), 136 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
@ 2017-09-19 17:02 ` Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
` (2 subsequent siblings)
3 siblings, 0 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw)
To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo
From: Erik Carrillo <erik.g.carrillo@intel.com>
This commit adds support for enabling multiple pending timer lists in
each lcore's priv_timer struct with a new function; a single list is still
used by default. In the case that multiple lcores repeatedly install
timers on the same target lcore, this option reduces lock contention for
the target lcore's skiplists and increases performance.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v3:
* Added ability to enable multiple pending lists for a specified lcore
with a new function, in response to feedback from Keith Wiles, Konstantin
Ananyev, and Stephen Hemminger. This functionality is off by default,
so that new overhead in rte_timer_manage() is minimized if the
application doesn't want this feature.
* Updated map file to reflect addition of new function.
* Made minor updates to some variable names and logic structure.
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 376 ++++++++++++++++++++++-----------
lib/librte_timer/rte_timer.h | 29 ++-
lib/librte_timer/rte_timer_version.map | 6 +
3 files changed, 287 insertions(+), 124 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 5ee0840..1f4141e 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -37,6 +37,7 @@
#include <inttypes.h>
#include <assert.h>
#include <sys/queue.h>
+#include <stdbool.h>
#include <rte_atomic.h>
#include <rte_common.h>
@@ -54,19 +55,24 @@
#include "rte_timer.h"
+#define SINGLE_LIST_IDX 0
+
LIST_HEAD(rte_timer_list, rte_timer);
+struct skiplist {
+ struct rte_timer head; /**< dummy timer instance to head up list */
+ rte_spinlock_t lock; /**< lock to protect list access */
+ unsigned int depth; /**< track the current depth of the skiplist */
+} __rte_cache_aligned;
+
struct priv_timer {
- struct rte_timer pending_head; /**< dummy timer instance to head up list */
- rte_spinlock_t list_lock; /**< lock to protect list access */
+ /** one pending list per enabled lcore */
+ struct skiplist pending_lists[RTE_MAX_LCORE];
+ bool multi_pendlists;
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
int updated;
-
- /** track the current depth of the skiplist */
- unsigned curr_skiplist_depth;
-
unsigned prev_lcore; /**< used for lcore round robin */
/** running timer on this lcore now */
@@ -81,6 +87,10 @@ struct priv_timer {
/** per-lcore private info for timers */
static struct priv_timer priv_timer[RTE_MAX_LCORE];
+/** cache of IDs of enabled lcores */
+static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static int n_enabled_lcores;
+
/* when debug is enabled, store some statistics */
#ifdef RTE_LIBRTE_TIMER_DEBUG
#define __TIMER_STAT_ADD(name, n) do { \
@@ -96,17 +106,60 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE];
void
rte_timer_subsystem_init(void)
{
- unsigned lcore_id;
+ unsigned int lcore_id, pending_lists_idx;
+ struct skiplist *list;
+ struct priv_timer *priv_tim;
+ int i, j;
+
+ RTE_LCORE_FOREACH(lcore_id)
+ enabled_lcores[n_enabled_lcores++] = lcore_id;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
- for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
- rte_spinlock_init(&priv_timer[lcore_id].list_lock);
- priv_timer[lcore_id].prev_lcore = lcore_id;
+ for (i = 0; i < n_enabled_lcores; i++) {
+ lcore_id = enabled_lcores[i];
+
+
+ priv_tim = &priv_timer[lcore_id];
+ priv_tim->prev_lcore = lcore_id;
+ priv_tim->multi_pendlists = false;
+
+ for (j = 0; j < n_enabled_lcores; j++) {
+ pending_lists_idx = enabled_lcores[j];
+ list = &priv_tim->pending_lists[pending_lists_idx];
+ rte_spinlock_init(&list->lock);
+ }
+
+ /* Init the list at SINGLE_LIST_IDX, in case we didn't get to
+ * it above. This will be the skiplist we use unless the
+ * multi-pendlists option is later enabled for this lcore.
+ */
+ list = &priv_tim->pending_lists[SINGLE_LIST_IDX];
+ rte_spinlock_init(&list->lock);
}
}
+int
+rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id)
+{
+ struct priv_timer *priv_tim = &priv_timer[lcore_id];
+ struct skiplist *list = &priv_tim->pending_lists[SINGLE_LIST_IDX];
+
+ rte_spinlock_lock(&list->lock);
+
+ if (list->head.sl_next[0] != NULL) {
+ /* Timers are already on the default pending list. */
+ rte_spinlock_unlock(&list->lock);
+ return -1;
+ }
+
+ priv_tim->multi_pendlists = true;
+ rte_spinlock_unlock(&list->lock);
+
+ return 0;
+}
+
/* Initialize the timer handle tim for use */
void
rte_timer_init(struct rte_timer *tim)
@@ -114,7 +167,8 @@ rte_timer_init(struct rte_timer *tim)
union rte_timer_status status;
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
}
@@ -142,7 +196,7 @@ timer_set_config_state(struct rte_timer *tim,
* or ready to run on local core, exit
*/
if (prev_status.state == RTE_TIMER_RUNNING &&
- (prev_status.owner != (uint16_t)lcore_id ||
+ (prev_status.owner != (int)lcore_id ||
tim != priv_timer[lcore_id].running_tim))
return -1;
@@ -153,7 +207,8 @@ timer_set_config_state(struct rte_timer *tim,
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_CONFIG;
- status.owner = (int16_t)lcore_id;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -185,7 +240,8 @@ timer_set_running_state(struct rte_timer *tim)
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_RUNNING;
- status.owner = (int16_t)lcore_id;
+ status.index = prev_status.index;
+ status.owner = lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
@@ -236,11 +292,11 @@ timer_get_skiplist_level(unsigned curr_depth)
* are <= that time value.
*/
static void
-timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
+timer_get_prev_entries(uint64_t time_val, struct skiplist *list,
struct rte_timer **prev)
{
- unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
- prev[lvl] = &priv_timer[tim_lcore].pending_head;
+ unsigned int lvl = list->depth;
+ prev[lvl] = &list->head;
while(lvl != 0) {
lvl--;
prev[lvl] = prev[lvl+1];
@@ -255,15 +311,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
* all skiplist levels.
*/
static void
-timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
+timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list,
struct rte_timer **prev)
{
int i;
/* to get a specific entry in the list, look for just lower than the time
* values, and then increment on each level individually if necessary
*/
- timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
- for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries(tim->expire - 1, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
while (prev[i]->sl_next[i] != NULL &&
prev[i]->sl_next[i] != tim &&
prev[i]->sl_next[i]->expire <= tim->expire)
@@ -277,30 +333,39 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
* timer must not be in a list
*/
static void
-timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
+timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked,
+ int *pending_lists_idx)
{
unsigned lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+ struct priv_timer *priv_tim = &priv_timer[tim_lcore];
+
+ if (priv_tim->multi_pendlists)
+ *pending_lists_idx = lcore_id;
+ else
+ *pending_lists_idx = SINGLE_LIST_IDX;
+
+ list = &priv_tim->pending_lists[*pending_lists_idx];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
* we are not called from rte_timer_manage() */
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_lock(&list->lock);
/* find where exactly this element goes in the list of elements
* for each depth. */
- timer_get_prev_entries(tim->expire, tim_lcore, prev);
+ timer_get_prev_entries(tim->expire, list, prev);
/* now assign it a new level and add at that level */
- const unsigned tim_level = timer_get_skiplist_level(
- priv_timer[tim_lcore].curr_skiplist_depth);
- if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
- priv_timer[tim_lcore].curr_skiplist_depth++;
+ const unsigned int tim_level = timer_get_skiplist_level(list->depth);
+ if (tim_level == list->depth)
+ list->depth++;
lvl = tim_level;
- while (lvl > 0) {
+ while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) {
tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
prev[lvl]->sl_next[lvl] = tim;
lvl--;
@@ -310,11 +375,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
/* save the lowest list entry into the expire field of the dummy hdr
* NOTE: this is not atomic on 32-bit*/
- priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
- pending_head.sl_next[0]->expire;
+ list->head.expire = list->head.sl_next[0]->expire;
if (tim_lcore != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/*
@@ -330,35 +394,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
unsigned prev_owner = prev_status.owner;
int i;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+ struct skiplist *list;
+
+ list = &priv_timer[prev_owner].pending_lists[prev_status.index];
/* if timer needs is pending another core, we need to lock the
* list; if it is on local core, we need to lock if we are not
* called from rte_timer_manage() */
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_lock(&list->lock);
/* save the lowest list entry into the expire field of the dummy hdr.
* NOTE: this is not atomic on 32-bit */
- if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
- priv_timer[prev_owner].pending_head.expire =
- ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
+ if (tim == list->head.sl_next[0])
+ list->head.expire = ((tim->sl_next[0] == NULL) ?
+ 0 : tim->sl_next[0]->expire);
/* adjust pointers from previous entries to point past this */
- timer_get_prev_entries_for_node(tim, prev_owner, prev);
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
+ timer_get_prev_entries_for_node(tim, list, prev);
+ for (i = list->depth - 1; i >= 0; i--) {
if (prev[i]->sl_next[i] == tim)
prev[i]->sl_next[i] = tim->sl_next[i];
}
/* in case we deleted last entry at a level, adjust down max level */
- for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
- if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
- priv_timer[prev_owner].curr_skiplist_depth --;
+ for (i = list->depth - 1; i >= 0; i--)
+ if (list->head.sl_next[i] == NULL)
+ list->depth--;
else
break;
if (prev_owner != lcore_id || !local_is_locked)
- rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
+ rte_spinlock_unlock(&list->lock);
}
/* Reset and start the timer associated with the timer handle (private func) */
@@ -371,15 +438,16 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
union rte_timer_status prev_status, status;
int ret;
unsigned lcore_id = rte_lcore_id();
+ struct priv_timer *priv_tim = &priv_timer[lcore_id];
+ int pending_lists_index;
/* round robin for tim_lcore */
if (tim_lcore == (unsigned)LCORE_ID_ANY) {
if (lcore_id < RTE_MAX_LCORE) {
/* EAL thread with valid lcore_id */
- tim_lcore = rte_get_next_lcore(
- priv_timer[lcore_id].prev_lcore,
- 0, 1);
- priv_timer[lcore_id].prev_lcore = tim_lcore;
+ tim_lcore = rte_get_next_lcore(priv_tim->prev_lcore, 0,
+ 1);
+ priv_tim->prev_lcore = tim_lcore;
} else
/* non-EAL thread do not run rte_timer_manage(),
* so schedule the timer on the first enabled lcore. */
@@ -395,7 +463,7 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
__TIMER_STAT_ADD(reset, 1);
if (prev_status.state == RTE_TIMER_RUNNING &&
lcore_id < RTE_MAX_LCORE) {
- priv_timer[lcore_id].updated = 1;
+ priv_tim->updated = 1;
}
/* remove it from list */
@@ -410,13 +478,14 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
tim->arg = arg;
__TIMER_STAT_ADD(pending, 1);
- timer_add(tim, tim_lcore, local_is_locked);
+ timer_add(tim, tim_lcore, local_is_locked, &pending_lists_index);
- /* update state: as we are in CONFIG state, only us can modify
+ /* update state: as we are in CONFIG state, only we can modify
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
- status.owner = (int16_t)tim_lcore;
+ status.index = pending_lists_index;
+ status.owner = tim_lcore;
tim->status.u32 = status.u32;
return 0;
@@ -484,7 +553,8 @@ rte_timer_stop(struct rte_timer *tim)
/* mark timer as stopped */
rte_wmb();
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
tim->status.u32 = status.u32;
return 0;
@@ -506,119 +576,185 @@ rte_timer_pending(struct rte_timer *tim)
}
/* must be called periodically, run all timer that expired */
-void rte_timer_manage(void)
+void
+rte_timer_manage(void)
{
union rte_timer_status status;
- struct rte_timer *tim, *next_tim;
- struct rte_timer *run_first_tim, **pprev;
- unsigned lcore_id = rte_lcore_id();
+ struct rte_timer *tim, *next_tim, **pprev;
+ struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1];
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
- uint64_t cur_time;
- int i, ret;
+ struct priv_timer *priv_tim;
+ unsigned int lcore_id = rte_lcore_id();
+ uint64_t cur_time, min_expire;
+ int i, j, min_idx, ret;
+ int npendlists, nrunlists = 0;
+ int local_is_locked = 0;
+ struct skiplist *dest_list, *list = NULL;
+ bool done;
+ int idx;
/* timer manager only runs on EAL thread with valid lcore_id */
assert(lcore_id < RTE_MAX_LCORE);
+ priv_tim = &priv_timer[lcore_id];
+
__TIMER_STAT_ADD(manage, 1);
- /* optimize for the case where per-cpu list is empty */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
- return;
- cur_time = rte_get_timer_cycles();
+ npendlists = (priv_tim->multi_pendlists) ? n_enabled_lcores : 1;
+
+ for (i = 0; i < npendlists; i++) {
+ idx = (priv_tim->multi_pendlists) ? enabled_lcores[i] :
+ SINGLE_LIST_IDX;
+ list = &priv_tim->pending_lists[idx];
+
+ /* optimize for the case where list is empty */
+ if (list->head.sl_next[0] == NULL)
+ continue;
+ cur_time = rte_get_timer_cycles();
#ifdef RTE_ARCH_X86_64
- /* on 64-bit the value cached in the pending_head.expired will be
- * updated atomically, so we can consult that for a quick check here
- * outside the lock */
- if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
- return;
+ /* on 64-bit the value cached in the list->head.expired will be
+ * updated atomically, so we can consult that for a quick check
+ * here outside the lock
+ */
+ if (likely(list->head.expire > cur_time))
+ continue;
#endif
- /* browse ordered list, add expired timers in 'expired' list */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ /* browse ordered list, add expired timers in 'expired' list */
+ rte_spinlock_lock(&list->lock);
- /* if nothing to do just unlock and return */
- if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
- priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
- return;
- }
+ /* if nothing to do just unlock and continue */
+ if (list->head.sl_next[0] == NULL ||
+ list->head.sl_next[0]->expire > cur_time) {
+ rte_spinlock_unlock(&list->lock);
+ continue;
+ }
- /* save start of list of expired timers */
- tim = priv_timer[lcore_id].pending_head.sl_next[0];
+ /* save start of list of expired timers */
+ tim = list->head.sl_next[0];
+
+ /* break the existing list at current time point */
+ timer_get_prev_entries(cur_time, list, prev);
+ for (j = list->depth - 1; j >= 0; j--) {
+ if (prev[j] == &list->head)
+ continue;
+ list->head.sl_next[j] =
+ prev[j]->sl_next[j];
+ if (prev[j]->sl_next[j] == NULL)
+ list->depth--;
+ prev[j]->sl_next[j] = NULL;
+ }
- /* break the existing list at current time point */
- timer_get_prev_entries(cur_time, lcore_id, prev);
- for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
- if (prev[i] == &priv_timer[lcore_id].pending_head)
- continue;
- priv_timer[lcore_id].pending_head.sl_next[i] =
- prev[i]->sl_next[i];
- if (prev[i]->sl_next[i] == NULL)
- priv_timer[lcore_id].curr_skiplist_depth--;
- prev[i] ->sl_next[i] = NULL;
- }
+ /* transition run-list from PENDING to RUNNING */
+ run_first_tims[nrunlists] = tim;
+ pprev = &run_first_tims[nrunlists];
+ nrunlists++;
+
+ for (; tim != NULL; tim = next_tim) {
+ next_tim = tim->sl_next[0];
+
+ ret = timer_set_running_state(tim);
+ if (likely(ret == 0)) {
+ pprev = &tim->sl_next[0];
+ } else {
+ /* another core is trying to re-config this one,
+ * remove it from local expired list
+ */
+ *pprev = next_tim;
+ }
+ }
- /* transition run-list from PENDING to RUNNING */
- run_first_tim = tim;
- pprev = &run_first_tim;
+ /* update the next to expire timer value */
+ list->head.expire = (list->head.sl_next[0] == NULL) ?
+ 0 : list->head.sl_next[0]->expire;
- for ( ; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
+ rte_spinlock_unlock(&list->lock);
+ }
- ret = timer_set_running_state(tim);
- if (likely(ret == 0)) {
- pprev = &tim->sl_next[0];
- } else {
- /* another core is trying to re-config this one,
- * remove it from local expired list
- */
- *pprev = next_tim;
+ /* Now process the run lists */
+ while (1) {
+ done = true;
+ min_expire = UINT64_MAX;
+ min_idx = 0;
+
+ /* Find the next oldest timer to process */
+ for (i = 0; i < nrunlists; i++) {
+ tim = run_first_tims[i];
+
+ if (tim != NULL && tim->expire < min_expire) {
+ min_expire = tim->expire;
+ min_idx = i;
+ done = false;
+ }
}
- }
- /* update the next to expire timer value */
- priv_timer[lcore_id].pending_head.expire =
- (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
- priv_timer[lcore_id].pending_head.sl_next[0]->expire;
+ if (done)
+ break;
+
+ tim = run_first_tims[min_idx];
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+ /* Move down the runlist from which we picked a timer to
+ * execute
+ */
+ run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
- /* now scan expired list and call callbacks */
- for (tim = run_first_tim; tim != NULL; tim = next_tim) {
- next_tim = tim->sl_next[0];
- priv_timer[lcore_id].updated = 0;
- priv_timer[lcore_id].running_tim = tim;
+ priv_tim->updated = 0;
+ priv_tim->running_tim = tim;
/* execute callback function with list unlocked */
tim->f(tim, tim->arg);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
- * function, we have nothing to do here */
- if (priv_timer[lcore_id].updated == 1)
+ * function, we have nothing to do here
+ */
+ if (priv_tim->updated == 1)
continue;
if (tim->period == 0) {
/* remove from done list and mark timer as stopped */
status.state = RTE_TIMER_STOP;
- status.owner = RTE_TIMER_NO_OWNER;
+ status.index = RTE_TIMER_NO_LCORE;
+ status.owner = RTE_TIMER_NO_LCORE;
rte_wmb();
tim->status.u32 = status.u32;
}
else {
- /* keep it in list and mark timer as pending */
- rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
+ idx = (priv_tim->multi_pendlists) ? lcore_id :
+ SINGLE_LIST_IDX;
+ dest_list = &priv_tim->pending_lists[idx];
+
+ /* If the destination list is the current list
+ * we can acquire the lock here, and hold it
+ * across removal and insertion of the timer.
+ */
+ if (list == dest_list) {
+ rte_spinlock_lock(&list->lock);
+ local_is_locked = 1;
+ }
+
+ /* set timer state back to PENDING and
+ * reinsert it in pending list
+ */
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
- status.owner = (int16_t)lcore_id;
+ status.index = tim->status.index;
+ status.owner = lcore_id;
rte_wmb();
tim->status.u32 = status.u32;
- __rte_timer_reset(tim, tim->expire + tim->period,
- tim->period, lcore_id, tim->f, tim->arg, 1);
- rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+
+ __rte_timer_reset(tim,
+ tim->expire + tim->period,
+ tim->period, lcore_id,
+ tim->f, tim->arg, local_is_locked);
+
+ if (local_is_locked) {
+ rte_spinlock_unlock(&list->lock);
+ local_is_locked = 0;
+ }
}
}
- priv_timer[lcore_id].running_tim = NULL;
+ priv_tim->running_tim = NULL;
}
/* dump statistics about timers */
diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h
index a276a73..5870b4a 100644
--- a/lib/librte_timer/rte_timer.h
+++ b/lib/librte_timer/rte_timer.h
@@ -77,7 +77,7 @@ extern "C" {
#define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */
#define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */
-#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */
+#define RTE_TIMER_NO_LCORE -2
/**
* Timer type: Periodic or single (one-shot).
@@ -94,10 +94,11 @@ enum rte_timer_type {
union rte_timer_status {
RTE_STD_C11
struct {
- uint16_t state; /**< Stop, pending, running, config. */
- int16_t owner; /**< The lcore that owns the timer. */
+ unsigned state : 2;
+ int index : 15;
+ int owner : 15;
};
- uint32_t u32; /**< To atomic-set status + owner. */
+ uint32_t u32; /**< To atomic-set status, index, owner */
};
#ifdef RTE_LIBRTE_TIMER_DEBUG
@@ -168,6 +169,26 @@ struct rte_timer
void rte_timer_subsystem_init(void);
/**
+ * Enable multiple pending lists for specified lcore ID.
+ *
+ * This function enables the use of per-installer pending lists for a
+ * target lcore. When timers are being installed from multiple
+ * lcores simultaneously, this option can increase performance by
+ * reducing contention for the same pending list.
+ *
+ * This function should be called after the timer subsystem has been
+ * initialized, and before any timers have been started on the desired
+ * lcore.
+ *
+ * @param lcore_id
+ * The lcore on which to enable multiple pending lists
+ * @return
+ * - 0: Success
+ * - (-1): Failure; pending timers already exist on the default list.
+ */
+int rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id);
+
+/**
* Initialize a timer handle.
*
* The rte_timer_init() function initializes the timer handle *tim*
diff --git a/lib/librte_timer/rte_timer_version.map b/lib/librte_timer/rte_timer_version.map
index 9b2e4b8..7d4cbe8 100644
--- a/lib/librte_timer/rte_timer_version.map
+++ b/lib/librte_timer/rte_timer_version.map
@@ -13,3 +13,9 @@ DPDK_2.0 {
local: *;
};
+
+DPDK_17.11 {
+ global:
+
+ rte_timer_subsystem_set_multi_pendlists;
+} DPDK_2.0;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
@ 2017-09-19 17:02 ` Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo
2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon
3 siblings, 0 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw)
To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo
From: Erik Carrillo <erik.g.carrillo@intel.com>
This commit adds support for timers being created from
non-EAL threads; it maps timers from all such threads to
lcore id RTE_MAX_LCORE, and puts them all in a corresponding
skiplist.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
v3:
* Rebased patch on reworked parent commit
v2:
* Address checkpatch warnings
lib/librte_timer/rte_timer.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 1f4141e..c489c5f 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -66,8 +66,8 @@ struct skiplist {
} __rte_cache_aligned;
struct priv_timer {
- /** one pending list per enabled lcore */
- struct skiplist pending_lists[RTE_MAX_LCORE];
+ /** one pending list per lcore, plus one for non-EAL threads */
+ struct skiplist pending_lists[RTE_MAX_LCORE + 1];
bool multi_pendlists;
/** per-core variable that true if a timer was updated on this
@@ -88,7 +88,7 @@ struct priv_timer {
static struct priv_timer priv_timer[RTE_MAX_LCORE];
/** cache of IDs of enabled lcores */
-static unsigned int enabled_lcores[RTE_MAX_LCORE];
+static unsigned int enabled_lcores[RTE_MAX_LCORE + 1];
static int n_enabled_lcores;
/* when debug is enabled, store some statistics */
@@ -114,12 +114,18 @@ rte_timer_subsystem_init(void)
RTE_LCORE_FOREACH(lcore_id)
enabled_lcores[n_enabled_lcores++] = lcore_id;
+ /* To handle timers coming from non-EAL threads */
+ enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE;
+
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
for (i = 0; i < n_enabled_lcores; i++) {
lcore_id = enabled_lcores[i];
+ /* Don't use this value to index the priv_timer array */
+ if (lcore_id == RTE_MAX_LCORE)
+ continue;
priv_tim = &priv_timer[lcore_id];
priv_tim->prev_lcore = lcore_id;
@@ -343,7 +349,9 @@ timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked,
struct priv_timer *priv_tim = &priv_timer[tim_lcore];
if (priv_tim->multi_pendlists)
- *pending_lists_idx = lcore_id;
+ /* Check if timer being installed from non-EAL thread */
+ *pending_lists_idx = (lcore_id == LCORE_ID_ANY) ?
+ RTE_MAX_LCORE : lcore_id;
else
*pending_lists_idx = SINGLE_LIST_IDX;
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
@ 2017-09-19 17:02 ` Erik Gabriel Carrillo
2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon
3 siblings, 0 replies; 30+ messages in thread
From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw)
To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo
From: Erik Carrillo <erik.g.carrillo@intel.com>
This change updates the timer library documentation to
reflect a change to the organization of the skiplists
in the implementation.
Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
Acked-by: John McNamara <john.mcnamara@intel.com>
---
v4:
* Made changes suggested by John Mcnamara[1].
[1] http://dpdk.org/ml/archives/dev/2017-September/075819.html
doc/guides/prog_guide/timer_lib.rst | 31 +++++++++++++++++++------------
doc/guides/rel_notes/release_17_11.rst | 7 +++++++
2 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst
index f437417..e1f64ac 100644
--- a/doc/guides/prog_guide/timer_lib.rst
+++ b/doc/guides/prog_guide/timer_lib.rst
@@ -1,5 +1,5 @@
.. BSD LICENSE
- Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -53,16 +53,19 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>`
Implementation Details
----------------------
-Timers are tracked on a per-lcore basis,
-with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure.
-The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level.
+Timers are tracked on a per-lcore basis, with all pending timers for a core being maintained in order of timer
+expiry in either a single skiplist data structure or an array of skiplists, depending on whether
+the lcore has been configured for multiple pending lists. Multiple pending lists can be enabled when an
+application experiences contention for a single list for that lcore; skiplists corresponding to every other
+enabled lcore will be created.
+Each skiplist data structure has ten levels and each entry in the table appears in each level with probability 0.25^level.
This means that all entries are present in level 0, 1 in every 4 entries is present at level 1,
one in every 16 at level 2 and so on up to level 9.
This means that adding and removing entries from the timer list for a core can be done in log(n) time,
up to 4^10 entries, that is, approximately 1,000,000 timers per lcore.
A timer structure contains a special field called status,
-which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id).
+which is a union of a timer state (stopped, pending, running, config), an index (lcore id), and an owner (lcore id).
Depending on the timer state, we know if a timer is present in a list or not:
* STOPPED: no owner, not in a list
@@ -75,19 +78,23 @@ Depending on the timer state, we know if a timer is present in a list or not:
Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not allowed.
When modifying the state of a timer,
-a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically.
-
-Inside the rte_timer_manage() function,
-the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries,
-until an entry which has not yet expired has been encountered.
-To improve performance in the case where there are entries in the timer list but none of those timers have yet expired,
+a Compare And Swap instruction should be used to guarantee that the status (state+index+owner) is modified atomically.
+
+Inside the rte_timer_manage() function, the timer lists are processed.
+If multiple pending lists have been enabled for an lcore, then each skiplist will
+be traversed sequentially, and run lists will be broken out and then processed.
+If multiple pending lists are not enabled for an lcore, then only a single skiplist will be traversed.
+A skiplist is used as a regular list by iterating along the level
+0 list, which contains all timer entries, until an entry which has not yet expired has been encountered.
+To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired,
the expiry time of the first list entry is maintained within the per-core timer list structure itself.
On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure.
(Since expiry times are maintained as 64-bit values,
a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock,
so this additional check is skipped in favor of checking as normal once the lock has been taken.)
On both 64-bit and 32-bit platforms,
-a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty.
+rte_timer_manage() can either return or continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty,
+depending on whether or not the lcore has multiple pending lists.
Use Cases
---------
diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst
index 170f4f9..4683cbe 100644
--- a/doc/guides/rel_notes/release_17_11.rst
+++ b/doc/guides/rel_notes/release_17_11.rst
@@ -110,6 +110,13 @@ API Changes
Also, make sure to start the actual text at the margin.
=========================================================
+* **Updated timer library.**
+
+ The timer library has been updated; it can now support multiple timer lists
+ per lcore where it previously only had one. This functionality is off by
+ default but can be enabled in cases where contention for a single list is
+ an issue with the new function ``rte_timer_subsystem_set_multi_pendlists()``.
+
ABI Changes
-----------
--
2.6.4
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
2017-09-18 16:19 ` Mcnamara, John
@ 2017-09-19 17:04 ` Carrillo, Erik G
0 siblings, 0 replies; 30+ messages in thread
From: Carrillo, Erik G @ 2017-09-19 17:04 UTC (permalink / raw)
To: Mcnamara, John, rsanford
Cc: dev, Ananyev, Konstantin, stephen, Wiles, Keith, Vangati, Narender
Thanks for the review, John. I've submitted the change you suggest in a new patch series.
Regards,
Gabriel
> -----Original Message-----
> From: Mcnamara, John
> Sent: Monday, September 18, 2017 11:20 AM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>; rsanford@akamai.com
> Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>;
> Vangati, Narender <narender.vangati@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
>
>
>
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Erik Gabriel
> > Carrillo
> > Sent: Wednesday, September 13, 2017 11:05 PM
> > To: rsanford@akamai.com
> > Cc: dev@dpdk.org; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>;
> > stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>;
> > Vangati, Narender <narender.vangati@intel.com>
> > Subject: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs
> >
> > This change updates the timer library documentation to reflect a
> > change to the organization of the skiplists in the implementation.
> >
> > Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
>
>
> > +Each skiplist data structure has ten levels and each entry in the
> > +table
> > appears in each level with probability ¼^level.
>
> Probably best not to use that Unicode, or otherwise, 1/4 character since it
> can mess up the PDF output. Use "0.25^level" instead.
>
>
> Apart from that:
>
> Acked-by: John McNamara <john.mcnamara@intel.com>
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
` (2 preceding siblings ...)
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo
@ 2017-10-11 20:42 ` Thomas Monjalon
2018-04-04 22:42 ` Thomas Monjalon
3 siblings, 1 reply; 30+ messages in thread
From: Thomas Monjalon @ 2017-10-11 20:42 UTC (permalink / raw)
To: rsanford; +Cc: dev, Erik Gabriel Carrillo, john.mcnamara
This patchset is waiting for review.
Robert, are you available?
19/09/2017 19:02, Erik Gabriel Carrillo:
> In the current implementation of the DPDK timer library, timers can be
> created and set to be handled by a target lcore by adding it to a
> skiplist that corresponds to that lcore. However, if an application
> enables multiple lcores, and each of these lcores repeatedly attempts
> to install timers on the same target lcore, overall application
> throughput will be reduced as all lcores contend to acquire the lock
> guarding the single skiplist of pending timers.
>
> This patchset addresses this scenario by adding an option to enable an
> array of skiplists in each lcore's priv_timer struct. Then, when lcore i
> installs a timer on lcore k, the timer will be added to the ith skiplist
> for lcore k. If lcore j installs a timer on lcore k simultaneously,
> lcores i and j can both proceed since they will be acquiring different
> locks for different lists. This functionality is off by default, and
> can be enabled via a new function.
>
> When lcore k processes its pending timers, if the multiple pending list
> option is enabled, it will traverse skiplists in its array and acquire
> the current skiplist's lock while a run list is broken out; meanwhile,
> all other lists can continue to be modified. Then, all run lists for
> lcore k are collected and traversed together so timers are executed in
> their relative order. If the multiple pending list option is not enabled
> (the default), only a single list will be traversed, as before.
>
> Erik Gabriel Carrillo (3):
> timer: add multiple pending lists option for each lcore
> timer: handle timers installed from non-EAL threads
> doc: update timer lib docs
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement
2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon
@ 2018-04-04 22:42 ` Thomas Monjalon
2018-04-05 21:53 ` Carrillo, Erik G
0 siblings, 1 reply; 30+ messages in thread
From: Thomas Monjalon @ 2018-04-04 22:42 UTC (permalink / raw)
To: Erik Gabriel Carrillo; +Cc: dev, rsanford, john.mcnamara
Let's revive these patches.
Erik, could you rebase them with experimental tag, please?
Someone for a review?
11/10/2017 22:42, Thomas Monjalon:
> This patchset is waiting for review.
>
> Robert, are you available?
>
> 19/09/2017 19:02, Erik Gabriel Carrillo:
> > In the current implementation of the DPDK timer library, timers can be
> > created and set to be handled by a target lcore by adding it to a
> > skiplist that corresponds to that lcore. However, if an application
> > enables multiple lcores, and each of these lcores repeatedly attempts
> > to install timers on the same target lcore, overall application
> > throughput will be reduced as all lcores contend to acquire the lock
> > guarding the single skiplist of pending timers.
> >
> > This patchset addresses this scenario by adding an option to enable an
> > array of skiplists in each lcore's priv_timer struct. Then, when lcore i
> > installs a timer on lcore k, the timer will be added to the ith skiplist
> > for lcore k. If lcore j installs a timer on lcore k simultaneously,
> > lcores i and j can both proceed since they will be acquiring different
> > locks for different lists. This functionality is off by default, and
> > can be enabled via a new function.
> >
> > When lcore k processes its pending timers, if the multiple pending list
> > option is enabled, it will traverse skiplists in its array and acquire
> > the current skiplist's lock while a run list is broken out; meanwhile,
> > all other lists can continue to be modified. Then, all run lists for
> > lcore k are collected and traversed together so timers are executed in
> > their relative order. If the multiple pending list option is not enabled
> > (the default), only a single list will be traversed, as before.
> >
> > Erik Gabriel Carrillo (3):
> > timer: add multiple pending lists option for each lcore
> > timer: handle timers installed from non-EAL threads
> > doc: update timer lib docs
>
>
>
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement
2018-04-04 22:42 ` Thomas Monjalon
@ 2018-04-05 21:53 ` Carrillo, Erik G
0 siblings, 0 replies; 30+ messages in thread
From: Carrillo, Erik G @ 2018-04-05 21:53 UTC (permalink / raw)
To: Thomas Monjalon; +Cc: dev, rsanford, Mcnamara, John
Hi Thomas,
OK, I'll take a fresh look at these.
Thanks,
Erik
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Wednesday, April 4, 2018 5:43 PM
> To: Carrillo, Erik G <erik.g.carrillo@intel.com>
> Cc: dev@dpdk.org; rsanford@akamai.com; Mcnamara, John
> <john.mcnamara@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement
>
> Let's revive these patches.
> Erik, could you rebase them with experimental tag, please?
>
> Someone for a review?
>
>
> 11/10/2017 22:42, Thomas Monjalon:
> > This patchset is waiting for review.
> >
> > Robert, are you available?
> >
> > 19/09/2017 19:02, Erik Gabriel Carrillo:
> > > In the current implementation of the DPDK timer library, timers can
> > > be created and set to be handled by a target lcore by adding it to a
> > > skiplist that corresponds to that lcore. However, if an application
> > > enables multiple lcores, and each of these lcores repeatedly
> > > attempts to install timers on the same target lcore, overall
> > > application throughput will be reduced as all lcores contend to
> > > acquire the lock guarding the single skiplist of pending timers.
> > >
> > > This patchset addresses this scenario by adding an option to enable
> > > an array of skiplists in each lcore's priv_timer struct. Then, when
> > > lcore i installs a timer on lcore k, the timer will be added to the
> > > ith skiplist for lcore k. If lcore j installs a timer on lcore k
> > > simultaneously, lcores i and j can both proceed since they will be
> > > acquiring different locks for different lists. This functionality is
> > > off by default, and can be enabled via a new function.
> > >
> > > When lcore k processes its pending timers, if the multiple pending
> > > list option is enabled, it will traverse skiplists in its array and
> > > acquire the current skiplist's lock while a run list is broken out;
> > > meanwhile, all other lists can continue to be modified. Then, all
> > > run lists for lcore k are collected and traversed together so timers
> > > are executed in their relative order. If the multiple pending list
> > > option is not enabled (the default), only a single list will be traversed, as
> before.
> > >
> > > Erik Gabriel Carrillo (3):
> > > timer: add multiple pending lists option for each lcore
> > > timer: handle timers installed from non-EAL threads
> > > doc: update timer lib docs
> >
> >
> >
>
>
>
>
^ permalink raw reply [flat|nested] 30+ messages in thread
end of thread, other threads:[~2018-04-05 21:53 UTC | newest]
Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-29 10:57 ` Ananyev, Konstantin
2017-08-29 16:13 ` Carrillo, Erik G
2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger
2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo
2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith
2017-08-23 16:19 ` Carrillo, Erik G
2017-08-23 16:50 ` Wiles, Keith
2017-08-23 19:28 ` Carrillo, Erik G
2017-08-23 21:04 ` Wiles, Keith
2017-08-24 14:08 ` Carrillo, Erik G
2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo
2017-09-18 16:19 ` Mcnamara, John
2017-09-19 17:04 ` Carrillo, Erik G
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo
2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo
2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon
2018-04-04 22:42 ` Thomas Monjalon
2018-04-05 21:53 ` Carrillo, Erik G
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).