* [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** @ 2017-08-23 14:47 Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo ` (4 more replies) 0 siblings, 5 replies; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw) To: rsanford; +Cc: dev In the current implementation of the DPDK timer library, timers can be created and set to be handled by a target lcore by adding it to a skiplist that corresponds to that lcore. However, if an application enables multiple lcores, and each of these lcores repeatedly attempts to install timers on the same target lcore, overall application throughput will be reduced as all lcores contend to acquire the lock guarding the single skiplist of pending timers. This patchset addresses this scenario by adding an array of skiplists to each lcore's priv_timer struct, such that when lcore i installs a timer on lcore k, the timer will be added to the ith skiplist for lcore k. If lcore j installs a timer on lcore k simultaneously, lcores i and j can both proceed since they will be acquiring different locks for different lists. When lcore k processes its pending timers, it will traverse each skiplist in its array and acquire a skiplist's lock while a run list is broken out; meanwhile, all other lists can continue to be modified. Then, all run lists for lcore k are collected and traversed together so timers are executed in their global order. Gabriel Carrillo (3): timer: add per-installer pending lists for each lcore timer: handle timers installed from non-EAL threads doc: update timer lib docs doc/guides/prog_guide/timer_lib.rst | 19 ++- lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------------- lib/librte_timer/rte_timer.h | 9 +- 3 files changed, 231 insertions(+), 126 deletions(-) -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo @ 2017-08-23 14:47 ` Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger 2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo ` (3 subsequent siblings) 4 siblings, 2 replies; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw) To: rsanford; +Cc: dev Instead of each priv_timer struct containing a single skiplist, this commit adds a skiplist for each enabled lcore to priv_timer. In the case that multiple lcores repeatedly install timers on the same target lcore, this change reduces lock contention for the target lcore's skiplists and increases performance. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++---------------- lib/librte_timer/rte_timer.h | 9 +- 2 files changed, 202 insertions(+), 116 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 5ee0840..2db74c1 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -37,6 +37,7 @@ #include <inttypes.h> #include <assert.h> #include <sys/queue.h> +#include <stdbool.h> #include <rte_atomic.h> #include <rte_common.h> @@ -56,17 +57,19 @@ LIST_HEAD(rte_timer_list, rte_timer); +struct skiplist { + struct rte_timer head; /**< dummy timer instance to head up list */ + rte_spinlock_t lock; /**< lock to protect list access */ + unsigned depth; /**< track the current depth of the skiplist */ +} __rte_cache_aligned; + struct priv_timer { - struct rte_timer pending_head; /**< dummy timer instance to head up list */ - rte_spinlock_t list_lock; /**< lock to protect list access */ + /** one pending list per enabled lcore */ + struct skiplist pending_lists[RTE_MAX_LCORE]; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ int updated; - - /** track the current depth of the skiplist */ - unsigned curr_skiplist_depth; - unsigned prev_lcore; /**< used for lcore round robin */ /** running timer on this lcore now */ @@ -81,6 +84,10 @@ struct priv_timer { /** per-lcore private info for timers */ static struct priv_timer priv_timer[RTE_MAX_LCORE]; +/** cache of IDs of enabled lcores */ +static unsigned enabled_lcores[RTE_MAX_LCORE]; +static int n_enabled_lcores; + /* when debug is enabled, store some statistics */ #ifdef RTE_LIBRTE_TIMER_DEBUG #define __TIMER_STAT_ADD(name, n) do { \ @@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE]; void rte_timer_subsystem_init(void) { - unsigned lcore_id; + unsigned lcore_id1, lcore_id2; + struct skiplist *list; + int i, j; + + RTE_LCORE_FOREACH(lcore_id1) + enabled_lcores[n_enabled_lcores++] = lcore_id1; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { - rte_spinlock_init(&priv_timer[lcore_id].list_lock); - priv_timer[lcore_id].prev_lcore = lcore_id; + for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores; + lcore_id1 = enabled_lcores[++i]) { + priv_timer[lcore_id1].prev_lcore = lcore_id1; + + for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores; + lcore_id2 = enabled_lcores[++j]) { + list = &priv_timer[lcore_id1].pending_lists[lcore_id2]; + rte_spinlock_init(&list->lock); + } } } @@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim) union rte_timer_status status; status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; } @@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim, * or ready to run on local core, exit */ if (prev_status.state == RTE_TIMER_RUNNING && - (prev_status.owner != (uint16_t)lcore_id || + (prev_status.owner != (int)lcore_id || tim != priv_timer[lcore_id].running_tim)) return -1; @@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim, /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_CONFIG; - status.owner = (int16_t)lcore_id; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim) /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_RUNNING; - status.owner = (int16_t)lcore_id; + status.installer = prev_status.installer; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth) * are <= that time value. */ static void -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, +timer_get_prev_entries(uint64_t time_val, struct skiplist *list, struct rte_timer **prev) { - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; - prev[lvl] = &priv_timer[tim_lcore].pending_head; + unsigned lvl = list->depth; + prev[lvl] = &list->head; while(lvl != 0) { lvl--; prev[lvl] = prev[lvl+1]; @@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, * all skiplist levels. */ static void -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, struct rte_timer **prev) { int i; /* to get a specific entry in the list, look for just lower than the time * values, and then increment on each level individually if necessary */ - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries(tim->expire - 1, list, prev); + for (i = list->depth - 1; i >= 0; i--) { while (prev[i]->sl_next[i] != NULL && prev[i]->sl_next[i] != tim && prev[i]->sl_next[i]->expire <= tim->expire) @@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) unsigned lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if * we are not called from rte_timer_manage() */ if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_lock(&list->lock); /* find where exactly this element goes in the list of elements * for each depth. */ - timer_get_prev_entries(tim->expire, tim_lcore, prev); + timer_get_prev_entries(tim->expire, list, prev); /* now assign it a new level and add at that level */ - const unsigned tim_level = timer_get_skiplist_level( - priv_timer[tim_lcore].curr_skiplist_depth); - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) - priv_timer[tim_lcore].curr_skiplist_depth++; + const unsigned tim_level = timer_get_skiplist_level(list->depth); + if (tim_level == list->depth) + list->depth++; lvl = tim_level; - while (lvl > 0) { + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) { tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; prev[lvl]->sl_next[lvl] = tim; lvl--; @@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) /* save the lowest list entry into the expire field of the dummy hdr * NOTE: this is not atomic on 32-bit*/ - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ - pending_head.sl_next[0]->expire; + list->head.expire = list->head.sl_next[0]->expire; if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_unlock(&list->lock); } /* @@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status, unsigned prev_owner = prev_status.owner; int i; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + + list = &priv_timer[prev_owner].pending_lists[prev_status.installer]; /* if timer needs is pending another core, we need to lock the * list; if it is on local core, we need to lock if we are not * called from rte_timer_manage() */ if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[prev_owner].list_lock); + rte_spinlock_lock(&list->lock); /* save the lowest list entry into the expire field of the dummy hdr. * NOTE: this is not atomic on 32-bit */ - if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) - priv_timer[prev_owner].pending_head.expire = - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); + if (tim == list->head.sl_next[0]) + list->head.expire = ((tim->sl_next[0] == NULL) ? + 0 : tim->sl_next[0]->expire); /* adjust pointers from previous entries to point past this */ - timer_get_prev_entries_for_node(tim, prev_owner, prev); - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries_for_node(tim, list, prev); + for (i = list->depth - 1; i >= 0; i--) { if (prev[i]->sl_next[i] == tim) prev[i]->sl_next[i] = tim->sl_next[i]; } /* in case we deleted last entry at a level, adjust down max level */ - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) - priv_timer[prev_owner].curr_skiplist_depth --; + for (i = list->depth - 1; i >= 0; i--) + if (list->head.sl_next[i] == NULL) + list->depth--; else break; if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); + rte_spinlock_unlock(&list->lock); } /* Reset and start the timer associated with the timer handle (private func) */ @@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.owner = (int16_t)tim_lcore; + status.installer = lcore_id; + status.owner = tim_lcore; tim->status.u32 = status.u32; return 0; @@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim) /* mark timer as stopped */ rte_wmb(); status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; return 0; @@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim) } /* must be called periodically, run all timer that expired */ -void rte_timer_manage(void) +void +rte_timer_manage(void) { union rte_timer_status status; - struct rte_timer *tim, *next_tim; - struct rte_timer *run_first_tim, **pprev; - unsigned lcore_id = rte_lcore_id(); + struct rte_timer *tim, *next_tim, **pprev; + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1]; struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; - uint64_t cur_time; - int i, ret; + struct priv_timer *priv_tim; + unsigned installer_lcore, lcore_id = rte_lcore_id(); + uint64_t cur_time, min_expire; + int i, j, min_idx, ret; + int nrunlists = 0; + int local_is_locked = 0; + struct skiplist *dest_list, *list = NULL; + bool done; /* timer manager only runs on EAL thread with valid lcore_id */ assert(lcore_id < RTE_MAX_LCORE); + priv_tim = &priv_timer[lcore_id]; + __TIMER_STAT_ADD(manage, 1); - /* optimize for the case where per-cpu list is empty */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) - return; - cur_time = rte_get_timer_cycles(); + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores; + installer_lcore = enabled_lcores[++i]) { + list = &priv_tim->pending_lists[installer_lcore]; + + /* optimize for the case where list is empty */ + if (list->head.sl_next[0] == NULL) + continue; + cur_time = rte_get_timer_cycles(); #ifdef RTE_ARCH_X86_64 - /* on 64-bit the value cached in the pending_head.expired will be - * updated atomically, so we can consult that for a quick check here - * outside the lock */ - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) - return; + /* on 64-bit the value cached in the list->head.expired will be + * updated atomically, so we can consult that for a quick check + * here outside the lock + */ + if (likely(list->head.expire > cur_time)) + continue; #endif - /* browse ordered list, add expired timers in 'expired' list */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + /* browse ordered list, add expired timers in 'expired' list */ + rte_spinlock_lock(&list->lock); - /* if nothing to do just unlock and return */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); - return; - } + /* if nothing to do just unlock and continue */ + if (list->head.sl_next[0] == NULL || + list->head.sl_next[0]->expire > cur_time) { + rte_spinlock_unlock(&list->lock); + continue; + } - /* save start of list of expired timers */ - tim = priv_timer[lcore_id].pending_head.sl_next[0]; + /* save start of list of expired timers */ + tim = list->head.sl_next[0]; + + /* break the existing list at current time point */ + timer_get_prev_entries(cur_time, list, prev); + for (j = list->depth - 1; j >= 0; j--) { + if (prev[j] == &list->head) + continue; + list->head.sl_next[j] = + prev[j]->sl_next[j]; + if (prev[j]->sl_next[j] == NULL) + list->depth--; + prev[j]->sl_next[j] = NULL; + } - /* break the existing list at current time point */ - timer_get_prev_entries(cur_time, lcore_id, prev); - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { - if (prev[i] == &priv_timer[lcore_id].pending_head) - continue; - priv_timer[lcore_id].pending_head.sl_next[i] = - prev[i]->sl_next[i]; - if (prev[i]->sl_next[i] == NULL) - priv_timer[lcore_id].curr_skiplist_depth--; - prev[i] ->sl_next[i] = NULL; - } + /* transition run-list from PENDING to RUNNING */ + run_first_tims[nrunlists] = tim; + pprev = &run_first_tims[nrunlists]; + nrunlists++; + + for (; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + + ret = timer_set_running_state(tim); + if (likely(ret == 0)) { + pprev = &tim->sl_next[0]; + } else { + /* another core is trying to re-config this one, + * remove it from local expired list + */ + *pprev = next_tim; + } + } - /* transition run-list from PENDING to RUNNING */ - run_first_tim = tim; - pprev = &run_first_tim; + /* update the next to expire timer value */ + list->head.expire = (list->head.sl_next[0] == NULL) ? + 0 : list->head.sl_next[0]->expire; - for ( ; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; + rte_spinlock_unlock(&list->lock); + } - ret = timer_set_running_state(tim); - if (likely(ret == 0)) { - pprev = &tim->sl_next[0]; - } else { - /* another core is trying to re-config this one, - * remove it from local expired list - */ - *pprev = next_tim; + /* Now process the run lists */ + while (1) { + done = true; + min_expire = UINT64_MAX; + min_idx = 0; + + /* Find the next oldest timer to process */ + for (i = 0; i < nrunlists; i++) { + tim = run_first_tims[i]; + + if (tim != NULL && tim->expire < min_expire) { + min_expire = tim->expire; + min_idx = i; + done = false; + } } - } - /* update the next to expire timer value */ - priv_timer[lcore_id].pending_head.expire = - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : - priv_timer[lcore_id].pending_head.sl_next[0]->expire; + if (done) + break; + + tim = run_first_tims[min_idx]; - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + /* Move down the runlist from which we picked a timer to + * execute + */ + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0]; - /* now scan expired list and call callbacks */ - for (tim = run_first_tim; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; - priv_timer[lcore_id].updated = 0; - priv_timer[lcore_id].running_tim = tim; + priv_tim->updated = 0; + priv_tim->running_tim = tim; /* execute callback function with list unlocked */ tim->f(tim, tim->arg); __TIMER_STAT_ADD(pending, -1); /* the timer was stopped or reloaded by the callback - * function, we have nothing to do here */ - if (priv_timer[lcore_id].updated == 1) + * function, we have nothing to do here + */ + if (priv_tim->updated == 1) continue; if (tim->period == 0) { /* remove from done list and mark timer as stopped */ status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; rte_wmb(); tim->status.u32 = status.u32; } else { - /* keep it in list and mark timer as pending */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + dest_list = &priv_tim->pending_lists[lcore_id]; + + /* If the destination list is the current list + * we can acquire the lock here, and hold it + * across removal and insertion of the timer. + */ + if (list == dest_list) { + rte_spinlock_lock(&list->lock); + local_is_locked = 1; + } + + /* set timer state back to PENDING and + * reinsert it in pending list + */ status.state = RTE_TIMER_PENDING; __TIMER_STAT_ADD(pending, 1); - status.owner = (int16_t)lcore_id; + status.installer = tim->status.installer; + status.owner = lcore_id; rte_wmb(); tim->status.u32 = status.u32; - __rte_timer_reset(tim, tim->expire + tim->period, - tim->period, lcore_id, tim->f, tim->arg, 1); - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + + __rte_timer_reset(tim, + tim->expire + tim->period, + tim->period, lcore_id, + tim->f, tim->arg, local_is_locked); + + if (local_is_locked) { + rte_spinlock_unlock(&list->lock); + local_is_locked = 0; + } } } - priv_timer[lcore_id].running_tim = NULL; + priv_tim->running_tim = NULL; } /* dump statistics about timers */ diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h index a276a73..4cc6baf 100644 --- a/lib/librte_timer/rte_timer.h +++ b/lib/librte_timer/rte_timer.h @@ -77,7 +77,7 @@ extern "C" { #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ +#define RTE_TIMER_NO_LCORE -2 /** * Timer type: Periodic or single (one-shot). @@ -94,10 +94,11 @@ enum rte_timer_type { union rte_timer_status { RTE_STD_C11 struct { - uint16_t state; /**< Stop, pending, running, config. */ - int16_t owner; /**< The lcore that owns the timer. */ + unsigned state : 2; + int installer : 15; + int owner : 15; }; - uint32_t u32; /**< To atomic-set status + owner. */ + uint32_t u32; /**< To atomic-set status, installer, owner */ }; #ifdef RTE_LIBRTE_TIMER_DEBUG -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo @ 2017-08-25 20:27 ` Gabriel Carrillo 2017-08-29 10:57 ` Ananyev, Konstantin 2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger 1 sibling, 1 reply; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw) To: rsanford; +Cc: dev Instead of each priv_timer struct containing a single skiplist, this commit adds a skiplist for each enabled lcore to priv_timer. In the case that multiple lcores repeatedly install timers on the same target lcore, this change reduces lock contention for the target lcore's skiplists and increases performance. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++---------------- lib/librte_timer/rte_timer.h | 9 +- 2 files changed, 202 insertions(+), 116 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 5ee0840..da2fc1a 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -37,6 +37,7 @@ #include <inttypes.h> #include <assert.h> #include <sys/queue.h> +#include <stdbool.h> #include <rte_atomic.h> #include <rte_common.h> @@ -56,17 +57,19 @@ LIST_HEAD(rte_timer_list, rte_timer); +struct skiplist { + struct rte_timer head; /**< dummy timer instance to head up list */ + rte_spinlock_t lock; /**< lock to protect list access */ + unsigned int depth; /**< track the current depth of the skiplist */ +} __rte_cache_aligned; + struct priv_timer { - struct rte_timer pending_head; /**< dummy timer instance to head up list */ - rte_spinlock_t list_lock; /**< lock to protect list access */ + /** one pending list per enabled lcore */ + struct skiplist pending_lists[RTE_MAX_LCORE]; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ int updated; - - /** track the current depth of the skiplist */ - unsigned curr_skiplist_depth; - unsigned prev_lcore; /**< used for lcore round robin */ /** running timer on this lcore now */ @@ -81,6 +84,10 @@ struct priv_timer { /** per-lcore private info for timers */ static struct priv_timer priv_timer[RTE_MAX_LCORE]; +/** cache of IDs of enabled lcores */ +static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static int n_enabled_lcores; + /* when debug is enabled, store some statistics */ #ifdef RTE_LIBRTE_TIMER_DEBUG #define __TIMER_STAT_ADD(name, n) do { \ @@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE]; void rte_timer_subsystem_init(void) { - unsigned lcore_id; + unsigned int lcore_id1, lcore_id2; + struct skiplist *list; + int i, j; + + RTE_LCORE_FOREACH(lcore_id1) + enabled_lcores[n_enabled_lcores++] = lcore_id1; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { - rte_spinlock_init(&priv_timer[lcore_id].list_lock); - priv_timer[lcore_id].prev_lcore = lcore_id; + for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores; + lcore_id1 = enabled_lcores[++i]) { + priv_timer[lcore_id1].prev_lcore = lcore_id1; + + for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores; + lcore_id2 = enabled_lcores[++j]) { + list = &priv_timer[lcore_id1].pending_lists[lcore_id2]; + rte_spinlock_init(&list->lock); + } } } @@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim) union rte_timer_status status; status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; } @@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim, * or ready to run on local core, exit */ if (prev_status.state == RTE_TIMER_RUNNING && - (prev_status.owner != (uint16_t)lcore_id || + (prev_status.owner != (int)lcore_id || tim != priv_timer[lcore_id].running_tim)) return -1; @@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim, /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_CONFIG; - status.owner = (int16_t)lcore_id; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim) /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_RUNNING; - status.owner = (int16_t)lcore_id; + status.installer = prev_status.installer; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth) * are <= that time value. */ static void -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, +timer_get_prev_entries(uint64_t time_val, struct skiplist *list, struct rte_timer **prev) { - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; - prev[lvl] = &priv_timer[tim_lcore].pending_head; + unsigned int lvl = list->depth; + prev[lvl] = &list->head; while(lvl != 0) { lvl--; prev[lvl] = prev[lvl+1]; @@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, * all skiplist levels. */ static void -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, struct rte_timer **prev) { int i; /* to get a specific entry in the list, look for just lower than the time * values, and then increment on each level individually if necessary */ - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries(tim->expire - 1, list, prev); + for (i = list->depth - 1; i >= 0; i--) { while (prev[i]->sl_next[i] != NULL && prev[i]->sl_next[i] != tim && prev[i]->sl_next[i]->expire <= tim->expire) @@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) unsigned lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if * we are not called from rte_timer_manage() */ if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_lock(&list->lock); /* find where exactly this element goes in the list of elements * for each depth. */ - timer_get_prev_entries(tim->expire, tim_lcore, prev); + timer_get_prev_entries(tim->expire, list, prev); /* now assign it a new level and add at that level */ - const unsigned tim_level = timer_get_skiplist_level( - priv_timer[tim_lcore].curr_skiplist_depth); - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) - priv_timer[tim_lcore].curr_skiplist_depth++; + const unsigned int tim_level = timer_get_skiplist_level(list->depth); + if (tim_level == list->depth) + list->depth++; lvl = tim_level; - while (lvl > 0) { + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) { tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; prev[lvl]->sl_next[lvl] = tim; lvl--; @@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) /* save the lowest list entry into the expire field of the dummy hdr * NOTE: this is not atomic on 32-bit*/ - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ - pending_head.sl_next[0]->expire; + list->head.expire = list->head.sl_next[0]->expire; if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_unlock(&list->lock); } /* @@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status, unsigned prev_owner = prev_status.owner; int i; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + + list = &priv_timer[prev_owner].pending_lists[prev_status.installer]; /* if timer needs is pending another core, we need to lock the * list; if it is on local core, we need to lock if we are not * called from rte_timer_manage() */ if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[prev_owner].list_lock); + rte_spinlock_lock(&list->lock); /* save the lowest list entry into the expire field of the dummy hdr. * NOTE: this is not atomic on 32-bit */ - if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) - priv_timer[prev_owner].pending_head.expire = - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); + if (tim == list->head.sl_next[0]) + list->head.expire = ((tim->sl_next[0] == NULL) ? + 0 : tim->sl_next[0]->expire); /* adjust pointers from previous entries to point past this */ - timer_get_prev_entries_for_node(tim, prev_owner, prev); - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries_for_node(tim, list, prev); + for (i = list->depth - 1; i >= 0; i--) { if (prev[i]->sl_next[i] == tim) prev[i]->sl_next[i] = tim->sl_next[i]; } /* in case we deleted last entry at a level, adjust down max level */ - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) - priv_timer[prev_owner].curr_skiplist_depth --; + for (i = list->depth - 1; i >= 0; i--) + if (list->head.sl_next[i] == NULL) + list->depth--; else break; if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); + rte_spinlock_unlock(&list->lock); } /* Reset and start the timer associated with the timer handle (private func) */ @@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.owner = (int16_t)tim_lcore; + status.installer = lcore_id; + status.owner = tim_lcore; tim->status.u32 = status.u32; return 0; @@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim) /* mark timer as stopped */ rte_wmb(); status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; return 0; @@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim) } /* must be called periodically, run all timer that expired */ -void rte_timer_manage(void) +void +rte_timer_manage(void) { union rte_timer_status status; - struct rte_timer *tim, *next_tim; - struct rte_timer *run_first_tim, **pprev; - unsigned lcore_id = rte_lcore_id(); + struct rte_timer *tim, *next_tim, **pprev; + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1]; struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; - uint64_t cur_time; - int i, ret; + struct priv_timer *priv_tim; + unsigned int installer_lcore, lcore_id = rte_lcore_id(); + uint64_t cur_time, min_expire; + int i, j, min_idx, ret; + int nrunlists = 0; + int local_is_locked = 0; + struct skiplist *dest_list, *list = NULL; + bool done; /* timer manager only runs on EAL thread with valid lcore_id */ assert(lcore_id < RTE_MAX_LCORE); + priv_tim = &priv_timer[lcore_id]; + __TIMER_STAT_ADD(manage, 1); - /* optimize for the case where per-cpu list is empty */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) - return; - cur_time = rte_get_timer_cycles(); + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores; + installer_lcore = enabled_lcores[++i]) { + list = &priv_tim->pending_lists[installer_lcore]; + + /* optimize for the case where list is empty */ + if (list->head.sl_next[0] == NULL) + continue; + cur_time = rte_get_timer_cycles(); #ifdef RTE_ARCH_X86_64 - /* on 64-bit the value cached in the pending_head.expired will be - * updated atomically, so we can consult that for a quick check here - * outside the lock */ - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) - return; + /* on 64-bit the value cached in the list->head.expired will be + * updated atomically, so we can consult that for a quick check + * here outside the lock + */ + if (likely(list->head.expire > cur_time)) + continue; #endif - /* browse ordered list, add expired timers in 'expired' list */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + /* browse ordered list, add expired timers in 'expired' list */ + rte_spinlock_lock(&list->lock); - /* if nothing to do just unlock and return */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); - return; - } + /* if nothing to do just unlock and continue */ + if (list->head.sl_next[0] == NULL || + list->head.sl_next[0]->expire > cur_time) { + rte_spinlock_unlock(&list->lock); + continue; + } - /* save start of list of expired timers */ - tim = priv_timer[lcore_id].pending_head.sl_next[0]; + /* save start of list of expired timers */ + tim = list->head.sl_next[0]; + + /* break the existing list at current time point */ + timer_get_prev_entries(cur_time, list, prev); + for (j = list->depth - 1; j >= 0; j--) { + if (prev[j] == &list->head) + continue; + list->head.sl_next[j] = + prev[j]->sl_next[j]; + if (prev[j]->sl_next[j] == NULL) + list->depth--; + prev[j]->sl_next[j] = NULL; + } - /* break the existing list at current time point */ - timer_get_prev_entries(cur_time, lcore_id, prev); - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { - if (prev[i] == &priv_timer[lcore_id].pending_head) - continue; - priv_timer[lcore_id].pending_head.sl_next[i] = - prev[i]->sl_next[i]; - if (prev[i]->sl_next[i] == NULL) - priv_timer[lcore_id].curr_skiplist_depth--; - prev[i] ->sl_next[i] = NULL; - } + /* transition run-list from PENDING to RUNNING */ + run_first_tims[nrunlists] = tim; + pprev = &run_first_tims[nrunlists]; + nrunlists++; + + for (; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + + ret = timer_set_running_state(tim); + if (likely(ret == 0)) { + pprev = &tim->sl_next[0]; + } else { + /* another core is trying to re-config this one, + * remove it from local expired list + */ + *pprev = next_tim; + } + } - /* transition run-list from PENDING to RUNNING */ - run_first_tim = tim; - pprev = &run_first_tim; + /* update the next to expire timer value */ + list->head.expire = (list->head.sl_next[0] == NULL) ? + 0 : list->head.sl_next[0]->expire; - for ( ; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; + rte_spinlock_unlock(&list->lock); + } - ret = timer_set_running_state(tim); - if (likely(ret == 0)) { - pprev = &tim->sl_next[0]; - } else { - /* another core is trying to re-config this one, - * remove it from local expired list - */ - *pprev = next_tim; + /* Now process the run lists */ + while (1) { + done = true; + min_expire = UINT64_MAX; + min_idx = 0; + + /* Find the next oldest timer to process */ + for (i = 0; i < nrunlists; i++) { + tim = run_first_tims[i]; + + if (tim != NULL && tim->expire < min_expire) { + min_expire = tim->expire; + min_idx = i; + done = false; + } } - } - /* update the next to expire timer value */ - priv_timer[lcore_id].pending_head.expire = - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : - priv_timer[lcore_id].pending_head.sl_next[0]->expire; + if (done) + break; + + tim = run_first_tims[min_idx]; - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + /* Move down the runlist from which we picked a timer to + * execute + */ + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0]; - /* now scan expired list and call callbacks */ - for (tim = run_first_tim; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; - priv_timer[lcore_id].updated = 0; - priv_timer[lcore_id].running_tim = tim; + priv_tim->updated = 0; + priv_tim->running_tim = tim; /* execute callback function with list unlocked */ tim->f(tim, tim->arg); __TIMER_STAT_ADD(pending, -1); /* the timer was stopped or reloaded by the callback - * function, we have nothing to do here */ - if (priv_timer[lcore_id].updated == 1) + * function, we have nothing to do here + */ + if (priv_tim->updated == 1) continue; if (tim->period == 0) { /* remove from done list and mark timer as stopped */ status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.installer = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; rte_wmb(); tim->status.u32 = status.u32; } else { - /* keep it in list and mark timer as pending */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + dest_list = &priv_tim->pending_lists[lcore_id]; + + /* If the destination list is the current list + * we can acquire the lock here, and hold it + * across removal and insertion of the timer. + */ + if (list == dest_list) { + rte_spinlock_lock(&list->lock); + local_is_locked = 1; + } + + /* set timer state back to PENDING and + * reinsert it in pending list + */ status.state = RTE_TIMER_PENDING; __TIMER_STAT_ADD(pending, 1); - status.owner = (int16_t)lcore_id; + status.installer = tim->status.installer; + status.owner = lcore_id; rte_wmb(); tim->status.u32 = status.u32; - __rte_timer_reset(tim, tim->expire + tim->period, - tim->period, lcore_id, tim->f, tim->arg, 1); - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + + __rte_timer_reset(tim, + tim->expire + tim->period, + tim->period, lcore_id, + tim->f, tim->arg, local_is_locked); + + if (local_is_locked) { + rte_spinlock_unlock(&list->lock); + local_is_locked = 0; + } } } - priv_timer[lcore_id].running_tim = NULL; + priv_tim->running_tim = NULL; } /* dump statistics about timers */ diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h index a276a73..4cc6baf 100644 --- a/lib/librte_timer/rte_timer.h +++ b/lib/librte_timer/rte_timer.h @@ -77,7 +77,7 @@ extern "C" { #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ +#define RTE_TIMER_NO_LCORE -2 /** * Timer type: Periodic or single (one-shot). @@ -94,10 +94,11 @@ enum rte_timer_type { union rte_timer_status { RTE_STD_C11 struct { - uint16_t state; /**< Stop, pending, running, config. */ - int16_t owner; /**< The lcore that owns the timer. */ + unsigned state : 2; + int installer : 15; + int owner : 15; }; - uint32_t u32; /**< To atomic-set status + owner. */ + uint32_t u32; /**< To atomic-set status, installer, owner */ }; #ifdef RTE_LIBRTE_TIMER_DEBUG -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo @ 2017-08-29 10:57 ` Ananyev, Konstantin 2017-08-29 16:13 ` Carrillo, Erik G 0 siblings, 1 reply; 30+ messages in thread From: Ananyev, Konstantin @ 2017-08-29 10:57 UTC (permalink / raw) To: Carrillo, Erik G, rsanford; +Cc: dev Hi Gabriel, > > Instead of each priv_timer struct containing a single skiplist, this > commit adds a skiplist for each enabled lcore to priv_timer. In the case > that multiple lcores repeatedly install timers on the same target lcore, > this change reduces lock contention for the target lcore's skiplists and > increases performance. I am not an rte_timer expert, but there is one thing that worries me: It seems that complexity of timer_manage() has increased with that patch quite a bit: now it has to check/process up to RTE_MAX_LCORE skiplists instead of one, also it has to somehow to properly sort up to RTE_MAX_LCORE lists of retrieved (ready to run) timers. Wouldn't all that affect it's running time? I understand your intention to reduce lock contention, but I suppose at least it could be done in a configurable way. Let say allow user to specify dimension of pending_lists[] at init phase or so. Then timer from lcore_id=N will endup in pending_lists[N%RTE_DIM(pendilng_list)]. Another thought - might be better to divide pending timers list not by client (lcore) id, but by expiration time - some analog of timer wheel or so. That, I think might greatly decrease the probability that timer_manage() and timer_add() will try to access the same list. >From other side timer_manage() still would have to consume skip-lists one by one. Though I suppose that's quite radical change from what we have right now. Konstantin > > Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> > --- > v2: > * Address checkpatch warnings > > lib/librte_timer/rte_timer.c | 309 +++++++++++++++++++++++++++---------------- > lib/librte_timer/rte_timer.h | 9 +- > 2 files changed, 202 insertions(+), 116 deletions(-) > > diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c > index 5ee0840..da2fc1a 100644 > --- a/lib/librte_timer/rte_timer.c > +++ b/lib/librte_timer/rte_timer.c > @@ -37,6 +37,7 @@ > #include <inttypes.h> > #include <assert.h> > #include <sys/queue.h> > +#include <stdbool.h> > > #include <rte_atomic.h> > #include <rte_common.h> > @@ -56,17 +57,19 @@ > > LIST_HEAD(rte_timer_list, rte_timer); > > +struct skiplist { > + struct rte_timer head; /**< dummy timer instance to head up list */ > + rte_spinlock_t lock; /**< lock to protect list access */ > + unsigned int depth; /**< track the current depth of the skiplist */ > +} __rte_cache_aligned; > + > struct priv_timer { > - struct rte_timer pending_head; /**< dummy timer instance to head up list */ > - rte_spinlock_t list_lock; /**< lock to protect list access */ > + /** one pending list per enabled lcore */ > + struct skiplist pending_lists[RTE_MAX_LCORE]; > > /** per-core variable that true if a timer was updated on this > * core since last reset of the variable */ > int updated; > - > - /** track the current depth of the skiplist */ > - unsigned curr_skiplist_depth; > - > unsigned prev_lcore; /**< used for lcore round robin */ > > /** running timer on this lcore now */ > @@ -81,6 +84,10 @@ struct priv_timer { > /** per-lcore private info for timers */ > static struct priv_timer priv_timer[RTE_MAX_LCORE]; > > +/** cache of IDs of enabled lcores */ > +static unsigned int enabled_lcores[RTE_MAX_LCORE]; > +static int n_enabled_lcores; > + > /* when debug is enabled, store some statistics */ > #ifdef RTE_LIBRTE_TIMER_DEBUG > #define __TIMER_STAT_ADD(name, n) do { \ > @@ -96,14 +103,25 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE]; > void > rte_timer_subsystem_init(void) > { > - unsigned lcore_id; > + unsigned int lcore_id1, lcore_id2; > + struct skiplist *list; > + int i, j; > + > + RTE_LCORE_FOREACH(lcore_id1) > + enabled_lcores[n_enabled_lcores++] = lcore_id1; > > /* since priv_timer is static, it's zeroed by default, so only init some > * fields. > */ > - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { > - rte_spinlock_init(&priv_timer[lcore_id].list_lock); > - priv_timer[lcore_id].prev_lcore = lcore_id; > + for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores; > + lcore_id1 = enabled_lcores[++i]) { > + priv_timer[lcore_id1].prev_lcore = lcore_id1; > + > + for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores; > + lcore_id2 = enabled_lcores[++j]) { > + list = &priv_timer[lcore_id1].pending_lists[lcore_id2]; > + rte_spinlock_init(&list->lock); > + } > } > } > > @@ -114,7 +132,8 @@ rte_timer_init(struct rte_timer *tim) > union rte_timer_status status; > > status.state = RTE_TIMER_STOP; > - status.owner = RTE_TIMER_NO_OWNER; > + status.installer = RTE_TIMER_NO_LCORE; > + status.owner = RTE_TIMER_NO_LCORE; > tim->status.u32 = status.u32; > } > > @@ -142,7 +161,7 @@ timer_set_config_state(struct rte_timer *tim, > * or ready to run on local core, exit > */ > if (prev_status.state == RTE_TIMER_RUNNING && > - (prev_status.owner != (uint16_t)lcore_id || > + (prev_status.owner != (int)lcore_id || > tim != priv_timer[lcore_id].running_tim)) > return -1; > > @@ -153,7 +172,8 @@ timer_set_config_state(struct rte_timer *tim, > /* here, we know that timer is stopped or pending, > * mark it atomically as being configured */ > status.state = RTE_TIMER_CONFIG; > - status.owner = (int16_t)lcore_id; > + status.installer = RTE_TIMER_NO_LCORE; > + status.owner = lcore_id; > success = rte_atomic32_cmpset(&tim->status.u32, > prev_status.u32, > status.u32); > @@ -185,7 +205,8 @@ timer_set_running_state(struct rte_timer *tim) > /* here, we know that timer is stopped or pending, > * mark it atomically as being configured */ > status.state = RTE_TIMER_RUNNING; > - status.owner = (int16_t)lcore_id; > + status.installer = prev_status.installer; > + status.owner = lcore_id; > success = rte_atomic32_cmpset(&tim->status.u32, > prev_status.u32, > status.u32); > @@ -236,11 +257,11 @@ timer_get_skiplist_level(unsigned curr_depth) > * are <= that time value. > */ > static void > -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, > +timer_get_prev_entries(uint64_t time_val, struct skiplist *list, > struct rte_timer **prev) > { > - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; > - prev[lvl] = &priv_timer[tim_lcore].pending_head; > + unsigned int lvl = list->depth; > + prev[lvl] = &list->head; > while(lvl != 0) { > lvl--; > prev[lvl] = prev[lvl+1]; > @@ -255,15 +276,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, > * all skiplist levels. > */ > static void > -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, > +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, > struct rte_timer **prev) > { > int i; > /* to get a specific entry in the list, look for just lower than the time > * values, and then increment on each level individually if necessary > */ > - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); > - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { > + timer_get_prev_entries(tim->expire - 1, list, prev); > + for (i = list->depth - 1; i >= 0; i--) { > while (prev[i]->sl_next[i] != NULL && > prev[i]->sl_next[i] != tim && > prev[i]->sl_next[i]->expire <= tim->expire) > @@ -282,25 +303,25 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) > unsigned lcore_id = rte_lcore_id(); > unsigned lvl; > struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; > + struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id]; > > /* if timer needs to be scheduled on another core, we need to > * lock the list; if it is on local core, we need to lock if > * we are not called from rte_timer_manage() */ > if (tim_lcore != lcore_id || !local_is_locked) > - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); > + rte_spinlock_lock(&list->lock); > > /* find where exactly this element goes in the list of elements > * for each depth. */ > - timer_get_prev_entries(tim->expire, tim_lcore, prev); > + timer_get_prev_entries(tim->expire, list, prev); > > /* now assign it a new level and add at that level */ > - const unsigned tim_level = timer_get_skiplist_level( > - priv_timer[tim_lcore].curr_skiplist_depth); > - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) > - priv_timer[tim_lcore].curr_skiplist_depth++; > + const unsigned int tim_level = timer_get_skiplist_level(list->depth); > + if (tim_level == list->depth) > + list->depth++; > > lvl = tim_level; > - while (lvl > 0) { > + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) { > tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; > prev[lvl]->sl_next[lvl] = tim; > lvl--; > @@ -310,11 +331,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) > > /* save the lowest list entry into the expire field of the dummy hdr > * NOTE: this is not atomic on 32-bit*/ > - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ > - pending_head.sl_next[0]->expire; > + list->head.expire = list->head.sl_next[0]->expire; > > if (tim_lcore != lcore_id || !local_is_locked) > - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); > + rte_spinlock_unlock(&list->lock); > } > > /* > @@ -330,35 +350,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status, > unsigned prev_owner = prev_status.owner; > int i; > struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; > + struct skiplist *list; > + > + list = &priv_timer[prev_owner].pending_lists[prev_status.installer]; > > /* if timer needs is pending another core, we need to lock the > * list; if it is on local core, we need to lock if we are not > * called from rte_timer_manage() */ > if (prev_owner != lcore_id || !local_is_locked) > - rte_spinlock_lock(&priv_timer[prev_owner].list_lock); > + rte_spinlock_lock(&list->lock); > > /* save the lowest list entry into the expire field of the dummy hdr. > * NOTE: this is not atomic on 32-bit */ > - if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) > - priv_timer[prev_owner].pending_head.expire = > - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); > + if (tim == list->head.sl_next[0]) > + list->head.expire = ((tim->sl_next[0] == NULL) ? > + 0 : tim->sl_next[0]->expire); > > /* adjust pointers from previous entries to point past this */ > - timer_get_prev_entries_for_node(tim, prev_owner, prev); > - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { > + timer_get_prev_entries_for_node(tim, list, prev); > + for (i = list->depth - 1; i >= 0; i--) { > if (prev[i]->sl_next[i] == tim) > prev[i]->sl_next[i] = tim->sl_next[i]; > } > > /* in case we deleted last entry at a level, adjust down max level */ > - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) > - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) > - priv_timer[prev_owner].curr_skiplist_depth --; > + for (i = list->depth - 1; i >= 0; i--) > + if (list->head.sl_next[i] == NULL) > + list->depth--; > else > break; > > if (prev_owner != lcore_id || !local_is_locked) > - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); > + rte_spinlock_unlock(&list->lock); > } > > /* Reset and start the timer associated with the timer handle (private func) */ > @@ -416,7 +439,8 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, > * the state so we don't need to use cmpset() here */ > rte_wmb(); > status.state = RTE_TIMER_PENDING; > - status.owner = (int16_t)tim_lcore; > + status.installer = lcore_id; > + status.owner = tim_lcore; > tim->status.u32 = status.u32; > > return 0; > @@ -484,7 +508,8 @@ rte_timer_stop(struct rte_timer *tim) > /* mark timer as stopped */ > rte_wmb(); > status.state = RTE_TIMER_STOP; > - status.owner = RTE_TIMER_NO_OWNER; > + status.installer = RTE_TIMER_NO_LCORE; > + status.owner = RTE_TIMER_NO_LCORE; > tim->status.u32 = status.u32; > > return 0; > @@ -506,119 +531,179 @@ rte_timer_pending(struct rte_timer *tim) > } > > /* must be called periodically, run all timer that expired */ > -void rte_timer_manage(void) > +void > +rte_timer_manage(void) > { > union rte_timer_status status; > - struct rte_timer *tim, *next_tim; > - struct rte_timer *run_first_tim, **pprev; > - unsigned lcore_id = rte_lcore_id(); > + struct rte_timer *tim, *next_tim, **pprev; > + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1]; > struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; > - uint64_t cur_time; > - int i, ret; > + struct priv_timer *priv_tim; > + unsigned int installer_lcore, lcore_id = rte_lcore_id(); > + uint64_t cur_time, min_expire; > + int i, j, min_idx, ret; > + int nrunlists = 0; > + int local_is_locked = 0; > + struct skiplist *dest_list, *list = NULL; > + bool done; > > /* timer manager only runs on EAL thread with valid lcore_id */ > assert(lcore_id < RTE_MAX_LCORE); > > + priv_tim = &priv_timer[lcore_id]; > + > __TIMER_STAT_ADD(manage, 1); > - /* optimize for the case where per-cpu list is empty */ > - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) > - return; > - cur_time = rte_get_timer_cycles(); > + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores; > + installer_lcore = enabled_lcores[++i]) { > + list = &priv_tim->pending_lists[installer_lcore]; > + > + /* optimize for the case where list is empty */ > + if (list->head.sl_next[0] == NULL) > + continue; > + cur_time = rte_get_timer_cycles(); > > #ifdef RTE_ARCH_X86_64 > - /* on 64-bit the value cached in the pending_head.expired will be > - * updated atomically, so we can consult that for a quick check here > - * outside the lock */ > - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) > - return; > + /* on 64-bit the value cached in the list->head.expired will be > + * updated atomically, so we can consult that for a quick check > + * here outside the lock > + */ > + if (likely(list->head.expire > cur_time)) > + continue; > #endif > > - /* browse ordered list, add expired timers in 'expired' list */ > - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); > + /* browse ordered list, add expired timers in 'expired' list */ > + rte_spinlock_lock(&list->lock); > > - /* if nothing to do just unlock and return */ > - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || > - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { > - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); > - return; > - } > + /* if nothing to do just unlock and continue */ > + if (list->head.sl_next[0] == NULL || > + list->head.sl_next[0]->expire > cur_time) { > + rte_spinlock_unlock(&list->lock); > + continue; > + } > > - /* save start of list of expired timers */ > - tim = priv_timer[lcore_id].pending_head.sl_next[0]; > + /* save start of list of expired timers */ > + tim = list->head.sl_next[0]; > + > + /* break the existing list at current time point */ > + timer_get_prev_entries(cur_time, list, prev); > + for (j = list->depth - 1; j >= 0; j--) { > + if (prev[j] == &list->head) > + continue; > + list->head.sl_next[j] = > + prev[j]->sl_next[j]; > + if (prev[j]->sl_next[j] == NULL) > + list->depth--; > + prev[j]->sl_next[j] = NULL; > + } > > - /* break the existing list at current time point */ > - timer_get_prev_entries(cur_time, lcore_id, prev); > - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { > - if (prev[i] == &priv_timer[lcore_id].pending_head) > - continue; > - priv_timer[lcore_id].pending_head.sl_next[i] = > - prev[i]->sl_next[i]; > - if (prev[i]->sl_next[i] == NULL) > - priv_timer[lcore_id].curr_skiplist_depth--; > - prev[i] ->sl_next[i] = NULL; > - } > + /* transition run-list from PENDING to RUNNING */ > + run_first_tims[nrunlists] = tim; > + pprev = &run_first_tims[nrunlists]; > + nrunlists++; > + > + for (; tim != NULL; tim = next_tim) { > + next_tim = tim->sl_next[0]; > + > + ret = timer_set_running_state(tim); > + if (likely(ret == 0)) { > + pprev = &tim->sl_next[0]; > + } else { > + /* another core is trying to re-config this one, > + * remove it from local expired list > + */ > + *pprev = next_tim; > + } > + } > > - /* transition run-list from PENDING to RUNNING */ > - run_first_tim = tim; > - pprev = &run_first_tim; > + /* update the next to expire timer value */ > + list->head.expire = (list->head.sl_next[0] == NULL) ? > + 0 : list->head.sl_next[0]->expire; > > - for ( ; tim != NULL; tim = next_tim) { > - next_tim = tim->sl_next[0]; > + rte_spinlock_unlock(&list->lock); > + } > > - ret = timer_set_running_state(tim); > - if (likely(ret == 0)) { > - pprev = &tim->sl_next[0]; > - } else { > - /* another core is trying to re-config this one, > - * remove it from local expired list > - */ > - *pprev = next_tim; > + /* Now process the run lists */ > + while (1) { > + done = true; > + min_expire = UINT64_MAX; > + min_idx = 0; > + > + /* Find the next oldest timer to process */ > + for (i = 0; i < nrunlists; i++) { > + tim = run_first_tims[i]; > + > + if (tim != NULL && tim->expire < min_expire) { > + min_expire = tim->expire; > + min_idx = i; > + done = false; > + } > } > - } > > - /* update the next to expire timer value */ > - priv_timer[lcore_id].pending_head.expire = > - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : > - priv_timer[lcore_id].pending_head.sl_next[0]->expire; > + if (done) > + break; > + > + tim = run_first_tims[min_idx]; > > - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); > + /* Move down the runlist from which we picked a timer to > + * execute > + */ > + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0]; > > - /* now scan expired list and call callbacks */ > - for (tim = run_first_tim; tim != NULL; tim = next_tim) { > - next_tim = tim->sl_next[0]; > - priv_timer[lcore_id].updated = 0; > - priv_timer[lcore_id].running_tim = tim; > + priv_tim->updated = 0; > + priv_tim->running_tim = tim; > > /* execute callback function with list unlocked */ > tim->f(tim, tim->arg); > > __TIMER_STAT_ADD(pending, -1); > /* the timer was stopped or reloaded by the callback > - * function, we have nothing to do here */ > - if (priv_timer[lcore_id].updated == 1) > + * function, we have nothing to do here > + */ > + if (priv_tim->updated == 1) > continue; > > if (tim->period == 0) { > /* remove from done list and mark timer as stopped */ > status.state = RTE_TIMER_STOP; > - status.owner = RTE_TIMER_NO_OWNER; > + status.installer = RTE_TIMER_NO_LCORE; > + status.owner = RTE_TIMER_NO_LCORE; > rte_wmb(); > tim->status.u32 = status.u32; > } > else { > - /* keep it in list and mark timer as pending */ > - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); > + dest_list = &priv_tim->pending_lists[lcore_id]; > + > + /* If the destination list is the current list > + * we can acquire the lock here, and hold it > + * across removal and insertion of the timer. > + */ > + if (list == dest_list) { > + rte_spinlock_lock(&list->lock); > + local_is_locked = 1; > + } > + > + /* set timer state back to PENDING and > + * reinsert it in pending list > + */ > status.state = RTE_TIMER_PENDING; > __TIMER_STAT_ADD(pending, 1); > - status.owner = (int16_t)lcore_id; > + status.installer = tim->status.installer; > + status.owner = lcore_id; > rte_wmb(); > tim->status.u32 = status.u32; > - __rte_timer_reset(tim, tim->expire + tim->period, > - tim->period, lcore_id, tim->f, tim->arg, 1); > - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); > + > + __rte_timer_reset(tim, > + tim->expire + tim->period, > + tim->period, lcore_id, > + tim->f, tim->arg, local_is_locked); > + > + if (local_is_locked) { > + rte_spinlock_unlock(&list->lock); > + local_is_locked = 0; > + } > } > } > - priv_timer[lcore_id].running_tim = NULL; > + priv_tim->running_tim = NULL; > } > > /* dump statistics about timers */ > diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h > index a276a73..4cc6baf 100644 > --- a/lib/librte_timer/rte_timer.h > +++ b/lib/librte_timer/rte_timer.h > @@ -77,7 +77,7 @@ extern "C" { > #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ > #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ > > -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ > +#define RTE_TIMER_NO_LCORE -2 > > /** > * Timer type: Periodic or single (one-shot). > @@ -94,10 +94,11 @@ enum rte_timer_type { > union rte_timer_status { > RTE_STD_C11 > struct { > - uint16_t state; /**< Stop, pending, running, config. */ > - int16_t owner; /**< The lcore that owns the timer. */ > + unsigned state : 2; > + int installer : 15; > + int owner : 15; > }; > - uint32_t u32; /**< To atomic-set status + owner. */ > + uint32_t u32; /**< To atomic-set status, installer, owner */ > }; > > #ifdef RTE_LIBRTE_TIMER_DEBUG > -- > 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists for each lcore 2017-08-29 10:57 ` Ananyev, Konstantin @ 2017-08-29 16:13 ` Carrillo, Erik G 0 siblings, 0 replies; 30+ messages in thread From: Carrillo, Erik G @ 2017-08-29 16:13 UTC (permalink / raw) To: Ananyev, Konstantin, rsanford; +Cc: dev, Stephen Hemminger Hi Konstantin, > -----Original Message----- > From: Ananyev, Konstantin > Sent: Tuesday, August 29, 2017 5:57 AM > To: Carrillo, Erik G <erik.g.carrillo@intel.com>; rsanford@akamai.com > Cc: dev@dpdk.org > Subject: RE: [dpdk-dev] [PATCH v2 1/3] timer: add per-installer pending lists > for each lcore > > Hi Gabriel, > > > > > Instead of each priv_timer struct containing a single skiplist, this > > commit adds a skiplist for each enabled lcore to priv_timer. In the > > case that multiple lcores repeatedly install timers on the same target > > lcore, this change reduces lock contention for the target lcore's > > skiplists and increases performance. > > I am not an rte_timer expert, but there is one thing that worries me: > It seems that complexity of timer_manage() has increased with that patch > quite a bit: > now it has to check/process up to RTE_MAX_LCORE skiplists instead of one, > also it has to somehow to properly sort up to RTE_MAX_LCORE lists of > retrieved (ready to run) timers. > Wouldn't all that affect it's running time? > Yes, it would indeed increase it. > I understand your intention to reduce lock contention, but I suppose at least > it could be done in a configurable way. > Let say allow user to specify dimension of pending_lists[] at init phase or so. > Then timer from lcore_id=N will endup in > pending_lists[N%RTE_DIM(pendilng_list)]. > This is a neat idea, and seemingly would allow the original performance to be maintained for applications where contention is not an issue. I'll look into this change, as it may address other developers' concerns as well - thanks for the suggestion. > Another thought - might be better to divide pending timers list not by client > (lcore) id, but by expiration time - some analog of timer wheel or so. > That, I think might greatly decrease the probability that timer_manage() and > timer_add() will try to access the same list. > From other side timer_manage() still would have to consume skip-lists one > by one. > Though I suppose that's quite radical change from what we have right now. > Konstantin > Thanks, Gabriel ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo @ 2017-08-29 15:11 ` Stephen Hemminger 1 sibling, 0 replies; 30+ messages in thread From: Stephen Hemminger @ 2017-08-29 15:11 UTC (permalink / raw) To: Gabriel Carrillo; +Cc: rsanford, dev On Wed, 23 Aug 2017 09:47:22 -0500 Gabriel Carrillo <erik.g.carrillo@intel.com> wrote: > __TIMER_STAT_ADD(manage, 1); > - /* optimize for the case where per-cpu list is empty */ > - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) > - return; > - cur_time = rte_get_timer_cycles(); > + for (i = 0, installer_lcore = enabled_lcores[i]; i < n_enabled_lcores; > + installer_lcore = enabled_lcores[++i]) { > + list = &priv_tim->pending_lists[installer_lcore]; > + > + /* optimize for the case where list is empty */ > + if (list->head.sl_next[0] == NULL) > + continue; > + cur_time = rte_get_timer_cycles(); This code is critical. We expect applications using timers to call timer_manage very often and the case of no timers present, and no timer due must consume as few cycles as possible This change will add significant performance delays to these applications. ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo @ 2017-08-23 14:47 ` Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo ` (2 subsequent siblings) 4 siblings, 1 reply; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw) To: rsanford; +Cc: dev This commit adds support for timers being created from non-EAL threads; it maps timers from all such threads to lcore id RTE_MAX_LCORE, and puts them all in a corresponding skiplist. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- lib/librte_timer/rte_timer.c | 48 ++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 2db74c1..a493874 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -64,8 +64,8 @@ struct skiplist { } __rte_cache_aligned; struct priv_timer { - /** one pending list per enabled lcore */ - struct skiplist pending_lists[RTE_MAX_LCORE]; + /** one pending list per lcore, plus one for non-EAL threads */ + struct skiplist pending_lists[RTE_MAX_LCORE + 1]; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ @@ -85,7 +85,7 @@ struct priv_timer { static struct priv_timer priv_timer[RTE_MAX_LCORE]; /** cache of IDs of enabled lcores */ -static unsigned enabled_lcores[RTE_MAX_LCORE]; +static unsigned enabled_lcores[RTE_MAX_LCORE + 1]; static int n_enabled_lcores; /* when debug is enabled, store some statistics */ @@ -103,23 +103,33 @@ static int n_enabled_lcores; void rte_timer_subsystem_init(void) { - unsigned lcore_id1, lcore_id2; + unsigned target_lcore, installer_lcore; struct skiplist *list; + struct priv_timer *priv_tim; int i, j; - RTE_LCORE_FOREACH(lcore_id1) - enabled_lcores[n_enabled_lcores++] = lcore_id1; + RTE_LCORE_FOREACH(target_lcore) + enabled_lcores[n_enabled_lcores++] = target_lcore; + + /* To handle timers coming from non-EAL threads */ + enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores; - lcore_id1 = enabled_lcores[++i]) { - priv_timer[lcore_id1].prev_lcore = lcore_id1; + for (i = 0, target_lcore = enabled_lcores[i]; i < n_enabled_lcores; + target_lcore = enabled_lcores[++i]) { + /* Don't use this value to index the priv_timer array */ + if (target_lcore == RTE_MAX_LCORE) + continue; - for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores; - lcore_id2 = enabled_lcores[++j]) { - list = &priv_timer[lcore_id1].pending_lists[lcore_id2]; + priv_tim = &priv_timer[target_lcore]; + priv_tim->prev_lcore = target_lcore; + + for (j = 0, installer_lcore = enabled_lcores[j]; + j < n_enabled_lcores; + installer_lcore = enabled_lcores[++j]) { + list = &priv_tim->pending_lists[installer_lcore]; rte_spinlock_init(&list->lock); } } @@ -300,10 +310,16 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, static void timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) { - unsigned lcore_id = rte_lcore_id(); + unsigned installer_lcore, lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; - struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id]; + struct skiplist *list; + + /* Check if timer being installed from non-EAL thread */ + installer_lcore = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE : + lcore_id; + + list = &priv_timer[tim_lcore].pending_lists[installer_lcore]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if @@ -439,7 +455,9 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.installer = lcore_id; + /* Check if installer is non-EAL thread */ + status.installer = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE : + lcore_id; status.owner = tim_lcore; tim->status.u32 = status.u32; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 2/3] timer: handle timers installed from non-EAL threads 2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo @ 2017-08-25 20:27 ` Gabriel Carrillo 0 siblings, 0 replies; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw) To: rsanford; +Cc: dev This commit adds support for timers being created from non-EAL threads; it maps timers from all such threads to lcore id RTE_MAX_LCORE, and puts them all in a corresponding skiplist. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 48 ++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index da2fc1a..ca7b80c 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -64,8 +64,8 @@ struct skiplist { } __rte_cache_aligned; struct priv_timer { - /** one pending list per enabled lcore */ - struct skiplist pending_lists[RTE_MAX_LCORE]; + /** one pending list per lcore, plus one for non-EAL threads */ + struct skiplist pending_lists[RTE_MAX_LCORE + 1]; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ @@ -85,7 +85,7 @@ struct priv_timer { static struct priv_timer priv_timer[RTE_MAX_LCORE]; /** cache of IDs of enabled lcores */ -static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static unsigned int enabled_lcores[RTE_MAX_LCORE + 1]; static int n_enabled_lcores; /* when debug is enabled, store some statistics */ @@ -103,23 +103,33 @@ static int n_enabled_lcores; void rte_timer_subsystem_init(void) { - unsigned int lcore_id1, lcore_id2; + unsigned int target_lcore, installer_lcore; struct skiplist *list; + struct priv_timer *priv_tim; int i, j; - RTE_LCORE_FOREACH(lcore_id1) - enabled_lcores[n_enabled_lcores++] = lcore_id1; + RTE_LCORE_FOREACH(target_lcore) + enabled_lcores[n_enabled_lcores++] = target_lcore; + + /* To handle timers coming from non-EAL threads */ + enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (i = 0, lcore_id1 = enabled_lcores[i]; i < n_enabled_lcores; - lcore_id1 = enabled_lcores[++i]) { - priv_timer[lcore_id1].prev_lcore = lcore_id1; + for (i = 0, target_lcore = enabled_lcores[i]; i < n_enabled_lcores; + target_lcore = enabled_lcores[++i]) { + /* Don't use this value to index the priv_timer array */ + if (target_lcore == RTE_MAX_LCORE) + continue; - for (j = 0, lcore_id2 = enabled_lcores[j]; j < n_enabled_lcores; - lcore_id2 = enabled_lcores[++j]) { - list = &priv_timer[lcore_id1].pending_lists[lcore_id2]; + priv_tim = &priv_timer[target_lcore]; + priv_tim->prev_lcore = target_lcore; + + for (j = 0, installer_lcore = enabled_lcores[j]; + j < n_enabled_lcores; + installer_lcore = enabled_lcores[++j]) { + list = &priv_tim->pending_lists[installer_lcore]; rte_spinlock_init(&list->lock); } } @@ -300,10 +310,16 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, static void timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) { - unsigned lcore_id = rte_lcore_id(); + unsigned int installer_lcore, lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; - struct skiplist *list = &priv_timer[tim_lcore].pending_lists[lcore_id]; + struct skiplist *list; + + /* Check if timer being installed from non-EAL thread */ + installer_lcore = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE : + lcore_id; + + list = &priv_timer[tim_lcore].pending_lists[installer_lcore]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if @@ -439,7 +455,9 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.installer = lcore_id; + /* Check if installer is non-EAL thread */ + status.installer = (lcore_id == LCORE_ID_ANY) ? RTE_MAX_LCORE : + lcore_id; status.owner = tim_lcore; tim->status.u32 = status.u32; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH 3/3] doc: update timer lib docs 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo @ 2017-08-23 14:47 ` Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith 2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 4 siblings, 1 reply; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-23 14:47 UTC (permalink / raw) To: rsanford; +Cc: dev This change updates the timer library documentation to reflect a change to the organization of the skiplists in the implementation. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- doc/guides/prog_guide/timer_lib.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst index f437417..f94ffaa 100644 --- a/doc/guides/prog_guide/timer_lib.rst +++ b/doc/guides/prog_guide/timer_lib.rst @@ -1,5 +1,5 @@ .. BSD LICENSE - Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + Copyright(c) 2010-2017 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -53,16 +53,17 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>` Implementation Details ---------------------- -Timers are tracked on a per-lcore basis, -with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure. -The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level. +Timers are tracked in a per-lcore array of skiplist data structures; each +lcore has one skiplist corresponding to each other lcore that could load a timer on it. All pending +timers in each skiplist are maintained in order of timer expiry. +Each skiplist has ten levels and each entry in the table appears in each level with probability ¼^level. This means that all entries are present in level 0, 1 in every 4 entries is present at level 1, one in every 16 at level 2 and so on up to level 9. This means that adding and removing entries from the timer list for a core can be done in log(n) time, up to 4^10 entries, that is, approximately 1,000,000 timers per lcore. A timer structure contains a special field called status, -which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id). +which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id). Depending on the timer state, we know if a timer is present in a list or not: * STOPPED: no owner, not in a list @@ -77,17 +78,17 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al When modifying the state of a timer, a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically. -Inside the rte_timer_manage() function, -the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, +Inside the rte_timer_manage() function, each of an lcore's skiplists is traversed in sequence. +Each skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, until an entry which has not yet expired has been encountered. -To improve performance in the case where there are entries in the timer list but none of those timers have yet expired, +To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired, the expiry time of the first list entry is maintained within the per-core timer list structure itself. On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure. (Since expiry times are maintained as 64-bit values, a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock, so this additional check is skipped in favor of checking as normal once the lock has been taken.) On both 64-bit and 32-bit platforms, -a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty. +rte_timer_manage() can continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty. Use Cases --------- -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 3/3] doc: update timer lib docs 2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo @ 2017-08-25 20:27 ` Gabriel Carrillo 0 siblings, 0 replies; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-25 20:27 UTC (permalink / raw) To: rsanford; +Cc: dev This change updates the timer library documentation to reflect a change to the organization of the skiplists in the implementation. Signed-off-by: Gabriel Carrillo <erik.g.carrillo@intel.com> --- doc/guides/prog_guide/timer_lib.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst index f437417..f94ffaa 100644 --- a/doc/guides/prog_guide/timer_lib.rst +++ b/doc/guides/prog_guide/timer_lib.rst @@ -1,5 +1,5 @@ .. BSD LICENSE - Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + Copyright(c) 2010-2017 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -53,16 +53,17 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>` Implementation Details ---------------------- -Timers are tracked on a per-lcore basis, -with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure. -The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level. +Timers are tracked in a per-lcore array of skiplist data structures; each +lcore has one skiplist corresponding to each other lcore that could load a timer on it. All pending +timers in each skiplist are maintained in order of timer expiry. +Each skiplist has ten levels and each entry in the table appears in each level with probability ¼^level. This means that all entries are present in level 0, 1 in every 4 entries is present at level 1, one in every 16 at level 2 and so on up to level 9. This means that adding and removing entries from the timer list for a core can be done in log(n) time, up to 4^10 entries, that is, approximately 1,000,000 timers per lcore. A timer structure contains a special field called status, -which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id). +which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id). Depending on the timer state, we know if a timer is present in a list or not: * STOPPED: no owner, not in a list @@ -77,17 +78,17 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al When modifying the state of a timer, a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically. -Inside the rte_timer_manage() function, -the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, +Inside the rte_timer_manage() function, each of an lcore's skiplists is traversed in sequence. +Each skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, until an entry which has not yet expired has been encountered. -To improve performance in the case where there are entries in the timer list but none of those timers have yet expired, +To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired, the expiry time of the first list entry is maintained within the per-core timer list structure itself. On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure. (Since expiry times are maintained as 64-bit values, a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock, so this additional check is skipped in favor of checking as normal once the lock has been taken.) On both 64-bit and 32-bit platforms, -a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty. +rte_timer_manage() can continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty. Use Cases --------- -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo ` (2 preceding siblings ...) 2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo @ 2017-08-23 15:02 ` Wiles, Keith 2017-08-23 16:19 ` Carrillo, Erik G 2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 4 siblings, 1 reply; 30+ messages in thread From: Wiles, Keith @ 2017-08-23 15:02 UTC (permalink / raw) To: Carrillo, Erik G; +Cc: rsanford, dev > On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com> wrote: > > In the current implementation of the DPDK timer library, timers can be > created and set to be handled by a target lcore by adding it to a > skiplist that corresponds to that lcore. However, if an application > enables multiple lcores, and each of these lcores repeatedly attempts > to install timers on the same target lcore, overall application > throughput will be reduced as all lcores contend to acquire the lock > guarding the single skiplist of pending timers. > > This patchset addresses this scenario by adding an array of skiplists > to each lcore's priv_timer struct, such that when lcore i installs a > timer on lcore k, the timer will be added to the ith skiplist for > lcore k. If lcore j installs a timer on lcore k simultaneously, > lcores i and j can both proceed since they will be acquiring different > locks for different lists. > > When lcore k processes its pending timers, it will traverse each skiplist > in its array and acquire a skiplist's lock while a run list is broken > out; meanwhile, all other lists can continue to be modified. Then, all > run lists for lcore k are collected and traversed together so timers are > executed in their global order. What is the performance and/or latency added to the timeout now? I worry about the case when just about all of the cores are enabled, which could be as high was 128 or more now. One option is to have the lcore j that wants to install a timer on lcore k to pass a message via a ring to lcore k to add that timer. We could even add that logic into setting a timer on a different lcore then the caller in the current API. The ring would be a multi-producer and single consumer, we still have the lock. What am I missing here? > > Gabriel Carrillo (3): > timer: add per-installer pending lists for each lcore > timer: handle timers installed from non-EAL threads > doc: update timer lib docs > > doc/guides/prog_guide/timer_lib.rst | 19 ++- > lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------------- > lib/librte_timer/rte_timer.h | 9 +- > 3 files changed, 231 insertions(+), 126 deletions(-) > > -- > 2.6.4 > Regards, Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith @ 2017-08-23 16:19 ` Carrillo, Erik G 2017-08-23 16:50 ` Wiles, Keith 0 siblings, 1 reply; 30+ messages in thread From: Carrillo, Erik G @ 2017-08-23 16:19 UTC (permalink / raw) To: Wiles, Keith; +Cc: rsanford, dev > -----Original Message----- > From: Wiles, Keith > Sent: Wednesday, August 23, 2017 10:02 AM > To: Carrillo, Erik G <erik.g.carrillo@intel.com> > Cc: rsanford@akamai.com; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** > > > > On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com> > wrote: > > > > In the current implementation of the DPDK timer library, timers can be > > created and set to be handled by a target lcore by adding it to a > > skiplist that corresponds to that lcore. However, if an application > > enables multiple lcores, and each of these lcores repeatedly attempts > > to install timers on the same target lcore, overall application > > throughput will be reduced as all lcores contend to acquire the lock > > guarding the single skiplist of pending timers. > > > > This patchset addresses this scenario by adding an array of skiplists > > to each lcore's priv_timer struct, such that when lcore i installs a > > timer on lcore k, the timer will be added to the ith skiplist for > > lcore k. If lcore j installs a timer on lcore k simultaneously, > > lcores i and j can both proceed since they will be acquiring different > > locks for different lists. > > > > When lcore k processes its pending timers, it will traverse each > > skiplist in its array and acquire a skiplist's lock while a run list > > is broken out; meanwhile, all other lists can continue to be modified. > > Then, all run lists for lcore k are collected and traversed together > > so timers are executed in their global order. > > What is the performance and/or latency added to the timeout now? > > I worry about the case when just about all of the cores are enabled, which > could be as high was 128 or more now. There is a case in the timer_perf_autotest that runs rte_timer_manage with zero timers that can give a sense of the added latency. When run with one lcore, it completes in around 25 cycles. When run with 43 lcores (the highest I have access to at the moment), rte_timer_mange completes in around 155 cycles. So it looks like each added lcore adds around 3 cycles of overhead for checking empty lists in my testing. > > One option is to have the lcore j that wants to install a timer on lcore k to pass > a message via a ring to lcore k to add that timer. We could even add that logic > into setting a timer on a different lcore then the caller in the current API. The > ring would be a multi-producer and single consumer, we still have the lock. > What am I missing here? > I did try this approach: initially I had a multi-producer single-consumer ring that would hold requests to add or delete a timer from lcore k's skiplist, but it didn't really give an appreciable increase in my test application throughput. In profiling this solution, the hotspot had moved from acquiring the skiplist's spinlock to the rte_atomic32_cmpset that the multiple-producer ring code uses to manipulate the head pointer. Then, I tried multiple single-producer single-consumer rings per target lcore. This removed the ring hotspot, but the performance didn't increase as much as with the proposed solution. These solutions also add overhead to rte_timer_manage, as it would have to process the rings and then process the skiplists. One other thing to note is that a solution that uses such messages changes the use models for the timer. One interesting example is: - lcore I enqueues a message to install a timer on lcore k - lcore k runs rte_timer_manage, processes its messages and adds the timer to its list - lcore I then enqueues a message to stop the same timer, now owned by lcore k - lcore k does not run rte_timer_manage again - lcore I wants to free the timer but it might not be safe Even though lcore I has successfully enqueued the request to stop the timer (and delete it from lcore k's pending list), it hasn't actually been deleted from the list yet, so freeing it could corrupt the list. This case exists in the existing timer stress tests. Another interesting scenario is: - lcore I resets a timer to install it on lcore k - lcore j resets the same timer to install it on lcore k - then, lcore k runs timer_manage Lcore j's message obviates lcore i's message, and it would be wasted work for lcore k to process it, so we should mark it to be skipped over. Handling all the edge cases was more complex than the solution proposed. > > > > Gabriel Carrillo (3): > > timer: add per-installer pending lists for each lcore > > timer: handle timers installed from non-EAL threads > > doc: update timer lib docs > > > > doc/guides/prog_guide/timer_lib.rst | 19 ++- > > lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++--------- > ---- > > lib/librte_timer/rte_timer.h | 9 +- > > 3 files changed, 231 insertions(+), 126 deletions(-) > > > > -- > > 2.6.4 > > > > Regards, > Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 16:19 ` Carrillo, Erik G @ 2017-08-23 16:50 ` Wiles, Keith 2017-08-23 19:28 ` Carrillo, Erik G 0 siblings, 1 reply; 30+ messages in thread From: Wiles, Keith @ 2017-08-23 16:50 UTC (permalink / raw) To: Carrillo, Erik G; +Cc: rsanford, dev > On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com> wrote: > > > >> -----Original Message----- >> From: Wiles, Keith >> Sent: Wednesday, August 23, 2017 10:02 AM >> To: Carrillo, Erik G <erik.g.carrillo@intel.com> >> Cc: rsanford@akamai.com; dev@dpdk.org >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** >> >> >>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo <erik.g.carrillo@intel.com> >> wrote: >>> >>> In the current implementation of the DPDK timer library, timers can be >>> created and set to be handled by a target lcore by adding it to a >>> skiplist that corresponds to that lcore. However, if an application >>> enables multiple lcores, and each of these lcores repeatedly attempts >>> to install timers on the same target lcore, overall application >>> throughput will be reduced as all lcores contend to acquire the lock >>> guarding the single skiplist of pending timers. >>> >>> This patchset addresses this scenario by adding an array of skiplists >>> to each lcore's priv_timer struct, such that when lcore i installs a >>> timer on lcore k, the timer will be added to the ith skiplist for >>> lcore k. If lcore j installs a timer on lcore k simultaneously, >>> lcores i and j can both proceed since they will be acquiring different >>> locks for different lists. >>> >>> When lcore k processes its pending timers, it will traverse each >>> skiplist in its array and acquire a skiplist's lock while a run list >>> is broken out; meanwhile, all other lists can continue to be modified. >>> Then, all run lists for lcore k are collected and traversed together >>> so timers are executed in their global order. >> >> What is the performance and/or latency added to the timeout now? >> >> I worry about the case when just about all of the cores are enabled, which >> could be as high was 128 or more now. > > There is a case in the timer_perf_autotest that runs rte_timer_manage with zero timers that can give a sense of the added latency. When run with one lcore, it completes in around 25 cycles. When run with 43 lcores (the highest I have access to at the moment), rte_timer_mange completes in around 155 cycles. So it looks like each added lcore adds around 3 cycles of overhead for checking empty lists in my testing. Does this mean we have only 25 cycles on the current design or is the 25 cycles for the new design? If for the new design, then what is the old design cost compared to the new cost. I also think we need the call to a timer function in the calculation, just to make sure we have at least one timer in the list and we account for any short cuts in the code for no timers active. > >> >> One option is to have the lcore j that wants to install a timer on lcore k to pass >> a message via a ring to lcore k to add that timer. We could even add that logic >> into setting a timer on a different lcore then the caller in the current API. The >> ring would be a multi-producer and single consumer, we still have the lock. >> What am I missing here? >> > > I did try this approach: initially I had a multi-producer single-consumer ring that would hold requests to add or delete a timer from lcore k's skiplist, but it didn't really give an appreciable increase in my test application throughput. In profiling this solution, the hotspot had moved from acquiring the skiplist's spinlock to the rte_atomic32_cmpset that the multiple-producer ring code uses to manipulate the head pointer. > > Then, I tried multiple single-producer single-consumer rings per target lcore. This removed the ring hotspot, but the performance didn't increase as much as with the proposed solution. These solutions also add overhead to rte_timer_manage, as it would have to process the rings and then process the skiplists. > > One other thing to note is that a solution that uses such messages changes the use models for the timer. One interesting example is: > - lcore I enqueues a message to install a timer on lcore k > - lcore k runs rte_timer_manage, processes its messages and adds the timer to its list > - lcore I then enqueues a message to stop the same timer, now owned by lcore k > - lcore k does not run rte_timer_manage again > - lcore I wants to free the timer but it might not be safe This case seems like a mistake to me as lcore k should continue to call rte_timer_manager() to process any new timers from other lcores not just the case where the list becomes empty and lcore k does not add timer to his list. > > Even though lcore I has successfully enqueued the request to stop the timer (and delete it from lcore k's pending list), it hasn't actually been deleted from the list yet, so freeing it could corrupt the list. This case exists in the existing timer stress tests. > > Another interesting scenario is: > - lcore I resets a timer to install it on lcore k > - lcore j resets the same timer to install it on lcore k > - then, lcore k runs timer_manage This one also seems like a mistake, more then one lcore setting the same timer seems like a problem and should not be done. A lcore should own a timer and no other lcore should be able to change that timer. If multiple lcores need a timer then they should not share the same timer structure. > > Lcore j's message obviates lcore i's message, and it would be wasted work for lcore k to process it, so we should mark it to be skipped over. Handling all the edge cases was more complex than the solution proposed. Hmmm, to me it seems simple here as long as the lcores follow the same rules and sharing a timer structure is very risky and avoidable IMO. Once you have lcores adding timers to another lcore then all accesses to that skip list must be serialized or you get unpredictable results. This should also fix most of the edge cases you are talking about. Also it seems to me the case with an lcore adding timers to another lcore timer list is a specific use case and could be handled by a different set of APIs for that specific use case. Then we do not need to change the current design and all of the overhead is placed on the new APIs/design. IMO we are turning the current timer design into a global timer design as it really is a per lcore design today and I beleive that is a mistake. > >>> >>> Gabriel Carrillo (3): >>> timer: add per-installer pending lists for each lcore >>> timer: handle timers installed from non-EAL threads >>> doc: update timer lib docs >>> >>> doc/guides/prog_guide/timer_lib.rst | 19 ++- >>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++--------- >> ---- >>> lib/librte_timer/rte_timer.h | 9 +- >>> 3 files changed, 231 insertions(+), 126 deletions(-) >>> >>> -- >>> 2.6.4 >>> >> >> Regards, >> Keith Regards, Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 16:50 ` Wiles, Keith @ 2017-08-23 19:28 ` Carrillo, Erik G 2017-08-23 21:04 ` Wiles, Keith 0 siblings, 1 reply; 30+ messages in thread From: Carrillo, Erik G @ 2017-08-23 19:28 UTC (permalink / raw) To: Wiles, Keith; +Cc: rsanford, dev > -----Original Message----- > From: Wiles, Keith > Sent: Wednesday, August 23, 2017 11:50 AM > To: Carrillo, Erik G <erik.g.carrillo@intel.com> > Cc: rsanford@akamai.com; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** > > > > On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com> > wrote: > > > > > > > >> -----Original Message----- > >> From: Wiles, Keith > >> Sent: Wednesday, August 23, 2017 10:02 AM > >> To: Carrillo, Erik G <erik.g.carrillo@intel.com> > >> Cc: rsanford@akamai.com; dev@dpdk.org > >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements > >> *** > >> > >> > >>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo > >>> <erik.g.carrillo@intel.com> > >> wrote: > >>> > >>> In the current implementation of the DPDK timer library, timers can > >>> be created and set to be handled by a target lcore by adding it to a > >>> skiplist that corresponds to that lcore. However, if an application > >>> enables multiple lcores, and each of these lcores repeatedly > >>> attempts to install timers on the same target lcore, overall > >>> application throughput will be reduced as all lcores contend to > >>> acquire the lock guarding the single skiplist of pending timers. > >>> > >>> This patchset addresses this scenario by adding an array of > >>> skiplists to each lcore's priv_timer struct, such that when lcore i > >>> installs a timer on lcore k, the timer will be added to the ith > >>> skiplist for lcore k. If lcore j installs a timer on lcore k > >>> simultaneously, lcores i and j can both proceed since they will be > >>> acquiring different locks for different lists. > >>> > >>> When lcore k processes its pending timers, it will traverse each > >>> skiplist in its array and acquire a skiplist's lock while a run list > >>> is broken out; meanwhile, all other lists can continue to be modified. > >>> Then, all run lists for lcore k are collected and traversed together > >>> so timers are executed in their global order. > >> > >> What is the performance and/or latency added to the timeout now? > >> > >> I worry about the case when just about all of the cores are enabled, > >> which could be as high was 128 or more now. > > > > There is a case in the timer_perf_autotest that runs rte_timer_manage > with zero timers that can give a sense of the added latency. When run with > one lcore, it completes in around 25 cycles. When run with 43 lcores (the > highest I have access to at the moment), rte_timer_mange completes in > around 155 cycles. So it looks like each added lcore adds around 3 cycles of > overhead for checking empty lists in my testing. > > Does this mean we have only 25 cycles on the current design or is the 25 > cycles for the new design? > Both - when run with one lcore, the new design becomes equivalent to the original one. I tested the current design to confirm. > If for the new design, then what is the old design cost compared to the new > cost. > > I also think we need the call to a timer function in the calculation, just to > make sure we have at least one timer in the list and we account for any short > cuts in the code for no timers active. > Looking at the numbers for non-empty lists in timer_perf_autotest, the overhead appears to fall away. Here are some representative runs for timer_perf_autotest: 43 lcores enabled, installing 1M timers on an lcore and processing them with current design: <...snipped...> Appending 1000000 timers Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us) Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73 (0us) Resetting 1000000 timers Time for 1000000 timers: 1406756396 (641ms), Time per timer: 1406 (1us) <...snipped...> 43 lcores enabled, installing 1M timers on an lcore and processing them with proposed design: <...snipped...> Appending 1000000 timers Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us) Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79 (0us) Resetting 1000000 timers Time for 1000000 timers: 1427189116 (650ms), Time per timer: 1427 (1us) <...snipped...> The above are not averages, so the numbers don't really indicate which is faster, but they show that the overhead of the proposed design should not be appreciable. > > > >> > >> One option is to have the lcore j that wants to install a timer on > >> lcore k to pass a message via a ring to lcore k to add that timer. We > >> could even add that logic into setting a timer on a different lcore > >> then the caller in the current API. The ring would be a multi-producer and > single consumer, we still have the lock. > >> What am I missing here? > >> > > > > I did try this approach: initially I had a multi-producer single-consumer ring > that would hold requests to add or delete a timer from lcore k's skiplist, but it > didn't really give an appreciable increase in my test application throughput. > In profiling this solution, the hotspot had moved from acquiring the skiplist's > spinlock to the rte_atomic32_cmpset that the multiple-producer ring code > uses to manipulate the head pointer. > > > > Then, I tried multiple single-producer single-consumer rings per target > lcore. This removed the ring hotspot, but the performance didn't increase as > much as with the proposed solution. These solutions also add overhead to > rte_timer_manage, as it would have to process the rings and then process > the skiplists. > > > > One other thing to note is that a solution that uses such messages changes > the use models for the timer. One interesting example is: > > - lcore I enqueues a message to install a timer on lcore k > > - lcore k runs rte_timer_manage, processes its messages and adds the > > timer to its list > > - lcore I then enqueues a message to stop the same timer, now owned by > > lcore k > > - lcore k does not run rte_timer_manage again > > - lcore I wants to free the timer but it might not be safe > > This case seems like a mistake to me as lcore k should continue to call > rte_timer_manager() to process any new timers from other lcores not just > the case where the list becomes empty and lcore k does not add timer to his > list. > > > > > Even though lcore I has successfully enqueued the request to stop the > timer (and delete it from lcore k's pending list), it hasn't actually been > deleted from the list yet, so freeing it could corrupt the list. This case exists > in the existing timer stress tests. > > > > Another interesting scenario is: > > - lcore I resets a timer to install it on lcore k > > - lcore j resets the same timer to install it on lcore k > > - then, lcore k runs timer_manage > > This one also seems like a mistake, more then one lcore setting the same > timer seems like a problem and should not be done. A lcore should own a > timer and no other lcore should be able to change that timer. If multiple > lcores need a timer then they should not share the same timer structure. > Both of the above cases exist in the timer library stress tests, so a solution would presumably need to address them or it would be less flexible. The original design passed these tests, as does the proposed one. > > > > Lcore j's message obviates lcore i's message, and it would be wasted work > for lcore k to process it, so we should mark it to be skipped over. Handling all > the edge cases was more complex than the solution proposed. > > Hmmm, to me it seems simple here as long as the lcores follow the same > rules and sharing a timer structure is very risky and avoidable IMO. > > Once you have lcores adding timers to another lcore then all accesses to that > skip list must be serialized or you get unpredictable results. This should also > fix most of the edge cases you are talking about. > > Also it seems to me the case with an lcore adding timers to another lcore > timer list is a specific use case and could be handled by a different set of APIs > for that specific use case. Then we do not need to change the current design > and all of the overhead is placed on the new APIs/design. IMO we are > turning the current timer design into a global timer design as it really is a per > lcore design today and I beleive that is a mistake. > Well, the original API explicitly supports installing a timer to be executed on a different lcore, and there are no API changes in the patchset. Also, the proposed design keeps the per-lcore design intact; it only takes what used to be one large skiplist that held timers for all installing lcores, and separates it into N skiplists that correspond 1:1 to an installing lcore. When an lcore processes timers on its lists it will still only be managing timers it owns, and no others. > > > >>> > >>> Gabriel Carrillo (3): > >>> timer: add per-installer pending lists for each lcore > >>> timer: handle timers installed from non-EAL threads > >>> doc: update timer lib docs > >>> > >>> doc/guides/prog_guide/timer_lib.rst | 19 ++- > >>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------ > --- > >> ---- > >>> lib/librte_timer/rte_timer.h | 9 +- > >>> 3 files changed, 231 insertions(+), 126 deletions(-) > >>> > >>> -- > >>> 2.6.4 > >>> > >> > >> Regards, > >> Keith > > Regards, > Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 19:28 ` Carrillo, Erik G @ 2017-08-23 21:04 ` Wiles, Keith 2017-08-24 14:08 ` Carrillo, Erik G 0 siblings, 1 reply; 30+ messages in thread From: Wiles, Keith @ 2017-08-23 21:04 UTC (permalink / raw) To: Carrillo, Erik G; +Cc: rsanford, dev > On Aug 23, 2017, at 2:28 PM, Carrillo, Erik G <erik.g.carrillo@intel.com> wrote: > >> >> -----Original Message----- >> From: Wiles, Keith >> Sent: Wednesday, August 23, 2017 11:50 AM >> To: Carrillo, Erik G <erik.g.carrillo@intel.com> >> Cc: rsanford@akamai.com; dev@dpdk.org >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** >> >> >>> On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G <erik.g.carrillo@intel.com> >> wrote: >>> >>> >>> >>>> -----Original Message----- >>>> From: Wiles, Keith >>>> Sent: Wednesday, August 23, 2017 10:02 AM >>>> To: Carrillo, Erik G <erik.g.carrillo@intel.com> >>>> Cc: rsanford@akamai.com; dev@dpdk.org >>>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements >>>> *** >>>> >>>> >>>>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo >>>>> <erik.g.carrillo@intel.com> >>>> wrote: >>>>> >>>>> In the current implementation of the DPDK timer library, timers can >>>>> be created and set to be handled by a target lcore by adding it to a >>>>> skiplist that corresponds to that lcore. However, if an application >>>>> enables multiple lcores, and each of these lcores repeatedly >>>>> attempts to install timers on the same target lcore, overall >>>>> application throughput will be reduced as all lcores contend to >>>>> acquire the lock guarding the single skiplist of pending timers. >>>>> >>>>> This patchset addresses this scenario by adding an array of >>>>> skiplists to each lcore's priv_timer struct, such that when lcore i >>>>> installs a timer on lcore k, the timer will be added to the ith >>>>> skiplist for lcore k. If lcore j installs a timer on lcore k >>>>> simultaneously, lcores i and j can both proceed since they will be >>>>> acquiring different locks for different lists. >>>>> >>>>> When lcore k processes its pending timers, it will traverse each >>>>> skiplist in its array and acquire a skiplist's lock while a run list >>>>> is broken out; meanwhile, all other lists can continue to be modified. >>>>> Then, all run lists for lcore k are collected and traversed together >>>>> so timers are executed in their global order. >>>> >>>> What is the performance and/or latency added to the timeout now? >>>> >>>> I worry about the case when just about all of the cores are enabled, >>>> which could be as high was 128 or more now. >>> >>> There is a case in the timer_perf_autotest that runs rte_timer_manage >> with zero timers that can give a sense of the added latency. When run with >> one lcore, it completes in around 25 cycles. When run with 43 lcores (the >> highest I have access to at the moment), rte_timer_mange completes in >> around 155 cycles. So it looks like each added lcore adds around 3 cycles of >> overhead for checking empty lists in my testing. >> >> Does this mean we have only 25 cycles on the current design or is the 25 >> cycles for the new design? >> > > Both - when run with one lcore, the new design becomes equivalent to the original one. I tested the current design to confirm. Good thanks > >> If for the new design, then what is the old design cost compared to the new >> cost. >> >> I also think we need the call to a timer function in the calculation, just to >> make sure we have at least one timer in the list and we account for any short >> cuts in the code for no timers active. >> > > Looking at the numbers for non-empty lists in timer_perf_autotest, the overhead appears to fall away. Here are some representative runs for timer_perf_autotest: > > 43 lcores enabled, installing 1M timers on an lcore and processing them with current design: > > <...snipped...> > Appending 1000000 timers > Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us) > Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73 (0us) > Resetting 1000000 timers > Time for 1000000 timers: 1406756396 (641ms), Time per timer: 1406 (1us) > <...snipped...> > > 43 lcores enabled, installing 1M timers on an lcore and processing them with proposed design: > > <...snipped...> > Appending 1000000 timers > Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us) > Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79 (0us) > Resetting 1000000 timers > Time for 1000000 timers: 1427189116 (650ms), Time per timer: 1427 (1us) > <...snipped…> it looks ok then. The main concern I had was the timers in Pktgen and someone telling the jitter increase or latency or performance. I guess I will just have to wait an see. > > The above are not averages, so the numbers don't really indicate which is faster, but they show that the overhead of the proposed design should not be appreciable. > >>> >>>> >>>> One option is to have the lcore j that wants to install a timer on >>>> lcore k to pass a message via a ring to lcore k to add that timer. We >>>> could even add that logic into setting a timer on a different lcore >>>> then the caller in the current API. The ring would be a multi-producer and >> single consumer, we still have the lock. >>>> What am I missing here? >>>> >>> >>> I did try this approach: initially I had a multi-producer single-consumer ring >> that would hold requests to add or delete a timer from lcore k's skiplist, but it >> didn't really give an appreciable increase in my test application throughput. >> In profiling this solution, the hotspot had moved from acquiring the skiplist's >> spinlock to the rte_atomic32_cmpset that the multiple-producer ring code >> uses to manipulate the head pointer. >>> >>> Then, I tried multiple single-producer single-consumer rings per target >> lcore. This removed the ring hotspot, but the performance didn't increase as >> much as with the proposed solution. These solutions also add overhead to >> rte_timer_manage, as it would have to process the rings and then process >> the skiplists. >>> >>> One other thing to note is that a solution that uses such messages changes >> the use models for the timer. One interesting example is: >>> - lcore I enqueues a message to install a timer on lcore k >>> - lcore k runs rte_timer_manage, processes its messages and adds the >>> timer to its list >>> - lcore I then enqueues a message to stop the same timer, now owned by >>> lcore k >>> - lcore k does not run rte_timer_manage again >>> - lcore I wants to free the timer but it might not be safe >> >> This case seems like a mistake to me as lcore k should continue to call >> rte_timer_manager() to process any new timers from other lcores not just >> the case where the list becomes empty and lcore k does not add timer to his >> list. >> >>> >>> Even though lcore I has successfully enqueued the request to stop the >> timer (and delete it from lcore k's pending list), it hasn't actually been >> deleted from the list yet, so freeing it could corrupt the list. This case exists >> in the existing timer stress tests. >>> >>> Another interesting scenario is: >>> - lcore I resets a timer to install it on lcore k >>> - lcore j resets the same timer to install it on lcore k >>> - then, lcore k runs timer_manage >> >> This one also seems like a mistake, more then one lcore setting the same >> timer seems like a problem and should not be done. A lcore should own a >> timer and no other lcore should be able to change that timer. If multiple >> lcores need a timer then they should not share the same timer structure. >> > > Both of the above cases exist in the timer library stress tests, so a solution would presumably need to address them or it would be less flexible. The original design passed these tests, as does the proposed one. I get this twitch when one lcore is adding timers to another lcore as I come from a realtime OS background, but I guess if no one else cares or finds a problem I will have to live with it. Having a test for something does not make it a good test or a reasonable reason to continue a design issue. We can make any test work, but is it right is the real question and we will just have to wait an see I guess. > >>> >>> Lcore j's message obviates lcore i's message, and it would be wasted work >> for lcore k to process it, so we should mark it to be skipped over. Handling all >> the edge cases was more complex than the solution proposed. >> >> Hmmm, to me it seems simple here as long as the lcores follow the same >> rules and sharing a timer structure is very risky and avoidable IMO. >> >> Once you have lcores adding timers to another lcore then all accesses to that >> skip list must be serialized or you get unpredictable results. This should also >> fix most of the edge cases you are talking about. >> >> Also it seems to me the case with an lcore adding timers to another lcore >> timer list is a specific use case and could be handled by a different set of APIs >> for that specific use case. Then we do not need to change the current design >> and all of the overhead is placed on the new APIs/design. IMO we are >> turning the current timer design into a global timer design as it really is a per >> lcore design today and I beleive that is a mistake. >> > > Well, the original API explicitly supports installing a timer to be executed on a different lcore, and there are no API changes in the patchset. Also, the proposed design keeps the per-lcore design intact; it only takes what used to be one large skiplist that held timers for all installing lcores, and separates it into N skiplists that correspond 1:1 to an installing lcore. When an lcore processes timers on its lists it will still only be managing timers it owns, and no others. Having an API to explicitly support some feature is not a reason to keep something, but I think you have reduce my twitching some :-) so I will let it go. Thanks for the information. > > >>> >>>>> >>>>> Gabriel Carrillo (3): >>>>> timer: add per-installer pending lists for each lcore >>>>> timer: handle timers installed from non-EAL threads >>>>> doc: update timer lib docs >>>>> >>>>> doc/guides/prog_guide/timer_lib.rst | 19 ++- >>>>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------ >> --- >>>> ---- >>>>> lib/librte_timer/rte_timer.h | 9 +- >>>>> 3 files changed, 231 insertions(+), 126 deletions(-) >>>>> >>>>> -- >>>>> 2.6.4 >>>>> >>>> >>>> Regards, >>>> Keith >> >> Regards, >> Keith Regards, Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** 2017-08-23 21:04 ` Wiles, Keith @ 2017-08-24 14:08 ` Carrillo, Erik G 0 siblings, 0 replies; 30+ messages in thread From: Carrillo, Erik G @ 2017-08-24 14:08 UTC (permalink / raw) To: Wiles, Keith; +Cc: rsanford, dev > -----Original Message----- > From: Wiles, Keith > Sent: Wednesday, August 23, 2017 4:05 PM > To: Carrillo, Erik G <erik.g.carrillo@intel.com> > Cc: rsanford@akamai.com; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** > > > > On Aug 23, 2017, at 2:28 PM, Carrillo, Erik G <erik.g.carrillo@intel.com> > wrote: > > > >> > >> -----Original Message----- > >> From: Wiles, Keith > >> Sent: Wednesday, August 23, 2017 11:50 AM > >> To: Carrillo, Erik G <erik.g.carrillo@intel.com> > >> Cc: rsanford@akamai.com; dev@dpdk.org > >> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements > >> *** > >> > >> > >>> On Aug 23, 2017, at 11:19 AM, Carrillo, Erik G > >>> <erik.g.carrillo@intel.com> > >> wrote: > >>> > >>> > >>> > >>>> -----Original Message----- > >>>> From: Wiles, Keith > >>>> Sent: Wednesday, August 23, 2017 10:02 AM > >>>> To: Carrillo, Erik G <erik.g.carrillo@intel.com> > >>>> Cc: rsanford@akamai.com; dev@dpdk.org > >>>> Subject: Re: [dpdk-dev] [PATCH 0/3] *** timer library enhancements > >>>> *** > >>>> > >>>> > >>>>> On Aug 23, 2017, at 9:47 AM, Gabriel Carrillo > >>>>> <erik.g.carrillo@intel.com> > >>>> wrote: > >>>>> > >>>>> In the current implementation of the DPDK timer library, timers > >>>>> can be created and set to be handled by a target lcore by adding > >>>>> it to a skiplist that corresponds to that lcore. However, if an > >>>>> application enables multiple lcores, and each of these lcores > >>>>> repeatedly attempts to install timers on the same target lcore, > >>>>> overall application throughput will be reduced as all lcores > >>>>> contend to acquire the lock guarding the single skiplist of pending > timers. > >>>>> > >>>>> This patchset addresses this scenario by adding an array of > >>>>> skiplists to each lcore's priv_timer struct, such that when lcore > >>>>> i installs a timer on lcore k, the timer will be added to the ith > >>>>> skiplist for lcore k. If lcore j installs a timer on lcore k > >>>>> simultaneously, lcores i and j can both proceed since they will be > >>>>> acquiring different locks for different lists. > >>>>> > >>>>> When lcore k processes its pending timers, it will traverse each > >>>>> skiplist in its array and acquire a skiplist's lock while a run > >>>>> list is broken out; meanwhile, all other lists can continue to be > modified. > >>>>> Then, all run lists for lcore k are collected and traversed > >>>>> together so timers are executed in their global order. > >>>> > >>>> What is the performance and/or latency added to the timeout now? > >>>> > >>>> I worry about the case when just about all of the cores are > >>>> enabled, which could be as high was 128 or more now. > >>> > >>> There is a case in the timer_perf_autotest that runs > >>> rte_timer_manage > >> with zero timers that can give a sense of the added latency. When run > with > >> one lcore, it completes in around 25 cycles. When run with 43 lcores > >> (the highest I have access to at the moment), rte_timer_mange > >> completes in around 155 cycles. So it looks like each added lcore > >> adds around 3 cycles of overhead for checking empty lists in my testing. > >> > >> Does this mean we have only 25 cycles on the current design or is the > >> 25 cycles for the new design? > >> > > > > Both - when run with one lcore, the new design becomes equivalent to the > original one. I tested the current design to confirm. > > Good thanks > > > > >> If for the new design, then what is the old design cost compared to > >> the new cost. > >> > >> I also think we need the call to a timer function in the calculation, > >> just to make sure we have at least one timer in the list and we > >> account for any short cuts in the code for no timers active. > >> > > > > Looking at the numbers for non-empty lists in timer_perf_autotest, the > overhead appears to fall away. Here are some representative runs for > timer_perf_autotest: > > > > 43 lcores enabled, installing 1M timers on an lcore and processing them > with current design: > > > > <...snipped...> > > Appending 1000000 timers > > Time for 1000000 timers: 424066294 (193ms), Time per timer: 424 (0us) > > Time for 1000000 callbacks: 73124504 (33ms), Time per callback: 73 > > (0us) Resetting 1000000 timers Time for 1000000 timers: 1406756396 > > (641ms), Time per timer: 1406 (1us) <...snipped...> > > > > 43 lcores enabled, installing 1M timers on an lcore and processing them > with proposed design: > > > > <...snipped...> > > Appending 1000000 timers > > Time for 1000000 timers: 382912762 (174ms), Time per timer: 382 (0us) > > Time for 1000000 callbacks: 79194418 (36ms), Time per callback: 79 > > (0us) Resetting 1000000 timers Time for 1000000 timers: 1427189116 > > (650ms), Time per timer: 1427 (1us) <...snipped…> > > it looks ok then. The main concern I had was the timers in Pktgen and > someone telling the jitter increase or latency or performance. I guess I will > just have to wait an see. > > > > > The above are not averages, so the numbers don't really indicate which is > faster, but they show that the overhead of the proposed design should not > be appreciable. > > > >>> > >>>> > >>>> One option is to have the lcore j that wants to install a timer on > >>>> lcore k to pass a message via a ring to lcore k to add that timer. > >>>> We could even add that logic into setting a timer on a different > >>>> lcore then the caller in the current API. The ring would be a > >>>> multi-producer and > >> single consumer, we still have the lock. > >>>> What am I missing here? > >>>> > >>> > >>> I did try this approach: initially I had a multi-producer > >>> single-consumer ring > >> that would hold requests to add or delete a timer from lcore k's > >> skiplist, but it didn't really give an appreciable increase in my test > application throughput. > >> In profiling this solution, the hotspot had moved from acquiring the > >> skiplist's spinlock to the rte_atomic32_cmpset that the > >> multiple-producer ring code uses to manipulate the head pointer. > >>> > >>> Then, I tried multiple single-producer single-consumer rings per > >>> target > >> lcore. This removed the ring hotspot, but the performance didn't > >> increase as much as with the proposed solution. These solutions also > >> add overhead to rte_timer_manage, as it would have to process the > >> rings and then process the skiplists. > >>> > >>> One other thing to note is that a solution that uses such messages > >>> changes > >> the use models for the timer. One interesting example is: > >>> - lcore I enqueues a message to install a timer on lcore k > >>> - lcore k runs rte_timer_manage, processes its messages and adds the > >>> timer to its list > >>> - lcore I then enqueues a message to stop the same timer, now owned > >>> by lcore k > >>> - lcore k does not run rte_timer_manage again > >>> - lcore I wants to free the timer but it might not be safe > >> > >> This case seems like a mistake to me as lcore k should continue to > >> call > >> rte_timer_manager() to process any new timers from other lcores not > >> just the case where the list becomes empty and lcore k does not add > >> timer to his list. > >> > >>> > >>> Even though lcore I has successfully enqueued the request to stop > >>> the > >> timer (and delete it from lcore k's pending list), it hasn't actually > >> been deleted from the list yet, so freeing it could corrupt the > >> list. This case exists in the existing timer stress tests. > >>> > >>> Another interesting scenario is: > >>> - lcore I resets a timer to install it on lcore k > >>> - lcore j resets the same timer to install it on lcore k > >>> - then, lcore k runs timer_manage > >> > >> This one also seems like a mistake, more then one lcore setting the > >> same timer seems like a problem and should not be done. A lcore > >> should own a timer and no other lcore should be able to change that > >> timer. If multiple lcores need a timer then they should not share the same > timer structure. > >> > > > > Both of the above cases exist in the timer library stress tests, so a solution > would presumably need to address them or it would be less flexible. The > original design passed these tests, as does the proposed one. > > I get this twitch when one lcore is adding timers to another lcore as I come > from a realtime OS background, but I guess if no one else cares or finds a > problem I will have to live with it. Having a test for something does not make > it a good test or a reasonable reason to continue a design issue. We can make > any test work, but is it right is the real question and we will just have to wait > an see I guess. > > > > >>> > >>> Lcore j's message obviates lcore i's message, and it would be wasted > >>> work > >> for lcore k to process it, so we should mark it to be skipped over. > Handling all > >> the edge cases was more complex than the solution proposed. > >> > >> Hmmm, to me it seems simple here as long as the lcores follow the > >> same rules and sharing a timer structure is very risky and avoidable IMO. > >> > >> Once you have lcores adding timers to another lcore then all accesses > >> to that skip list must be serialized or you get unpredictable > >> results. This should also fix most of the edge cases you are talking about. > >> > >> Also it seems to me the case with an lcore adding timers to another > >> lcore timer list is a specific use case and could be handled by a > >> different set of APIs for that specific use case. Then we do not need > >> to change the current design and all of the overhead is placed on the > >> new APIs/design. IMO we are turning the current timer design into a > >> global timer design as it really is a per lcore design today and I beleive that > is a mistake. > >> > > > > Well, the original API explicitly supports installing a timer to be executed on > a different lcore, and there are no API changes in the patchset. Also, the > proposed design keeps the per-lcore design intact; it only takes what used > to be one large skiplist that held timers for all installing lcores, and separates > it into N skiplists that correspond 1:1 to an installing lcore. When an lcore > processes timers on its lists it will still only be managing timers it owns, and no > others. > > > Having an API to explicitly support some feature is not a reason to keep > something, but I think you have reduce my twitching some :-) so I will let it > go. > > Thanks for the information. You're welcome, and thank you for the feedback. Regards, Gabriel > > > > > > >>> > >>>>> > >>>>> Gabriel Carrillo (3): > >>>>> timer: add per-installer pending lists for each lcore > >>>>> timer: handle timers installed from non-EAL threads > >>>>> doc: update timer lib docs > >>>>> > >>>>> doc/guides/prog_guide/timer_lib.rst | 19 ++- > >>>>> lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++--- > --- > >> --- > >>>> ---- > >>>>> lib/librte_timer/rte_timer.h | 9 +- > >>>>> 3 files changed, 231 insertions(+), 126 deletions(-) > >>>>> > >>>>> -- > >>>>> 2.6.4 > >>>>> > >>>> > >>>> Regards, > >>>> Keith > >> > >> Regards, > >> Keith > > Regards, > Keith ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v2 0/3] *** timer library enhancements *** 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo ` (3 preceding siblings ...) 2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith @ 2017-08-25 20:26 ` Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo 4 siblings, 1 reply; 30+ messages in thread From: Gabriel Carrillo @ 2017-08-25 20:26 UTC (permalink / raw) To: rsanford; +Cc: dev In the current implementation of the DPDK timer library, timers can be created and set to be handled by a target lcore by adding it to a skiplist that corresponds to that lcore. However, if an application enables multiple lcores, and each of these lcores repeatedly attempts to install timers on the same target lcore, overall application throughput will be reduced as all lcores contend to acquire the lock guarding the single skiplist of pending timers. This patchset addresses this scenario by adding an array of skiplists to each lcore's priv_timer struct, such that when lcore i installs a timer on lcore k, the timer will be added to the ith skiplist for lcore k. If lcore j installs a timer on lcore k simultaneously, lcores i and j can both proceed since they will be acquiring different locks for different lists. When lcore k processes its pending timers, it will traverse each skiplist in its array and acquire a skiplist's lock while a run list is broken out; meanwhile, all other lists can continue to be modified. Then, all run lists for lcore k are collected and traversed together so timers are executed in their relative order. Gabriel Carrillo (3): timer: add per-installer pending lists for each lcore timer: handle timers installed from non-EAL threads doc: update timer lib docs doc/guides/prog_guide/timer_lib.rst | 19 ++- lib/librte_timer/rte_timer.c | 329 +++++++++++++++++++++++------------- lib/librte_timer/rte_timer.h | 9 +- 3 files changed, 231 insertions(+), 126 deletions(-) -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 0/3] timer library enhancement 2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo @ 2017-09-13 22:05 ` Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo ` (3 more replies) 0 siblings, 4 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw) To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati In the current implementation of the DPDK timer library, timers can be created and set to be handled by a target lcore by adding it to a skiplist that corresponds to that lcore. However, if an application enables multiple lcores, and each of these lcores repeatedly attempts to install timers on the same target lcore, overall application throughput will be reduced as all lcores contend to acquire the lock guarding the single skiplist of pending timers. This patchset addresses this scenario by adding an option to enable an array of skiplists in each lcore's priv_timer struct. Then, when lcore i installs a timer on lcore k, the timer will be added to the ith skiplist for lcore k. If lcore j installs a timer on lcore k simultaneously, lcores i and j can both proceed since they will be acquiring different locks for different lists. This functionality is off by default, and can be enabled via a new function. When lcore k processes its pending timers, if the multiple pending list option is enabled, it will traverse skiplists in its array and acquire the current skiplist's lock while a run list is broken out; meanwhile, all other lists can continue to be modified. Then, all run lists for lcore k are collected and traversed together so timers are executed in their relative order. If the multiple pending list option is not enabled (the default), only a single list will be traversed, as before. Erik Gabriel Carrillo (3): timer: add multiple pending lists option for each lcore timer: handle timers installed from non-EAL threads doc: update timer lib docs doc/guides/prog_guide/timer_lib.rst | 27 ++- doc/guides/rel_notes/release_17_11.rst | 7 + lib/librte_timer/rte_timer.c | 384 ++++++++++++++++++++++----------- lib/librte_timer/rte_timer.h | 29 ++- lib/librte_timer/rte_timer_version.map | 6 + 5 files changed, 319 insertions(+), 134 deletions(-) -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo @ 2017-09-13 22:05 ` Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo ` (2 subsequent siblings) 3 siblings, 0 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw) To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati This commit adds support for enabling multiple pending timer lists in each lcore's priv_timer struct with a new function; a single list is still used by default. In the case that multiple lcores repeatedly install timers on the same target lcore, this option reduces lock contention for the target lcore's skiplists and increases performance. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> --- v3: * Added ability to enable multiple pending lists for a specified lcore with a new function, in response to feedback from Keith Wiles, Konstantin Ananyev, and Stephen Hemminger. This functionality is off by default, so that new overhead in rte_timer_manage() is minimized if the application doesn't want this feature. * Updated map file to reflect addition of new function. * Made minor updates to some variable names and logic structure. v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 376 ++++++++++++++++++++++----------- lib/librte_timer/rte_timer.h | 29 ++- lib/librte_timer/rte_timer_version.map | 6 + 3 files changed, 287 insertions(+), 124 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 5ee0840..1f4141e 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -37,6 +37,7 @@ #include <inttypes.h> #include <assert.h> #include <sys/queue.h> +#include <stdbool.h> #include <rte_atomic.h> #include <rte_common.h> @@ -54,19 +55,24 @@ #include "rte_timer.h" +#define SINGLE_LIST_IDX 0 + LIST_HEAD(rte_timer_list, rte_timer); +struct skiplist { + struct rte_timer head; /**< dummy timer instance to head up list */ + rte_spinlock_t lock; /**< lock to protect list access */ + unsigned int depth; /**< track the current depth of the skiplist */ +} __rte_cache_aligned; + struct priv_timer { - struct rte_timer pending_head; /**< dummy timer instance to head up list */ - rte_spinlock_t list_lock; /**< lock to protect list access */ + /** one pending list per enabled lcore */ + struct skiplist pending_lists[RTE_MAX_LCORE]; + bool multi_pendlists; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ int updated; - - /** track the current depth of the skiplist */ - unsigned curr_skiplist_depth; - unsigned prev_lcore; /**< used for lcore round robin */ /** running timer on this lcore now */ @@ -81,6 +87,10 @@ struct priv_timer { /** per-lcore private info for timers */ static struct priv_timer priv_timer[RTE_MAX_LCORE]; +/** cache of IDs of enabled lcores */ +static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static int n_enabled_lcores; + /* when debug is enabled, store some statistics */ #ifdef RTE_LIBRTE_TIMER_DEBUG #define __TIMER_STAT_ADD(name, n) do { \ @@ -96,17 +106,60 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE]; void rte_timer_subsystem_init(void) { - unsigned lcore_id; + unsigned int lcore_id, pending_lists_idx; + struct skiplist *list; + struct priv_timer *priv_tim; + int i, j; + + RTE_LCORE_FOREACH(lcore_id) + enabled_lcores[n_enabled_lcores++] = lcore_id; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { - rte_spinlock_init(&priv_timer[lcore_id].list_lock); - priv_timer[lcore_id].prev_lcore = lcore_id; + for (i = 0; i < n_enabled_lcores; i++) { + lcore_id = enabled_lcores[i]; + + + priv_tim = &priv_timer[lcore_id]; + priv_tim->prev_lcore = lcore_id; + priv_tim->multi_pendlists = false; + + for (j = 0; j < n_enabled_lcores; j++) { + pending_lists_idx = enabled_lcores[j]; + list = &priv_tim->pending_lists[pending_lists_idx]; + rte_spinlock_init(&list->lock); + } + + /* Init the list at SINGLE_LIST_IDX, in case we didn't get to + * it above. This will be the skiplist we use unless the + * multi-pendlists option is later enabled for this lcore. + */ + list = &priv_tim->pending_lists[SINGLE_LIST_IDX]; + rte_spinlock_init(&list->lock); } } +int +rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id) +{ + struct priv_timer *priv_tim = &priv_timer[lcore_id]; + struct skiplist *list = &priv_tim->pending_lists[SINGLE_LIST_IDX]; + + rte_spinlock_lock(&list->lock); + + if (list->head.sl_next[0] != NULL) { + /* Timers are already on the default pending list. */ + rte_spinlock_unlock(&list->lock); + return -1; + } + + priv_tim->multi_pendlists = true; + rte_spinlock_unlock(&list->lock); + + return 0; +} + /* Initialize the timer handle tim for use */ void rte_timer_init(struct rte_timer *tim) @@ -114,7 +167,8 @@ rte_timer_init(struct rte_timer *tim) union rte_timer_status status; status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; } @@ -142,7 +196,7 @@ timer_set_config_state(struct rte_timer *tim, * or ready to run on local core, exit */ if (prev_status.state == RTE_TIMER_RUNNING && - (prev_status.owner != (uint16_t)lcore_id || + (prev_status.owner != (int)lcore_id || tim != priv_timer[lcore_id].running_tim)) return -1; @@ -153,7 +207,8 @@ timer_set_config_state(struct rte_timer *tim, /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_CONFIG; - status.owner = (int16_t)lcore_id; + status.index = RTE_TIMER_NO_LCORE; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -185,7 +240,8 @@ timer_set_running_state(struct rte_timer *tim) /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_RUNNING; - status.owner = (int16_t)lcore_id; + status.index = prev_status.index; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -236,11 +292,11 @@ timer_get_skiplist_level(unsigned curr_depth) * are <= that time value. */ static void -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, +timer_get_prev_entries(uint64_t time_val, struct skiplist *list, struct rte_timer **prev) { - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; - prev[lvl] = &priv_timer[tim_lcore].pending_head; + unsigned int lvl = list->depth; + prev[lvl] = &list->head; while(lvl != 0) { lvl--; prev[lvl] = prev[lvl+1]; @@ -255,15 +311,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, * all skiplist levels. */ static void -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, struct rte_timer **prev) { int i; /* to get a specific entry in the list, look for just lower than the time * values, and then increment on each level individually if necessary */ - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries(tim->expire - 1, list, prev); + for (i = list->depth - 1; i >= 0; i--) { while (prev[i]->sl_next[i] != NULL && prev[i]->sl_next[i] != tim && prev[i]->sl_next[i]->expire <= tim->expire) @@ -277,30 +333,39 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, * timer must not be in a list */ static void -timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) +timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked, + int *pending_lists_idx) { unsigned lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + struct priv_timer *priv_tim = &priv_timer[tim_lcore]; + + if (priv_tim->multi_pendlists) + *pending_lists_idx = lcore_id; + else + *pending_lists_idx = SINGLE_LIST_IDX; + + list = &priv_tim->pending_lists[*pending_lists_idx]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if * we are not called from rte_timer_manage() */ if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_lock(&list->lock); /* find where exactly this element goes in the list of elements * for each depth. */ - timer_get_prev_entries(tim->expire, tim_lcore, prev); + timer_get_prev_entries(tim->expire, list, prev); /* now assign it a new level and add at that level */ - const unsigned tim_level = timer_get_skiplist_level( - priv_timer[tim_lcore].curr_skiplist_depth); - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) - priv_timer[tim_lcore].curr_skiplist_depth++; + const unsigned int tim_level = timer_get_skiplist_level(list->depth); + if (tim_level == list->depth) + list->depth++; lvl = tim_level; - while (lvl > 0) { + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) { tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; prev[lvl]->sl_next[lvl] = tim; lvl--; @@ -310,11 +375,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) /* save the lowest list entry into the expire field of the dummy hdr * NOTE: this is not atomic on 32-bit*/ - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ - pending_head.sl_next[0]->expire; + list->head.expire = list->head.sl_next[0]->expire; if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_unlock(&list->lock); } /* @@ -330,35 +394,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status, unsigned prev_owner = prev_status.owner; int i; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + + list = &priv_timer[prev_owner].pending_lists[prev_status.index]; /* if timer needs is pending another core, we need to lock the * list; if it is on local core, we need to lock if we are not * called from rte_timer_manage() */ if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[prev_owner].list_lock); + rte_spinlock_lock(&list->lock); /* save the lowest list entry into the expire field of the dummy hdr. * NOTE: this is not atomic on 32-bit */ - if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) - priv_timer[prev_owner].pending_head.expire = - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); + if (tim == list->head.sl_next[0]) + list->head.expire = ((tim->sl_next[0] == NULL) ? + 0 : tim->sl_next[0]->expire); /* adjust pointers from previous entries to point past this */ - timer_get_prev_entries_for_node(tim, prev_owner, prev); - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries_for_node(tim, list, prev); + for (i = list->depth - 1; i >= 0; i--) { if (prev[i]->sl_next[i] == tim) prev[i]->sl_next[i] = tim->sl_next[i]; } /* in case we deleted last entry at a level, adjust down max level */ - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) - priv_timer[prev_owner].curr_skiplist_depth --; + for (i = list->depth - 1; i >= 0; i--) + if (list->head.sl_next[i] == NULL) + list->depth--; else break; if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); + rte_spinlock_unlock(&list->lock); } /* Reset and start the timer associated with the timer handle (private func) */ @@ -371,15 +438,16 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, union rte_timer_status prev_status, status; int ret; unsigned lcore_id = rte_lcore_id(); + struct priv_timer *priv_tim = &priv_timer[lcore_id]; + int pending_lists_index; /* round robin for tim_lcore */ if (tim_lcore == (unsigned)LCORE_ID_ANY) { if (lcore_id < RTE_MAX_LCORE) { /* EAL thread with valid lcore_id */ - tim_lcore = rte_get_next_lcore( - priv_timer[lcore_id].prev_lcore, - 0, 1); - priv_timer[lcore_id].prev_lcore = tim_lcore; + tim_lcore = rte_get_next_lcore(priv_tim->prev_lcore, 0, + 1); + priv_tim->prev_lcore = tim_lcore; } else /* non-EAL thread do not run rte_timer_manage(), * so schedule the timer on the first enabled lcore. */ @@ -395,7 +463,7 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, __TIMER_STAT_ADD(reset, 1); if (prev_status.state == RTE_TIMER_RUNNING && lcore_id < RTE_MAX_LCORE) { - priv_timer[lcore_id].updated = 1; + priv_tim->updated = 1; } /* remove it from list */ @@ -410,13 +478,14 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, tim->arg = arg; __TIMER_STAT_ADD(pending, 1); - timer_add(tim, tim_lcore, local_is_locked); + timer_add(tim, tim_lcore, local_is_locked, &pending_lists_index); - /* update state: as we are in CONFIG state, only us can modify + /* update state: as we are in CONFIG state, only we can modify * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.owner = (int16_t)tim_lcore; + status.index = pending_lists_index; + status.owner = tim_lcore; tim->status.u32 = status.u32; return 0; @@ -484,7 +553,8 @@ rte_timer_stop(struct rte_timer *tim) /* mark timer as stopped */ rte_wmb(); status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; return 0; @@ -506,119 +576,185 @@ rte_timer_pending(struct rte_timer *tim) } /* must be called periodically, run all timer that expired */ -void rte_timer_manage(void) +void +rte_timer_manage(void) { union rte_timer_status status; - struct rte_timer *tim, *next_tim; - struct rte_timer *run_first_tim, **pprev; - unsigned lcore_id = rte_lcore_id(); + struct rte_timer *tim, *next_tim, **pprev; + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1]; struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; - uint64_t cur_time; - int i, ret; + struct priv_timer *priv_tim; + unsigned int lcore_id = rte_lcore_id(); + uint64_t cur_time, min_expire; + int i, j, min_idx, ret; + int npendlists, nrunlists = 0; + int local_is_locked = 0; + struct skiplist *dest_list, *list = NULL; + bool done; + int idx; /* timer manager only runs on EAL thread with valid lcore_id */ assert(lcore_id < RTE_MAX_LCORE); + priv_tim = &priv_timer[lcore_id]; + __TIMER_STAT_ADD(manage, 1); - /* optimize for the case where per-cpu list is empty */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) - return; - cur_time = rte_get_timer_cycles(); + npendlists = (priv_tim->multi_pendlists) ? n_enabled_lcores : 1; + + for (i = 0; i < npendlists; i++) { + idx = (priv_tim->multi_pendlists) ? enabled_lcores[i] : + SINGLE_LIST_IDX; + list = &priv_tim->pending_lists[idx]; + + /* optimize for the case where list is empty */ + if (list->head.sl_next[0] == NULL) + continue; + cur_time = rte_get_timer_cycles(); #ifdef RTE_ARCH_X86_64 - /* on 64-bit the value cached in the pending_head.expired will be - * updated atomically, so we can consult that for a quick check here - * outside the lock */ - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) - return; + /* on 64-bit the value cached in the list->head.expired will be + * updated atomically, so we can consult that for a quick check + * here outside the lock + */ + if (likely(list->head.expire > cur_time)) + continue; #endif - /* browse ordered list, add expired timers in 'expired' list */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + /* browse ordered list, add expired timers in 'expired' list */ + rte_spinlock_lock(&list->lock); - /* if nothing to do just unlock and return */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); - return; - } + /* if nothing to do just unlock and continue */ + if (list->head.sl_next[0] == NULL || + list->head.sl_next[0]->expire > cur_time) { + rte_spinlock_unlock(&list->lock); + continue; + } - /* save start of list of expired timers */ - tim = priv_timer[lcore_id].pending_head.sl_next[0]; + /* save start of list of expired timers */ + tim = list->head.sl_next[0]; + + /* break the existing list at current time point */ + timer_get_prev_entries(cur_time, list, prev); + for (j = list->depth - 1; j >= 0; j--) { + if (prev[j] == &list->head) + continue; + list->head.sl_next[j] = + prev[j]->sl_next[j]; + if (prev[j]->sl_next[j] == NULL) + list->depth--; + prev[j]->sl_next[j] = NULL; + } - /* break the existing list at current time point */ - timer_get_prev_entries(cur_time, lcore_id, prev); - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { - if (prev[i] == &priv_timer[lcore_id].pending_head) - continue; - priv_timer[lcore_id].pending_head.sl_next[i] = - prev[i]->sl_next[i]; - if (prev[i]->sl_next[i] == NULL) - priv_timer[lcore_id].curr_skiplist_depth--; - prev[i] ->sl_next[i] = NULL; - } + /* transition run-list from PENDING to RUNNING */ + run_first_tims[nrunlists] = tim; + pprev = &run_first_tims[nrunlists]; + nrunlists++; + + for (; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + + ret = timer_set_running_state(tim); + if (likely(ret == 0)) { + pprev = &tim->sl_next[0]; + } else { + /* another core is trying to re-config this one, + * remove it from local expired list + */ + *pprev = next_tim; + } + } - /* transition run-list from PENDING to RUNNING */ - run_first_tim = tim; - pprev = &run_first_tim; + /* update the next to expire timer value */ + list->head.expire = (list->head.sl_next[0] == NULL) ? + 0 : list->head.sl_next[0]->expire; - for ( ; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; + rte_spinlock_unlock(&list->lock); + } - ret = timer_set_running_state(tim); - if (likely(ret == 0)) { - pprev = &tim->sl_next[0]; - } else { - /* another core is trying to re-config this one, - * remove it from local expired list - */ - *pprev = next_tim; + /* Now process the run lists */ + while (1) { + done = true; + min_expire = UINT64_MAX; + min_idx = 0; + + /* Find the next oldest timer to process */ + for (i = 0; i < nrunlists; i++) { + tim = run_first_tims[i]; + + if (tim != NULL && tim->expire < min_expire) { + min_expire = tim->expire; + min_idx = i; + done = false; + } } - } - /* update the next to expire timer value */ - priv_timer[lcore_id].pending_head.expire = - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : - priv_timer[lcore_id].pending_head.sl_next[0]->expire; + if (done) + break; + + tim = run_first_tims[min_idx]; - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + /* Move down the runlist from which we picked a timer to + * execute + */ + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0]; - /* now scan expired list and call callbacks */ - for (tim = run_first_tim; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; - priv_timer[lcore_id].updated = 0; - priv_timer[lcore_id].running_tim = tim; + priv_tim->updated = 0; + priv_tim->running_tim = tim; /* execute callback function with list unlocked */ tim->f(tim, tim->arg); __TIMER_STAT_ADD(pending, -1); /* the timer was stopped or reloaded by the callback - * function, we have nothing to do here */ - if (priv_timer[lcore_id].updated == 1) + * function, we have nothing to do here + */ + if (priv_tim->updated == 1) continue; if (tim->period == 0) { /* remove from done list and mark timer as stopped */ status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; rte_wmb(); tim->status.u32 = status.u32; } else { - /* keep it in list and mark timer as pending */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + idx = (priv_tim->multi_pendlists) ? lcore_id : + SINGLE_LIST_IDX; + dest_list = &priv_tim->pending_lists[idx]; + + /* If the destination list is the current list + * we can acquire the lock here, and hold it + * across removal and insertion of the timer. + */ + if (list == dest_list) { + rte_spinlock_lock(&list->lock); + local_is_locked = 1; + } + + /* set timer state back to PENDING and + * reinsert it in pending list + */ status.state = RTE_TIMER_PENDING; __TIMER_STAT_ADD(pending, 1); - status.owner = (int16_t)lcore_id; + status.index = tim->status.index; + status.owner = lcore_id; rte_wmb(); tim->status.u32 = status.u32; - __rte_timer_reset(tim, tim->expire + tim->period, - tim->period, lcore_id, tim->f, tim->arg, 1); - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + + __rte_timer_reset(tim, + tim->expire + tim->period, + tim->period, lcore_id, + tim->f, tim->arg, local_is_locked); + + if (local_is_locked) { + rte_spinlock_unlock(&list->lock); + local_is_locked = 0; + } } } - priv_timer[lcore_id].running_tim = NULL; + priv_tim->running_tim = NULL; } /* dump statistics about timers */ diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h index a276a73..5870b4a 100644 --- a/lib/librte_timer/rte_timer.h +++ b/lib/librte_timer/rte_timer.h @@ -77,7 +77,7 @@ extern "C" { #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ +#define RTE_TIMER_NO_LCORE -2 /** * Timer type: Periodic or single (one-shot). @@ -94,10 +94,11 @@ enum rte_timer_type { union rte_timer_status { RTE_STD_C11 struct { - uint16_t state; /**< Stop, pending, running, config. */ - int16_t owner; /**< The lcore that owns the timer. */ + unsigned state : 2; + int index : 15; + int owner : 15; }; - uint32_t u32; /**< To atomic-set status + owner. */ + uint32_t u32; /**< To atomic-set status, index, owner */ }; #ifdef RTE_LIBRTE_TIMER_DEBUG @@ -168,6 +169,26 @@ struct rte_timer void rte_timer_subsystem_init(void); /** + * Enable multiple pending lists for specified lcore ID. + * + * This function enables the use of per-installer pending lists for a + * target lcore. When timers are being installed from multiple + * lcores simultaneously, this option can increase performance by + * reducing contention for the same pending list. + * + * This function should be called after the timer subsystem has been + * initialized, and before any timers have been started on the desired + * lcore. + * + * @param lcore_id + * The lcore on which to enable multiple pending lists + * @return + * - 0: Success + * - (-1): Failure; pending timers already exist on the default list. + */ +int rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id); + +/** * Initialize a timer handle. * * The rte_timer_init() function initializes the timer handle *tim* diff --git a/lib/librte_timer/rte_timer_version.map b/lib/librte_timer/rte_timer_version.map index 9b2e4b8..7d4cbe8 100644 --- a/lib/librte_timer/rte_timer_version.map +++ b/lib/librte_timer/rte_timer_version.map @@ -13,3 +13,9 @@ DPDK_2.0 { local: *; }; + +DPDK_17.11 { + global: + + rte_timer_subsystem_set_multi_pendlists; +} DPDK_2.0; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo @ 2017-09-13 22:05 ` Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo 3 siblings, 0 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw) To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati This commit adds support for timers being created from non-EAL threads; it maps timers from all such threads to lcore id RTE_MAX_LCORE, and puts them all in a corresponding skiplist. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> --- V3: * Rebased patch on reworked parent commit v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 1f4141e..c489c5f 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -66,8 +66,8 @@ struct skiplist { } __rte_cache_aligned; struct priv_timer { - /** one pending list per enabled lcore */ - struct skiplist pending_lists[RTE_MAX_LCORE]; + /** one pending list per lcore, plus one for non-EAL threads */ + struct skiplist pending_lists[RTE_MAX_LCORE + 1]; bool multi_pendlists; /** per-core variable that true if a timer was updated on this @@ -88,7 +88,7 @@ struct priv_timer { static struct priv_timer priv_timer[RTE_MAX_LCORE]; /** cache of IDs of enabled lcores */ -static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static unsigned int enabled_lcores[RTE_MAX_LCORE + 1]; static int n_enabled_lcores; /* when debug is enabled, store some statistics */ @@ -114,12 +114,18 @@ rte_timer_subsystem_init(void) RTE_LCORE_FOREACH(lcore_id) enabled_lcores[n_enabled_lcores++] = lcore_id; + /* To handle timers coming from non-EAL threads */ + enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE; + /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ for (i = 0; i < n_enabled_lcores; i++) { lcore_id = enabled_lcores[i]; + /* Don't use this value to index the priv_timer array */ + if (lcore_id == RTE_MAX_LCORE) + continue; priv_tim = &priv_timer[lcore_id]; priv_tim->prev_lcore = lcore_id; @@ -343,7 +349,9 @@ timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked, struct priv_timer *priv_tim = &priv_timer[tim_lcore]; if (priv_tim->multi_pendlists) - *pending_lists_idx = lcore_id; + /* Check if timer being installed from non-EAL thread */ + *pending_lists_idx = (lcore_id == LCORE_ID_ANY) ? + RTE_MAX_LCORE : lcore_id; else *pending_lists_idx = SINGLE_LIST_IDX; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo @ 2017-09-13 22:05 ` Erik Gabriel Carrillo 2017-09-18 16:19 ` Mcnamara, John 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo 3 siblings, 1 reply; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-13 22:05 UTC (permalink / raw) To: rsanford; +Cc: dev, konstantin.ananyev, stephen, keith.wiles, narender.vangati This change updates the timer library documentation to reflect a change to the organization of the skiplists in the implementation. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> --- v3 * Updated implementation details section of timer_lib.rst to reflect the addition of the option to use multiple pending timer lists per lcore. * Updated release notes to reflect the addition of new function in timer lib API. doc/guides/prog_guide/timer_lib.rst | 27 +++++++++++++++++---------- doc/guides/rel_notes/release_17_11.rst | 7 +++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst index f437417..dfabf24 100644 --- a/doc/guides/prog_guide/timer_lib.rst +++ b/doc/guides/prog_guide/timer_lib.rst @@ -1,5 +1,5 @@ .. BSD LICENSE - Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + Copyright(c) 2010-2017 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -53,16 +53,19 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>` Implementation Details ---------------------- -Timers are tracked on a per-lcore basis, -with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure. -The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level. +Timers are tracked on a per-lcore basis, with all pending timers for a core being maintained in order of timer +expiry in either a single skiplist data structure or an array of skiplists, depending on whether +the lcore has been configured for multiple pending lists. Multiple pending lists can be enabled when an +application experiences contention for a single list for that lcore; skiplists corresponding to every other +enabled lcore will be created. +Each skiplist data structure has ten levels and each entry in the table appears in each level with probability ¼^level. This means that all entries are present in level 0, 1 in every 4 entries is present at level 1, one in every 16 at level 2 and so on up to level 9. This means that adding and removing entries from the timer list for a core can be done in log(n) time, up to 4^10 entries, that is, approximately 1,000,000 timers per lcore. A timer structure contains a special field called status, -which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id). +which is a union of a timer state (stopped, pending, running, config), an installer (lcore id), and an owner (lcore id). Depending on the timer state, we know if a timer is present in a list or not: * STOPPED: no owner, not in a list @@ -77,17 +80,21 @@ Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not al When modifying the state of a timer, a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically. -Inside the rte_timer_manage() function, -the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, -until an entry which has not yet expired has been encountered. -To improve performance in the case where there are entries in the timer list but none of those timers have yet expired, +Inside the rte_timer_manage() function, the timer lists are processed. +If multiple pending lists have been enabled for an lcore, then each skiplist will +be traversed sequentially, and run lists will be broken out and then processed. +If multiple pending lists are not enabled for an lcore, then only a single skiplist will be traversed. +A skiplist is used as a regular list by iterating along the level +0 list, which contains all timer entries, until an entry which has not yet expired has been encountered. +To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired, the expiry time of the first list entry is maintained within the per-core timer list structure itself. On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure. (Since expiry times are maintained as 64-bit values, a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock, so this additional check is skipped in favor of checking as normal once the lock has been taken.) On both 64-bit and 32-bit platforms, -a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty. +rte_timer_manage() can either return or continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty, +depending on whether or not the lcore has multiple pending lists. Use Cases --------- diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst index 170f4f9..4683cbe 100644 --- a/doc/guides/rel_notes/release_17_11.rst +++ b/doc/guides/rel_notes/release_17_11.rst @@ -110,6 +110,13 @@ API Changes Also, make sure to start the actual text at the margin. ========================================================= +* **Updated timer library.** + + The timer library has been updated; it can now support multiple timer lists + per lcore where it previously only had one. This functionality is off by + default but can be enabled in cases where contention for a single list is + an issue with the new function ``rte_timer_subsystem_set_multi_pendlists()``. + ABI Changes ----------- -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo @ 2017-09-18 16:19 ` Mcnamara, John 2017-09-19 17:04 ` Carrillo, Erik G 0 siblings, 1 reply; 30+ messages in thread From: Mcnamara, John @ 2017-09-18 16:19 UTC (permalink / raw) To: Carrillo, Erik G, rsanford Cc: dev, Ananyev, Konstantin, stephen, Wiles, Keith, Vangati, Narender > -----Original Message----- > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Erik Gabriel Carrillo > Sent: Wednesday, September 13, 2017 11:05 PM > To: rsanford@akamai.com > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; > stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>; Vangati, > Narender <narender.vangati@intel.com> > Subject: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs > > This change updates the timer library documentation to reflect a change to > the organization of the skiplists in the implementation. > > Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> > +Each skiplist data structure has ten levels and each entry in the table > appears in each level with probability ¼^level. Probably best not to use that Unicode, or otherwise, 1/4 character since it can mess up the PDF output. Use "0.25^level" instead. Apart from that: Acked-by: John McNamara <john.mcnamara@intel.com> ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs 2017-09-18 16:19 ` Mcnamara, John @ 2017-09-19 17:04 ` Carrillo, Erik G 0 siblings, 0 replies; 30+ messages in thread From: Carrillo, Erik G @ 2017-09-19 17:04 UTC (permalink / raw) To: Mcnamara, John, rsanford Cc: dev, Ananyev, Konstantin, stephen, Wiles, Keith, Vangati, Narender Thanks for the review, John. I've submitted the change you suggest in a new patch series. Regards, Gabriel > -----Original Message----- > From: Mcnamara, John > Sent: Monday, September 18, 2017 11:20 AM > To: Carrillo, Erik G <erik.g.carrillo@intel.com>; rsanford@akamai.com > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; > stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>; > Vangati, Narender <narender.vangati@intel.com> > Subject: RE: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs > > > > > -----Original Message----- > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Erik Gabriel > > Carrillo > > Sent: Wednesday, September 13, 2017 11:05 PM > > To: rsanford@akamai.com > > Cc: dev@dpdk.org; Ananyev, Konstantin > <konstantin.ananyev@intel.com>; > > stephen@networkplumber.org; Wiles, Keith <keith.wiles@intel.com>; > > Vangati, Narender <narender.vangati@intel.com> > > Subject: [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs > > > > This change updates the timer library documentation to reflect a > > change to the organization of the skiplists in the implementation. > > > > Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> > > > > +Each skiplist data structure has ten levels and each entry in the > > +table > > appears in each level with probability ¼^level. > > Probably best not to use that Unicode, or otherwise, 1/4 character since it > can mess up the PDF output. Use "0.25^level" instead. > > > Apart from that: > > Acked-by: John McNamara <john.mcnamara@intel.com> ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 0/3] timer library enhancement 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo ` (2 preceding siblings ...) 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo @ 2017-09-19 17:02 ` Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo ` (3 more replies) 3 siblings, 4 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw) To: rsanford; +Cc: dev, john.mcnamara In the current implementation of the DPDK timer library, timers can be created and set to be handled by a target lcore by adding it to a skiplist that corresponds to that lcore. However, if an application enables multiple lcores, and each of these lcores repeatedly attempts to install timers on the same target lcore, overall application throughput will be reduced as all lcores contend to acquire the lock guarding the single skiplist of pending timers. This patchset addresses this scenario by adding an option to enable an array of skiplists in each lcore's priv_timer struct. Then, when lcore i installs a timer on lcore k, the timer will be added to the ith skiplist for lcore k. If lcore j installs a timer on lcore k simultaneously, lcores i and j can both proceed since they will be acquiring different locks for different lists. This functionality is off by default, and can be enabled via a new function. When lcore k processes its pending timers, if the multiple pending list option is enabled, it will traverse skiplists in its array and acquire the current skiplist's lock while a run list is broken out; meanwhile, all other lists can continue to be modified. Then, all run lists for lcore k are collected and traversed together so timers are executed in their relative order. If the multiple pending list option is not enabled (the default), only a single list will be traversed, as before. Erik Gabriel Carrillo (3): timer: add multiple pending lists option for each lcore timer: handle timers installed from non-EAL threads doc: update timer lib docs doc/guides/prog_guide/timer_lib.rst | 31 +-- doc/guides/rel_notes/release_17_11.rst | 7 + lib/librte_timer/rte_timer.c | 384 ++++++++++++++++++++++----------- lib/librte_timer/rte_timer.h | 29 ++- lib/librte_timer/rte_timer_version.map | 6 + 5 files changed, 321 insertions(+), 136 deletions(-) -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo @ 2017-09-19 17:02 ` Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo ` (2 subsequent siblings) 3 siblings, 0 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw) To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo From: Erik Carrillo <erik.g.carrillo@intel.com> This commit adds support for enabling multiple pending timer lists in each lcore's priv_timer struct with a new function; a single list is still used by default. In the case that multiple lcores repeatedly install timers on the same target lcore, this option reduces lock contention for the target lcore's skiplists and increases performance. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> --- v3: * Added ability to enable multiple pending lists for a specified lcore with a new function, in response to feedback from Keith Wiles, Konstantin Ananyev, and Stephen Hemminger. This functionality is off by default, so that new overhead in rte_timer_manage() is minimized if the application doesn't want this feature. * Updated map file to reflect addition of new function. * Made minor updates to some variable names and logic structure. v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 376 ++++++++++++++++++++++----------- lib/librte_timer/rte_timer.h | 29 ++- lib/librte_timer/rte_timer_version.map | 6 + 3 files changed, 287 insertions(+), 124 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 5ee0840..1f4141e 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -37,6 +37,7 @@ #include <inttypes.h> #include <assert.h> #include <sys/queue.h> +#include <stdbool.h> #include <rte_atomic.h> #include <rte_common.h> @@ -54,19 +55,24 @@ #include "rte_timer.h" +#define SINGLE_LIST_IDX 0 + LIST_HEAD(rte_timer_list, rte_timer); +struct skiplist { + struct rte_timer head; /**< dummy timer instance to head up list */ + rte_spinlock_t lock; /**< lock to protect list access */ + unsigned int depth; /**< track the current depth of the skiplist */ +} __rte_cache_aligned; + struct priv_timer { - struct rte_timer pending_head; /**< dummy timer instance to head up list */ - rte_spinlock_t list_lock; /**< lock to protect list access */ + /** one pending list per enabled lcore */ + struct skiplist pending_lists[RTE_MAX_LCORE]; + bool multi_pendlists; /** per-core variable that true if a timer was updated on this * core since last reset of the variable */ int updated; - - /** track the current depth of the skiplist */ - unsigned curr_skiplist_depth; - unsigned prev_lcore; /**< used for lcore round robin */ /** running timer on this lcore now */ @@ -81,6 +87,10 @@ struct priv_timer { /** per-lcore private info for timers */ static struct priv_timer priv_timer[RTE_MAX_LCORE]; +/** cache of IDs of enabled lcores */ +static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static int n_enabled_lcores; + /* when debug is enabled, store some statistics */ #ifdef RTE_LIBRTE_TIMER_DEBUG #define __TIMER_STAT_ADD(name, n) do { \ @@ -96,17 +106,60 @@ static struct priv_timer priv_timer[RTE_MAX_LCORE]; void rte_timer_subsystem_init(void) { - unsigned lcore_id; + unsigned int lcore_id, pending_lists_idx; + struct skiplist *list; + struct priv_timer *priv_tim; + int i, j; + + RTE_LCORE_FOREACH(lcore_id) + enabled_lcores[n_enabled_lcores++] = lcore_id; /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { - rte_spinlock_init(&priv_timer[lcore_id].list_lock); - priv_timer[lcore_id].prev_lcore = lcore_id; + for (i = 0; i < n_enabled_lcores; i++) { + lcore_id = enabled_lcores[i]; + + + priv_tim = &priv_timer[lcore_id]; + priv_tim->prev_lcore = lcore_id; + priv_tim->multi_pendlists = false; + + for (j = 0; j < n_enabled_lcores; j++) { + pending_lists_idx = enabled_lcores[j]; + list = &priv_tim->pending_lists[pending_lists_idx]; + rte_spinlock_init(&list->lock); + } + + /* Init the list at SINGLE_LIST_IDX, in case we didn't get to + * it above. This will be the skiplist we use unless the + * multi-pendlists option is later enabled for this lcore. + */ + list = &priv_tim->pending_lists[SINGLE_LIST_IDX]; + rte_spinlock_init(&list->lock); } } +int +rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id) +{ + struct priv_timer *priv_tim = &priv_timer[lcore_id]; + struct skiplist *list = &priv_tim->pending_lists[SINGLE_LIST_IDX]; + + rte_spinlock_lock(&list->lock); + + if (list->head.sl_next[0] != NULL) { + /* Timers are already on the default pending list. */ + rte_spinlock_unlock(&list->lock); + return -1; + } + + priv_tim->multi_pendlists = true; + rte_spinlock_unlock(&list->lock); + + return 0; +} + /* Initialize the timer handle tim for use */ void rte_timer_init(struct rte_timer *tim) @@ -114,7 +167,8 @@ rte_timer_init(struct rte_timer *tim) union rte_timer_status status; status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; } @@ -142,7 +196,7 @@ timer_set_config_state(struct rte_timer *tim, * or ready to run on local core, exit */ if (prev_status.state == RTE_TIMER_RUNNING && - (prev_status.owner != (uint16_t)lcore_id || + (prev_status.owner != (int)lcore_id || tim != priv_timer[lcore_id].running_tim)) return -1; @@ -153,7 +207,8 @@ timer_set_config_state(struct rte_timer *tim, /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_CONFIG; - status.owner = (int16_t)lcore_id; + status.index = RTE_TIMER_NO_LCORE; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -185,7 +240,8 @@ timer_set_running_state(struct rte_timer *tim) /* here, we know that timer is stopped or pending, * mark it atomically as being configured */ status.state = RTE_TIMER_RUNNING; - status.owner = (int16_t)lcore_id; + status.index = prev_status.index; + status.owner = lcore_id; success = rte_atomic32_cmpset(&tim->status.u32, prev_status.u32, status.u32); @@ -236,11 +292,11 @@ timer_get_skiplist_level(unsigned curr_depth) * are <= that time value. */ static void -timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, +timer_get_prev_entries(uint64_t time_val, struct skiplist *list, struct rte_timer **prev) { - unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; - prev[lvl] = &priv_timer[tim_lcore].pending_head; + unsigned int lvl = list->depth; + prev[lvl] = &list->head; while(lvl != 0) { lvl--; prev[lvl] = prev[lvl+1]; @@ -255,15 +311,15 @@ timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, * all skiplist levels. */ static void -timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, +timer_get_prev_entries_for_node(struct rte_timer *tim, struct skiplist *list, struct rte_timer **prev) { int i; /* to get a specific entry in the list, look for just lower than the time * values, and then increment on each level individually if necessary */ - timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); - for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries(tim->expire - 1, list, prev); + for (i = list->depth - 1; i >= 0; i--) { while (prev[i]->sl_next[i] != NULL && prev[i]->sl_next[i] != tim && prev[i]->sl_next[i]->expire <= tim->expire) @@ -277,30 +333,39 @@ timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, * timer must not be in a list */ static void -timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) +timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked, + int *pending_lists_idx) { unsigned lcore_id = rte_lcore_id(); unsigned lvl; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + struct priv_timer *priv_tim = &priv_timer[tim_lcore]; + + if (priv_tim->multi_pendlists) + *pending_lists_idx = lcore_id; + else + *pending_lists_idx = SINGLE_LIST_IDX; + + list = &priv_tim->pending_lists[*pending_lists_idx]; /* if timer needs to be scheduled on another core, we need to * lock the list; if it is on local core, we need to lock if * we are not called from rte_timer_manage() */ if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_lock(&list->lock); /* find where exactly this element goes in the list of elements * for each depth. */ - timer_get_prev_entries(tim->expire, tim_lcore, prev); + timer_get_prev_entries(tim->expire, list, prev); /* now assign it a new level and add at that level */ - const unsigned tim_level = timer_get_skiplist_level( - priv_timer[tim_lcore].curr_skiplist_depth); - if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) - priv_timer[tim_lcore].curr_skiplist_depth++; + const unsigned int tim_level = timer_get_skiplist_level(list->depth); + if (tim_level == list->depth) + list->depth++; lvl = tim_level; - while (lvl > 0) { + while (lvl > 0 && lvl < MAX_SKIPLIST_DEPTH + 1) { tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; prev[lvl]->sl_next[lvl] = tim; lvl--; @@ -310,11 +375,10 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) /* save the lowest list entry into the expire field of the dummy hdr * NOTE: this is not atomic on 32-bit*/ - priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ - pending_head.sl_next[0]->expire; + list->head.expire = list->head.sl_next[0]->expire; if (tim_lcore != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); + rte_spinlock_unlock(&list->lock); } /* @@ -330,35 +394,38 @@ timer_del(struct rte_timer *tim, union rte_timer_status prev_status, unsigned prev_owner = prev_status.owner; int i; struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + struct skiplist *list; + + list = &priv_timer[prev_owner].pending_lists[prev_status.index]; /* if timer needs is pending another core, we need to lock the * list; if it is on local core, we need to lock if we are not * called from rte_timer_manage() */ if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_lock(&priv_timer[prev_owner].list_lock); + rte_spinlock_lock(&list->lock); /* save the lowest list entry into the expire field of the dummy hdr. * NOTE: this is not atomic on 32-bit */ - if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) - priv_timer[prev_owner].pending_head.expire = - ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); + if (tim == list->head.sl_next[0]) + list->head.expire = ((tim->sl_next[0] == NULL) ? + 0 : tim->sl_next[0]->expire); /* adjust pointers from previous entries to point past this */ - timer_get_prev_entries_for_node(tim, prev_owner, prev); - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { + timer_get_prev_entries_for_node(tim, list, prev); + for (i = list->depth - 1; i >= 0; i--) { if (prev[i]->sl_next[i] == tim) prev[i]->sl_next[i] = tim->sl_next[i]; } /* in case we deleted last entry at a level, adjust down max level */ - for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) - if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) - priv_timer[prev_owner].curr_skiplist_depth --; + for (i = list->depth - 1; i >= 0; i--) + if (list->head.sl_next[i] == NULL) + list->depth--; else break; if (prev_owner != lcore_id || !local_is_locked) - rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); + rte_spinlock_unlock(&list->lock); } /* Reset and start the timer associated with the timer handle (private func) */ @@ -371,15 +438,16 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, union rte_timer_status prev_status, status; int ret; unsigned lcore_id = rte_lcore_id(); + struct priv_timer *priv_tim = &priv_timer[lcore_id]; + int pending_lists_index; /* round robin for tim_lcore */ if (tim_lcore == (unsigned)LCORE_ID_ANY) { if (lcore_id < RTE_MAX_LCORE) { /* EAL thread with valid lcore_id */ - tim_lcore = rte_get_next_lcore( - priv_timer[lcore_id].prev_lcore, - 0, 1); - priv_timer[lcore_id].prev_lcore = tim_lcore; + tim_lcore = rte_get_next_lcore(priv_tim->prev_lcore, 0, + 1); + priv_tim->prev_lcore = tim_lcore; } else /* non-EAL thread do not run rte_timer_manage(), * so schedule the timer on the first enabled lcore. */ @@ -395,7 +463,7 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, __TIMER_STAT_ADD(reset, 1); if (prev_status.state == RTE_TIMER_RUNNING && lcore_id < RTE_MAX_LCORE) { - priv_timer[lcore_id].updated = 1; + priv_tim->updated = 1; } /* remove it from list */ @@ -410,13 +478,14 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire, tim->arg = arg; __TIMER_STAT_ADD(pending, 1); - timer_add(tim, tim_lcore, local_is_locked); + timer_add(tim, tim_lcore, local_is_locked, &pending_lists_index); - /* update state: as we are in CONFIG state, only us can modify + /* update state: as we are in CONFIG state, only we can modify * the state so we don't need to use cmpset() here */ rte_wmb(); status.state = RTE_TIMER_PENDING; - status.owner = (int16_t)tim_lcore; + status.index = pending_lists_index; + status.owner = tim_lcore; tim->status.u32 = status.u32; return 0; @@ -484,7 +553,8 @@ rte_timer_stop(struct rte_timer *tim) /* mark timer as stopped */ rte_wmb(); status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; tim->status.u32 = status.u32; return 0; @@ -506,119 +576,185 @@ rte_timer_pending(struct rte_timer *tim) } /* must be called periodically, run all timer that expired */ -void rte_timer_manage(void) +void +rte_timer_manage(void) { union rte_timer_status status; - struct rte_timer *tim, *next_tim; - struct rte_timer *run_first_tim, **pprev; - unsigned lcore_id = rte_lcore_id(); + struct rte_timer *tim, *next_tim, **pprev; + struct rte_timer *run_first_tims[RTE_MAX_LCORE + 1]; struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; - uint64_t cur_time; - int i, ret; + struct priv_timer *priv_tim; + unsigned int lcore_id = rte_lcore_id(); + uint64_t cur_time, min_expire; + int i, j, min_idx, ret; + int npendlists, nrunlists = 0; + int local_is_locked = 0; + struct skiplist *dest_list, *list = NULL; + bool done; + int idx; /* timer manager only runs on EAL thread with valid lcore_id */ assert(lcore_id < RTE_MAX_LCORE); + priv_tim = &priv_timer[lcore_id]; + __TIMER_STAT_ADD(manage, 1); - /* optimize for the case where per-cpu list is empty */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) - return; - cur_time = rte_get_timer_cycles(); + npendlists = (priv_tim->multi_pendlists) ? n_enabled_lcores : 1; + + for (i = 0; i < npendlists; i++) { + idx = (priv_tim->multi_pendlists) ? enabled_lcores[i] : + SINGLE_LIST_IDX; + list = &priv_tim->pending_lists[idx]; + + /* optimize for the case where list is empty */ + if (list->head.sl_next[0] == NULL) + continue; + cur_time = rte_get_timer_cycles(); #ifdef RTE_ARCH_X86_64 - /* on 64-bit the value cached in the pending_head.expired will be - * updated atomically, so we can consult that for a quick check here - * outside the lock */ - if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) - return; + /* on 64-bit the value cached in the list->head.expired will be + * updated atomically, so we can consult that for a quick check + * here outside the lock + */ + if (likely(list->head.expire > cur_time)) + continue; #endif - /* browse ordered list, add expired timers in 'expired' list */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + /* browse ordered list, add expired timers in 'expired' list */ + rte_spinlock_lock(&list->lock); - /* if nothing to do just unlock and return */ - if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || - priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); - return; - } + /* if nothing to do just unlock and continue */ + if (list->head.sl_next[0] == NULL || + list->head.sl_next[0]->expire > cur_time) { + rte_spinlock_unlock(&list->lock); + continue; + } - /* save start of list of expired timers */ - tim = priv_timer[lcore_id].pending_head.sl_next[0]; + /* save start of list of expired timers */ + tim = list->head.sl_next[0]; + + /* break the existing list at current time point */ + timer_get_prev_entries(cur_time, list, prev); + for (j = list->depth - 1; j >= 0; j--) { + if (prev[j] == &list->head) + continue; + list->head.sl_next[j] = + prev[j]->sl_next[j]; + if (prev[j]->sl_next[j] == NULL) + list->depth--; + prev[j]->sl_next[j] = NULL; + } - /* break the existing list at current time point */ - timer_get_prev_entries(cur_time, lcore_id, prev); - for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { - if (prev[i] == &priv_timer[lcore_id].pending_head) - continue; - priv_timer[lcore_id].pending_head.sl_next[i] = - prev[i]->sl_next[i]; - if (prev[i]->sl_next[i] == NULL) - priv_timer[lcore_id].curr_skiplist_depth--; - prev[i] ->sl_next[i] = NULL; - } + /* transition run-list from PENDING to RUNNING */ + run_first_tims[nrunlists] = tim; + pprev = &run_first_tims[nrunlists]; + nrunlists++; + + for (; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + + ret = timer_set_running_state(tim); + if (likely(ret == 0)) { + pprev = &tim->sl_next[0]; + } else { + /* another core is trying to re-config this one, + * remove it from local expired list + */ + *pprev = next_tim; + } + } - /* transition run-list from PENDING to RUNNING */ - run_first_tim = tim; - pprev = &run_first_tim; + /* update the next to expire timer value */ + list->head.expire = (list->head.sl_next[0] == NULL) ? + 0 : list->head.sl_next[0]->expire; - for ( ; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; + rte_spinlock_unlock(&list->lock); + } - ret = timer_set_running_state(tim); - if (likely(ret == 0)) { - pprev = &tim->sl_next[0]; - } else { - /* another core is trying to re-config this one, - * remove it from local expired list - */ - *pprev = next_tim; + /* Now process the run lists */ + while (1) { + done = true; + min_expire = UINT64_MAX; + min_idx = 0; + + /* Find the next oldest timer to process */ + for (i = 0; i < nrunlists; i++) { + tim = run_first_tims[i]; + + if (tim != NULL && tim->expire < min_expire) { + min_expire = tim->expire; + min_idx = i; + done = false; + } } - } - /* update the next to expire timer value */ - priv_timer[lcore_id].pending_head.expire = - (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : - priv_timer[lcore_id].pending_head.sl_next[0]->expire; + if (done) + break; + + tim = run_first_tims[min_idx]; - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + /* Move down the runlist from which we picked a timer to + * execute + */ + run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0]; - /* now scan expired list and call callbacks */ - for (tim = run_first_tim; tim != NULL; tim = next_tim) { - next_tim = tim->sl_next[0]; - priv_timer[lcore_id].updated = 0; - priv_timer[lcore_id].running_tim = tim; + priv_tim->updated = 0; + priv_tim->running_tim = tim; /* execute callback function with list unlocked */ tim->f(tim, tim->arg); __TIMER_STAT_ADD(pending, -1); /* the timer was stopped or reloaded by the callback - * function, we have nothing to do here */ - if (priv_timer[lcore_id].updated == 1) + * function, we have nothing to do here + */ + if (priv_tim->updated == 1) continue; if (tim->period == 0) { /* remove from done list and mark timer as stopped */ status.state = RTE_TIMER_STOP; - status.owner = RTE_TIMER_NO_OWNER; + status.index = RTE_TIMER_NO_LCORE; + status.owner = RTE_TIMER_NO_LCORE; rte_wmb(); tim->status.u32 = status.u32; } else { - /* keep it in list and mark timer as pending */ - rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + idx = (priv_tim->multi_pendlists) ? lcore_id : + SINGLE_LIST_IDX; + dest_list = &priv_tim->pending_lists[idx]; + + /* If the destination list is the current list + * we can acquire the lock here, and hold it + * across removal and insertion of the timer. + */ + if (list == dest_list) { + rte_spinlock_lock(&list->lock); + local_is_locked = 1; + } + + /* set timer state back to PENDING and + * reinsert it in pending list + */ status.state = RTE_TIMER_PENDING; __TIMER_STAT_ADD(pending, 1); - status.owner = (int16_t)lcore_id; + status.index = tim->status.index; + status.owner = lcore_id; rte_wmb(); tim->status.u32 = status.u32; - __rte_timer_reset(tim, tim->expire + tim->period, - tim->period, lcore_id, tim->f, tim->arg, 1); - rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + + __rte_timer_reset(tim, + tim->expire + tim->period, + tim->period, lcore_id, + tim->f, tim->arg, local_is_locked); + + if (local_is_locked) { + rte_spinlock_unlock(&list->lock); + local_is_locked = 0; + } } } - priv_timer[lcore_id].running_tim = NULL; + priv_tim->running_tim = NULL; } /* dump statistics about timers */ diff --git a/lib/librte_timer/rte_timer.h b/lib/librte_timer/rte_timer.h index a276a73..5870b4a 100644 --- a/lib/librte_timer/rte_timer.h +++ b/lib/librte_timer/rte_timer.h @@ -77,7 +77,7 @@ extern "C" { #define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ #define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ -#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ +#define RTE_TIMER_NO_LCORE -2 /** * Timer type: Periodic or single (one-shot). @@ -94,10 +94,11 @@ enum rte_timer_type { union rte_timer_status { RTE_STD_C11 struct { - uint16_t state; /**< Stop, pending, running, config. */ - int16_t owner; /**< The lcore that owns the timer. */ + unsigned state : 2; + int index : 15; + int owner : 15; }; - uint32_t u32; /**< To atomic-set status + owner. */ + uint32_t u32; /**< To atomic-set status, index, owner */ }; #ifdef RTE_LIBRTE_TIMER_DEBUG @@ -168,6 +169,26 @@ struct rte_timer void rte_timer_subsystem_init(void); /** + * Enable multiple pending lists for specified lcore ID. + * + * This function enables the use of per-installer pending lists for a + * target lcore. When timers are being installed from multiple + * lcores simultaneously, this option can increase performance by + * reducing contention for the same pending list. + * + * This function should be called after the timer subsystem has been + * initialized, and before any timers have been started on the desired + * lcore. + * + * @param lcore_id + * The lcore on which to enable multiple pending lists + * @return + * - 0: Success + * - (-1): Failure; pending timers already exist on the default list. + */ +int rte_timer_subsystem_set_multi_pendlists(unsigned int lcore_id); + +/** * Initialize a timer handle. * * The rte_timer_init() function initializes the timer handle *tim* diff --git a/lib/librte_timer/rte_timer_version.map b/lib/librte_timer/rte_timer_version.map index 9b2e4b8..7d4cbe8 100644 --- a/lib/librte_timer/rte_timer_version.map +++ b/lib/librte_timer/rte_timer_version.map @@ -13,3 +13,9 @@ DPDK_2.0 { local: *; }; + +DPDK_17.11 { + global: + + rte_timer_subsystem_set_multi_pendlists; +} DPDK_2.0; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo @ 2017-09-19 17:02 ` Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo 2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon 3 siblings, 0 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw) To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo From: Erik Carrillo <erik.g.carrillo@intel.com> This commit adds support for timers being created from non-EAL threads; it maps timers from all such threads to lcore id RTE_MAX_LCORE, and puts them all in a corresponding skiplist. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> --- v3: * Rebased patch on reworked parent commit v2: * Address checkpatch warnings lib/librte_timer/rte_timer.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c index 1f4141e..c489c5f 100644 --- a/lib/librte_timer/rte_timer.c +++ b/lib/librte_timer/rte_timer.c @@ -66,8 +66,8 @@ struct skiplist { } __rte_cache_aligned; struct priv_timer { - /** one pending list per enabled lcore */ - struct skiplist pending_lists[RTE_MAX_LCORE]; + /** one pending list per lcore, plus one for non-EAL threads */ + struct skiplist pending_lists[RTE_MAX_LCORE + 1]; bool multi_pendlists; /** per-core variable that true if a timer was updated on this @@ -88,7 +88,7 @@ struct priv_timer { static struct priv_timer priv_timer[RTE_MAX_LCORE]; /** cache of IDs of enabled lcores */ -static unsigned int enabled_lcores[RTE_MAX_LCORE]; +static unsigned int enabled_lcores[RTE_MAX_LCORE + 1]; static int n_enabled_lcores; /* when debug is enabled, store some statistics */ @@ -114,12 +114,18 @@ rte_timer_subsystem_init(void) RTE_LCORE_FOREACH(lcore_id) enabled_lcores[n_enabled_lcores++] = lcore_id; + /* To handle timers coming from non-EAL threads */ + enabled_lcores[n_enabled_lcores++] = RTE_MAX_LCORE; + /* since priv_timer is static, it's zeroed by default, so only init some * fields. */ for (i = 0; i < n_enabled_lcores; i++) { lcore_id = enabled_lcores[i]; + /* Don't use this value to index the priv_timer array */ + if (lcore_id == RTE_MAX_LCORE) + continue; priv_tim = &priv_timer[lcore_id]; priv_tim->prev_lcore = lcore_id; @@ -343,7 +349,9 @@ timer_add(struct rte_timer *tim, unsigned int tim_lcore, int local_is_locked, struct priv_timer *priv_tim = &priv_timer[tim_lcore]; if (priv_tim->multi_pendlists) - *pending_lists_idx = lcore_id; + /* Check if timer being installed from non-EAL thread */ + *pending_lists_idx = (lcore_id == LCORE_ID_ANY) ? + RTE_MAX_LCORE : lcore_id; else *pending_lists_idx = SINGLE_LIST_IDX; -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo @ 2017-09-19 17:02 ` Erik Gabriel Carrillo 2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon 3 siblings, 0 replies; 30+ messages in thread From: Erik Gabriel Carrillo @ 2017-09-19 17:02 UTC (permalink / raw) To: rsanford; +Cc: dev, john.mcnamara, Erik Carrillo From: Erik Carrillo <erik.g.carrillo@intel.com> This change updates the timer library documentation to reflect a change to the organization of the skiplists in the implementation. Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com> Acked-by: John McNamara <john.mcnamara@intel.com> --- v4: * Made changes suggested by John Mcnamara[1]. [1] http://dpdk.org/ml/archives/dev/2017-September/075819.html doc/guides/prog_guide/timer_lib.rst | 31 +++++++++++++++++++------------ doc/guides/rel_notes/release_17_11.rst | 7 +++++++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/doc/guides/prog_guide/timer_lib.rst b/doc/guides/prog_guide/timer_lib.rst index f437417..e1f64ac 100644 --- a/doc/guides/prog_guide/timer_lib.rst +++ b/doc/guides/prog_guide/timer_lib.rst @@ -1,5 +1,5 @@ .. BSD LICENSE - Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + Copyright(c) 2010-2017 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -53,16 +53,19 @@ Refer to the `callout manual <http://www.daemon-systems.org/man/callout.9.html>` Implementation Details ---------------------- -Timers are tracked on a per-lcore basis, -with all pending timers for a core being maintained in order of timer expiry in a skiplist data structure. -The skiplist used has ten levels and each entry in the table appears in each level with probability ¼^level. +Timers are tracked on a per-lcore basis, with all pending timers for a core being maintained in order of timer +expiry in either a single skiplist data structure or an array of skiplists, depending on whether +the lcore has been configured for multiple pending lists. Multiple pending lists can be enabled when an +application experiences contention for a single list for that lcore; skiplists corresponding to every other +enabled lcore will be created. +Each skiplist data structure has ten levels and each entry in the table appears in each level with probability 0.25^level. This means that all entries are present in level 0, 1 in every 4 entries is present at level 1, one in every 16 at level 2 and so on up to level 9. This means that adding and removing entries from the timer list for a core can be done in log(n) time, up to 4^10 entries, that is, approximately 1,000,000 timers per lcore. A timer structure contains a special field called status, -which is a union of a timer state (stopped, pending, running, config) and an owner (lcore id). +which is a union of a timer state (stopped, pending, running, config), an index (lcore id), and an owner (lcore id). Depending on the timer state, we know if a timer is present in a list or not: * STOPPED: no owner, not in a list @@ -75,19 +78,23 @@ Depending on the timer state, we know if a timer is present in a list or not: Resetting or stopping a timer while it is in a CONFIG or RUNNING state is not allowed. When modifying the state of a timer, -a Compare And Swap instruction should be used to guarantee that the status (state+owner) is modified atomically. - -Inside the rte_timer_manage() function, -the skiplist is used as a regular list by iterating along the level 0 list, which contains all timer entries, -until an entry which has not yet expired has been encountered. -To improve performance in the case where there are entries in the timer list but none of those timers have yet expired, +a Compare And Swap instruction should be used to guarantee that the status (state+index+owner) is modified atomically. + +Inside the rte_timer_manage() function, the timer lists are processed. +If multiple pending lists have been enabled for an lcore, then each skiplist will +be traversed sequentially, and run lists will be broken out and then processed. +If multiple pending lists are not enabled for an lcore, then only a single skiplist will be traversed. +A skiplist is used as a regular list by iterating along the level +0 list, which contains all timer entries, until an entry which has not yet expired has been encountered. +To improve performance in the case where there are entries in a skiplist but none of those timers have yet expired, the expiry time of the first list entry is maintained within the per-core timer list structure itself. On 64-bit platforms, this value can be checked without the need to take a lock on the overall structure. (Since expiry times are maintained as 64-bit values, a check on the value cannot be done on 32-bit platforms without using either a compare-and-swap (CAS) instruction or using a lock, so this additional check is skipped in favor of checking as normal once the lock has been taken.) On both 64-bit and 32-bit platforms, -a call to rte_timer_manage() returns without taking a lock in the case where the timer list for the calling core is empty. +rte_timer_manage() can either return or continue on to an lcore's next skiplist without taking a lock in the case where a timer list is empty, +depending on whether or not the lcore has multiple pending lists. Use Cases --------- diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst index 170f4f9..4683cbe 100644 --- a/doc/guides/rel_notes/release_17_11.rst +++ b/doc/guides/rel_notes/release_17_11.rst @@ -110,6 +110,13 @@ API Changes Also, make sure to start the actual text at the margin. ========================================================= +* **Updated timer library.** + + The timer library has been updated; it can now support multiple timer lists + per lcore where it previously only had one. This functionality is off by + default but can be enabled in cases where contention for a single list is + an issue with the new function ``rte_timer_subsystem_set_multi_pendlists()``. + ABI Changes ----------- -- 2.6.4 ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo ` (2 preceding siblings ...) 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo @ 2017-10-11 20:42 ` Thomas Monjalon 2018-04-04 22:42 ` Thomas Monjalon 3 siblings, 1 reply; 30+ messages in thread From: Thomas Monjalon @ 2017-10-11 20:42 UTC (permalink / raw) To: rsanford; +Cc: dev, Erik Gabriel Carrillo, john.mcnamara This patchset is waiting for review. Robert, are you available? 19/09/2017 19:02, Erik Gabriel Carrillo: > In the current implementation of the DPDK timer library, timers can be > created and set to be handled by a target lcore by adding it to a > skiplist that corresponds to that lcore. However, if an application > enables multiple lcores, and each of these lcores repeatedly attempts > to install timers on the same target lcore, overall application > throughput will be reduced as all lcores contend to acquire the lock > guarding the single skiplist of pending timers. > > This patchset addresses this scenario by adding an option to enable an > array of skiplists in each lcore's priv_timer struct. Then, when lcore i > installs a timer on lcore k, the timer will be added to the ith skiplist > for lcore k. If lcore j installs a timer on lcore k simultaneously, > lcores i and j can both proceed since they will be acquiring different > locks for different lists. This functionality is off by default, and > can be enabled via a new function. > > When lcore k processes its pending timers, if the multiple pending list > option is enabled, it will traverse skiplists in its array and acquire > the current skiplist's lock while a run list is broken out; meanwhile, > all other lists can continue to be modified. Then, all run lists for > lcore k are collected and traversed together so timers are executed in > their relative order. If the multiple pending list option is not enabled > (the default), only a single list will be traversed, as before. > > Erik Gabriel Carrillo (3): > timer: add multiple pending lists option for each lcore > timer: handle timers installed from non-EAL threads > doc: update timer lib docs ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement 2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon @ 2018-04-04 22:42 ` Thomas Monjalon 2018-04-05 21:53 ` Carrillo, Erik G 0 siblings, 1 reply; 30+ messages in thread From: Thomas Monjalon @ 2018-04-04 22:42 UTC (permalink / raw) To: Erik Gabriel Carrillo; +Cc: dev, rsanford, john.mcnamara Let's revive these patches. Erik, could you rebase them with experimental tag, please? Someone for a review? 11/10/2017 22:42, Thomas Monjalon: > This patchset is waiting for review. > > Robert, are you available? > > 19/09/2017 19:02, Erik Gabriel Carrillo: > > In the current implementation of the DPDK timer library, timers can be > > created and set to be handled by a target lcore by adding it to a > > skiplist that corresponds to that lcore. However, if an application > > enables multiple lcores, and each of these lcores repeatedly attempts > > to install timers on the same target lcore, overall application > > throughput will be reduced as all lcores contend to acquire the lock > > guarding the single skiplist of pending timers. > > > > This patchset addresses this scenario by adding an option to enable an > > array of skiplists in each lcore's priv_timer struct. Then, when lcore i > > installs a timer on lcore k, the timer will be added to the ith skiplist > > for lcore k. If lcore j installs a timer on lcore k simultaneously, > > lcores i and j can both proceed since they will be acquiring different > > locks for different lists. This functionality is off by default, and > > can be enabled via a new function. > > > > When lcore k processes its pending timers, if the multiple pending list > > option is enabled, it will traverse skiplists in its array and acquire > > the current skiplist's lock while a run list is broken out; meanwhile, > > all other lists can continue to be modified. Then, all run lists for > > lcore k are collected and traversed together so timers are executed in > > their relative order. If the multiple pending list option is not enabled > > (the default), only a single list will be traversed, as before. > > > > Erik Gabriel Carrillo (3): > > timer: add multiple pending lists option for each lcore > > timer: handle timers installed from non-EAL threads > > doc: update timer lib docs > > > ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement 2018-04-04 22:42 ` Thomas Monjalon @ 2018-04-05 21:53 ` Carrillo, Erik G 0 siblings, 0 replies; 30+ messages in thread From: Carrillo, Erik G @ 2018-04-05 21:53 UTC (permalink / raw) To: Thomas Monjalon; +Cc: dev, rsanford, Mcnamara, John Hi Thomas, OK, I'll take a fresh look at these. Thanks, Erik > -----Original Message----- > From: Thomas Monjalon [mailto:thomas@monjalon.net] > Sent: Wednesday, April 4, 2018 5:43 PM > To: Carrillo, Erik G <erik.g.carrillo@intel.com> > Cc: dev@dpdk.org; rsanford@akamai.com; Mcnamara, John > <john.mcnamara@intel.com> > Subject: Re: [dpdk-dev] [PATCH v4 0/3] timer library enhancement > > Let's revive these patches. > Erik, could you rebase them with experimental tag, please? > > Someone for a review? > > > 11/10/2017 22:42, Thomas Monjalon: > > This patchset is waiting for review. > > > > Robert, are you available? > > > > 19/09/2017 19:02, Erik Gabriel Carrillo: > > > In the current implementation of the DPDK timer library, timers can > > > be created and set to be handled by a target lcore by adding it to a > > > skiplist that corresponds to that lcore. However, if an application > > > enables multiple lcores, and each of these lcores repeatedly > > > attempts to install timers on the same target lcore, overall > > > application throughput will be reduced as all lcores contend to > > > acquire the lock guarding the single skiplist of pending timers. > > > > > > This patchset addresses this scenario by adding an option to enable > > > an array of skiplists in each lcore's priv_timer struct. Then, when > > > lcore i installs a timer on lcore k, the timer will be added to the > > > ith skiplist for lcore k. If lcore j installs a timer on lcore k > > > simultaneously, lcores i and j can both proceed since they will be > > > acquiring different locks for different lists. This functionality is > > > off by default, and can be enabled via a new function. > > > > > > When lcore k processes its pending timers, if the multiple pending > > > list option is enabled, it will traverse skiplists in its array and > > > acquire the current skiplist's lock while a run list is broken out; > > > meanwhile, all other lists can continue to be modified. Then, all > > > run lists for lcore k are collected and traversed together so timers > > > are executed in their relative order. If the multiple pending list > > > option is not enabled (the default), only a single list will be traversed, as > before. > > > > > > Erik Gabriel Carrillo (3): > > > timer: add multiple pending lists option for each lcore > > > timer: handle timers installed from non-EAL threads > > > doc: update timer lib docs > > > > > > > > > > ^ permalink raw reply [flat|nested] 30+ messages in thread
end of thread, other threads:[~2018-04-05 21:53 UTC | newest] Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2017-08-23 14:47 [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 1/3] timer: add per-installer pending lists for each lcore Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-29 10:57 ` Ananyev, Konstantin 2017-08-29 16:13 ` Carrillo, Erik G 2017-08-29 15:11 ` [dpdk-dev] [PATCH " Stephen Hemminger 2017-08-23 14:47 ` [dpdk-dev] [PATCH 2/3] timer: handle timers installed from non-EAL threads Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-23 14:47 ` [dpdk-dev] [PATCH 3/3] doc: update timer lib docs Gabriel Carrillo 2017-08-25 20:27 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-08-23 15:02 ` [dpdk-dev] [PATCH 0/3] *** timer library enhancements *** Wiles, Keith 2017-08-23 16:19 ` Carrillo, Erik G 2017-08-23 16:50 ` Wiles, Keith 2017-08-23 19:28 ` Carrillo, Erik G 2017-08-23 21:04 ` Wiles, Keith 2017-08-24 14:08 ` Carrillo, Erik G 2017-08-25 20:26 ` [dpdk-dev] [PATCH v2 " Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo 2017-09-13 22:05 ` [dpdk-dev] [PATCH v3 3/3] doc: update timer lib docs Erik Gabriel Carrillo 2017-09-18 16:19 ` Mcnamara, John 2017-09-19 17:04 ` Carrillo, Erik G 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 1/3] timer: add multiple pending lists option for each lcore Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 2/3] timer: handle timers installed from non-EAL threads Erik Gabriel Carrillo 2017-09-19 17:02 ` [dpdk-dev] [PATCH v4 3/3] doc: update timer lib docs Erik Gabriel Carrillo 2017-10-11 20:42 ` [dpdk-dev] [PATCH v4 0/3] timer library enhancement Thomas Monjalon 2018-04-04 22:42 ` Thomas Monjalon 2018-04-05 21:53 ` Carrillo, Erik G
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).