* [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code @ 2017-11-30 9:04 alangordondewar 2017-11-30 10:47 ` Luca Boccassi 2018-01-02 16:13 ` Dumitrescu, Cristian 0 siblings, 2 replies; 4+ messages in thread From: alangordondewar @ 2017-11-30 9:04 UTC (permalink / raw) To: cristian.dumitrescu; +Cc: dev, Alan Dewar From: Alan Dewar <alan.dewar@att.com> Revised patch - this version fixes an issue when a small wrr_cost is shifted so far right that its value becomes zero. The WRR code calculates the lowest common denominator between the four WRR weights as a uint32_t value and divides the LCD by each of the WRR weights and casts the results as a uint8_t. This casting can cause the ratios of the computed wrr costs to be wrong. For example with WRR weights of 3, 5, 7 and 11, the LCD is computed to be 1155. The WRR costs get computed as: 1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105. When the value 385 is cast into an uint8_t it ends up as 129. Rather than casting straight into a uint8_t, this patch shifts the computed WRR costs right so that the largest value is only eight bits wide. In grinder_schedule, the packet length is multiplied by the WRR cost and added to the grinder's wrr_tokens value. The grinder's wrr_tokens field is a uint16_t, so combination of a packet length of 1500 bytes and a wrr cost of 44 will overflow this field on the first packet. This patch increases the width of the grinder's wrr_tokens and wrr_mask fields from uint16_t to uint32_t. In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens array are copied to the appropriate pipe's wrr_tokens array. However the pipe's wrr_tokens array is only a uint8_t array so unused tokens were quite frequently lost which upsets the balance of traffic across the four WRR queues. This patch increases the width of the pipe's wrr_tokens array from a uint8_t to uint32_t. Signed-off-by: Alan Dewar <alan.dewar@att.com> --- v2 - fixed bug in the wrr_cost calculation code that could result in a zero wrr_cost lib/librte_sched/rte_sched.c | 59 +++++++++++++++++++++++++++++-------- lib/librte_sched/rte_sched_common.h | 15 ++++++++++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c index 7252f85..324743d 100644 --- a/lib/librte_sched/rte_sched.c +++ b/lib/librte_sched/rte_sched.c @@ -130,7 +130,7 @@ struct rte_sched_pipe { uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; /* Weighted Round Robin (WRR) */ - uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; /* TC oversubscription */ uint32_t tc_ov_credits; @@ -205,8 +205,8 @@ struct rte_sched_grinder { struct rte_mbuf *pkt; /* WRR */ - uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; - uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; + uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; }; @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, uint32_t rate) return time; } +static uint32_t rte_sched_reduce_to_byte(uint32_t value) +{ + uint32_t shift = 0; + + while (value & 0xFFFFFF00) { + value >>= 1; + shift++; + } + return shift; +} + static void rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte_sched_port_params *params) { @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte uint32_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; uint32_t lcd, lcd1, lcd2; uint32_t qindex; + uint32_t low_pos; + uint32_t shift; qindex = j * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]); lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]); lcd = rte_get_lcd(lcd1, lcd2); + low_pos = rte_min_pos_4_u32(wrr_cost); wrr_cost[0] = lcd / wrr_cost[0]; wrr_cost[1] = lcd / wrr_cost[1]; wrr_cost[2] = lcd / wrr_cost[2]; wrr_cost[3] = lcd / wrr_cost[3]; + shift = rte_sched_reduce_to_byte(wrr_cost[low_pos]); + wrr_cost[0] >>= shift; + wrr_cost[1] >>= shift; + wrr_cost[2] >>= shift; + wrr_cost[3] >>= shift; + + if (wrr_cost[0] == 0) + wrr_cost[0]++; + if (wrr_cost[1] == 0) + wrr_cost[1]++; + if (wrr_cost[2] == 0) + wrr_cost[2]++; + if (wrr_cost[3] == 0) + wrr_cost[3]++; + dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0]; dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1]; dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port, uint32_t pos) qindex = tc_index * 4; - grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << RTE_SCHED_WRR_SHIFT; - grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) << RTE_SCHED_WRR_SHIFT; - grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) << RTE_SCHED_WRR_SHIFT; - grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) << RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] << + RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] << + RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] << + RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] << + RTE_SCHED_WRR_SHIFT; - grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; - grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; - grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; - grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; + grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF; + grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF; + grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF; + grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF; grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; @@ -1981,14 +2014,14 @@ static inline void grinder_wrr(struct rte_sched_port *port, uint32_t pos) { struct rte_sched_grinder *grinder = port->grinder + pos; - uint16_t wrr_tokens_min; + uint32_t wrr_tokens_min; grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0]; grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1]; grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2]; grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3]; - grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens); + grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens); wrr_tokens_min = grinder->wrr_tokens[grinder->qpos]; grinder->wrr_tokens[0] -= wrr_tokens_min; diff --git a/lib/librte_sched/rte_sched_common.h b/lib/librte_sched/rte_sched_common.h index aed144b..a3c6bc2 100644 --- a/lib/librte_sched/rte_sched_common.h +++ b/lib/librte_sched/rte_sched_common.h @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x) return pos0; } +static inline uint32_t +rte_min_pos_4_u32(uint32_t *x) +{ + uint32_t pos0 = 0; + uint32_t pos1 = 2; + + if (x[1] <= x[0]) + pos0 = 1; + if (x[3] <= x[2]) + pos1 = 3; + if (x[pos1] <= x[pos0]) + pos0 = pos1; + + return pos0; +} #endif /* -- 2.1.4 ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code 2017-11-30 9:04 [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code alangordondewar @ 2017-11-30 10:47 ` Luca Boccassi 2018-01-02 16:13 ` Dumitrescu, Cristian 1 sibling, 0 replies; 4+ messages in thread From: Luca Boccassi @ 2017-11-30 10:47 UTC (permalink / raw) To: alangordondewar, cristian.dumitrescu; +Cc: dev, Alan Dewar On Thu, 2017-11-30 at 09:04 +0000, alangordondewar@gmail.com wrote: > From: Alan Dewar <alan.dewar@att.com> > > Revised patch - this version fixes an issue when a small wrr_cost is > shifted so far right that its value becomes zero. > > The WRR code calculates the lowest common denominator between the > four > WRR weights as a uint32_t value and divides the LCD by each of the > WRR > weights and casts the results as a uint8_t. This casting can cause > the ratios of the computed wrr costs to be wrong. For example with > WRR weights of 3, 5, 7 and 11, the LCD is computed to be > 1155. The WRR costs get computed as: > > 1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105. > > When the value 385 is cast into an uint8_t it ends up as 129. > Rather than casting straight into a uint8_t, this patch shifts the > computed WRR costs right so that the largest value is only eight bits > wide. > > In grinder_schedule, the packet length is multiplied by the WRR cost > and added to the grinder's wrr_tokens value. The grinder's > wrr_tokens > field is a uint16_t, so combination of a packet length of 1500 bytes > and a wrr cost of 44 will overflow this field on the first packet. > > This patch increases the width of the grinder's wrr_tokens and > wrr_mask fields from uint16_t to uint32_t. > > In grinder_wrr_store, the remaining tokens in the grinder's > wrr_tokens > array are copied to the appropriate pipe's wrr_tokens array. However > the pipe's wrr_tokens array is only a uint8_t array so unused tokens > were quite frequently lost which upsets the balance of traffic across > the four WRR queues. > > This patch increases the width of the pipe's wrr_tokens array from > a uint8_t to uint32_t. > > Signed-off-by: Alan Dewar <alan.dewar@att.com> > --- > v2 - fixed bug in the wrr_cost calculation code that could result > in a zero wrr_cost > > lib/librte_sched/rte_sched.c | 59 > +++++++++++++++++++++++++++++-------- > lib/librte_sched/rte_sched_common.h | 15 ++++++++++ > 2 files changed, 61 insertions(+), 13 deletions(-) > > diff --git a/lib/librte_sched/rte_sched.c > b/lib/librte_sched/rte_sched.c > index 7252f85..324743d 100644 > --- a/lib/librte_sched/rte_sched.c > +++ b/lib/librte_sched/rte_sched.c > @@ -130,7 +130,7 @@ struct rte_sched_pipe { > uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > /* Weighted Round Robin (WRR) */ > - uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > > /* TC oversubscription */ > uint32_t tc_ov_credits; > @@ -205,8 +205,8 @@ struct rte_sched_grinder { > struct rte_mbuf *pkt; > > /* WRR */ > - uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > - uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > + uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > }; > > @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, > uint32_t rate) > return time; > } > > +static uint32_t rte_sched_reduce_to_byte(uint32_t value) > +{ > + uint32_t shift = 0; > + > + while (value & 0xFFFFFF00) { > + value >>= 1; > + shift++; > + } > + return shift; > +} > + > static void > rte_sched_port_config_pipe_profile_table(struct rte_sched_port > *port, struct rte_sched_port_params *params) > { > @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct > rte_sched_port *port, struct rte > uint32_t > wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > uint32_t lcd, lcd1, lcd2; > uint32_t qindex; > + uint32_t low_pos; > + uint32_t shift; > > qindex = j * > RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; > > @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct > rte_sched_port *port, struct rte > lcd1 = rte_get_lcd(wrr_cost[0], > wrr_cost[1]); > lcd2 = rte_get_lcd(wrr_cost[2], > wrr_cost[3]); > lcd = rte_get_lcd(lcd1, lcd2); > + low_pos = rte_min_pos_4_u32(wrr_cost); > > wrr_cost[0] = lcd / wrr_cost[0]; > wrr_cost[1] = lcd / wrr_cost[1]; > wrr_cost[2] = lcd / wrr_cost[2]; > wrr_cost[3] = lcd / wrr_cost[3]; > > + shift = > rte_sched_reduce_to_byte(wrr_cost[low_pos]); > + wrr_cost[0] >>= shift; > + wrr_cost[1] >>= shift; > + wrr_cost[2] >>= shift; > + wrr_cost[3] >>= shift; > + > + if (wrr_cost[0] == 0) > + wrr_cost[0]++; > + if (wrr_cost[1] == 0) > + wrr_cost[1]++; > + if (wrr_cost[2] == 0) > + wrr_cost[2]++; > + if (wrr_cost[3] == 0) > + wrr_cost[3]++; > + > dst->wrr_cost[qindex] = (uint8_t) > wrr_cost[0]; > dst->wrr_cost[qindex + 1] = (uint8_t) > wrr_cost[1]; > dst->wrr_cost[qindex + 2] = (uint8_t) > wrr_cost[2]; > @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port, > uint32_t pos) > > qindex = tc_index * 4; > > - grinder->wrr_tokens[0] = ((uint16_t) pipe- > >wrr_tokens[qindex]) << RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex > + 1]) << RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex > + 2]) << RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex > + 3]) << RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] << > + RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; > - grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; > - grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; > - grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; > + grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF; > > grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; > grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; > @@ -1981,14 +2014,14 @@ static inline void > grinder_wrr(struct rte_sched_port *port, uint32_t pos) > { > struct rte_sched_grinder *grinder = port->grinder + pos; > - uint16_t wrr_tokens_min; > + uint32_t wrr_tokens_min; > > grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0]; > grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1]; > grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2]; > grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3]; > > - grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens); > + grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens); > wrr_tokens_min = grinder->wrr_tokens[grinder->qpos]; > > grinder->wrr_tokens[0] -= wrr_tokens_min; > diff --git a/lib/librte_sched/rte_sched_common.h > b/lib/librte_sched/rte_sched_common.h > index aed144b..a3c6bc2 100644 > --- a/lib/librte_sched/rte_sched_common.h > +++ b/lib/librte_sched/rte_sched_common.h > @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x) > return pos0; > } > > +static inline uint32_t > +rte_min_pos_4_u32(uint32_t *x) > +{ > + uint32_t pos0 = 0; > + uint32_t pos1 = 2; > + > + if (x[1] <= x[0]) > + pos0 = 1; > + if (x[3] <= x[2]) > + pos1 = 3; > + if (x[pos1] <= x[pos0]) > + pos0 = pos1; > + > + return pos0; > +} > #endif > > /* Reviewed-by: Luca Boccassi <bluca@debian.org> LGTM -- Kind regards, Luca Boccassi ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code 2017-11-30 9:04 [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code alangordondewar 2017-11-30 10:47 ` Luca Boccassi @ 2018-01-02 16:13 ` Dumitrescu, Cristian 2018-01-03 13:45 ` Dewar, Alan 1 sibling, 1 reply; 4+ messages in thread From: Dumitrescu, Cristian @ 2018-01-02 16:13 UTC (permalink / raw) To: alangordondewar; +Cc: dev, Alan Dewar Hi Alan, NAK for now. There is a good reason for truncating the WRR cost to 8-bit value, which is keeping the size of the rte_sched_pipe structure to single cache line (64 bytes). This is done for performance reasons at the expense of some accuracy loss for the scheduling of the 4x queues per traffic class. Is there a way to make the improvement while working with 8-bit WRR cost values in the pipe structure? > -----Original Message----- > From: alangordondewar@gmail.com [mailto:alangordondewar@gmail.com] > Sent: Thursday, November 30, 2017 9:05 AM > To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com> > Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com> > Subject: [PATCH v2] sched: fix overflow errors in WRR weighting code > > From: Alan Dewar <alan.dewar@att.com> > > Revised patch - this version fixes an issue when a small wrr_cost is > shifted so far right that its value becomes zero. > > The WRR code calculates the lowest common denominator between the > four > WRR weights as a uint32_t value and divides the LCD by each of the WRR > weights and casts the results as a uint8_t. This casting can cause > the ratios of the computed wrr costs to be wrong. For example with > WRR weights of 3, 5, 7 and 11, the LCD is computed to be > 1155. The WRR costs get computed as: Picking prime numbers for the weights is generally a bad idea. If you would pick e.g. 4, 6, 8 and 12 rather than 3, 5, 7 and 11 you would avoid any issues due to the 8-bit truncation. > > 1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105. > > When the value 385 is cast into an uint8_t it ends up as 129. > Rather than casting straight into a uint8_t, this patch shifts the > computed WRR costs right so that the largest value is only eight bits > wide. > > In grinder_schedule, the packet length is multiplied by the WRR cost > and added to the grinder's wrr_tokens value. The grinder's wrr_tokens > field is a uint16_t, so combination of a packet length of 1500 bytes > and a wrr cost of 44 will overflow this field on the first packet. > > This patch increases the width of the grinder's wrr_tokens and > wrr_mask fields from uint16_t to uint32_t. > Increasing the size of the grinder fields is OK, but I am not sure whether it is really helpful, as the values saved in the pipe structure are 8-bit. > In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens > array are copied to the appropriate pipe's wrr_tokens array. However > the pipe's wrr_tokens array is only a uint8_t array so unused tokens > were quite frequently lost which upsets the balance of traffic across > the four WRR queues. > > This patch increases the width of the pipe's wrr_tokens array from > a uint8_t to uint32_t. This is not allowed for performance reasons, as having 16-bit or 32-bit WRR cost values in pipe structure would increase the size of the pipe structure from one cache line to two cache lines. > > Signed-off-by: Alan Dewar <alan.dewar@att.com> > --- > v2 - fixed bug in the wrr_cost calculation code that could result > in a zero wrr_cost > > lib/librte_sched/rte_sched.c | 59 +++++++++++++++++++++++++++++--- > ----- > lib/librte_sched/rte_sched_common.h | 15 ++++++++++ > 2 files changed, 61 insertions(+), 13 deletions(-) > > diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c > index 7252f85..324743d 100644 > --- a/lib/librte_sched/rte_sched.c > +++ b/lib/librte_sched/rte_sched.c > @@ -130,7 +130,7 @@ struct rte_sched_pipe { > uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > /* Weighted Round Robin (WRR) */ > - uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > > /* TC oversubscription */ > uint32_t tc_ov_credits; > @@ -205,8 +205,8 @@ struct rte_sched_grinder { > struct rte_mbuf *pkt; > > /* WRR */ > - uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > - uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > + uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > }; > > @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, > uint32_t rate) > return time; > } > > +static uint32_t rte_sched_reduce_to_byte(uint32_t value) > +{ > + uint32_t shift = 0; > + > + while (value & 0xFFFFFF00) { > + value >>= 1; > + shift++; > + } > + return shift; > +} > + > static void > rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, > struct rte_sched_port_params *params) > { > @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct > rte_sched_port *port, struct rte > uint32_t > wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > uint32_t lcd, lcd1, lcd2; > uint32_t qindex; > + uint32_t low_pos; > + uint32_t shift; > > qindex = j * > RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; > > @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct > rte_sched_port *port, struct rte > lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]); > lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]); > lcd = rte_get_lcd(lcd1, lcd2); > + low_pos = rte_min_pos_4_u32(wrr_cost); > > wrr_cost[0] = lcd / wrr_cost[0]; > wrr_cost[1] = lcd / wrr_cost[1]; > wrr_cost[2] = lcd / wrr_cost[2]; > wrr_cost[3] = lcd / wrr_cost[3]; > > + shift = > rte_sched_reduce_to_byte(wrr_cost[low_pos]); > + wrr_cost[0] >>= shift; > + wrr_cost[1] >>= shift; > + wrr_cost[2] >>= shift; > + wrr_cost[3] >>= shift; > + > + if (wrr_cost[0] == 0) > + wrr_cost[0]++; > + if (wrr_cost[1] == 0) > + wrr_cost[1]++; > + if (wrr_cost[2] == 0) > + wrr_cost[2]++; > + if (wrr_cost[3] == 0) > + wrr_cost[3]++; > + > dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0]; > dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1]; > dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; > @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port, > uint32_t pos) > > qindex = tc_index * 4; > > - grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << > RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) > << RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) > << RTE_SCHED_WRR_SHIFT; > - grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) > << RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] << > + RTE_SCHED_WRR_SHIFT; > + grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] << > + RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; > - grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; > - grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; > - grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; > + grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF; > + grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF; > > grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; > grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; > @@ -1981,14 +2014,14 @@ static inline void > grinder_wrr(struct rte_sched_port *port, uint32_t pos) > { > struct rte_sched_grinder *grinder = port->grinder + pos; > - uint16_t wrr_tokens_min; > + uint32_t wrr_tokens_min; > > grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0]; > grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1]; > grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2]; > grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3]; > > - grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens); > + grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens); > wrr_tokens_min = grinder->wrr_tokens[grinder->qpos]; > > grinder->wrr_tokens[0] -= wrr_tokens_min; > diff --git a/lib/librte_sched/rte_sched_common.h > b/lib/librte_sched/rte_sched_common.h > index aed144b..a3c6bc2 100644 > --- a/lib/librte_sched/rte_sched_common.h > +++ b/lib/librte_sched/rte_sched_common.h > @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x) > return pos0; > } > > +static inline uint32_t > +rte_min_pos_4_u32(uint32_t *x) > +{ > + uint32_t pos0 = 0; > + uint32_t pos1 = 2; > + > + if (x[1] <= x[0]) > + pos0 = 1; > + if (x[3] <= x[2]) > + pos1 = 3; > + if (x[pos1] <= x[pos0]) > + pos0 = pos1; > + > + return pos0; > +} > #endif > > /* > -- > 2.1.4 Regards, Cristian ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code 2018-01-02 16:13 ` Dumitrescu, Cristian @ 2018-01-03 13:45 ` Dewar, Alan 0 siblings, 0 replies; 4+ messages in thread From: Dewar, Alan @ 2018-01-03 13:45 UTC (permalink / raw) To: 'Dumitrescu, Cristian', 'alangordondewar@gmail.com' Cc: 'dev@dpdk.org', 'Alan Dewar' Hi Cristian, Responses in-line below... > -----Original Message----- > From: Dumitrescu, Cristian [mailto:cristian.dumitrescu@intel.com] > Sent: Tuesday, January 02, 2018 4:14 PM > To: alangordondewar@gmail.com > Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com> > Subject: RE: [PATCH v2] sched: fix overflow errors in WRR weighting code > > Hi Alan, > > NAK for now. > > There is a good reason for truncating the WRR cost to 8-bit value, which is keeping the size of the rte_sched_pipe structure to single cache line (64 bytes). This is done for performance reasons at the expense of some accuracy loss for the scheduling of the 4x queues per traffic class. > Ah, I wasn't aware of this restriction, but I fully understand why you don't want to increase the size of the rte_sched_pipe structure. > Is there a way to make the improvement while working with 8-bit WRR cost values in the pipe structure? Off the top of my head, I think that we might be able to improve the behavior of WRR queues a little, but not as well as we could with 32-bit wrr_cost array in the rte_sched_pipe structure. I'll need to play around with it and see what happens. > > > -----Original Message----- > > From: alangordondewar@gmail.com [mailto:alangordondewar@gmail.com] > > Sent: Thursday, November 30, 2017 9:05 AM > > To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com> > > Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com> > > Subject: [PATCH v2] sched: fix overflow errors in WRR weighting code > > > > From: Alan Dewar <alan.dewar@att.com> > > > > Revised patch - this version fixes an issue when a small wrr_cost is > > shifted so far right that its value becomes zero. > > > > The WRR code calculates the lowest common denominator between the four > > WRR weights as a uint32_t value and divides the LCD by each of the WRR > > weights and casts the results as a uint8_t. This casting can cause > > the ratios of the computed wrr costs to be wrong. For example with > > WRR weights of 3, 5, 7 and 11, the LCD is computed to be 1155. The > > WRR costs get computed as: > > Picking prime numbers for the weights is generally a bad idea. If you would pick e.g. 4, 6, 8 and 12 rather than 3, 5, 7 and 11 you would avoid any issues due to the 8-bit truncation. > I selected 3, 5, 7 and 11 as a pathological case to illustrate part of the problem, I think that it is unlikely that customers will actually use these numbers. However the DPDK doesn't impose any restrictions on what values wrr_weights can have any values between 1 and 255 are allowed. In the product where we are using the DPDK as part of a software dataplane, we currently restricted wrr_weights to values between 1 and 100. However we have no control over the values that our customers select within that range. To impose any further restriction would not be backwards compatible with previous versions of our software. This non-backwards compatibility is not acceptable to customers when they upgrade the software on their routers with the newer version, and their previously accepted configuration is rejected by the new version of the software. > > > > 1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105. > > > > When the value 385 is cast into an uint8_t it ends up as 129. > > Rather than casting straight into a uint8_t, this patch shifts the > > computed WRR costs right so that the largest value is only eight bits > > wide. > > > > In grinder_schedule, the packet length is multiplied by the WRR cost > > and added to the grinder's wrr_tokens value. The grinder's wrr_tokens > > field is a uint16_t, so combination of a packet length of 1500 bytes > > and a wrr cost of 44 will overflow this field on the first packet. > > > > This patch increases the width of the grinder's wrr_tokens and > > wrr_mask fields from uint16_t to uint32_t. > > > > Increasing the size of the grinder fields is OK, but I am not sure whether it is really helpful, as the values saved in the pipe structure are 8-bit. > I will investigate this further, but I don't think that we will see much benefit. > > > In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens > > array are copied to the appropriate pipe's wrr_tokens array. However > > the pipe's wrr_tokens array is only a uint8_t array so unused tokens > > were quite frequently lost which upsets the balance of traffic across > > the four WRR queues. > > > > This patch increases the width of the pipe's wrr_tokens array from a > > uint8_t to uint32_t. > > This is not allowed for performance reasons, as having 16-bit or 32-bit WRR cost values in pipe structure would increase the size of the pipe structure from one cache line to two cache lines. Understood, it might be interesting to try to determine what the performance hit is. Regards Alan > > > > > Signed-off-by: Alan Dewar <alan.dewar@att.com> > > --- > > v2 - fixed bug in the wrr_cost calculation code that could result in a > > zero wrr_cost > > > > lib/librte_sched/rte_sched.c | 59 +++++++++++++++++++++++++++++--- > > ----- > > lib/librte_sched/rte_sched_common.h | 15 ++++++++++ > > 2 files changed, 61 insertions(+), 13 deletions(-) > > > > diff --git a/lib/librte_sched/rte_sched.c > > b/lib/librte_sched/rte_sched.c index 7252f85..324743d 100644 > > --- a/lib/librte_sched/rte_sched.c > > +++ b/lib/librte_sched/rte_sched.c > > @@ -130,7 +130,7 @@ struct rte_sched_pipe { > > uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > > > /* Weighted Round Robin (WRR) */ > > - uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > > > > /* TC oversubscription */ > > uint32_t tc_ov_credits; > > @@ -205,8 +205,8 @@ struct rte_sched_grinder { > > struct rte_mbuf *pkt; > > > > /* WRR */ > > - uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > - uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > + uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > + uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > }; > > > > @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, > > uint32_t rate) > > return time; > > } > > > > +static uint32_t rte_sched_reduce_to_byte(uint32_t value) { > > + uint32_t shift = 0; > > + > > + while (value & 0xFFFFFF00) { > > + value >>= 1; > > + shift++; > > + } > > + return shift; > > +} > > + > > static void > > rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, > > struct rte_sched_port_params *params) { @@ -583,6 +594,8 @@ > > rte_sched_port_config_pipe_profile_table(struct > > rte_sched_port *port, struct rte > > uint32_t > > wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > uint32_t lcd, lcd1, lcd2; > > uint32_t qindex; > > + uint32_t low_pos; > > + uint32_t shift; > > > > qindex = j * > > RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; > > > > @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct > > rte_sched_port *port, struct rte > > lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]); > > lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]); > > lcd = rte_get_lcd(lcd1, lcd2); > > + low_pos = rte_min_pos_4_u32(wrr_cost); > > > > wrr_cost[0] = lcd / wrr_cost[0]; > > wrr_cost[1] = lcd / wrr_cost[1]; > > wrr_cost[2] = lcd / wrr_cost[2]; > > wrr_cost[3] = lcd / wrr_cost[3]; > > > > + shift = > > rte_sched_reduce_to_byte(wrr_cost[low_pos]); > > + wrr_cost[0] >>= shift; > > + wrr_cost[1] >>= shift; > > + wrr_cost[2] >>= shift; > > + wrr_cost[3] >>= shift; > > + > > + if (wrr_cost[0] == 0) > > + wrr_cost[0]++; > > + if (wrr_cost[1] == 0) > > + wrr_cost[1]++; > > + if (wrr_cost[2] == 0) > > + wrr_cost[2]++; > > + if (wrr_cost[3] == 0) > > + wrr_cost[3]++; > > + > > dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0]; > > dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1]; > > dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; @@ -1941,15 > > +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port, uint32_t > > pos) > > > > qindex = tc_index * 4; > > > > - grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << > > RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) > > << RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) > > << RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) > > << RTE_SCHED_WRR_SHIFT; > > + grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] << > > + RTE_SCHED_WRR_SHIFT; > > + grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] << > > + RTE_SCHED_WRR_SHIFT; > > + grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] << > > + RTE_SCHED_WRR_SHIFT; > > + grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] << > > + RTE_SCHED_WRR_SHIFT; > > > > - grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; > > - grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; > > - grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; > > - grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; > > + grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF; > > + grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF; > > + grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF; > > + grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF; > > > > grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; > > grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; @@ > > -1981,14 +2014,14 @@ static inline void grinder_wrr(struct > > rte_sched_port *port, uint32_t pos) { > > struct rte_sched_grinder *grinder = port->grinder + pos; > > - uint16_t wrr_tokens_min; > > + uint32_t wrr_tokens_min; > > > > grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0]; > > grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1]; > > grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2]; > > grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3]; > > > > - grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens); > > + grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens); > > wrr_tokens_min = grinder->wrr_tokens[grinder->qpos]; > > > > grinder->wrr_tokens[0] -= wrr_tokens_min; diff --git > > a/lib/librte_sched/rte_sched_common.h > > b/lib/librte_sched/rte_sched_common.h > > index aed144b..a3c6bc2 100644 > > --- a/lib/librte_sched/rte_sched_common.h > > +++ b/lib/librte_sched/rte_sched_common.h > > @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x) > > return pos0; > > } > > > > +static inline uint32_t > > +rte_min_pos_4_u32(uint32_t *x) > > +{ > > + uint32_t pos0 = 0; > > + uint32_t pos1 = 2; > > + > > + if (x[1] <= x[0]) > > + pos0 = 1; > > + if (x[3] <= x[2]) > > + pos1 = 3; > > + if (x[pos1] <= x[pos0]) > > + pos0 = pos1; > > + > > + return pos0; > > +} > > #endif > > > > /* > > -- > > 2.1.4 > > Regards, > Cristian ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2018-01-03 13:46 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2017-11-30 9:04 [dpdk-dev] [PATCH v2] sched: fix overflow errors in WRR weighting code alangordondewar 2017-11-30 10:47 ` Luca Boccassi 2018-01-02 16:13 ` Dumitrescu, Cristian 2018-01-03 13:45 ` Dewar, Alan
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).