* [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64
@ 2019-10-08 9:55 Ruifeng Wang
2019-10-08 12:53 ` Hunt, David
` (3 more replies)
0 siblings, 4 replies; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-08 9:55 UTC (permalink / raw)
To: david.hunt
Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang, stable
Distributor and worker threads rely on data structs in cache line
for synchronization. The shared data structs were not protected.
This caused deadlock issue on weaker memory ordering platforms as
aarch64.
Fix this issue by adding memory barriers to ensure synchronization
among cores.
Bugzilla ID: 342
Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
lib/librte_distributor/rte_distributor.c | 28 ++++++++++------
lib/librte_distributor/rte_distributor_v20.c | 34 +++++++++++++-------
2 files changed, 41 insertions(+), 21 deletions(-)
diff --git a/lib/librte_distributor/rte_distributor.c b/lib/librte_distributor/rte_distributor.c
index 21eb1fb0a..7bf96e224 100644
--- a/lib/librte_distributor/rte_distributor.c
+++ b/lib/librte_distributor/rte_distributor.c
@@ -50,7 +50,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
retptr64 = &(buf->retptr64[0]);
/* Spin while handshake bits are set (scheduler clears it) */
- while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
+ while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)) {
rte_pause();
uint64_t t = rte_rdtsc()+100;
@@ -76,7 +77,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
* Finally, set the GET_BUF to signal to distributor that cache
* line is ready for processing
*/
- *retptr64 |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d,
@@ -99,7 +101,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
}
/* If bit is set, return */
- if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
+ if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return -1;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -116,6 +119,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
* on the next cacheline while we're working.
*/
buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return count;
}
@@ -183,7 +188,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF;
/* set the GET_BUF but even if we got no returns */
- buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->retptr64[0]),
+ buf->retptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return 0;
}
@@ -273,7 +279,8 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
unsigned int count = 0;
unsigned int i;
- if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
+ if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) {
for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
oldbuf = ((uintptr_t)(buf->retptr64[i] >>
@@ -287,7 +294,7 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
d->returns.start = ret_start;
d->returns.count = ret_count;
/* Clear for the worker to populate with more returns */
- buf->retptr64[0] = 0;
+ __atomic_store_n(&(buf->retptr64[0]), 0, __ATOMIC_RELEASE);
}
return count;
}
@@ -307,7 +314,8 @@ release(struct rte_distributor *d, unsigned int wkr)
struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
unsigned int i;
- while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
+ while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF))
rte_pause();
handle_returns(d, wkr);
@@ -328,7 +336,8 @@ release(struct rte_distributor *d, unsigned int wkr)
d->backlog[wkr].count = 0;
/* Clear the GET bit */
- buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return buf->count;
}
@@ -574,7 +583,8 @@ rte_distributor_clear_returns_v1705(struct rte_distributor *d)
/* throw away returns, so workers can exit */
for (wkr = 0; wkr < d->num_workers; wkr++)
- d->bufs[wkr].retptr64[0] = 0;
+ __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d),
diff --git a/lib/librte_distributor/rte_distributor_v20.c b/lib/librte_distributor/rte_distributor_v20.c
index cdc0969a8..3a5810c6d 100644
--- a/lib/librte_distributor/rte_distributor_v20.c
+++ b/lib/librte_distributor/rte_distributor_v20.c
@@ -34,9 +34,10 @@ rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_GET_BUF;
- while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
+ while (unlikely(__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_FLAGS_MASK))
rte_pause();
- buf->bufptr64 = req;
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
}
VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
@@ -45,7 +46,8 @@ rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
unsigned worker_id)
{
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
- if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
+ if (__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return NULL;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -73,7 +75,7 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_RETURN_BUF;
- buf->bufptr64 = req;
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
return 0;
}
VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0);
@@ -117,7 +119,7 @@ handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int wkr)
{
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
- d->bufs[wkr].bufptr64 = 0;
+ __atomic_store_n(&(d->bufs[wkr].bufptr64), 0, __ATOMIC_RELEASE);
if (unlikely(d->backlog[wkr].count != 0)) {
/* On return of a packet, we need to move the
* queued packets for this core elsewhere.
@@ -165,13 +167,17 @@ process_returns(struct rte_distributor_v20 *d)
const int64_t data = d->bufs[wkr].bufptr64;
uintptr_t oldbuf = 0;
- if (data & RTE_DISTRIB_GET_BUF) {
+ if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) {
flushed++;
if (d->backlog[wkr].count)
- d->bufs[wkr].bufptr64 =
- backlog_pop(&d->backlog[wkr]);
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
else {
- d->bufs[wkr].bufptr64 = RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
}
@@ -251,7 +257,8 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
}
}
- if ((data & RTE_DISTRIB_GET_BUF) &&
+ if ((__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) &&
(d->backlog[wkr].count || next_mb)) {
if (d->backlog[wkr].count)
@@ -280,13 +287,16 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
* if they are ready */
for (wkr = 0; wkr < d->num_workers; wkr++)
if (d->backlog[wkr].count &&
- (d->bufs[wkr].bufptr64 & RTE_DISTRIB_GET_BUF)) {
+ (__atomic_load_n(&(d->bufs[wkr].bufptr64),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)) {
int64_t oldbuf = d->bufs[wkr].bufptr64 >>
RTE_DISTRIB_FLAG_BITS;
store_return(oldbuf, d, &ret_start, &ret_count);
- d->bufs[wkr].bufptr64 = backlog_pop(&d->backlog[wkr]);
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
}
d->returns.start = ret_start;
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-08 9:55 [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
@ 2019-10-08 12:53 ` Hunt, David
2019-10-08 17:05 ` Aaron Conole
` (2 subsequent siblings)
3 siblings, 0 replies; 23+ messages in thread
From: Hunt, David @ 2019-10-08 12:53 UTC (permalink / raw)
To: Ruifeng Wang; +Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, stable
On 08/10/2019 10:55, Ruifeng Wang wrote:
> Distributor and worker threads rely on data structs in cache line
> for synchronization. The shared data structs were not protected.
> This caused deadlock issue on weaker memory ordering platforms as
> aarch64.
> Fix this issue by adding memory barriers to ensure synchronization
> among cores.
>
> Bugzilla ID: 342
> Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> Cc: stable@dpdk.org
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> lib/librte_distributor/rte_distributor.c | 28 ++++++++++------
> lib/librte_distributor/rte_distributor_v20.c | 34 +++++++++++++-------
> 2 files changed, 41 insertions(+), 21 deletions(-)
>
--snip--
I tested this on my system, and saw no performance degradation. Looks
good. Thanks.
Acked-by: David Hunt <david.hunt@intel.com>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-08 9:55 [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-08 12:53 ` Hunt, David
@ 2019-10-08 17:05 ` Aaron Conole
2019-10-08 19:46 ` [dpdk-dev] [dpdk-stable] " David Marchand
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 0/2] fix distributor unit test Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
3 siblings, 1 reply; 23+ messages in thread
From: Aaron Conole @ 2019-10-08 17:05 UTC (permalink / raw)
To: Ruifeng Wang
Cc: david.hunt, dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, stable
Ruifeng Wang <ruifeng.wang@arm.com> writes:
> Distributor and worker threads rely on data structs in cache line
> for synchronization. The shared data structs were not protected.
> This caused deadlock issue on weaker memory ordering platforms as
> aarch64.
> Fix this issue by adding memory barriers to ensure synchronization
> among cores.
>
> Bugzilla ID: 342
> Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> Cc: stable@dpdk.org
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
I see a failure in the distributor_autotest (on one of the builds):
64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit status 255 or signal 127 SIGinvalid)
--- command ---
DPDK_TEST='distributor_autotest' /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1 --file-prefix=distributor_autotest
--- stdout ---
EAL: Probing VFIO support...
APP: HPET is not enabled, using TSC as default timer
RTE>>distributor_autotest
=== Basic distributor sanity tests ===
Worker 0 handled 32 packets
Sanity test with all zero hashes done.
Worker 0 handled 32 packets
Sanity test with non-zero hashes done
=== testing big burst (single) ===
Sanity test of returned packets done
=== Sanity test with mbuf alloc/free (single) ===
Sanity test with mbuf alloc/free passed
Too few cores to run worker shutdown test
=== Basic distributor sanity tests ===
Worker 0 handled 32 packets
Sanity test with all zero hashes done.
Worker 0 handled 32 packets
Sanity test with non-zero hashes done
=== testing big burst (burst) ===
Sanity test of returned packets done
=== Sanity test with mbuf alloc/free (burst) ===
Line 326: Packet count is incorrect, 1048568, expected 1048576
Test Failed
RTE>>
--- stderr ---
EAL: Detected 2 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/distributor_autotest/mp_socket
EAL: Selected IOVA mode 'PA'
EAL: No available hugepages reported in hugepages-1048576kB
-------
Not sure how to help debug further. I'll re-start the job to see if
it 'clears' up - but I guess there may be a delicate synchronization
somewhere that needs to be accounted.
> lib/librte_distributor/rte_distributor.c | 28 ++++++++++------
> lib/librte_distributor/rte_distributor_v20.c | 34 +++++++++++++-------
> 2 files changed, 41 insertions(+), 21 deletions(-)
>
> diff --git a/lib/librte_distributor/rte_distributor.c b/lib/librte_distributor/rte_distributor.c
> index 21eb1fb0a..7bf96e224 100644
> --- a/lib/librte_distributor/rte_distributor.c
> +++ b/lib/librte_distributor/rte_distributor.c
> @@ -50,7 +50,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
>
> retptr64 = &(buf->retptr64[0]);
> /* Spin while handshake bits are set (scheduler clears it) */
> - while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
> + while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)) {
> rte_pause();
> uint64_t t = rte_rdtsc()+100;
>
> @@ -76,7 +77,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
> * Finally, set the GET_BUF to signal to distributor that cache
> * line is ready for processing
> */
> - *retptr64 |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
> + __ATOMIC_RELEASE);
> }
> BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
> MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d,
> @@ -99,7 +101,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
> }
>
> /* If bit is set, return */
> - if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
> + if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)
> return -1;
>
> /* since bufptr64 is signed, this should be an arithmetic shift */
> @@ -116,6 +119,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
> * on the next cacheline while we're working.
> */
> buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->bufptr64[0]),
> + buf->bufptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
>
> return count;
> }
> @@ -183,7 +188,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
> RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF;
>
> /* set the GET_BUF but even if we got no returns */
> - buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->retptr64[0]),
> + buf->retptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
>
> return 0;
> }
> @@ -273,7 +279,8 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
> unsigned int count = 0;
> unsigned int i;
>
> - if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
> + if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) {
> for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
> if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
> oldbuf = ((uintptr_t)(buf->retptr64[i] >>
> @@ -287,7 +294,7 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
> d->returns.start = ret_start;
> d->returns.count = ret_count;
> /* Clear for the worker to populate with more returns */
> - buf->retptr64[0] = 0;
> + __atomic_store_n(&(buf->retptr64[0]), 0, __ATOMIC_RELEASE);
> }
> return count;
> }
> @@ -307,7 +314,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
> unsigned int i;
>
> - while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
> + while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF))
> rte_pause();
>
> handle_returns(d, wkr);
> @@ -328,7 +336,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> d->backlog[wkr].count = 0;
>
> /* Clear the GET bit */
> - buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->bufptr64[0]),
> + buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
> return buf->count;
>
> }
> @@ -574,7 +583,8 @@ rte_distributor_clear_returns_v1705(struct rte_distributor *d)
>
> /* throw away returns, so workers can exit */
> for (wkr = 0; wkr < d->num_workers; wkr++)
> - d->bufs[wkr].retptr64[0] = 0;
> + __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
> + __ATOMIC_RELEASE);
> }
> BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
> MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d),
> diff --git a/lib/librte_distributor/rte_distributor_v20.c b/lib/librte_distributor/rte_distributor_v20.c
> index cdc0969a8..3a5810c6d 100644
> --- a/lib/librte_distributor/rte_distributor_v20.c
> +++ b/lib/librte_distributor/rte_distributor_v20.c
> @@ -34,9 +34,10 @@ rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
> | RTE_DISTRIB_GET_BUF;
> - while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
> + while (unlikely(__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_FLAGS_MASK))
> rte_pause();
> - buf->bufptr64 = req;
> + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> }
> VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
>
> @@ -45,7 +46,8 @@ rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
> unsigned worker_id)
> {
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> - if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
> + if (__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)
> return NULL;
>
> /* since bufptr64 is signed, this should be an arithmetic shift */
> @@ -73,7 +75,7 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
> | RTE_DISTRIB_RETURN_BUF;
> - buf->bufptr64 = req;
> + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> return 0;
> }
> VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0);
> @@ -117,7 +119,7 @@ handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int wkr)
> {
> d->in_flight_tags[wkr] = 0;
> d->in_flight_bitmask &= ~(1UL << wkr);
> - d->bufs[wkr].bufptr64 = 0;
> + __atomic_store_n(&(d->bufs[wkr].bufptr64), 0, __ATOMIC_RELEASE);
> if (unlikely(d->backlog[wkr].count != 0)) {
> /* On return of a packet, we need to move the
> * queued packets for this core elsewhere.
> @@ -165,13 +167,17 @@ process_returns(struct rte_distributor_v20 *d)
> const int64_t data = d->bufs[wkr].bufptr64;
> uintptr_t oldbuf = 0;
>
> - if (data & RTE_DISTRIB_GET_BUF) {
> + if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) {
> flushed++;
> if (d->backlog[wkr].count)
> - d->bufs[wkr].bufptr64 =
> - backlog_pop(&d->backlog[wkr]);
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + backlog_pop(&d->backlog[wkr]),
> + __ATOMIC_RELEASE);
> else {
> - d->bufs[wkr].bufptr64 = RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + RTE_DISTRIB_GET_BUF,
> + __ATOMIC_RELEASE);
> d->in_flight_tags[wkr] = 0;
> d->in_flight_bitmask &= ~(1UL << wkr);
> }
> @@ -251,7 +257,8 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
> }
> }
>
> - if ((data & RTE_DISTRIB_GET_BUF) &&
> + if ((__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) &&
> (d->backlog[wkr].count || next_mb)) {
>
> if (d->backlog[wkr].count)
> @@ -280,13 +287,16 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
> * if they are ready */
> for (wkr = 0; wkr < d->num_workers; wkr++)
> if (d->backlog[wkr].count &&
> - (d->bufs[wkr].bufptr64 & RTE_DISTRIB_GET_BUF)) {
> + (__atomic_load_n(&(d->bufs[wkr].bufptr64),
> + __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)) {
>
> int64_t oldbuf = d->bufs[wkr].bufptr64 >>
> RTE_DISTRIB_FLAG_BITS;
> store_return(oldbuf, d, &ret_start, &ret_count);
>
> - d->bufs[wkr].bufptr64 = backlog_pop(&d->backlog[wkr]);
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + backlog_pop(&d->backlog[wkr]),
> + __ATOMIC_RELEASE);
> }
>
> d->returns.start = ret_start;
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [dpdk-stable] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-08 17:05 ` Aaron Conole
@ 2019-10-08 19:46 ` David Marchand
2019-10-08 20:08 ` Aaron Conole
2019-10-09 5:52 ` Ruifeng Wang (Arm Technology China)
0 siblings, 2 replies; 23+ messages in thread
From: David Marchand @ 2019-10-08 19:46 UTC (permalink / raw)
To: Aaron Conole
Cc: Ruifeng Wang, David Hunt, dev, hkalra, Gavin Hu,
Honnappa Nagarahalli, nd, dpdk stable
On Tue, Oct 8, 2019 at 7:06 PM Aaron Conole <aconole@redhat.com> wrote:
>
> Ruifeng Wang <ruifeng.wang@arm.com> writes:
>
> > Distributor and worker threads rely on data structs in cache line
> > for synchronization. The shared data structs were not protected.
> > This caused deadlock issue on weaker memory ordering platforms as
> > aarch64.
> > Fix this issue by adding memory barriers to ensure synchronization
> > among cores.
> >
> > Bugzilla ID: 342
> > Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
>
> I see a failure in the distributor_autotest (on one of the builds):
>
> 64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit status 255 or signal 127 SIGinvalid)
>
> --- command ---
>
> DPDK_TEST='distributor_autotest' /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1 --file-prefix=distributor_autotest
>
> --- stdout ---
>
> EAL: Probing VFIO support...
>
> APP: HPET is not enabled, using TSC as default timer
>
> RTE>>distributor_autotest
>
> === Basic distributor sanity tests ===
>
> Worker 0 handled 32 packets
>
> Sanity test with all zero hashes done.
>
> Worker 0 handled 32 packets
>
> Sanity test with non-zero hashes done
>
> === testing big burst (single) ===
>
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (single) ===
>
> Sanity test with mbuf alloc/free passed
>
> Too few cores to run worker shutdown test
>
> === Basic distributor sanity tests ===
>
> Worker 0 handled 32 packets
>
> Sanity test with all zero hashes done.
>
> Worker 0 handled 32 packets
>
> Sanity test with non-zero hashes done
>
> === testing big burst (burst) ===
>
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (burst) ===
>
> Line 326: Packet count is incorrect, 1048568, expected 1048576
>
> Test Failed
>
> RTE>>
>
> --- stderr ---
>
> EAL: Detected 2 lcore(s)
>
> EAL: Detected 1 NUMA nodes
>
> EAL: Multi-process socket /var/run/dpdk/distributor_autotest/mp_socket
>
> EAL: Selected IOVA mode 'PA'
>
> EAL: No available hugepages reported in hugepages-1048576kB
>
> -------
>
> Not sure how to help debug further. I'll re-start the job to see if
> it 'clears' up - but I guess there may be a delicate synchronization
> somewhere that needs to be accounted.
Idem, and with the same loop I used before, it can be caught quickly.
# time (log=/tmp/$$.log; while true; do echo distributor_autotest
|taskset -c 0-1 ./build-gcc-static/app/test/dpdk-test --log-level *:8
-l 0-1 >$log 2>&1; grep -q 'Test OK' $log || break; done; cat $log; rm
-f $log)
[snip]
RTE>>distributor_autotest
EAL: Trying to obtain current memory policy.
EAL: Setting policy MPOL_PREFERRED for socket 0
EAL: Restoring previous memory policy: 0
EAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 2MB
EAL: Trying to obtain current memory policy.
EAL: Setting policy MPOL_PREFERRED for socket 0
EAL: Restoring previous memory policy: 0
EAL: alloc_pages_on_heap(): couldn't allocate physically contiguous space
EAL: Trying to obtain current memory policy.
EAL: Setting policy MPOL_PREFERRED for socket 0
EAL: Restoring previous memory policy: 0
EAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 8MB
=== Basic distributor sanity tests ===
Worker 0 handled 32 packets
Sanity test with all zero hashes done.
Worker 0 handled 32 packets
Sanity test with non-zero hashes done
=== testing big burst (single) ===
Sanity test of returned packets done
=== Sanity test with mbuf alloc/free (single) ===
Sanity test with mbuf alloc/free passed
Too few cores to run worker shutdown test
=== Basic distributor sanity tests ===
Worker 0 handled 32 packets
Sanity test with all zero hashes done.
Worker 0 handled 32 packets
Sanity test with non-zero hashes done
=== testing big burst (burst) ===
Sanity test of returned packets done
=== Sanity test with mbuf alloc/free (burst) ===
Line 326: Packet count is incorrect, 1048568, expected 1048576
Test Failed
RTE>>
real 0m36.668s
user 1m7.293s
sys 0m1.560s
Could be worth running this loop on all tests? (not talking about the
CI, it would be a manual effort to catch lurking issues).
--
David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [dpdk-stable] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-08 19:46 ` [dpdk-dev] [dpdk-stable] " David Marchand
@ 2019-10-08 20:08 ` Aaron Conole
2019-10-09 5:52 ` Ruifeng Wang (Arm Technology China)
1 sibling, 0 replies; 23+ messages in thread
From: Aaron Conole @ 2019-10-08 20:08 UTC (permalink / raw)
To: David Marchand
Cc: Ruifeng Wang, David Hunt, dev, hkalra, Gavin Hu,
Honnappa Nagarahalli, nd, dpdk stable
David Marchand <david.marchand@redhat.com> writes:
> On Tue, Oct 8, 2019 at 7:06 PM Aaron Conole <aconole@redhat.com> wrote:
>>
>> Ruifeng Wang <ruifeng.wang@arm.com> writes:
>>
>> > Distributor and worker threads rely on data structs in cache line
>> > for synchronization. The shared data structs were not protected.
>> > This caused deadlock issue on weaker memory ordering platforms as
>> > aarch64.
>> > Fix this issue by adding memory barriers to ensure synchronization
>> > among cores.
>> >
>> > Bugzilla ID: 342
>> > Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
>> > Cc: stable@dpdk.org
>> >
>> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>> > ---
>>
>> I see a failure in the distributor_autotest (on one of the builds):
>>
>> 64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit
>> status 255 or signal 127 SIGinvalid)
>>
>> --- command ---
>>
>> DPDK_TEST='distributor_autotest'
>> /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1
>> --file-prefix=distributor_autotest
>>
>> --- stdout ---
>>
>> EAL: Probing VFIO support...
>>
>> APP: HPET is not enabled, using TSC as default timer
>>
>> RTE>>distributor_autotest
>>
>> === Basic distributor sanity tests ===
>>
>> Worker 0 handled 32 packets
>>
>> Sanity test with all zero hashes done.
>>
>> Worker 0 handled 32 packets
>>
>> Sanity test with non-zero hashes done
>>
>> === testing big burst (single) ===
>>
>> Sanity test of returned packets done
>>
>> === Sanity test with mbuf alloc/free (single) ===
>>
>> Sanity test with mbuf alloc/free passed
>>
>> Too few cores to run worker shutdown test
>>
>> === Basic distributor sanity tests ===
>>
>> Worker 0 handled 32 packets
>>
>> Sanity test with all zero hashes done.
>>
>> Worker 0 handled 32 packets
>>
>> Sanity test with non-zero hashes done
>>
>> === testing big burst (burst) ===
>>
>> Sanity test of returned packets done
>>
>> === Sanity test with mbuf alloc/free (burst) ===
>>
>> Line 326: Packet count is incorrect, 1048568, expected 1048576
>>
>> Test Failed
>>
>> RTE>>
>>
>> --- stderr ---
>>
>> EAL: Detected 2 lcore(s)
>>
>> EAL: Detected 1 NUMA nodes
>>
>> EAL: Multi-process socket /var/run/dpdk/distributor_autotest/mp_socket
>>
>> EAL: Selected IOVA mode 'PA'
>>
>> EAL: No available hugepages reported in hugepages-1048576kB
>>
>> -------
>>
>> Not sure how to help debug further. I'll re-start the job to see if
>> it 'clears' up - but I guess there may be a delicate synchronization
>> somewhere that needs to be accounted.
>
> Idem, and with the same loop I used before, it can be caught quickly.
>
> # time (log=/tmp/$$.log; while true; do echo distributor_autotest
> |taskset -c 0-1 ./build-gcc-static/app/test/dpdk-test --log-level *:8
> -l 0-1 >$log 2>&1; grep -q 'Test OK' $log || break; done; cat $log; rm
> -f $log)
Probably good to document it, yes. It seems to be a good technique for
reproducing failures.
> [snip]
>
> RTE>>distributor_autotest
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: request: mp_malloc_sync
> EAL: Heap on socket 0 was expanded by 2MB
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: alloc_pages_on_heap(): couldn't allocate physically contiguous space
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: request: mp_malloc_sync
> EAL: Heap on socket 0 was expanded by 8MB
> === Basic distributor sanity tests ===
> Worker 0 handled 32 packets
> Sanity test with all zero hashes done.
> Worker 0 handled 32 packets
> Sanity test with non-zero hashes done
> === testing big burst (single) ===
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (single) ===
> Sanity test with mbuf alloc/free passed
>
> Too few cores to run worker shutdown test
> === Basic distributor sanity tests ===
> Worker 0 handled 32 packets
> Sanity test with all zero hashes done.
> Worker 0 handled 32 packets
> Sanity test with non-zero hashes done
> === testing big burst (burst) ===
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (burst) ===
> Line 326: Packet count is incorrect, 1048568, expected 1048576
> Test Failed
> RTE>>
> real 0m36.668s
> user 1m7.293s
> sys 0m1.560s
>
> Could be worth running this loop on all tests? (not talking about the
> CI, it would be a manual effort to catch lurking issues).
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [dpdk-stable] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-08 19:46 ` [dpdk-dev] [dpdk-stable] " David Marchand
2019-10-08 20:08 ` Aaron Conole
@ 2019-10-09 5:52 ` Ruifeng Wang (Arm Technology China)
2019-10-17 11:42 ` [dpdk-dev] [EXT] " Harman Kalra
1 sibling, 1 reply; 23+ messages in thread
From: Ruifeng Wang (Arm Technology China) @ 2019-10-09 5:52 UTC (permalink / raw)
To: David Marchand, Aaron Conole
Cc: David Hunt, dev, hkalra, Gavin Hu (Arm Technology China),
Honnappa Nagarahalli, nd, dpdk stable, nd
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Wednesday, October 9, 2019 03:47
> To: Aaron Conole <aconole@redhat.com>
> Cc: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>; David
> Hunt <david.hunt@intel.com>; dev <dev@dpdk.org>; hkalra@marvell.com;
> Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; dpdk
> stable <stable@dpdk.org>
> Subject: Re: [dpdk-stable] [dpdk-dev] [PATCH] lib/distributor: fix deadlock
> issue for aarch64
>
> On Tue, Oct 8, 2019 at 7:06 PM Aaron Conole <aconole@redhat.com> wrote:
> >
> > Ruifeng Wang <ruifeng.wang@arm.com> writes:
> >
> > > Distributor and worker threads rely on data structs in cache line
> > > for synchronization. The shared data structs were not protected.
> > > This caused deadlock issue on weaker memory ordering platforms as
> > > aarch64.
> > > Fix this issue by adding memory barriers to ensure synchronization
> > > among cores.
> > >
> > > Bugzilla ID: 342
> > > Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > ---
> >
> > I see a failure in the distributor_autotest (on one of the builds):
> >
> > 64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit status 255
> or signal 127 SIGinvalid)
> >
> > --- command ---
> >
> > DPDK_TEST='distributor_autotest'
> > /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1
> > --file-prefix=distributor_autotest
> >
> > --- stdout ---
> >
> > EAL: Probing VFIO support...
> >
> > APP: HPET is not enabled, using TSC as default timer
> >
> > RTE>>distributor_autotest
> >
> > === Basic distributor sanity tests ===
> >
> > Worker 0 handled 32 packets
> >
> > Sanity test with all zero hashes done.
> >
> > Worker 0 handled 32 packets
> >
> > Sanity test with non-zero hashes done
> >
> > === testing big burst (single) ===
> >
> > Sanity test of returned packets done
> >
> > === Sanity test with mbuf alloc/free (single) ===
> >
> > Sanity test with mbuf alloc/free passed
> >
> > Too few cores to run worker shutdown test
> >
> > === Basic distributor sanity tests ===
> >
> > Worker 0 handled 32 packets
> >
> > Sanity test with all zero hashes done.
> >
> > Worker 0 handled 32 packets
> >
> > Sanity test with non-zero hashes done
> >
> > === testing big burst (burst) ===
> >
> > Sanity test of returned packets done
> >
> > === Sanity test with mbuf alloc/free (burst) ===
> >
> > Line 326: Packet count is incorrect, 1048568, expected 1048576
> >
> > Test Failed
> >
> > RTE>>
> >
> > --- stderr ---
> >
> > EAL: Detected 2 lcore(s)
> >
> > EAL: Detected 1 NUMA nodes
> >
> > EAL: Multi-process socket /var/run/dpdk/distributor_autotest/mp_socket
> >
> > EAL: Selected IOVA mode 'PA'
> >
> > EAL: No available hugepages reported in hugepages-1048576kB
> >
> > -------
> >
> > Not sure how to help debug further. I'll re-start the job to see if
> > it 'clears' up - but I guess there may be a delicate synchronization
> > somewhere that needs to be accounted.
>
> Idem, and with the same loop I used before, it can be caught quickly.
>
> # time (log=/tmp/$$.log; while true; do echo distributor_autotest
> |taskset -c 0-1 ./build-gcc-static/app/test/dpdk-test --log-level *:8
> -l 0-1 >$log 2>&1; grep -q 'Test OK' $log || break; done; cat $log; rm -f $log)
>
Thanks Aaron and David for your report. I can reproduce this issue with the script.
Will fix it in next version.
> [snip]
>
> RTE>>distributor_autotest
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: request: mp_malloc_sync
> EAL: Heap on socket 0 was expanded by 2MB
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: alloc_pages_on_heap(): couldn't allocate physically contiguous space
> EAL: Trying to obtain current memory policy.
> EAL: Setting policy MPOL_PREFERRED for socket 0
> EAL: Restoring previous memory policy: 0
> EAL: request: mp_malloc_sync
> EAL: Heap on socket 0 was expanded by 8MB === Basic distributor sanity
> tests === Worker 0 handled 32 packets Sanity test with all zero hashes done.
> Worker 0 handled 32 packets
> Sanity test with non-zero hashes done
> === testing big burst (single) ===
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (single) === Sanity test with mbuf
> alloc/free passed
>
> Too few cores to run worker shutdown test === Basic distributor sanity tests
> === Worker 0 handled 32 packets Sanity test with all zero hashes done.
> Worker 0 handled 32 packets
> Sanity test with non-zero hashes done
> === testing big burst (burst) ===
> Sanity test of returned packets done
>
> === Sanity test with mbuf alloc/free (burst) === Line 326: Packet count is
> incorrect, 1048568, expected 1048576 Test Failed
> RTE>>
> real 0m36.668s
> user 1m7.293s
> sys 0m1.560s
>
> Could be worth running this loop on all tests? (not talking about the CI, it
> would be a manual effort to catch lurking issues).
>
>
> --
> David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v2 0/2] fix distributor unit test
2019-10-08 9:55 [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-08 12:53 ` Hunt, David
2019-10-08 17:05 ` Aaron Conole
@ 2019-10-12 2:43 ` Ruifeng Wang
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 2/2] test/distributor: fix false unit test failure Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
3 siblings, 2 replies; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-12 2:43 UTC (permalink / raw)
To: david.hunt; +Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang
Bug 342 reported distributor_autotest execution suspension
on aarch64 platform.
Issue was due to lack of synchronization among threads. Distributor
thread and worker thread may get deadlocked.
Fixed synchronization issue by adding barriers.
Another issue identified was in test case. Non-atomic operation on
stat value could cause value reset not been observed by worker thread
and mess counters. The issue was fixed by using atomic operations.
---
v2:
Fixed intermittent packet count incorrect failure. (Aaron, David)
Additional patch to fix non-atomic operation in unit test.
Ruifeng Wang (2):
lib/distributor: fix deadlock issue for aarch64
test/distributor: fix false unit test failure
app/test/test_distributor.c | 6 ++-
lib/librte_distributor/meson.build | 5 ++
lib/librte_distributor/rte_distributor.c | 39 ++++++++++------
lib/librte_distributor/rte_distributor_v20.c | 49 +++++++++++++-------
4 files changed, 67 insertions(+), 32 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 0/2] fix distributor unit test Ruifeng Wang
@ 2019-10-12 2:43 ` Ruifeng Wang
2019-10-13 2:31 ` Honnappa Nagarahalli
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 2/2] test/distributor: fix false unit test failure Ruifeng Wang
1 sibling, 1 reply; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-12 2:43 UTC (permalink / raw)
To: david.hunt
Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang, stable
Distributor and worker threads rely on data structs in cache line
for synchronization. The shared data structs were not protected.
This caused deadlock issue on weaker memory ordering platforms as
aarch64.
Fix this issue by adding memory barriers to ensure synchronization
among cores.
Bugzilla ID: 342
Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
lib/librte_distributor/meson.build | 5 ++
lib/librte_distributor/rte_distributor.c | 39 ++++++++++------
lib/librte_distributor/rte_distributor_v20.c | 49 +++++++++++++-------
3 files changed, 63 insertions(+), 30 deletions(-)
diff --git a/lib/librte_distributor/meson.build b/lib/librte_distributor/meson.build
index dba7e3b2a..26577dbc1 100644
--- a/lib/librte_distributor/meson.build
+++ b/lib/librte_distributor/meson.build
@@ -9,3 +9,8 @@ else
endif
headers = files('rte_distributor.h')
deps += ['mbuf']
+
+# for clang 32-bit compiles we need libatomic for 64-bit atomic ops
+if cc.get_id() == 'clang' and dpdk_conf.get('RTE_ARCH_64') == false
+ ext_deps += cc.find_library('atomic')
+endif
diff --git a/lib/librte_distributor/rte_distributor.c b/lib/librte_distributor/rte_distributor.c
index 21eb1fb0a..b653146d0 100644
--- a/lib/librte_distributor/rte_distributor.c
+++ b/lib/librte_distributor/rte_distributor.c
@@ -50,7 +50,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
retptr64 = &(buf->retptr64[0]);
/* Spin while handshake bits are set (scheduler clears it) */
- while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
+ while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)) {
rte_pause();
uint64_t t = rte_rdtsc()+100;
@@ -76,7 +77,8 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
* Finally, set the GET_BUF to signal to distributor that cache
* line is ready for processing
*/
- *retptr64 |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d,
@@ -99,7 +101,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
}
/* If bit is set, return */
- if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
+ if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return -1;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -115,7 +118,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
* mbuf pointers, so toggle the bit so scheduler can start working
* on the next cacheline while we're working.
*/
- buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return count;
}
@@ -174,6 +178,7 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
return -EINVAL;
}
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
for (i = 0; i < RTE_DIST_BURST_SIZE; i++)
/* Switch off the return bit first */
buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF;
@@ -183,7 +188,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF;
/* set the GET_BUF but even if we got no returns */
- buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->retptr64[0]),
+ buf->retptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return 0;
}
@@ -273,7 +279,8 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
unsigned int count = 0;
unsigned int i;
- if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
+ if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) {
for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
oldbuf = ((uintptr_t)(buf->retptr64[i] >>
@@ -287,7 +294,7 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
d->returns.start = ret_start;
d->returns.count = ret_count;
/* Clear for the worker to populate with more returns */
- buf->retptr64[0] = 0;
+ __atomic_store_n(&(buf->retptr64[0]), 0, __ATOMIC_RELEASE);
}
return count;
}
@@ -307,7 +314,8 @@ release(struct rte_distributor *d, unsigned int wkr)
struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
unsigned int i;
- while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
+ while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF))
rte_pause();
handle_returns(d, wkr);
@@ -328,7 +336,8 @@ release(struct rte_distributor *d, unsigned int wkr)
d->backlog[wkr].count = 0;
/* Clear the GET bit */
- buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return buf->count;
}
@@ -355,7 +364,8 @@ rte_distributor_process_v1705(struct rte_distributor *d,
if (unlikely(num_mbufs == 0)) {
/* Flush out all non-full cache-lines to workers. */
for (wid = 0 ; wid < d->num_workers; wid++) {
- if (d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF) {
+ if (__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF) {
release(d, wid);
handle_returns(d, wid);
}
@@ -367,7 +377,8 @@ rte_distributor_process_v1705(struct rte_distributor *d,
uint16_t matches[RTE_DIST_BURST_SIZE];
unsigned int pkts;
- if (d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF)
+ if (__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)
d->bufs[wkr].count = 0;
if ((num_mbufs - next_idx) < RTE_DIST_BURST_SIZE)
@@ -465,7 +476,8 @@ rte_distributor_process_v1705(struct rte_distributor *d,
/* Flush out all non-full cache-lines to workers. */
for (wid = 0 ; wid < d->num_workers; wid++)
- if ((d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF))
+ if ((__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF))
release(d, wid);
return num_mbufs;
@@ -574,7 +586,8 @@ rte_distributor_clear_returns_v1705(struct rte_distributor *d)
/* throw away returns, so workers can exit */
for (wkr = 0; wkr < d->num_workers; wkr++)
- d->bufs[wkr].retptr64[0] = 0;
+ __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d),
diff --git a/lib/librte_distributor/rte_distributor_v20.c b/lib/librte_distributor/rte_distributor_v20.c
index cdc0969a8..41411e3c1 100644
--- a/lib/librte_distributor/rte_distributor_v20.c
+++ b/lib/librte_distributor/rte_distributor_v20.c
@@ -34,9 +34,10 @@ rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_GET_BUF;
- while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
+ while (unlikely(__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_FLAGS_MASK))
rte_pause();
- buf->bufptr64 = req;
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
}
VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
@@ -45,7 +46,8 @@ rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
unsigned worker_id)
{
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
- if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
+ if (__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return NULL;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -73,7 +75,7 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_RETURN_BUF;
- buf->bufptr64 = req;
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
return 0;
}
VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0);
@@ -117,7 +119,7 @@ handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int wkr)
{
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
- d->bufs[wkr].bufptr64 = 0;
+ __atomic_store_n(&(d->bufs[wkr].bufptr64), 0, __ATOMIC_RELEASE);
if (unlikely(d->backlog[wkr].count != 0)) {
/* On return of a packet, we need to move the
* queued packets for this core elsewhere.
@@ -165,18 +167,23 @@ process_returns(struct rte_distributor_v20 *d)
const int64_t data = d->bufs[wkr].bufptr64;
uintptr_t oldbuf = 0;
- if (data & RTE_DISTRIB_GET_BUF) {
+ if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) {
flushed++;
if (d->backlog[wkr].count)
- d->bufs[wkr].bufptr64 =
- backlog_pop(&d->backlog[wkr]);
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
else {
- d->bufs[wkr].bufptr64 = RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
}
oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
- } else if (data & RTE_DISTRIB_RETURN_BUF) {
+ } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_RETURN_BUF) {
handle_worker_shutdown(d, wkr);
oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
}
@@ -251,21 +258,26 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
}
}
- if ((data & RTE_DISTRIB_GET_BUF) &&
+ if ((__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) &&
(d->backlog[wkr].count || next_mb)) {
if (d->backlog[wkr].count)
- d->bufs[wkr].bufptr64 =
- backlog_pop(&d->backlog[wkr]);
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
else {
- d->bufs[wkr].bufptr64 = next_value;
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ next_value,
+ __ATOMIC_RELEASE);
d->in_flight_tags[wkr] = new_tag;
d->in_flight_bitmask |= (1UL << wkr);
next_mb = NULL;
}
oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
- } else if (data & RTE_DISTRIB_RETURN_BUF) {
+ } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_RETURN_BUF) {
handle_worker_shutdown(d, wkr);
oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
}
@@ -280,13 +292,16 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
* if they are ready */
for (wkr = 0; wkr < d->num_workers; wkr++)
if (d->backlog[wkr].count &&
- (d->bufs[wkr].bufptr64 & RTE_DISTRIB_GET_BUF)) {
+ (__atomic_load_n(&(d->bufs[wkr].bufptr64),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)) {
int64_t oldbuf = d->bufs[wkr].bufptr64 >>
RTE_DISTRIB_FLAG_BITS;
store_return(oldbuf, d, &ret_start, &ret_count);
- d->bufs[wkr].bufptr64 = backlog_pop(&d->backlog[wkr]);
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
}
d->returns.start = ret_start;
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] test/distributor: fix false unit test failure
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 0/2] fix distributor unit test Ruifeng Wang
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
@ 2019-10-12 2:43 ` Ruifeng Wang
1 sibling, 0 replies; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-12 2:43 UTC (permalink / raw)
To: david.hunt
Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang, stable
Sanity test could spuriously fail with reporting flush count error.
It was caused by worker stat coherent issue between distributor and
worker thread.
Fix this issue by using atomic operations to update worker stat.
Fixes: c3eabff124e6 ("distributor: add unit tests")
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
app/test/test_distributor.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c
index 7090b55f8..ba1f81cf8 100644
--- a/app/test/test_distributor.c
+++ b/app/test/test_distributor.c
@@ -70,12 +70,14 @@ handle_work(void *arg)
buf[i] = NULL;
num = rte_distributor_get_pkt(db, id, buf, buf, num);
while (!quit) {
- worker_stats[id].handled_packets += num;
+ __atomic_fetch_add(&worker_stats[id].handled_packets, num,
+ __ATOMIC_RELAXED);
count += num;
num = rte_distributor_get_pkt(db, id,
buf, buf, num);
}
- worker_stats[id].handled_packets += num;
+ __atomic_fetch_add(&worker_stats[id].handled_packets, num,
+ __ATOMIC_RELAXED);
count += num;
rte_distributor_return_pkt(db, id, buf, num);
return 0;
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
@ 2019-10-13 2:31 ` Honnappa Nagarahalli
2019-10-14 10:00 ` Ruifeng Wang (Arm Technology China)
0 siblings, 1 reply; 23+ messages in thread
From: Honnappa Nagarahalli @ 2019-10-13 2:31 UTC (permalink / raw)
To: Ruifeng Wang (Arm Technology China), david.hunt
Cc: dev, hkalra, Gavin Hu (Arm Technology China),
nd, Ruifeng Wang (Arm Technology China),
stable, Honnappa Nagarahalli, nd
Hi Ruifeng,
Typically, we have followed the convention of adding comments whenever C11 atomic APIs are used. Can you please add comments indicating why acquire or release semantics are used?
> -----Original Message-----
> From: Ruifeng Wang <ruifeng.wang@arm.com>
> Sent: Friday, October 11, 2019 9:44 PM
> To: david.hunt@intel.com
> Cc: dev@dpdk.org; hkalra@marvell.com; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Ruifeng Wang (Arm
> Technology China) <Ruifeng.Wang@arm.com>; stable@dpdk.org
> Subject: [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64
>
> Distributor and worker threads rely on data structs in cache line for
> synchronization. The shared data structs were not protected.
> This caused deadlock issue on weaker memory ordering platforms as aarch64.
> Fix this issue by adding memory barriers to ensure synchronization among
> cores.
>
> Bugzilla ID: 342
> Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> Cc: stable@dpdk.org
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> lib/librte_distributor/meson.build | 5 ++
> lib/librte_distributor/rte_distributor.c | 39 ++++++++++------
> lib/librte_distributor/rte_distributor_v20.c | 49 +++++++++++++-------
> 3 files changed, 63 insertions(+), 30 deletions(-)
>
> diff --git a/lib/librte_distributor/meson.build
> b/lib/librte_distributor/meson.build
> index dba7e3b2a..26577dbc1 100644
> --- a/lib/librte_distributor/meson.build
> +++ b/lib/librte_distributor/meson.build
> @@ -9,3 +9,8 @@ else
> endif
> headers = files('rte_distributor.h')
> deps += ['mbuf']
> +
> +# for clang 32-bit compiles we need libatomic for 64-bit atomic ops if
> +cc.get_id() == 'clang' and dpdk_conf.get('RTE_ARCH_64') == false
> + ext_deps += cc.find_library('atomic')
> +endif
> diff --git a/lib/librte_distributor/rte_distributor.c
> b/lib/librte_distributor/rte_distributor.c
> index 21eb1fb0a..b653146d0 100644
> --- a/lib/librte_distributor/rte_distributor.c
> +++ b/lib/librte_distributor/rte_distributor.c
> @@ -50,7 +50,8 @@ rte_distributor_request_pkt_v1705(struct
> rte_distributor *d,
>
> retptr64 = &(buf->retptr64[0]);
> /* Spin while handshake bits are set (scheduler clears it) */
> - while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
> + while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)) {
> rte_pause();
> uint64_t t = rte_rdtsc()+100;
>
> @@ -76,7 +77,8 @@ rte_distributor_request_pkt_v1705(struct
> rte_distributor *d,
> * Finally, set the GET_BUF to signal to distributor that cache
> * line is ready for processing
> */
> - *retptr64 |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
> + __ATOMIC_RELEASE);
> }
> BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
> MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor
> *d, @@ -99,7 +101,8 @@ rte_distributor_poll_pkt_v1705(struct
> rte_distributor *d,
> }
>
> /* If bit is set, return */
> - if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
> + if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)
> return -1;
>
> /* since bufptr64 is signed, this should be an arithmetic shift */ @@ -
> 115,7 +118,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
> * mbuf pointers, so toggle the bit so scheduler can start working
> * on the next cacheline while we're working.
> */
> - buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->bufptr64[0]),
> + buf->bufptr64[0] | RTE_DISTRIB_GET_BUF,
> __ATOMIC_RELEASE);
>
> return count;
> }
> @@ -174,6 +178,7 @@ rte_distributor_return_pkt_v1705(struct
> rte_distributor *d,
> return -EINVAL;
> }
>
> + __atomic_thread_fence(__ATOMIC_ACQUIRE);
> for (i = 0; i < RTE_DIST_BURST_SIZE; i++)
> /* Switch off the return bit first */
> buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF; @@ -183,7
> +188,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
> RTE_DISTRIB_FLAG_BITS) |
> RTE_DISTRIB_RETURN_BUF;
>
> /* set the GET_BUF but even if we got no returns */
> - buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->retptr64[0]),
> + buf->retptr64[0] | RTE_DISTRIB_GET_BUF,
> __ATOMIC_RELEASE);
>
> return 0;
> }
> @@ -273,7 +279,8 @@ handle_returns(struct rte_distributor *d, unsigned
> int wkr)
> unsigned int count = 0;
> unsigned int i;
>
> - if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
> + if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) {
> for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
> if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
> oldbuf = ((uintptr_t)(buf->retptr64[i] >> @@
> -287,7 +294,7 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
> d->returns.start = ret_start;
> d->returns.count = ret_count;
> /* Clear for the worker to populate with more returns */
> - buf->retptr64[0] = 0;
> + __atomic_store_n(&(buf->retptr64[0]), 0,
> __ATOMIC_RELEASE);
> }
> return count;
> }
> @@ -307,7 +314,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
> unsigned int i;
>
> - while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
> + while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
> __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF))
> rte_pause();
>
> handle_returns(d, wkr);
> @@ -328,7 +336,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> d->backlog[wkr].count = 0;
>
> /* Clear the GET bit */
> - buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(buf->bufptr64[0]),
> + buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF,
> __ATOMIC_RELEASE);
> return buf->count;
>
> }
> @@ -355,7 +364,8 @@ rte_distributor_process_v1705(struct rte_distributor
> *d,
> if (unlikely(num_mbufs == 0)) {
> /* Flush out all non-full cache-lines to workers. */
> for (wid = 0 ; wid < d->num_workers; wid++) {
> - if (d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF)
> {
> + if (__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
> + __ATOMIC_ACQUIRE) &
> RTE_DISTRIB_GET_BUF) {
> release(d, wid);
> handle_returns(d, wid);
> }
> @@ -367,7 +377,8 @@ rte_distributor_process_v1705(struct rte_distributor
> *d,
> uint16_t matches[RTE_DIST_BURST_SIZE];
> unsigned int pkts;
>
> - if (d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF)
> + if (__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
> + __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)
> d->bufs[wkr].count = 0;
>
> if ((num_mbufs - next_idx) < RTE_DIST_BURST_SIZE) @@ -
> 465,7 +476,8 @@ rte_distributor_process_v1705(struct rte_distributor *d,
>
> /* Flush out all non-full cache-lines to workers. */
> for (wid = 0 ; wid < d->num_workers; wid++)
> - if ((d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF))
> + if ((__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
> + __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF))
> release(d, wid);
>
> return num_mbufs;
> @@ -574,7 +586,8 @@ rte_distributor_clear_returns_v1705(struct
> rte_distributor *d)
>
> /* throw away returns, so workers can exit */
> for (wkr = 0; wkr < d->num_workers; wkr++)
> - d->bufs[wkr].retptr64[0] = 0;
> + __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
> + __ATOMIC_RELEASE);
> }
> BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
> MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct
> rte_distributor *d), diff --git a/lib/librte_distributor/rte_distributor_v20.c
> b/lib/librte_distributor/rte_distributor_v20.c
> index cdc0969a8..41411e3c1 100644
> --- a/lib/librte_distributor/rte_distributor_v20.c
> +++ b/lib/librte_distributor/rte_distributor_v20.c
> @@ -34,9 +34,10 @@ rte_distributor_request_pkt_v20(struct
> rte_distributor_v20 *d,
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
> | RTE_DISTRIB_GET_BUF;
> - while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
> + while (unlikely(__atomic_load_n(&(buf->bufptr64),
> __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_FLAGS_MASK))
> rte_pause();
> - buf->bufptr64 = req;
> + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> }
> VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
>
> @@ -45,7 +46,8 @@ rte_distributor_poll_pkt_v20(struct rte_distributor_v20
> *d,
> unsigned worker_id)
> {
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> - if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
> + if (__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF)
> return NULL;
>
> /* since bufptr64 is signed, this should be an arithmetic shift */ @@ -
> 73,7 +75,7 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
> union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> uint64_t req = (((int64_t)(uintptr_t)oldpkt) <<
> RTE_DISTRIB_FLAG_BITS)
> | RTE_DISTRIB_RETURN_BUF;
> - buf->bufptr64 = req;
> + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> return 0;
> }
> VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0); @@ -117,7 +119,7
> @@ handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int
> wkr) {
> d->in_flight_tags[wkr] = 0;
> d->in_flight_bitmask &= ~(1UL << wkr);
> - d->bufs[wkr].bufptr64 = 0;
> + __atomic_store_n(&(d->bufs[wkr].bufptr64), 0, __ATOMIC_RELEASE);
> if (unlikely(d->backlog[wkr].count != 0)) {
> /* On return of a packet, we need to move the
> * queued packets for this core elsewhere.
> @@ -165,18 +167,23 @@ process_returns(struct rte_distributor_v20 *d)
> const int64_t data = d->bufs[wkr].bufptr64;
> uintptr_t oldbuf = 0;
>
> - if (data & RTE_DISTRIB_GET_BUF) {
> + if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) {
> flushed++;
> if (d->backlog[wkr].count)
> - d->bufs[wkr].bufptr64 =
> - backlog_pop(&d-
> >backlog[wkr]);
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + backlog_pop(&d->backlog[wkr]),
> + __ATOMIC_RELEASE);
> else {
> - d->bufs[wkr].bufptr64 =
> RTE_DISTRIB_GET_BUF;
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + RTE_DISTRIB_GET_BUF,
> + __ATOMIC_RELEASE);
> d->in_flight_tags[wkr] = 0;
> d->in_flight_bitmask &= ~(1UL << wkr);
> }
> oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> - } else if (data & RTE_DISTRIB_RETURN_BUF) {
> + } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_RETURN_BUF) {
> handle_worker_shutdown(d, wkr);
> oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> }
> @@ -251,21 +258,26 @@ rte_distributor_process_v20(struct
> rte_distributor_v20 *d,
> }
> }
>
> - if ((data & RTE_DISTRIB_GET_BUF) &&
> + if ((__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_GET_BUF) &&
> (d->backlog[wkr].count || next_mb)) {
>
> if (d->backlog[wkr].count)
> - d->bufs[wkr].bufptr64 =
> - backlog_pop(&d-
> >backlog[wkr]);
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + backlog_pop(&d-
> >backlog[wkr]),
> + __ATOMIC_RELEASE);
>
> else {
> - d->bufs[wkr].bufptr64 = next_value;
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + next_value,
> + __ATOMIC_RELEASE);
> d->in_flight_tags[wkr] = new_tag;
> d->in_flight_bitmask |= (1UL << wkr);
> next_mb = NULL;
> }
> oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> - } else if (data & RTE_DISTRIB_RETURN_BUF) {
> + } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> + & RTE_DISTRIB_RETURN_BUF) {
> handle_worker_shutdown(d, wkr);
> oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> }
> @@ -280,13 +292,16 @@ rte_distributor_process_v20(struct
> rte_distributor_v20 *d,
> * if they are ready */
> for (wkr = 0; wkr < d->num_workers; wkr++)
> if (d->backlog[wkr].count &&
> - (d->bufs[wkr].bufptr64 &
> RTE_DISTRIB_GET_BUF)) {
> + (__atomic_load_n(&(d->bufs[wkr].bufptr64),
> + __ATOMIC_ACQUIRE) &
> RTE_DISTRIB_GET_BUF)) {
>
> int64_t oldbuf = d->bufs[wkr].bufptr64 >>
> RTE_DISTRIB_FLAG_BITS;
> store_return(oldbuf, d, &ret_start, &ret_count);
>
> - d->bufs[wkr].bufptr64 = backlog_pop(&d-
> >backlog[wkr]);
> + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> + backlog_pop(&d->backlog[wkr]),
> + __ATOMIC_RELEASE);
> }
>
> d->returns.start = ret_start;
> --
> 2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64
2019-10-13 2:31 ` Honnappa Nagarahalli
@ 2019-10-14 10:00 ` Ruifeng Wang (Arm Technology China)
0 siblings, 0 replies; 23+ messages in thread
From: Ruifeng Wang (Arm Technology China) @ 2019-10-14 10:00 UTC (permalink / raw)
To: Honnappa Nagarahalli, david.hunt
Cc: dev, hkalra, Gavin Hu (Arm Technology China), nd, stable, nd, nd
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Sunday, October 13, 2019 10:32
> To: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>;
> david.hunt@intel.com
> Cc: dev@dpdk.org; hkalra@marvell.com; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; Ruifeng Wang (Arm Technology
> China) <Ruifeng.Wang@arm.com>; stable@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64
>
> Hi Ruifeng,
> Typically, we have followed the convention of adding comments
> whenever C11 atomic APIs are used. Can you please add comments
> indicating why acquire or release semantics are used?
>
OK. Comments will be added to explain acquire/release semantics used.
> > -----Original Message-----
> > From: Ruifeng Wang <ruifeng.wang@arm.com>
> > Sent: Friday, October 11, 2019 9:44 PM
> > To: david.hunt@intel.com
> > Cc: dev@dpdk.org; hkalra@marvell.com; Gavin Hu (Arm Technology China)
> > <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Ruifeng Wang
> (Arm
> > Technology China) <Ruifeng.Wang@arm.com>; stable@dpdk.org
> > Subject: [PATCH v2 1/2] lib/distributor: fix deadlock issue for
> > aarch64
> >
> > Distributor and worker threads rely on data structs in cache line for
> > synchronization. The shared data structs were not protected.
> > This caused deadlock issue on weaker memory ordering platforms as
> aarch64.
> > Fix this issue by adding memory barriers to ensure synchronization
> > among cores.
> >
> > Bugzilla ID: 342
> > Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> > lib/librte_distributor/meson.build | 5 ++
> > lib/librte_distributor/rte_distributor.c | 39 ++++++++++------
> > lib/librte_distributor/rte_distributor_v20.c | 49
> > +++++++++++++-------
> > 3 files changed, 63 insertions(+), 30 deletions(-)
> >
> > diff --git a/lib/librte_distributor/meson.build
> > b/lib/librte_distributor/meson.build
> > index dba7e3b2a..26577dbc1 100644
> > --- a/lib/librte_distributor/meson.build
> > +++ b/lib/librte_distributor/meson.build
> > @@ -9,3 +9,8 @@ else
> > endif
> > headers = files('rte_distributor.h')
> > deps += ['mbuf']
> > +
> > +# for clang 32-bit compiles we need libatomic for 64-bit atomic ops
> > +if
> > +cc.get_id() == 'clang' and dpdk_conf.get('RTE_ARCH_64') == false
> > + ext_deps += cc.find_library('atomic') endif
> > diff --git a/lib/librte_distributor/rte_distributor.c
> > b/lib/librte_distributor/rte_distributor.c
> > index 21eb1fb0a..b653146d0 100644
> > --- a/lib/librte_distributor/rte_distributor.c
> > +++ b/lib/librte_distributor/rte_distributor.c
> > @@ -50,7 +50,8 @@ rte_distributor_request_pkt_v1705(struct
> > rte_distributor *d,
> >
> > retptr64 = &(buf->retptr64[0]);
> > /* Spin while handshake bits are set (scheduler clears it) */
> > - while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
> > + while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF)) {
> > rte_pause();
> > uint64_t t = rte_rdtsc()+100;
> >
> > @@ -76,7 +77,8 @@ rte_distributor_request_pkt_v1705(struct
> > rte_distributor *d,
> > * Finally, set the GET_BUF to signal to distributor that cache
> > * line is ready for processing
> > */
> > - *retptr64 |= RTE_DISTRIB_GET_BUF;
> > + __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
> > + __ATOMIC_RELEASE);
> > }
> > BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
> > MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct
> > rte_distributor *d, @@ -99,7 +101,8 @@
> > rte_distributor_poll_pkt_v1705(struct
> > rte_distributor *d,
> > }
> >
> > /* If bit is set, return */
> > - if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
> > + if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF)
> > return -1;
> >
> > /* since bufptr64 is signed, this should be an arithmetic shift */
> > @@ -
> > 115,7 +118,8 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
> > * mbuf pointers, so toggle the bit so scheduler can start working
> > * on the next cacheline while we're working.
> > */
> > - buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
> > + __atomic_store_n(&(buf->bufptr64[0]),
> > + buf->bufptr64[0] | RTE_DISTRIB_GET_BUF,
> > __ATOMIC_RELEASE);
> >
> > return count;
> > }
> > @@ -174,6 +178,7 @@ rte_distributor_return_pkt_v1705(struct
> > rte_distributor *d,
> > return -EINVAL;
> > }
> >
> > + __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > for (i = 0; i < RTE_DIST_BURST_SIZE; i++)
> > /* Switch off the return bit first */
> > buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF; @@ -183,7
> > +188,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
> > RTE_DISTRIB_FLAG_BITS) |
> > RTE_DISTRIB_RETURN_BUF;
> >
> > /* set the GET_BUF but even if we got no returns */
> > - buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
> > + __atomic_store_n(&(buf->retptr64[0]),
> > + buf->retptr64[0] | RTE_DISTRIB_GET_BUF,
> > __ATOMIC_RELEASE);
> >
> > return 0;
> > }
> > @@ -273,7 +279,8 @@ handle_returns(struct rte_distributor *d, unsigned
> > int wkr)
> > unsigned int count = 0;
> > unsigned int i;
> >
> > - if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
> > + if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF) {
> > for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
> > if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
> > oldbuf = ((uintptr_t)(buf->retptr64[i] >> @@
> > -287,7 +294,7 @@ handle_returns(struct rte_distributor *d, unsigned int
> wkr)
> > d->returns.start = ret_start;
> > d->returns.count = ret_count;
> > /* Clear for the worker to populate with more returns */
> > - buf->retptr64[0] = 0;
> > + __atomic_store_n(&(buf->retptr64[0]), 0,
> > __ATOMIC_RELEASE);
> > }
> > return count;
> > }
> > @@ -307,7 +314,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> > struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
> > unsigned int i;
> >
> > - while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
> > + while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
> > __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF))
> > rte_pause();
> >
> > handle_returns(d, wkr);
> > @@ -328,7 +336,8 @@ release(struct rte_distributor *d, unsigned int wkr)
> > d->backlog[wkr].count = 0;
> >
> > /* Clear the GET bit */
> > - buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
> > + __atomic_store_n(&(buf->bufptr64[0]),
> > + buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF,
> > __ATOMIC_RELEASE);
> > return buf->count;
> >
> > }
> > @@ -355,7 +364,8 @@ rte_distributor_process_v1705(struct
> > rte_distributor *d,
> > if (unlikely(num_mbufs == 0)) {
> > /* Flush out all non-full cache-lines to workers. */
> > for (wid = 0 ; wid < d->num_workers; wid++) {
> > - if (d->bufs[wid].bufptr64[0] &
> RTE_DISTRIB_GET_BUF)
> > {
> > + if (__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
> > + __ATOMIC_ACQUIRE) &
> > RTE_DISTRIB_GET_BUF) {
> > release(d, wid);
> > handle_returns(d, wid);
> > }
> > @@ -367,7 +377,8 @@ rte_distributor_process_v1705(struct
> > rte_distributor *d,
> > uint16_t matches[RTE_DIST_BURST_SIZE];
> > unsigned int pkts;
> >
> > - if (d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF)
> > + if (__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
> > + __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)
> > d->bufs[wkr].count = 0;
> >
> > if ((num_mbufs - next_idx) < RTE_DIST_BURST_SIZE) @@ -
> > 465,7 +476,8 @@ rte_distributor_process_v1705(struct rte_distributor
> > *d,
> >
> > /* Flush out all non-full cache-lines to workers. */
> > for (wid = 0 ; wid < d->num_workers; wid++)
> > - if ((d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF))
> > + if ((__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
> > + __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF))
> > release(d, wid);
> >
> > return num_mbufs;
> > @@ -574,7 +586,8 @@ rte_distributor_clear_returns_v1705(struct
> > rte_distributor *d)
> >
> > /* throw away returns, so workers can exit */
> > for (wkr = 0; wkr < d->num_workers; wkr++)
> > - d->bufs[wkr].retptr64[0] = 0;
> > + __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
> > + __ATOMIC_RELEASE);
> > }
> > BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
> > MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct
> > rte_distributor *d), diff --git
> > a/lib/librte_distributor/rte_distributor_v20.c
> > b/lib/librte_distributor/rte_distributor_v20.c
> > index cdc0969a8..41411e3c1 100644
> > --- a/lib/librte_distributor/rte_distributor_v20.c
> > +++ b/lib/librte_distributor/rte_distributor_v20.c
> > @@ -34,9 +34,10 @@ rte_distributor_request_pkt_v20(struct
> > rte_distributor_v20 *d,
> > union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> > int64_t req = (((int64_t)(uintptr_t)oldpkt) <<
> RTE_DISTRIB_FLAG_BITS)
> > | RTE_DISTRIB_GET_BUF;
> > - while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
> > + while (unlikely(__atomic_load_n(&(buf->bufptr64),
> > __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_FLAGS_MASK))
> > rte_pause();
> > - buf->bufptr64 = req;
> > + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> > }
> > VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
> >
> > @@ -45,7 +46,8 @@ rte_distributor_poll_pkt_v20(struct
> > rte_distributor_v20 *d,
> > unsigned worker_id)
> > {
> > union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> > - if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
> > + if (__atomic_load_n(&(buf->bufptr64), __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF)
> > return NULL;
> >
> > /* since bufptr64 is signed, this should be an arithmetic shift */
> > @@ -
> > 73,7 +75,7 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20
> *d,
> > union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
> > uint64_t req = (((int64_t)(uintptr_t)oldpkt) <<
> > RTE_DISTRIB_FLAG_BITS)
> > | RTE_DISTRIB_RETURN_BUF;
> > - buf->bufptr64 = req;
> > + __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
> > return 0;
> > }
> > VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0); @@ -117,7
> > +119,7 @@ handle_worker_shutdown(struct rte_distributor_v20 *d,
> > unsigned int
> > wkr) {
> > d->in_flight_tags[wkr] = 0;
> > d->in_flight_bitmask &= ~(1UL << wkr);
> > - d->bufs[wkr].bufptr64 = 0;
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64), 0,
> __ATOMIC_RELEASE);
> > if (unlikely(d->backlog[wkr].count != 0)) {
> > /* On return of a packet, we need to move the
> > * queued packets for this core elsewhere.
> > @@ -165,18 +167,23 @@ process_returns(struct rte_distributor_v20 *d)
> > const int64_t data = d->bufs[wkr].bufptr64;
> > uintptr_t oldbuf = 0;
> >
> > - if (data & RTE_DISTRIB_GET_BUF) {
> > + if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF) {
> > flushed++;
> > if (d->backlog[wkr].count)
> > - d->bufs[wkr].bufptr64 =
> > - backlog_pop(&d-
> > >backlog[wkr]);
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> > + backlog_pop(&d->backlog[wkr]),
> > + __ATOMIC_RELEASE);
> > else {
> > - d->bufs[wkr].bufptr64 =
> > RTE_DISTRIB_GET_BUF;
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> > + RTE_DISTRIB_GET_BUF,
> > + __ATOMIC_RELEASE);
> > d->in_flight_tags[wkr] = 0;
> > d->in_flight_bitmask &= ~(1UL << wkr);
> > }
> > oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> > - } else if (data & RTE_DISTRIB_RETURN_BUF) {
> > + } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_RETURN_BUF) {
> > handle_worker_shutdown(d, wkr);
> > oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> > }
> > @@ -251,21 +258,26 @@ rte_distributor_process_v20(struct
> > rte_distributor_v20 *d,
> > }
> > }
> >
> > - if ((data & RTE_DISTRIB_GET_BUF) &&
> > + if ((__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_GET_BUF) &&
> > (d->backlog[wkr].count || next_mb)) {
> >
> > if (d->backlog[wkr].count)
> > - d->bufs[wkr].bufptr64 =
> > - backlog_pop(&d-
> > >backlog[wkr]);
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> > + backlog_pop(&d-
> > >backlog[wkr]),
> > + __ATOMIC_RELEASE);
> >
> > else {
> > - d->bufs[wkr].bufptr64 = next_value;
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> > + next_value,
> > + __ATOMIC_RELEASE);
> > d->in_flight_tags[wkr] = new_tag;
> > d->in_flight_bitmask |= (1UL << wkr);
> > next_mb = NULL;
> > }
> > oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> > - } else if (data & RTE_DISTRIB_RETURN_BUF) {
> > + } else if (__atomic_load_n(&data, __ATOMIC_ACQUIRE)
> > + & RTE_DISTRIB_RETURN_BUF) {
> > handle_worker_shutdown(d, wkr);
> > oldbuf = data >> RTE_DISTRIB_FLAG_BITS;
> > }
> > @@ -280,13 +292,16 @@ rte_distributor_process_v20(struct
> > rte_distributor_v20 *d,
> > * if they are ready */
> > for (wkr = 0; wkr < d->num_workers; wkr++)
> > if (d->backlog[wkr].count &&
> > - (d->bufs[wkr].bufptr64 &
> > RTE_DISTRIB_GET_BUF)) {
> > + (__atomic_load_n(&(d->bufs[wkr].bufptr64),
> > + __ATOMIC_ACQUIRE) &
> > RTE_DISTRIB_GET_BUF)) {
> >
> > int64_t oldbuf = d->bufs[wkr].bufptr64 >>
> > RTE_DISTRIB_FLAG_BITS;
> > store_return(oldbuf, d, &ret_start, &ret_count);
> >
> > - d->bufs[wkr].bufptr64 = backlog_pop(&d-
> > >backlog[wkr]);
> > + __atomic_store_n(&(d->bufs[wkr].bufptr64),
> > + backlog_pop(&d->backlog[wkr]),
> > + __ATOMIC_RELEASE);
> > }
> >
> > d->returns.start = ret_start;
> > --
> > 2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-08 9:55 [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
` (2 preceding siblings ...)
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 0/2] fix distributor unit test Ruifeng Wang
@ 2019-10-15 9:28 ` Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
` (3 more replies)
3 siblings, 4 replies; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-15 9:28 UTC (permalink / raw)
To: david.hunt; +Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang
Bug 342 reported distributor_autotest execution suspension
on aarch64 platform.
Issue was due to lack of synchronization among threads. Distributor
thread and worker thread may get deadlocked.
Fixed synchronization issue by adding barriers.
Another issue identified was in test case. Non-atomic operation on
stat value could cause value reset not been observed by worker thread
and mess counters. The issue was fixed by using atomic operations.
---
v3:
Added comments for using of C11 acquire/release semantics. (Honnappa)
v2:
Fixed intermittent packet count incorrect failure. (Aaron, David)
Fixed Clang build on 32bit systems.
Additional patch to fix non-atomic operation in unit test.
Ruifeng Wang (2):
lib/distributor: fix deadlock issue for aarch64
test/distributor: fix false unit test failure
app/test/test_distributor.c | 6 +-
lib/librte_distributor/meson.build | 5 ++
lib/librte_distributor/rte_distributor.c | 68 ++++++++++++++------
lib/librte_distributor/rte_distributor_v20.c | 59 ++++++++++++-----
4 files changed, 101 insertions(+), 37 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
@ 2019-10-15 9:28 ` Ruifeng Wang
2019-10-25 8:13 ` Hunt, David
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure Ruifeng Wang
` (2 subsequent siblings)
3 siblings, 1 reply; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-15 9:28 UTC (permalink / raw)
To: david.hunt
Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang, stable
Distributor and worker threads rely on data structs in cache line
for synchronization. The shared data structs were not protected.
This caused deadlock issue on weaker memory ordering platforms as
aarch64.
Fix this issue by adding memory barriers to ensure synchronization
among cores.
Bugzilla ID: 342
Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
lib/librte_distributor/meson.build | 5 ++
lib/librte_distributor/rte_distributor.c | 68 ++++++++++++++------
lib/librte_distributor/rte_distributor_v20.c | 59 ++++++++++++-----
3 files changed, 97 insertions(+), 35 deletions(-)
diff --git a/lib/librte_distributor/meson.build b/lib/librte_distributor/meson.build
index dba7e3b2a..26577dbc1 100644
--- a/lib/librte_distributor/meson.build
+++ b/lib/librte_distributor/meson.build
@@ -9,3 +9,8 @@ else
endif
headers = files('rte_distributor.h')
deps += ['mbuf']
+
+# for clang 32-bit compiles we need libatomic for 64-bit atomic ops
+if cc.get_id() == 'clang' and dpdk_conf.get('RTE_ARCH_64') == false
+ ext_deps += cc.find_library('atomic')
+endif
diff --git a/lib/librte_distributor/rte_distributor.c b/lib/librte_distributor/rte_distributor.c
index 21eb1fb0a..0a03625c9 100644
--- a/lib/librte_distributor/rte_distributor.c
+++ b/lib/librte_distributor/rte_distributor.c
@@ -49,8 +49,11 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
}
retptr64 = &(buf->retptr64[0]);
- /* Spin while handshake bits are set (scheduler clears it) */
- while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) {
+ /* Spin while handshake bits are set (scheduler clears it).
+ * Sync with worker on GET_BUF flag.
+ */
+ while (unlikely(__atomic_load_n(retptr64, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)) {
rte_pause();
uint64_t t = rte_rdtsc()+100;
@@ -75,8 +78,10 @@ rte_distributor_request_pkt_v1705(struct rte_distributor *d,
/*
* Finally, set the GET_BUF to signal to distributor that cache
* line is ready for processing
+ * Sync with distributor to release retptrs
*/
- *retptr64 |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(retptr64, *retptr64 | RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d,
@@ -98,8 +103,11 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
return (pkts[0]) ? 1 : 0;
}
- /* If bit is set, return */
- if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF)
+ /* If bit is set, return
+ * Sync with distributor to acquire bufptrs
+ */
+ if (__atomic_load_n(&(buf->bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return -1;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -114,8 +122,10 @@ rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
* so now we've got the contents of the cacheline into an array of
* mbuf pointers, so toggle the bit so scheduler can start working
* on the next cacheline while we're working.
+ * Sync with distributor on GET_BUF flag. Release bufptrs.
*/
- buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF;
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return count;
}
@@ -174,6 +184,8 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
return -EINVAL;
}
+ /* Sync with distributor to acquire retptrs */
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
for (i = 0; i < RTE_DIST_BURST_SIZE; i++)
/* Switch off the return bit first */
buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF;
@@ -182,8 +194,11 @@ rte_distributor_return_pkt_v1705(struct rte_distributor *d,
buf->retptr64[i] = (((int64_t)(uintptr_t)oldpkt[i]) <<
RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF;
- /* set the GET_BUF but even if we got no returns */
- buf->retptr64[0] |= RTE_DISTRIB_GET_BUF;
+ /* set the GET_BUF but even if we got no returns.
+ * Sync with distributor on GET_BUF flag. Release retptrs.
+ */
+ __atomic_store_n(&(buf->retptr64[0]),
+ buf->retptr64[0] | RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return 0;
}
@@ -273,7 +288,9 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
unsigned int count = 0;
unsigned int i;
- if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) {
+ /* Sync on GET_BUF flag. Acquire retptrs. */
+ if (__atomic_load_n(&(buf->retptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF) {
for (i = 0; i < RTE_DIST_BURST_SIZE; i++) {
if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) {
oldbuf = ((uintptr_t)(buf->retptr64[i] >>
@@ -286,8 +303,10 @@ handle_returns(struct rte_distributor *d, unsigned int wkr)
}
d->returns.start = ret_start;
d->returns.count = ret_count;
- /* Clear for the worker to populate with more returns */
- buf->retptr64[0] = 0;
+ /* Clear for the worker to populate with more returns.
+ * Sync with distributor on GET_BUF flag. Release retptrs.
+ */
+ __atomic_store_n(&(buf->retptr64[0]), 0, __ATOMIC_RELEASE);
}
return count;
}
@@ -307,7 +326,9 @@ release(struct rte_distributor *d, unsigned int wkr)
struct rte_distributor_buffer *buf = &(d->bufs[wkr]);
unsigned int i;
- while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF))
+ /* Sync with worker on GET_BUF flag */
+ while (!(__atomic_load_n(&(d->bufs[wkr].bufptr64[0]), __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF))
rte_pause();
handle_returns(d, wkr);
@@ -327,8 +348,11 @@ release(struct rte_distributor *d, unsigned int wkr)
d->backlog[wkr].count = 0;
- /* Clear the GET bit */
- buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF;
+ /* Clear the GET bit.
+ * Sync with worker on GET_BUF flag. Release bufptrs.
+ */
+ __atomic_store_n(&(buf->bufptr64[0]),
+ buf->bufptr64[0] & ~RTE_DISTRIB_GET_BUF, __ATOMIC_RELEASE);
return buf->count;
}
@@ -355,7 +379,9 @@ rte_distributor_process_v1705(struct rte_distributor *d,
if (unlikely(num_mbufs == 0)) {
/* Flush out all non-full cache-lines to workers. */
for (wid = 0 ; wid < d->num_workers; wid++) {
- if (d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF) {
+ /* Sync with worker on GET_BUF flag. */
+ if (__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF) {
release(d, wid);
handle_returns(d, wid);
}
@@ -367,7 +393,9 @@ rte_distributor_process_v1705(struct rte_distributor *d,
uint16_t matches[RTE_DIST_BURST_SIZE];
unsigned int pkts;
- if (d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF)
+ /* Sync with worker on GET_BUF flag. */
+ if (__atomic_load_n(&(d->bufs[wkr].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)
d->bufs[wkr].count = 0;
if ((num_mbufs - next_idx) < RTE_DIST_BURST_SIZE)
@@ -465,7 +493,9 @@ rte_distributor_process_v1705(struct rte_distributor *d,
/* Flush out all non-full cache-lines to workers. */
for (wid = 0 ; wid < d->num_workers; wid++)
- if ((d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF))
+ /* Sync with worker on GET_BUF flag. */
+ if ((__atomic_load_n(&(d->bufs[wid].bufptr64[0]),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF))
release(d, wid);
return num_mbufs;
@@ -574,7 +604,9 @@ rte_distributor_clear_returns_v1705(struct rte_distributor *d)
/* throw away returns, so workers can exit */
for (wkr = 0; wkr < d->num_workers; wkr++)
- d->bufs[wkr].retptr64[0] = 0;
+ /* Sync with worker. Release retptrs. */
+ __atomic_store_n(&(d->bufs[wkr].retptr64[0]), 0,
+ __ATOMIC_RELEASE);
}
BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05);
MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d),
diff --git a/lib/librte_distributor/rte_distributor_v20.c b/lib/librte_distributor/rte_distributor_v20.c
index cdc0969a8..ef6d5cb4b 100644
--- a/lib/librte_distributor/rte_distributor_v20.c
+++ b/lib/librte_distributor/rte_distributor_v20.c
@@ -34,9 +34,12 @@ rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_GET_BUF;
- while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK))
+ while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
+ & RTE_DISTRIB_FLAGS_MASK))
rte_pause();
- buf->bufptr64 = req;
+
+ /* Sync with distributor on GET_BUF flag. */
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
}
VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
@@ -45,7 +48,9 @@ rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
unsigned worker_id)
{
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
- if (buf->bufptr64 & RTE_DISTRIB_GET_BUF)
+ /* Sync with distributor. Acquire bufptr64. */
+ if (__atomic_load_n(&buf->bufptr64, __ATOMIC_ACQUIRE)
+ & RTE_DISTRIB_GET_BUF)
return NULL;
/* since bufptr64 is signed, this should be an arithmetic shift */
@@ -73,7 +78,8 @@ rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id];
uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
| RTE_DISTRIB_RETURN_BUF;
- buf->bufptr64 = req;
+ /* Sync with distributor on RETURN_BUF flag. */
+ __atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
return 0;
}
VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0);
@@ -117,7 +123,8 @@ handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int wkr)
{
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
- d->bufs[wkr].bufptr64 = 0;
+ /* Sync with worker. Release bufptr64. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64), 0, __ATOMIC_RELEASE);
if (unlikely(d->backlog[wkr].count != 0)) {
/* On return of a packet, we need to move the
* queued packets for this core elsewhere.
@@ -161,17 +168,23 @@ process_returns(struct rte_distributor_v20 *d)
ret_count = d->returns.count;
for (wkr = 0; wkr < d->num_workers; wkr++) {
-
- const int64_t data = d->bufs[wkr].bufptr64;
uintptr_t oldbuf = 0;
+ /* Sync with worker. Acquire bufptr64. */
+ const int64_t data = __atomic_load_n(&(d->bufs[wkr].bufptr64),
+ __ATOMIC_ACQUIRE);
if (data & RTE_DISTRIB_GET_BUF) {
flushed++;
if (d->backlog[wkr].count)
- d->bufs[wkr].bufptr64 =
- backlog_pop(&d->backlog[wkr]);
+ /* Sync with worker. Release bufptr64. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
else {
- d->bufs[wkr].bufptr64 = RTE_DISTRIB_GET_BUF;
+ /* Sync with worker on GET_BUF flag. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ RTE_DISTRIB_GET_BUF,
+ __ATOMIC_RELEASE);
d->in_flight_tags[wkr] = 0;
d->in_flight_bitmask &= ~(1UL << wkr);
}
@@ -207,9 +220,10 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
return process_returns(d);
while (next_idx < num_mbufs || next_mb != NULL) {
-
- int64_t data = d->bufs[wkr].bufptr64;
uintptr_t oldbuf = 0;
+ /* Sync with worker. Acquire bufptr64. */
+ int64_t data = __atomic_load_n(&(d->bufs[wkr].bufptr64),
+ __ATOMIC_ACQUIRE);
if (!next_mb) {
next_mb = mbufs[next_idx++];
@@ -255,11 +269,16 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
(d->backlog[wkr].count || next_mb)) {
if (d->backlog[wkr].count)
- d->bufs[wkr].bufptr64 =
- backlog_pop(&d->backlog[wkr]);
+ /* Sync with worker. Release bufptr64. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
else {
- d->bufs[wkr].bufptr64 = next_value;
+ /* Sync with worker. Release bufptr64. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ next_value,
+ __ATOMIC_RELEASE);
d->in_flight_tags[wkr] = new_tag;
d->in_flight_bitmask |= (1UL << wkr);
next_mb = NULL;
@@ -280,13 +299,19 @@ rte_distributor_process_v20(struct rte_distributor_v20 *d,
* if they are ready */
for (wkr = 0; wkr < d->num_workers; wkr++)
if (d->backlog[wkr].count &&
- (d->bufs[wkr].bufptr64 & RTE_DISTRIB_GET_BUF)) {
+ /* Sync with worker. Acquire bufptr64. */
+ (__atomic_load_n(&(d->bufs[wkr].bufptr64),
+ __ATOMIC_ACQUIRE) & RTE_DISTRIB_GET_BUF)) {
int64_t oldbuf = d->bufs[wkr].bufptr64 >>
RTE_DISTRIB_FLAG_BITS;
+
store_return(oldbuf, d, &ret_start, &ret_count);
- d->bufs[wkr].bufptr64 = backlog_pop(&d->backlog[wkr]);
+ /* Sync with worker. Release bufptr64. */
+ __atomic_store_n(&(d->bufs[wkr].bufptr64),
+ backlog_pop(&d->backlog[wkr]),
+ __ATOMIC_RELEASE);
}
d->returns.start = ret_start;
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
@ 2019-10-15 9:28 ` Ruifeng Wang
2019-10-25 8:13 ` Hunt, David
2019-10-24 19:31 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test David Marchand
2019-10-25 8:33 ` David Marchand
3 siblings, 1 reply; 23+ messages in thread
From: Ruifeng Wang @ 2019-10-15 9:28 UTC (permalink / raw)
To: david.hunt
Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, Ruifeng Wang, stable
Sanity test could spuriously fail with reporting flush count error.
It was caused by worker stat coherent issue between distributor and
worker thread.
Fix this issue by using atomic operations to update worker stat.
Fixes: c3eabff124e6 ("distributor: add unit tests")
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
app/test/test_distributor.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c
index 7090b55f8..ba1f81cf8 100644
--- a/app/test/test_distributor.c
+++ b/app/test/test_distributor.c
@@ -70,12 +70,14 @@ handle_work(void *arg)
buf[i] = NULL;
num = rte_distributor_get_pkt(db, id, buf, buf, num);
while (!quit) {
- worker_stats[id].handled_packets += num;
+ __atomic_fetch_add(&worker_stats[id].handled_packets, num,
+ __ATOMIC_RELAXED);
count += num;
num = rte_distributor_get_pkt(db, id,
buf, buf, num);
}
- worker_stats[id].handled_packets += num;
+ __atomic_fetch_add(&worker_stats[id].handled_packets, num,
+ __ATOMIC_RELAXED);
count += num;
rte_distributor_return_pkt(db, id, buf, num);
return 0;
--
2.17.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [EXT] RE: [dpdk-stable] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-09 5:52 ` Ruifeng Wang (Arm Technology China)
@ 2019-10-17 11:42 ` Harman Kalra
2019-10-17 13:48 ` Ruifeng Wang (Arm Technology China)
0 siblings, 1 reply; 23+ messages in thread
From: Harman Kalra @ 2019-10-17 11:42 UTC (permalink / raw)
To: Ruifeng Wang (Arm Technology China)
Cc: David Marchand, Aaron Conole, David Hunt, dev,
Gavin Hu (Arm Technology China),
Honnappa Nagarahalli, nd, dpdk stable
Hi
I tested this patch, following are my observations:
1. With this patch distributor_autotest getting suspended on arm64 platform
is resolved. But continous execution of this test results in test failure,
as reported by Aaron.
2. While testing on x86 platform, still I can observe distributor_autotest
getting suspeneded(stuck) on continous execution of the test (it took almost
7-8 iterations to reproduce the suspension).
Thanks
On Wed, Oct 09, 2019 at 05:52:03AM +0000, Ruifeng Wang (Arm Technology China) wrote:
> External Email
>
> ----------------------------------------------------------------------
>
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: Wednesday, October 9, 2019 03:47
> > To: Aaron Conole <aconole@redhat.com>
> > Cc: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>; David
> > Hunt <david.hunt@intel.com>; dev <dev@dpdk.org>; hkalra@marvell.com;
> > Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Honnappa
> > Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; dpdk
> > stable <stable@dpdk.org>
> > Subject: Re: [dpdk-stable] [dpdk-dev] [PATCH] lib/distributor: fix deadlock
> > issue for aarch64
> >
> > On Tue, Oct 8, 2019 at 7:06 PM Aaron Conole <aconole@redhat.com> wrote:
> > >
> > > Ruifeng Wang <ruifeng.wang@arm.com> writes:
> > >
> > > > Distributor and worker threads rely on data structs in cache line
> > > > for synchronization. The shared data structs were not protected.
> > > > This caused deadlock issue on weaker memory ordering platforms as
> > > > aarch64.
> > > > Fix this issue by adding memory barriers to ensure synchronization
> > > > among cores.
> > > >
> > > > Bugzilla ID: 342
> > > > Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> > > > Cc: stable@dpdk.org
> > > >
> > > > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > ---
> > >
> > > I see a failure in the distributor_autotest (on one of the builds):
> > >
> > > 64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit status 255
> > or signal 127 SIGinvalid)
> > >
> > > --- command ---
> > >
> > > DPDK_TEST='distributor_autotest'
> > > /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1
> > > --file-prefix=distributor_autotest
> > >
> > > --- stdout ---
> > >
> > > EAL: Probing VFIO support...
> > >
> > > APP: HPET is not enabled, using TSC as default timer
> > >
> > > RTE>>distributor_autotest
> > >
> > > === Basic distributor sanity tests ===
> > >
> > > Worker 0 handled 32 packets
> > >
> > > Sanity test with all zero hashes done.
> > >
> > > Worker 0 handled 32 packets
> > >
> > > Sanity test with non-zero hashes done
> > >
> > > === testing big burst (single) ===
> > >
> > > Sanity test of returned packets done
> > >
> > > === Sanity test with mbuf alloc/free (single) ===
> > >
> > > Sanity test with mbuf alloc/free passed
> > >
> > > Too few cores to run worker shutdown test
> > >
> > > === Basic distributor sanity tests ===
> > >
> > > Worker 0 handled 32 packets
> > >
> > > Sanity test with all zero hashes done.
> > >
> > > Worker 0 handled 32 packets
> > >
> > > Sanity test with non-zero hashes done
> > >
> > > === testing big burst (burst) ===
> > >
> > > Sanity test of returned packets done
> > >
> > > === Sanity test with mbuf alloc/free (burst) ===
> > >
> > > Line 326: Packet count is incorrect, 1048568, expected 1048576
> > >
> > > Test Failed
> > >
> > > RTE>>
> > >
> > > --- stderr ---
> > >
> > > EAL: Detected 2 lcore(s)
> > >
> > > EAL: Detected 1 NUMA nodes
> > >
> > > EAL: Multi-process socket /var/run/dpdk/distributor_autotest/mp_socket
> > >
> > > EAL: Selected IOVA mode 'PA'
> > >
> > > EAL: No available hugepages reported in hugepages-1048576kB
> > >
> > > -------
> > >
> > > Not sure how to help debug further. I'll re-start the job to see if
> > > it 'clears' up - but I guess there may be a delicate synchronization
> > > somewhere that needs to be accounted.
> >
> > Idem, and with the same loop I used before, it can be caught quickly.
> >
> > # time (log=/tmp/$$.log; while true; do echo distributor_autotest
> > |taskset -c 0-1 ./build-gcc-static/app/test/dpdk-test --log-level *:8
> > -l 0-1 >$log 2>&1; grep -q 'Test OK' $log || break; done; cat $log; rm -f $log)
> >
> Thanks Aaron and David for your report. I can reproduce this issue with the script.
> Will fix it in next version.
>
> > [snip]
> >
> > RTE>>distributor_autotest
> > EAL: Trying to obtain current memory policy.
> > EAL: Setting policy MPOL_PREFERRED for socket 0
> > EAL: Restoring previous memory policy: 0
> > EAL: request: mp_malloc_sync
> > EAL: Heap on socket 0 was expanded by 2MB
> > EAL: Trying to obtain current memory policy.
> > EAL: Setting policy MPOL_PREFERRED for socket 0
> > EAL: Restoring previous memory policy: 0
> > EAL: alloc_pages_on_heap(): couldn't allocate physically contiguous space
> > EAL: Trying to obtain current memory policy.
> > EAL: Setting policy MPOL_PREFERRED for socket 0
> > EAL: Restoring previous memory policy: 0
> > EAL: request: mp_malloc_sync
> > EAL: Heap on socket 0 was expanded by 8MB === Basic distributor sanity
> > tests === Worker 0 handled 32 packets Sanity test with all zero hashes done.
> > Worker 0 handled 32 packets
> > Sanity test with non-zero hashes done
> > === testing big burst (single) ===
> > Sanity test of returned packets done
> >
> > === Sanity test with mbuf alloc/free (single) === Sanity test with mbuf
> > alloc/free passed
> >
> > Too few cores to run worker shutdown test === Basic distributor sanity tests
> > === Worker 0 handled 32 packets Sanity test with all zero hashes done.
> > Worker 0 handled 32 packets
> > Sanity test with non-zero hashes done
> > === testing big burst (burst) ===
> > Sanity test of returned packets done
> >
> > === Sanity test with mbuf alloc/free (burst) === Line 326: Packet count is
> > incorrect, 1048568, expected 1048576 Test Failed
> > RTE>>
> > real 0m36.668s
> > user 1m7.293s
> > sys 0m1.560s
> >
> > Could be worth running this loop on all tests? (not talking about the CI, it
> > would be a manual effort to catch lurking issues).
> >
> >
> > --
> > David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [EXT] RE: [dpdk-stable] [PATCH] lib/distributor: fix deadlock issue for aarch64
2019-10-17 11:42 ` [dpdk-dev] [EXT] " Harman Kalra
@ 2019-10-17 13:48 ` Ruifeng Wang (Arm Technology China)
0 siblings, 0 replies; 23+ messages in thread
From: Ruifeng Wang (Arm Technology China) @ 2019-10-17 13:48 UTC (permalink / raw)
To: Harman Kalra
Cc: David Marchand, Aaron Conole, David Hunt, dev,
Gavin Hu (Arm Technology China),
Honnappa Nagarahalli, nd, dpdk stable, nd
Hi Harman,
Thank you for testing this.
> -----Original Message-----
> From: Harman Kalra <hkalra@marvell.com>
> Sent: Thursday, October 17, 2019 19:42
> To: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>
> Cc: David Marchand <david.marchand@redhat.com>; Aaron Conole
> <aconole@redhat.com>; David Hunt <david.hunt@intel.com>; dev
> <dev@dpdk.org>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd
> <nd@arm.com>; dpdk stable <stable@dpdk.org>
> Subject: Re: [EXT] RE: [dpdk-stable] [dpdk-dev] [PATCH] lib/distributor: fix
> deadlock issue for aarch64
>
> Hi
>
> I tested this patch, following are my observations:
> 1. With this patch distributor_autotest getting suspended on arm64 platform
> is resolved. But continous execution of this test results in test failure, as
> reported by Aaron.
> 2. While testing on x86 platform, still I can observe distributor_autotest
> getting suspeneded(stuck) on continous execution of the test (it took almost
> 7-8 iterations to reproduce the suspension).
Yes, this v1 patch is not complete to solve the issue.
I have posted v3:
http://patches.dpdk.org/project/dpdk/list/?series=6856
With the new patch set, I didn't observe test failure in my test.
Will you try that?
Thanks.
/Ruifeng
>
> Thanks
>
> On Wed, Oct 09, 2019 at 05:52:03AM +0000, Ruifeng Wang (Arm Technology
> China) wrote:
> > External Email
> >
> > ----------------------------------------------------------------------
> >
> > > -----Original Message-----
> > > From: David Marchand <david.marchand@redhat.com>
> > > Sent: Wednesday, October 9, 2019 03:47
> > > To: Aaron Conole <aconole@redhat.com>
> > > Cc: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>;
> > > David Hunt <david.hunt@intel.com>; dev <dev@dpdk.org>;
> > > hkalra@marvell.com; Gavin Hu (Arm Technology China)
> > > <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; dpdk stable
> > > <stable@dpdk.org>
> > > Subject: Re: [dpdk-stable] [dpdk-dev] [PATCH] lib/distributor: fix
> > > deadlock issue for aarch64
> > >
> > > On Tue, Oct 8, 2019 at 7:06 PM Aaron Conole <aconole@redhat.com>
> wrote:
> > > >
> > > > Ruifeng Wang <ruifeng.wang@arm.com> writes:
> > > >
> > > > > Distributor and worker threads rely on data structs in cache
> > > > > line for synchronization. The shared data structs were not protected.
> > > > > This caused deadlock issue on weaker memory ordering platforms
> > > > > as aarch64.
> > > > > Fix this issue by adding memory barriers to ensure
> > > > > synchronization among cores.
> > > > >
> > > > > Bugzilla ID: 342
> > > > > Fixes: 775003ad2f96 ("distributor: add new burst-capable
> > > > > library")
> > > > > Cc: stable@dpdk.org
> > > > >
> > > > > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > > > ---
> > > >
> > > > I see a failure in the distributor_autotest (on one of the builds):
> > > >
> > > > 64/82 DPDK:fast-tests / distributor_autotest FAIL 0.37 s (exit status
> 255
> > > or signal 127 SIGinvalid)
> > > >
> > > > --- command ---
> > > >
> > > > DPDK_TEST='distributor_autotest'
> > > > /home/travis/build/ovsrobot/dpdk/build/app/test/dpdk-test -l 0-1
> > > > --file-prefix=distributor_autotest
> > > >
> > > > --- stdout ---
> > > >
> > > > EAL: Probing VFIO support...
> > > >
> > > > APP: HPET is not enabled, using TSC as default timer
> > > >
> > > > RTE>>distributor_autotest
> > > >
> > > > === Basic distributor sanity tests ===
> > > >
> > > > Worker 0 handled 32 packets
> > > >
> > > > Sanity test with all zero hashes done.
> > > >
> > > > Worker 0 handled 32 packets
> > > >
> > > > Sanity test with non-zero hashes done
> > > >
> > > > === testing big burst (single) ===
> > > >
> > > > Sanity test of returned packets done
> > > >
> > > > === Sanity test with mbuf alloc/free (single) ===
> > > >
> > > > Sanity test with mbuf alloc/free passed
> > > >
> > > > Too few cores to run worker shutdown test
> > > >
> > > > === Basic distributor sanity tests ===
> > > >
> > > > Worker 0 handled 32 packets
> > > >
> > > > Sanity test with all zero hashes done.
> > > >
> > > > Worker 0 handled 32 packets
> > > >
> > > > Sanity test with non-zero hashes done
> > > >
> > > > === testing big burst (burst) ===
> > > >
> > > > Sanity test of returned packets done
> > > >
> > > > === Sanity test with mbuf alloc/free (burst) ===
> > > >
> > > > Line 326: Packet count is incorrect, 1048568, expected 1048576
> > > >
> > > > Test Failed
> > > >
> > > > RTE>>
> > > >
> > > > --- stderr ---
> > > >
> > > > EAL: Detected 2 lcore(s)
> > > >
> > > > EAL: Detected 1 NUMA nodes
> > > >
> > > > EAL: Multi-process socket
> > > > /var/run/dpdk/distributor_autotest/mp_socket
> > > >
> > > > EAL: Selected IOVA mode 'PA'
> > > >
> > > > EAL: No available hugepages reported in hugepages-1048576kB
> > > >
> > > > -------
> > > >
> > > > Not sure how to help debug further. I'll re-start the job to see
> > > > if it 'clears' up - but I guess there may be a delicate
> > > > synchronization somewhere that needs to be accounted.
> > >
> > > Idem, and with the same loop I used before, it can be caught quickly.
> > >
> > > # time (log=/tmp/$$.log; while true; do echo distributor_autotest
> > > |taskset -c 0-1 ./build-gcc-static/app/test/dpdk-test --log-level
> > > |*:8
> > > -l 0-1 >$log 2>&1; grep -q 'Test OK' $log || break; done; cat $log;
> > > rm -f $log)
> > >
> > Thanks Aaron and David for your report. I can reproduce this issue with the
> script.
> > Will fix it in next version.
> >
> > > [snip]
> > >
> > > RTE>>distributor_autotest
> > > EAL: Trying to obtain current memory policy.
> > > EAL: Setting policy MPOL_PREFERRED for socket 0
> > > EAL: Restoring previous memory policy: 0
> > > EAL: request: mp_malloc_sync
> > > EAL: Heap on socket 0 was expanded by 2MB
> > > EAL: Trying to obtain current memory policy.
> > > EAL: Setting policy MPOL_PREFERRED for socket 0
> > > EAL: Restoring previous memory policy: 0
> > > EAL: alloc_pages_on_heap(): couldn't allocate physically contiguous
> > > space
> > > EAL: Trying to obtain current memory policy.
> > > EAL: Setting policy MPOL_PREFERRED for socket 0
> > > EAL: Restoring previous memory policy: 0
> > > EAL: request: mp_malloc_sync
> > > EAL: Heap on socket 0 was expanded by 8MB === Basic distributor
> > > sanity tests === Worker 0 handled 32 packets Sanity test with all zero
> hashes done.
> > > Worker 0 handled 32 packets
> > > Sanity test with non-zero hashes done === testing big burst (single)
> > > === Sanity test of returned packets done
> > >
> > > === Sanity test with mbuf alloc/free (single) === Sanity test with
> > > mbuf alloc/free passed
> > >
> > > Too few cores to run worker shutdown test === Basic distributor
> > > sanity tests === Worker 0 handled 32 packets Sanity test with all zero
> hashes done.
> > > Worker 0 handled 32 packets
> > > Sanity test with non-zero hashes done === testing big burst (burst)
> > > === Sanity test of returned packets done
> > >
> > > === Sanity test with mbuf alloc/free (burst) === Line 326: Packet
> > > count is incorrect, 1048568, expected 1048576 Test Failed
> > > RTE>>
> > > real 0m36.668s
> > > user 1m7.293s
> > > sys 0m1.560s
> > >
> > > Could be worth running this loop on all tests? (not talking about
> > > the CI, it would be a manual effort to catch lurking issues).
> > >
> > >
> > > --
> > > David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure Ruifeng Wang
@ 2019-10-24 19:31 ` David Marchand
2019-10-25 8:11 ` Hunt, David
2019-10-25 8:33 ` David Marchand
3 siblings, 1 reply; 23+ messages in thread
From: David Marchand @ 2019-10-24 19:31 UTC (permalink / raw)
To: David Hunt
Cc: dev, Harman Kalra, Gavin Hu, Honnappa Nagarahalli, nd, Ruifeng Wang
On Tue, Oct 15, 2019 at 11:29 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>
> Bug 342 reported distributor_autotest execution suspension
> on aarch64 platform.
> Issue was due to lack of synchronization among threads. Distributor
> thread and worker thread may get deadlocked.
> Fixed synchronization issue by adding barriers.
>
> Another issue identified was in test case. Non-atomic operation on
> stat value could cause value reset not been observed by worker thread
> and mess counters. The issue was fixed by using atomic operations.
>
> ---
> v3:
> Added comments for using of C11 acquire/release semantics. (Honnappa)
>
> v2:
> Fixed intermittent packet count incorrect failure. (Aaron, David)
> Fixed Clang build on 32bit systems.
> Additional patch to fix non-atomic operation in unit test.
David,
Can you review this series?
Thanks.
--
David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-24 19:31 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test David Marchand
@ 2019-10-25 8:11 ` Hunt, David
2019-10-25 8:18 ` David Marchand
0 siblings, 1 reply; 23+ messages in thread
From: Hunt, David @ 2019-10-25 8:11 UTC (permalink / raw)
To: David Marchand
Cc: dev, Harman Kalra, Gavin Hu, Honnappa Nagarahalli, nd, Ruifeng Wang
On 24/10/2019 20:31, David Marchand wrote:
> On Tue, Oct 15, 2019 at 11:29 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>> Bug 342 reported distributor_autotest execution suspension
>> on aarch64 platform.
>> Issue was due to lack of synchronization among threads. Distributor
>> thread and worker thread may get deadlocked.
>> Fixed synchronization issue by adding barriers.
>>
>> Another issue identified was in test case. Non-atomic operation on
>> stat value could cause value reset not been observed by worker thread
>> and mess counters. The issue was fixed by using atomic operations.
>>
>> ---
>> v3:
>> Added comments for using of C11 acquire/release semantics. (Honnappa)
>>
>> v2:
>> Fixed intermittent packet count incorrect failure. (Aaron, David)
>> Fixed Clang build on 32bit systems.
>> Additional patch to fix non-atomic operation in unit test.
> David,
>
> Can you review this series?
> Thanks.
>
Hi David,
I had tested this previously, including performance comparisons
against original version on x86, and saw no performance degradation, so
I Acked it. I can re-ack on the latest version now.
Thanks,
Dave.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
@ 2019-10-25 8:13 ` Hunt, David
0 siblings, 0 replies; 23+ messages in thread
From: Hunt, David @ 2019-10-25 8:13 UTC (permalink / raw)
To: Ruifeng Wang; +Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, stable
On 15/10/2019 10:28, Ruifeng Wang wrote:
> Distributor and worker threads rely on data structs in cache line
> for synchronization. The shared data structs were not protected.
> This caused deadlock issue on weaker memory ordering platforms as
> aarch64.
> Fix this issue by adding memory barriers to ensure synchronization
> among cores.
>
> Bugzilla ID: 342
> Fixes: 775003ad2f96 ("distributor: add new burst-capable library")
> Cc: stable@dpdk.org
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
Acked-by: David Hunt <david.hunt@intel.com>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure Ruifeng Wang
@ 2019-10-25 8:13 ` Hunt, David
0 siblings, 0 replies; 23+ messages in thread
From: Hunt, David @ 2019-10-25 8:13 UTC (permalink / raw)
To: Ruifeng Wang; +Cc: dev, hkalra, gavin.hu, honnappa.nagarahalli, nd, stable
On 15/10/2019 10:28, Ruifeng Wang wrote:
> Sanity test could spuriously fail with reporting flush count error.
> It was caused by worker stat coherent issue between distributor and
> worker thread.
> Fix this issue by using atomic operations to update worker stat.
>
> Fixes: c3eabff124e6 ("distributor: add unit tests")
> Cc: stable@dpdk.org
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> app/test/test_distributor.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
> return 0;
Acked-by: David Hunt <david.hunt@intel.com>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-25 8:11 ` Hunt, David
@ 2019-10-25 8:18 ` David Marchand
2019-10-25 8:20 ` Hunt, David
0 siblings, 1 reply; 23+ messages in thread
From: David Marchand @ 2019-10-25 8:18 UTC (permalink / raw)
To: Hunt, David
Cc: dev, Harman Kalra, Gavin Hu, Honnappa Nagarahalli, nd, Ruifeng Wang
On Fri, Oct 25, 2019 at 10:11 AM Hunt, David <david.hunt@intel.com> wrote:
>
>
> On 24/10/2019 20:31, David Marchand wrote:
> > On Tue, Oct 15, 2019 at 11:29 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
> >> Bug 342 reported distributor_autotest execution suspension
> >> on aarch64 platform.
> >> Issue was due to lack of synchronization among threads. Distributor
> >> thread and worker thread may get deadlocked.
> >> Fixed synchronization issue by adding barriers.
> >>
> >> Another issue identified was in test case. Non-atomic operation on
> >> stat value could cause value reset not been observed by worker thread
> >> and mess counters. The issue was fixed by using atomic operations.
> >>
> >> ---
> >> v3:
> >> Added comments for using of C11 acquire/release semantics. (Honnappa)
The comments are also something to maintain, so checking the v3 made
sense to me.
> >>
> >> v2:
> >> Fixed intermittent packet count incorrect failure. (Aaron, David)
> >> Fixed Clang build on 32bit systems.
> >> Additional patch to fix non-atomic operation in unit test.
> > David,
> >
> > Can you review this series?
> > Thanks.
> >
>
> Hi David,
>
> I had tested this previously, including performance comparisons
> against original version on x86, and saw no performance degradation, so
> I Acked it. I can re-ack on the latest version now.
If you think this is fine as is, I will go and apply with your ack.
--
David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-25 8:18 ` David Marchand
@ 2019-10-25 8:20 ` Hunt, David
0 siblings, 0 replies; 23+ messages in thread
From: Hunt, David @ 2019-10-25 8:20 UTC (permalink / raw)
To: David Marchand
Cc: dev, Harman Kalra, Gavin Hu, Honnappa Nagarahalli, nd, Ruifeng Wang
On 25/10/2019 09:18, David Marchand wrote:
> On Fri, Oct 25, 2019 at 10:11 AM Hunt, David <david.hunt@intel.com> wrote:
>>
>> On 24/10/2019 20:31, David Marchand wrote:
>>> On Tue, Oct 15, 2019 at 11:29 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>>>> Bug 342 reported distributor_autotest execution suspension
>>>> on aarch64 platform.
>>>> Issue was due to lack of synchronization among threads. Distributor
>>>> thread and worker thread may get deadlocked.
>>>> Fixed synchronization issue by adding barriers.
>>>>
>>>> Another issue identified was in test case. Non-atomic operation on
>>>> stat value could cause value reset not been observed by worker thread
>>>> and mess counters. The issue was fixed by using atomic operations.
>>>>
>>>> ---
>>>> v3:
>>>> Added comments for using of C11 acquire/release semantics. (Honnappa)
> The comments are also something to maintain, so checking the v3 made
> sense to me.
I agree, and the comments look good to me.
>
>>>> v2:
>>>> Fixed intermittent packet count incorrect failure. (Aaron, David)
>>>> Fixed Clang build on 32bit systems.
>>>> Additional patch to fix non-atomic operation in unit test.
>>> David,
>>>
>>> Can you review this series?
>>> Thanks.
>>>
>> Hi David,
>>
>> I had tested this previously, including performance comparisons
>> against original version on x86, and saw no performance degradation, so
>> I Acked it. I can re-ack on the latest version now.
> If you think this is fine as is, I will go and apply with your ack.
Sure, thanks.
Regards,
Dave.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/2] fix distributor unit test
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
` (2 preceding siblings ...)
2019-10-24 19:31 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test David Marchand
@ 2019-10-25 8:33 ` David Marchand
3 siblings, 0 replies; 23+ messages in thread
From: David Marchand @ 2019-10-25 8:33 UTC (permalink / raw)
To: Ruifeng Wang
Cc: David Hunt, dev, Harman Kalra, Gavin Hu, Honnappa Nagarahalli, nd
On Tue, Oct 15, 2019 at 11:29 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>
> Bug 342 reported distributor_autotest execution suspension
> on aarch64 platform.
> Issue was due to lack of synchronization among threads. Distributor
> thread and worker thread may get deadlocked.
> Fixed synchronization issue by adding barriers.
>
> Another issue identified was in test case. Non-atomic operation on
> stat value could cause value reset not been observed by worker thread
> and mess counters. The issue was fixed by using atomic operations.
Acked-by: David Hunt <david.hunt@intel.com>
Applied, thanks.
--
David Marchand
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2019-10-25 8:34 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-08 9:55 [dpdk-dev] [PATCH] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-08 12:53 ` Hunt, David
2019-10-08 17:05 ` Aaron Conole
2019-10-08 19:46 ` [dpdk-dev] [dpdk-stable] " David Marchand
2019-10-08 20:08 ` Aaron Conole
2019-10-09 5:52 ` Ruifeng Wang (Arm Technology China)
2019-10-17 11:42 ` [dpdk-dev] [EXT] " Harman Kalra
2019-10-17 13:48 ` Ruifeng Wang (Arm Technology China)
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 0/2] fix distributor unit test Ruifeng Wang
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-13 2:31 ` Honnappa Nagarahalli
2019-10-14 10:00 ` Ruifeng Wang (Arm Technology China)
2019-10-12 2:43 ` [dpdk-dev] [PATCH v2 2/2] test/distributor: fix false unit test failure Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test Ruifeng Wang
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 1/2] lib/distributor: fix deadlock issue for aarch64 Ruifeng Wang
2019-10-25 8:13 ` Hunt, David
2019-10-15 9:28 ` [dpdk-dev] [PATCH v3 2/2] test/distributor: fix false unit test failure Ruifeng Wang
2019-10-25 8:13 ` Hunt, David
2019-10-24 19:31 ` [dpdk-dev] [PATCH v3 0/2] fix distributor unit test David Marchand
2019-10-25 8:11 ` Hunt, David
2019-10-25 8:18 ` David Marchand
2019-10-25 8:20 ` Hunt, David
2019-10-25 8:33 ` David Marchand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).