[PATCH v1 0/4] ring: some fixes and improvements

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH v1 0/4] ring: some fixes and improvements
@ 2025-05-21 11:14 Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 1/4] ring: introduce extra run-time checks Konstantin Ananyev
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 11:14 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

First two patches are ‘low risk’ ones.
Third one touches some core functions within rte_ring library and
would probably requires extra reviews/testing from different vendors.
4th one enables C11 based code on all x86 platforms by default.
The stretch goal for it – make all supported platforms to use C11
based code and get rid of legacy code in rte_ring_generic_pvt.h.
If there would be some issues with latest two patches – we can limit
ourselves with just first two to apply.

Konstantin Ananyev (4):
  ring: introduce extra run-time checks
  ring/soring: fix head-tail synchronization issue
  ring: fix potential sync issue between head and tail values
  config/x86: enable RTE_USE_C11_MEM_MODEL by default

 config/x86/meson.build           |  1 +
 lib/ring/rte_ring_c11_pvt.h      | 29 ++++++++++++++++++-----------
 lib/ring/rte_ring_elem_pvt.h     |  8 ++++++--
 lib/ring/rte_ring_hts_elem_pvt.h | 14 ++++++++++----
 lib/ring/rte_ring_rts_elem_pvt.h | 14 ++++++++++----
 lib/ring/soring.c                |  2 ++
 6 files changed, 47 insertions(+), 21 deletions(-)

-- 
2.43.0

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 11:14 [PATCH v1 0/4] ring: some fixes and improvements Konstantin Ananyev
@ 2025-05-21 11:14 ` Konstantin Ananyev
  2025-05-21 12:14   ` Morten Brørup
  2025-05-21 11:14 ` [PATCH v1 2/4] ring/soring: fix head-tail synchronization issue Konstantin Ananyev
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 11:14 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

Add RTE_ASSERT() to check that different move_tail() flavors
return meaningful  *entries value.
It also helps to ensure that inside move_tail(), it uses correct
head/tail values.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/ring/rte_ring_c11_pvt.h      | 2 +-
 lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
 lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
 lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
 lib/ring/soring.c                | 2 ++
 5 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index b9388af0da..0845cd6dcf 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail *d,
 			n = (behavior == RTE_RING_QUEUE_FIXED) ?
 					0 : *entries;
 
+		*new_head = *old_head + n;
 		if (n == 0)
 			return 0;
 
-		*new_head = *old_head + n;
 		if (is_st) {
 			d->head = *new_head;
 			success = 1;
diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
index 6eafae121f..2e71040830 100644
--- a/lib/ring/rte_ring_elem_pvt.h
+++ b/lib/ring/rte_ring_elem_pvt.h
@@ -341,8 +341,10 @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
 		uint32_t *old_head, uint32_t *new_head,
 		uint32_t *free_entries)
 {
-	return __rte_ring_headtail_move_head(&r->prod, &r->cons, r->capacity,
+	n = __rte_ring_headtail_move_head(&r->prod, &r->cons, r->capacity,
 			is_sp, n, behavior, old_head, new_head, free_entries);
+	RTE_ASSERT(*free_entries <= r->capacity);
+	return n;
 }
 
 /**
@@ -374,8 +376,10 @@ __rte_ring_move_cons_head(struct rte_ring *r, unsigned int is_sc,
 		uint32_t *old_head, uint32_t *new_head,
 		uint32_t *entries)
 {
-	return __rte_ring_headtail_move_head(&r->cons, &r->prod, 0,
+	n = __rte_ring_headtail_move_head(&r->cons, &r->prod, 0,
 			is_sc, n, behavior, old_head, new_head, entries);
+	RTE_ASSERT(*entries <= r->capacity);
+	return n;
 }
 
 /**
diff --git a/lib/ring/rte_ring_hts_elem_pvt.h b/lib/ring/rte_ring_hts_elem_pvt.h
index e2b82dd1e6..c59e5f6420 100644
--- a/lib/ring/rte_ring_hts_elem_pvt.h
+++ b/lib/ring/rte_ring_hts_elem_pvt.h
@@ -136,8 +136,10 @@ __rte_ring_hts_move_prod_head(struct rte_ring *r, unsigned int num,
 	enum rte_ring_queue_behavior behavior, uint32_t *old_head,
 	uint32_t *free_entries)
 {
-	return __rte_ring_hts_move_head(&r->hts_prod, &r->cons,
+	num = __rte_ring_hts_move_head(&r->hts_prod, &r->cons,
 			r->capacity, num, behavior, old_head, free_entries);
+	RTE_ASSERT(*free_entries <= r->capacity);
+	return num;
 }
 
 /**
@@ -148,8 +150,10 @@ __rte_ring_hts_move_cons_head(struct rte_ring *r, unsigned int num,
 	enum rte_ring_queue_behavior behavior, uint32_t *old_head,
 	uint32_t *entries)
 {
-	return __rte_ring_hts_move_head(&r->hts_cons, &r->prod,
+	num = __rte_ring_hts_move_head(&r->hts_cons, &r->prod,
 			0, num, behavior, old_head, entries);
+	RTE_ASSERT(*entries <= r->capacity);
+	return num;
 }
 
 /**
diff --git a/lib/ring/rte_ring_rts_elem_pvt.h b/lib/ring/rte_ring_rts_elem_pvt.h
index 96825931f8..509fa674fb 100644
--- a/lib/ring/rte_ring_rts_elem_pvt.h
+++ b/lib/ring/rte_ring_rts_elem_pvt.h
@@ -152,8 +152,10 @@ __rte_ring_rts_move_prod_head(struct rte_ring *r, uint32_t num,
 	enum rte_ring_queue_behavior behavior, uint32_t *old_head,
 	uint32_t *free_entries)
 {
-	return __rte_ring_rts_move_head(&r->rts_prod, &r->cons,
+	num = __rte_ring_rts_move_head(&r->rts_prod, &r->cons,
 			r->capacity, num, behavior, old_head, free_entries);
+	RTE_ASSERT(*free_entries <= r->capacity);
+	return num;
 }
 
 /**
@@ -164,8 +166,10 @@ __rte_ring_rts_move_cons_head(struct rte_ring *r, uint32_t num,
 	enum rte_ring_queue_behavior behavior, uint32_t *old_head,
 	uint32_t *entries)
 {
-	return __rte_ring_rts_move_head(&r->rts_cons, &r->prod,
+	num = __rte_ring_rts_move_head(&r->rts_cons, &r->prod,
 			0, num, behavior, old_head, entries);
+	RTE_ASSERT(*entries <= r->capacity);
+	return num;
 }
 
 /**
diff --git a/lib/ring/soring.c b/lib/ring/soring.c
index 797484d6bf..21a1a27e24 100644
--- a/lib/ring/soring.c
+++ b/lib/ring/soring.c
@@ -156,6 +156,7 @@ __rte_soring_move_prod_head(struct rte_soring *r, uint32_t num,
 		n = 0;
 	}
 
+	RTE_ASSERT(*free <= r->capacity);
 	return n;
 }
 
@@ -190,6 +191,7 @@ __rte_soring_move_cons_head(struct rte_soring *r, uint32_t stage, uint32_t num,
 		n = 0;
 	}
 
+	RTE_ASSERT(*avail <= r->capacity);
 	return n;
 }
 
-- 
2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v1 2/4] ring/soring: fix head-tail synchronization issue
  2025-05-21 11:14 [PATCH v1 0/4] ring: some fixes and improvements Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 1/4] ring: introduce extra run-time checks Konstantin Ananyev
@ 2025-05-21 11:14 ` Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 3/4] ring: fix potential sync issue between head and tail values Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default Konstantin Ananyev
  3 siblings, 0 replies; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 11:14 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

While running soring_stress_autotest on machine with Ampere Altra Max CPU,
observed the following synchronization issue:
...
TEST-CASE MT MT_DEQENQ-MT_STG1-PRCS
test_worker_prcs:_st_ring_dequeue_bulk: check_updt_elem(lc=11, num=42) failed at 11-th iter, offending object: 0x103df1480
...
EAL: PANIC in soring_verify_state():
line:382 from:acquire_state_update: soring=0x103c72c00, stage=0, idx=0x7fb8, expected={.stnum=0, .ftoken=0}, actual={.stnum=0x80000028, .ftoken=0x47fb8};

Few things to note:
- the problem is reproducible only for producer and consumer with
  RTE_RING_SYNC_MT sync type.
- the problem is reproducible only for RTE_USE_C11_MEM_MODEL enabled,
  i.e. when we use __rte_ring_headtail_move_head() implementation from
  rte_ring_c11_pvt.h.
- stage[nb_stage - 1].tail value becomes less then cons.head which should
  never happen.

While debugging it, I figured out that in some cases
__rte_ring_headtail_move_head() gets 'new' cons.head value while
corresponding tail value remains 'old'.
That causing the following calculation to return wrong (way too big value):
*entries = (capacity + stail - *old_head);
And then cons.head erroneously progress over not yet released elements.
Note that this issue happens only on the second iteration of
do { …; success=(CAS(&head); } while(success==0);
loop, i.e. – only when first CAS(&cons.head) attempt fails.

I believe that we are hitting the following race-condition scenario here:
1) soring_dequeue() calls _fianlize()
   It updates state[], then does store(stage.tail, release);
   Note that stage.tail itself can still contain the old value:
   release only guarantee that all *previous* stores will be visible.
2) soring_dequeue() calls move_cons_head() again.
   move_cons_head() updates 'cons.head', but there are still no
   *release* barriers   happened.
3) soring_dequeue() is called on different thread
   (in parallel with previous 2 operations).
     At first iteration move_cons_head() reads 'old' values for both
     'stage.tail' and 'cons.head'.
     Then CAS(cons.head) fails and returns a new value for it,
     while coming next load(stage.tail) still returns 'old' value
     (still no *release* happened).
     Then:
     *entries = (capacity + stail - *old_head);
     calculates wrong value.
In other words – in some rare cases (due to memory re-ordering),
thread can read 'new' 'cons.head' value, but 'old' value for 'stage.tail'.

The reason why that problem doesn’t exist with RTE_USE_C11_MEM_MODEL
disabled - move_head() implementation in rte_ring_generic_pvt.h uses
rte_atomic32_cmpset() – which generates a proper Acquire-Release barrier
for CAS operation.
While in rte_ring_c11_pvt.h – CAS operation is invoked with relaxed
memory-ordering.

To fix that issue for SORING - I introduced an extra release fence straight
after store(&tail)  operations.
As expected that helps – now  tail and it’s  counterpart head values
are always synchronized and all tests pass successfully.

One extra thing to note – I think the same problem potentially exists
even in conventional rte_ring with default (MP/MC case) behavior.
Though chances to hit it in practice are negligible.
At least, I wasn’t able to make it happen so far, even I tried really hard.

As alternative way to fix that issue – use Acquire-Release memory ordering
for CAS(&head) operation in move_head().
That would guarantee that if 'head' value is updated, then its
couterpart 'tail' latest value will also become visible.
Again, in that case conventional rte_ring will also be covered.

Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/ring/soring.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/ring/soring.c b/lib/ring/soring.c
index 21a1a27e24..7bcbf35516 100644
--- a/lib/ring/soring.c
+++ b/lib/ring/soring.c
@@ -123,6 +123,8 @@ __rte_soring_stage_finalize(struct soring_stage_headtail *sht, uint32_t stage,
 	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
 			rte_memory_order_release);

+	/* make sure that new tail value is visible */
+	rte_atomic_thread_fence(rte_memory_order_release);
 	return i;
 }

@@ -217,6 +219,9 @@ __rte_soring_update_tail(struct __rte_ring_headtail *rht,
 		/* unsupported mode, shouldn't be here */
 		RTE_ASSERT(0);
 	}
+
+	/* make sure that new tail value is visible */
+	rte_atomic_thread_fence(rte_memory_order_release);
 }

 static __rte_always_inline uint32_t
-- 
2.43.0

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-21 11:14 [PATCH v1 0/4] ring: some fixes and improvements Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 1/4] ring: introduce extra run-time checks Konstantin Ananyev
  2025-05-21 11:14 ` [PATCH v1 2/4] ring/soring: fix head-tail synchronization issue Konstantin Ananyev
@ 2025-05-21 11:14 ` Konstantin Ananyev
  2025-05-21 20:26   ` Morten Brørup
  2025-05-22 22:03   ` Wathsala Wathawana Vithanage
  2025-05-21 11:14 ` [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default Konstantin Ananyev
  3 siblings, 2 replies; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 11:14 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

This patch aims several purposes:
- provide an alternative (and I think a better) way to fix the
  issue discussed in previous patch:
  "ring/soring: fix synchronization issue between head and tail values"
- make sure that such problem wouldn’t happen within other usages of
  __rte_ring_headtail_move_head() – both current rte_ring
  implementation and possible future use-cases.
- step towards unification of move_head() implementations and
  removing rte_ring_generic_pvt.h
It uses Acquire-Release memory ordering for CAS operation in move_head().
That guarantees that corresponding ‘tail’ updates will be visible
before current ‘head’ is updated.
As I said before: I think that in theory the problem described in
previous patch might happen with our conventional rte_ring too
(when RTE_USE_C11_MEM_MODEL enabled).
But, so far I didn’t manage to reproduce it in reality.
For that reason and also because it touches a critical rte_ring code-path,
I put these changes into a separate patch. Expect all interested
stakeholders to come-up with their comments and observations.
Regarding performance impact – on my boxes both ring_perf_autotest and
ring_stress_autotest – show a mixed set of results: some of them become
few cycles faster, another few cycles slower.
But so far, I didn’t notice any real degradations with that patch.

Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of the head")

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
 lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--
 lib/ring/rte_ring_rts_elem_pvt.h |  6 ++++--
 lib/ring/soring.c                |  5 -----
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index 0845cd6dcf..6d1c46df9a 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail *d,
 	int success;
 	unsigned int max = n;
 
+	/* Ensure the head is read before tail */
 	*old_head = rte_atomic_load_explicit(&d->head,
-			rte_memory_order_relaxed);
+			rte_memory_order_acquire);
 	do {
 		/* Reset n to the initial burst count */
 		n = max;
 
-		/* Ensure the head is read before tail */
-		rte_atomic_thread_fence(rte_memory_order_acquire);
-
-		/* load-acquire synchronize with store-release of ht->tail
-		 * in update_tail.
+		/*
+		 * Read s->tail value. Note that it will be loaded after
+		 * d->head load, but before CAS operation for the d->head.
 		 */
 		stail = rte_atomic_load_explicit(&s->tail,
-					rte_memory_order_acquire);
+					rte_memory_order_relaxed);
 
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
@@ -112,11 +111,19 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail *d,
 			d->head = *new_head;
 			success = 1;
 		} else
-			/* on failure, *old_head is updated */
+			/*
+			 * on failure, *old_head is updated.
+			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
+			 * barrier to prevent:
+			 *  - OOO reads of cons tail value
+			 *  - OOO copy of elems from the ring
+			 *  Also RELEASE guarantees that latest tail value
+			 *  will become visible before the new head value.
+			 */
 			success = rte_atomic_compare_exchange_strong_explicit(
 					&d->head, old_head, *new_head,
-					rte_memory_order_relaxed,
-					rte_memory_order_relaxed);
+					rte_memory_order_acq_rel,
+					rte_memory_order_acquire);
 	} while (unlikely(success == 0));
 	return n;
 }
diff --git a/lib/ring/rte_ring_hts_elem_pvt.h b/lib/ring/rte_ring_hts_elem_pvt.h
index c59e5f6420..cc593433b9 100644
--- a/lib/ring/rte_ring_hts_elem_pvt.h
+++ b/lib/ring/rte_ring_hts_elem_pvt.h
@@ -116,13 +116,15 @@ __rte_ring_hts_move_head(struct rte_ring_hts_headtail *d,
 		np.pos.head = op.pos.head + n;
 
 	/*
-	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
+	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
 	 *  - OOO reads of cons tail value
 	 *  - OOO copy of elems from the ring
+	 *   Also RELEASE guarantees that latest tail value
+	 *   will become visible before the new head value.
 	 */
 	} while (rte_atomic_compare_exchange_strong_explicit(&d->ht.raw,
 			(uint64_t *)(uintptr_t)&op.raw, np.raw,
-			rte_memory_order_acquire,
+			rte_memory_order_acq_rel,
 			rte_memory_order_acquire) == 0);
 
 	*old_head = op.pos.head;
diff --git a/lib/ring/rte_ring_rts_elem_pvt.h b/lib/ring/rte_ring_rts_elem_pvt.h
index 509fa674fb..860b13cc61 100644
--- a/lib/ring/rte_ring_rts_elem_pvt.h
+++ b/lib/ring/rte_ring_rts_elem_pvt.h
@@ -131,13 +131,15 @@ __rte_ring_rts_move_head(struct rte_ring_rts_headtail *d,
 		nh.val.cnt = oh.val.cnt + 1;
 
 	/*
-	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
+	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
 	 *  - OOO reads of cons tail value
 	 *  - OOO copy of elems to the ring
+	 *  Also RELEASE guarantees that latest tail value
+	 *  will become visible before the new head value.
 	 */
 	} while (rte_atomic_compare_exchange_strong_explicit(&d->head.raw,
 			(uint64_t *)(uintptr_t)&oh.raw, nh.raw,
-			rte_memory_order_acquire,
+			rte_memory_order_acq_rel,
 			rte_memory_order_acquire) == 0);
 
 	*old_head = oh.val.pos;
diff --git a/lib/ring/soring.c b/lib/ring/soring.c
index 7bcbf35516..21a1a27e24 100644
--- a/lib/ring/soring.c
+++ b/lib/ring/soring.c
@@ -123,8 +123,6 @@ __rte_soring_stage_finalize(struct soring_stage_headtail *sht, uint32_t stage,
 	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
 			rte_memory_order_release);
 
-	/* make sure that new tail value is visible */
-	rte_atomic_thread_fence(rte_memory_order_release);
 	return i;
 }
 
@@ -219,9 +217,6 @@ __rte_soring_update_tail(struct __rte_ring_headtail *rht,
 		/* unsupported mode, shouldn't be here */
 		RTE_ASSERT(0);
 	}
-
-	/* make sure that new tail value is visible */
-	rte_atomic_thread_fence(rte_memory_order_release);
 }
 
 static __rte_always_inline uint32_t
-- 
2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default
  2025-05-21 11:14 [PATCH v1 0/4] ring: some fixes and improvements Konstantin Ananyev
                   ` (2 preceding siblings ...)
  2025-05-21 11:14 ` [PATCH v1 3/4] ring: fix potential sync issue between head and tail values Konstantin Ananyev
@ 2025-05-21 11:14 ` Konstantin Ananyev
  2025-05-21 19:47   ` Morten Brørup
  3 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 11:14 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

As an attempt to reduce legacy code usage within rte_ring -
enable RTE_USE_C11_MEM_MODEL by default on all x86 platforms.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 config/x86/meson.build | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/x86/meson.build b/config/x86/meson.build
index c3564b0011..5528eb4960 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -21,6 +21,7 @@ endif
 
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
 dpdk_conf.set('RTE_MAX_LCORE', 128)
+dpdk_conf.set('RTE_USE_C11_MEM_MODEL', true)
 
 epyc_zen_cores = {
     '__znver5__':768,
-- 
2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 11:14 ` [PATCH v1 1/4] ring: introduce extra run-time checks Konstantin Ananyev
@ 2025-05-21 12:14   ` Morten Brørup
  2025-05-21 12:34     ` Konstantin Ananyev
  0 siblings, 1 reply; 18+ messages in thread
From: Morten Brørup @ 2025-05-21 12:14 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Wednesday, 21 May 2025 13.14
> 
> Add RTE_ASSERT() to check that different move_tail() flavors
> return meaningful  *entries value.
> It also helps to ensure that inside move_tail(), it uses correct
> head/tail values.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---
>  lib/ring/rte_ring_c11_pvt.h      | 2 +-
>  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
>  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
>  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
>  lib/ring/soring.c                | 2 ++
>  5 files changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
> index b9388af0da..0845cd6dcf 100644
> --- a/lib/ring/rte_ring_c11_pvt.h
> +++ b/lib/ring/rte_ring_c11_pvt.h
> @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> rte_ring_headtail *d,
>  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
>  					0 : *entries;
> 
> +		*new_head = *old_head + n;
>  		if (n == 0)
>  			return 0;
> 
> -		*new_head = *old_head + n;
>  		if (is_st) {
>  			d->head = *new_head;
>  			success = 1;

Is there a need to assign a value to *new_head if n==0?

I don't think your suggestion is multi-thread safe.
If d->head moves, the value in *new_head will be incorrect.

Instead, suggest:

-		if (n == 0)
-			return 0;

		*new_head = *old_head + n;
		if (is_st) {
			d->head = *new_head;
			success = 1;
		} else
			/* on failure, *old_head is updated */
			success = rte_atomic_compare_exchange_strong_explicit(
					&d->head, old_head, *new_head,
					rte_memory_order_relaxed,
					rte_memory_order_relaxed);
	} while (unlikely(success == 0));


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 12:14   ` Morten Brørup
@ 2025-05-21 12:34     ` Konstantin Ananyev
  2025-05-21 18:36       ` Morten Brørup
  0 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 12:34 UTC (permalink / raw)
  To: Morten Brørup, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc



> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Wednesday, 21 May 2025 13.14
> >
> > Add RTE_ASSERT() to check that different move_tail() flavors
> > return meaningful  *entries value.
> > It also helps to ensure that inside move_tail(), it uses correct
> > head/tail values.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > ---
> >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> >  lib/ring/soring.c                | 2 ++
> >  5 files changed, 21 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
> > index b9388af0da..0845cd6dcf 100644
> > --- a/lib/ring/rte_ring_c11_pvt.h
> > +++ b/lib/ring/rte_ring_c11_pvt.h
> > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > rte_ring_headtail *d,
> >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> >  					0 : *entries;
> >
> > +		*new_head = *old_head + n;
> >  		if (n == 0)
> >  			return 0;
> >
> > -		*new_head = *old_head + n;
> >  		if (is_st) {
> >  			d->head = *new_head;
> >  			success = 1;
> 
> Is there a need to assign a value to *new_head if n==0?

Not really, main reason I just moved this line up - to keep compiler happy.
Otherwise it complained that *new_head might be left uninitialized.
 
> I don't think your suggestion is multi-thread safe.
> If d->head moves, the value in *new_head will be incorrect.

If d->head moves, then *old_head will also be incorrect.
For that case we do have CAS() below, it will return zero if (d->head != *old_head)
and we shall go to the next iteration (attempt).
Basically - if n == 0, your *old_head and *new_head might be invalid and should not be used
(and they are not used).  

> Instead, suggest:
> 
> -		if (n == 0)
> -			return 0;
> 
> 		*new_head = *old_head + n;
> 		if (is_st) {
> 			d->head = *new_head;
> 			success = 1;
> 		} else
> 			/* on failure, *old_head is updated */
> 			success = rte_atomic_compare_exchange_strong_explicit(
> 					&d->head, old_head, *new_head,
> 					rte_memory_order_relaxed,
> 					rte_memory_order_relaxed);
> 	} while (unlikely(success == 0));

That's possible, but if (n ==0) we probably don't want to update the head -
as we are not moving head - it is pointless, while still expensive. 




^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 12:34     ` Konstantin Ananyev
@ 2025-05-21 18:36       ` Morten Brørup
  2025-05-21 19:38         ` Konstantin Ananyev
  0 siblings, 1 reply; 18+ messages in thread
From: Morten Brørup @ 2025-05-21 18:36 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Wednesday, 21 May 2025 14.35
> 
> > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > Sent: Wednesday, 21 May 2025 13.14
> > >
> > > Add RTE_ASSERT() to check that different move_tail() flavors
> > > return meaningful  *entries value.
> > > It also helps to ensure that inside move_tail(), it uses correct
> > > head/tail values.
> > >
> > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > > ---
> > >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> > >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> > >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> > >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> > >  lib/ring/soring.c                | 2 ++
> > >  5 files changed, 21 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/lib/ring/rte_ring_c11_pvt.h
> b/lib/ring/rte_ring_c11_pvt.h
> > > index b9388af0da..0845cd6dcf 100644
> > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > > rte_ring_headtail *d,
> > >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> > >  					0 : *entries;
> > >
> > > +		*new_head = *old_head + n;
> > >  		if (n == 0)
> > >  			return 0;
> > >
> > > -		*new_head = *old_head + n;
> > >  		if (is_st) {
> > >  			d->head = *new_head;
> > >  			success = 1;
> >
> > Is there a need to assign a value to *new_head if n==0?
> 
> Not really, main reason I just moved this line up - to keep compiler
> happy.
> Otherwise it complained that *new_head might be left uninitialized.

Your change might give the impression that *new_head is used by a caller. (Like I asked about.)
To please the compiler, you could mark new_head __rte_unused, or:

-		if (n == 0)
+		if (n == 0) {
+			RTE_SET_USED(new_head);
			return 0;
+		}

> 
> > I don't think your suggestion is multi-thread safe.
> > If d->head moves, the value in *new_head will be incorrect.
> 
> If d->head moves, then *old_head will also be incorrect.
> For that case we do have CAS() below, it will return zero if (d->head
> != *old_head)
> and we shall go to the next iteration (attempt).

Exactly.
And with my suggestion the same will happen if n==0, and the next attempt will update them both, until they are both correct.

> Basically - if n == 0, your *old_head and *new_head might be invalid
> and should not be used
> (and they are not used).
> 
> > Instead, suggest:
> >
> > -		if (n == 0)
> > -			return 0;
> >
> > 		*new_head = *old_head + n;
> > 		if (is_st) {
> > 			d->head = *new_head;
> > 			success = 1;
> > 		} else
> > 			/* on failure, *old_head is updated */
> > 			success =
> rte_atomic_compare_exchange_strong_explicit(
> > 					&d->head, old_head, *new_head,
> > 					rte_memory_order_relaxed,
> > 					rte_memory_order_relaxed);
> > 	} while (unlikely(success == 0));
> 
> That's possible, but if (n ==0) we probably don't want to update the
> head -
> as we are not moving head - it is pointless, while still expensive.

Agree. Let's avoid unnecessary cost!
My suggestion was only relevant if *new_head needed to be updated when n==0.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 18:36       ` Morten Brørup
@ 2025-05-21 19:38         ` Konstantin Ananyev
  2025-05-21 22:02           ` Morten Brørup
  0 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-21 19:38 UTC (permalink / raw)
  To: Morten Brørup, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc



> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Wednesday, 21 May 2025 14.35
> >
> > > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > > Sent: Wednesday, 21 May 2025 13.14
> > > >
> > > > Add RTE_ASSERT() to check that different move_tail() flavors
> > > > return meaningful  *entries value.
> > > > It also helps to ensure that inside move_tail(), it uses correct
> > > > head/tail values.
> > > >
> > > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > > > ---
> > > >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> > > >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> > > >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> > > >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> > > >  lib/ring/soring.c                | 2 ++
> > > >  5 files changed, 21 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/lib/ring/rte_ring_c11_pvt.h
> > b/lib/ring/rte_ring_c11_pvt.h
> > > > index b9388af0da..0845cd6dcf 100644
> > > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > > > rte_ring_headtail *d,
> > > >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> > > >  					0 : *entries;
> > > >
> > > > +		*new_head = *old_head + n;
> > > >  		if (n == 0)
> > > >  			return 0;
> > > >
> > > > -		*new_head = *old_head + n;
> > > >  		if (is_st) {
> > > >  			d->head = *new_head;
> > > >  			success = 1;
> > >
> > > Is there a need to assign a value to *new_head if n==0?
> >
> > Not really, main reason I just moved this line up - to keep compiler
> > happy.
> > Otherwise it complained that *new_head might be left uninitialized.
> 
> Your change might give the impression that *new_head is used by a caller. (Like I asked about.)
> To please the compiler, you could mark new_head __rte_unused, or:
> 
> -		if (n == 0)
> +		if (n == 0) {
> +			RTE_SET_USED(new_head);
> 			return 0;
> +		}
> 
> >

Makes sense, will re-spin.
Do you have any comments for other patches in the series?
Thanks
Konstantin 


> > > I don't think your suggestion is multi-thread safe.
> > > If d->head moves, the value in *new_head will be incorrect.
> >
> > If d->head moves, then *old_head will also be incorrect.
> > For that case we do have CAS() below, it will return zero if (d->head
> > != *old_head)
> > and we shall go to the next iteration (attempt).
> 
> Exactly.
> And with my suggestion the same will happen if n==0, and the next attempt will update them both, until they are both correct.
> 
> > Basically - if n == 0, your *old_head and *new_head might be invalid
> > and should not be used
> > (and they are not used).
> >
> > > Instead, suggest:
> > >
> > > -		if (n == 0)
> > > -			return 0;
> > >
> > > 		*new_head = *old_head + n;
> > > 		if (is_st) {
> > > 			d->head = *new_head;
> > > 			success = 1;
> > > 		} else
> > > 			/* on failure, *old_head is updated */
> > > 			success =
> > rte_atomic_compare_exchange_strong_explicit(
> > > 					&d->head, old_head, *new_head,
> > > 					rte_memory_order_relaxed,
> > > 					rte_memory_order_relaxed);
> > > 	} while (unlikely(success == 0));
> >
> > That's possible, but if (n ==0) we probably don't want to update the
> > head -
> > as we are not moving head - it is pointless, while still expensive.
> 
> Agree. Let's avoid unnecessary cost!
> My suggestion was only relevant if *new_head needed to be updated when n==0.
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default
  2025-05-21 11:14 ` [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default Konstantin Ananyev
@ 2025-05-21 19:47   ` Morten Brørup
  0 siblings, 0 replies; 18+ messages in thread
From: Morten Brørup @ 2025-05-21 19:47 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Wednesday, 21 May 2025 13.15
> 
> As an attempt to reduce legacy code usage within rte_ring -
> enable RTE_USE_C11_MEM_MODEL by default on all x86 platforms.

I theory, after requiring C11, RTE_USE_C11_MEM_MODEL should be the default on all platforms, and the legacy memory model should be optional for performance.

But IIRC, significant performance differences made the legacy memory model better for some platforms, so keeping legacy the default for those platforms makes sense.

If this patch series solved the C11 performance issues on those other platforms, we can get rid of the ring operations using the legacy memory model. Such a change would be suitable for an LTS release.

That said, we could postpone this patch 4 to the 25.11 release.
In theory, patch 3 replaced a rte_atomic_thread_fence() - affecting all memory operations - with some atomic functions affecting only the ring head/tail memory operations.
However, if that theoretical change is 100 % irrelevant on any x86 CPU in reality, then it's acceptable to not wait for 25.11.

> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---
>  config/x86/meson.build | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/config/x86/meson.build b/config/x86/meson.build
> index c3564b0011..5528eb4960 100644
> --- a/config/x86/meson.build
> +++ b/config/x86/meson.build
> @@ -21,6 +21,7 @@ endif
> 
>  dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
>  dpdk_conf.set('RTE_MAX_LCORE', 128)
> +dpdk_conf.set('RTE_USE_C11_MEM_MODEL', true)
> 
>  epyc_zen_cores = {
>      '__znver5__':768,
> --
> 2.43.0

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-21 11:14 ` [PATCH v1 3/4] ring: fix potential sync issue between head and tail values Konstantin Ananyev
@ 2025-05-21 20:26   ` Morten Brørup
  2025-05-22 22:03   ` Wathsala Wathawana Vithanage
  1 sibling, 0 replies; 18+ messages in thread
From: Morten Brørup @ 2025-05-21 20:26 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Wednesday, 21 May 2025 13.15
> 
> This patch aims several purposes:
> - provide an alternative (and I think a better) way to fix the
>   issue discussed in previous patch:
>   "ring/soring: fix synchronization issue between head and tail values"
> - make sure that such problem wouldn’t happen within other usages of
>   __rte_ring_headtail_move_head() – both current rte_ring
>   implementation and possible future use-cases.
> - step towards unification of move_head() implementations and
>   removing rte_ring_generic_pvt.h
> It uses Acquire-Release memory ordering for CAS operation in
> move_head().
> That guarantees that corresponding ‘tail’ updates will be visible
> before current ‘head’ is updated.
> As I said before: I think that in theory the problem described in
> previous patch might happen with our conventional rte_ring too
> (when RTE_USE_C11_MEM_MODEL enabled).
> But, so far I didn’t manage to reproduce it in reality.

Overall, I think the code becomes more elegant and much easier to understand with this patch, where the atomic operations are performed explicitly on the head/tail, eliminating the need for the broad-reaching rte_atomic_thread_fence().

The detailed inline code comments are also a good improvement.

> For that reason and also because it touches a critical rte_ring code-
> path,
> I put these changes into a separate patch. Expect all interested
> stakeholders to come-up with their comments and observations.
> Regarding performance impact – on my boxes both ring_perf_autotest and
> ring_stress_autotest – show a mixed set of results: some of them become
> few cycles faster, another few cycles slower.
> But so far, I didn’t notice any real degradations with that patch.

Maybe it was the broad-reaching rte_atomic_thread_fence() that made the C11 variant slow on other architectures.
This makes me curious about performance results on other architectures with this patch.

> 
> Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
> Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
> Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
> Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of
> the head")
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---
>  lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
>  lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--
>  lib/ring/rte_ring_rts_elem_pvt.h |  6 ++++--
>  lib/ring/soring.c                |  5 -----
>  4 files changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
> index 0845cd6dcf..6d1c46df9a 100644
> --- a/lib/ring/rte_ring_c11_pvt.h
> +++ b/lib/ring/rte_ring_c11_pvt.h
> @@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct
> rte_ring_headtail *d,
>  	int success;
>  	unsigned int max = n;
> 
> +	/* Ensure the head is read before tail */

Maybe "d->head" and "s->tail" instead of "head" and "tail".

>  	*old_head = rte_atomic_load_explicit(&d->head,
> -			rte_memory_order_relaxed);
> +			rte_memory_order_acquire);
>  	do {
>  		/* Reset n to the initial burst count */
>  		n = max;
> 
> -		/* Ensure the head is read before tail */
> -		rte_atomic_thread_fence(rte_memory_order_acquire);
> -
> -		/* load-acquire synchronize with store-release of ht->tail
> -		 * in update_tail.
> +		/*
> +		 * Read s->tail value. Note that it will be loaded after
> +		 * d->head load, but before CAS operation for the d->head.
>  		 */
>  		stail = rte_atomic_load_explicit(&s->tail,
> -					rte_memory_order_acquire);
> +					rte_memory_order_relaxed);
> 
>  		/* The subtraction is done between two unsigned 32bits
> value
>  		 * (the result is always modulo 32 bits even if we have
> @@ -112,11 +111,19 @@ __rte_ring_headtail_move_head(struct
> rte_ring_headtail *d,
>  			d->head = *new_head;
>  			success = 1;
>  		} else
> -			/* on failure, *old_head is updated */
> +			/*
> +			 * on failure, *old_head is updated.
> +			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
> +			 * barrier to prevent:
> +			 *  - OOO reads of cons tail value
> +			 *  - OOO copy of elems from the ring

It's not really the ACQ_REL that does this. It's the AQUIRE.
So this comment needs some adjustment.

Also maybe "s->tail" instead of "cons tail".

> +			 *  Also RELEASE guarantees that latest tail value

Maybe "latest s->tail" instead of "latest tail".

> +			 *  will become visible before the new head value.

Maybe "new d->head value" instead of "new head value".

> +			 */
>  			success =
> rte_atomic_compare_exchange_strong_explicit(
>  					&d->head, old_head, *new_head,
> -					rte_memory_order_relaxed,
> -					rte_memory_order_relaxed);
> +					rte_memory_order_acq_rel,
> +					rte_memory_order_acquire);
>  	} while (unlikely(success == 0));
>  	return n;
>  }

I haven't reviewed the remaining changes in detail, but I think my feedback that the comments should mention ACQUIRE instead of ACQ_REL also apply to the other files.

> diff --git a/lib/ring/rte_ring_hts_elem_pvt.h
> b/lib/ring/rte_ring_hts_elem_pvt.h

> diff --git a/lib/ring/rte_ring_rts_elem_pvt.h
> b/lib/ring/rte_ring_rts_elem_pvt.h

> diff --git a/lib/ring/soring.c b/lib/ring/soring.c


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 19:38         ` Konstantin Ananyev
@ 2025-05-21 22:02           ` Morten Brørup
  2025-05-26  8:39             ` Konstantin Ananyev
  0 siblings, 1 reply; 18+ messages in thread
From: Morten Brørup @ 2025-05-21 22:02 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Wednesday, 21 May 2025 21.39
> 
> > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > Sent: Wednesday, 21 May 2025 14.35
> > >
> > > > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > > > Sent: Wednesday, 21 May 2025 13.14
> > > > >
> > > > > Add RTE_ASSERT() to check that different move_tail() flavors
> > > > > return meaningful  *entries value.
> > > > > It also helps to ensure that inside move_tail(), it uses
> correct
> > > > > head/tail values.
> > > > >
> > > > > Signed-off-by: Konstantin Ananyev
> <konstantin.ananyev@huawei.com>
> > > > > ---
> > > > >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> > > > >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> > > > >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> > > > >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> > > > >  lib/ring/soring.c                | 2 ++
> > > > >  5 files changed, 21 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/lib/ring/rte_ring_c11_pvt.h
> > > b/lib/ring/rte_ring_c11_pvt.h
> > > > > index b9388af0da..0845cd6dcf 100644
> > > > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > > > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > > > > rte_ring_headtail *d,
> > > > >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> > > > >  					0 : *entries;
> > > > >
> > > > > +		*new_head = *old_head + n;
> > > > >  		if (n == 0)
> > > > >  			return 0;
> > > > >
> > > > > -		*new_head = *old_head + n;
> > > > >  		if (is_st) {
> > > > >  			d->head = *new_head;
> > > > >  			success = 1;
> > > >
> > > > Is there a need to assign a value to *new_head if n==0?
> > >
> > > Not really, main reason I just moved this line up - to keep
> compiler
> > > happy.
> > > Otherwise it complained that *new_head might be left uninitialized.
> >
> > Your change might give the impression that *new_head is used by a
> caller. (Like I asked about.)
> > To please the compiler, you could mark new_head __rte_unused, or:
> >
> > -		if (n == 0)
> > +		if (n == 0) {
> > +			RTE_SET_USED(new_head);
> > 			return 0;
> > +		}
> >
> > >
> 
> Makes sense, will re-spin.

I'm having second thoughts about treating this compiler warning as a false positive!

Are you 100 % sure that no caller uses *new_head?
I suppose you are, because it wasn't set before this patch either, so the existing code would have a bug if some caller uses it.

But the documentation does not mention that *new_head is not set if the function returns 0.
So, some future caller might use *new_head, thus introducing a bug when n==0.

But most importantly for the performance discussion, the costly CAS is only pointless when n==0.
So, if n==0 is very unlikely, we could accept this pointless cost.
And it would save us the cost of "if (n==0) return 0;" in the hot code path.

And, as a consequence, some of the callers of this function could also remove their special handing of __rte_ring_headtail_move_head() returning 0. (Likewise, only if a return value of 0 is unlikely, and the special handling has a cost in the hot cod path for non-zero return values.)

> Do you have any comments for other patches in the series?
> Thanks
> Konstantin
> 
> 
> > > > I don't think your suggestion is multi-thread safe.
> > > > If d->head moves, the value in *new_head will be incorrect.
> > >
> > > If d->head moves, then *old_head will also be incorrect.
> > > For that case we do have CAS() below, it will return zero if (d-
> >head
> > > != *old_head)
> > > and we shall go to the next iteration (attempt).
> >
> > Exactly.
> > And with my suggestion the same will happen if n==0, and the next
> attempt will update them both, until they are both correct.
> >
> > > Basically - if n == 0, your *old_head and *new_head might be
> invalid
> > > and should not be used
> > > (and they are not used).
> > >
> > > > Instead, suggest:
> > > >
> > > > -		if (n == 0)
> > > > -			return 0;
> > > >
> > > > 		*new_head = *old_head + n;
> > > > 		if (is_st) {
> > > > 			d->head = *new_head;
> > > > 			success = 1;
> > > > 		} else
> > > > 			/* on failure, *old_head is updated */
> > > > 			success =
> > > rte_atomic_compare_exchange_strong_explicit(
> > > > 					&d->head, old_head, *new_head,
> > > > 					rte_memory_order_relaxed,
> > > > 					rte_memory_order_relaxed);
> > > > 	} while (unlikely(success == 0));
> > >
> > > That's possible, but if (n ==0) we probably don't want to update
> the
> > > head -
> > > as we are not moving head - it is pointless, while still expensive.
> >
> > Agree. Let's avoid unnecessary cost!
> > My suggestion was only relevant if *new_head needed to be updated
> when n==0.
> >


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-21 11:14 ` [PATCH v1 3/4] ring: fix potential sync issue between head and tail values Konstantin Ananyev
  2025-05-21 20:26   ` Morten Brørup
@ 2025-05-22 22:03   ` Wathsala Wathawana Vithanage
  2025-05-26 11:54     ` Konstantin Ananyev
  1 sibling, 1 reply; 18+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-05-22 22:03 UTC (permalink / raw)
  To: Konstantin Ananyev, dev
  Cc: Honnappa Nagarahalli, jerinj, hemant.agrawal, drc, nd

Hi Konstanin,

In rte_ring the store-release on tail update guarantees that CAS
won't get reordered with the store-released of the tail update.

So, the sequence of events would look like this (combined view
of head and tail update)

Releaxed-load(new_head,  N)                              ----------------> (A)
Relaxed-CAS(d->head, new_head, old_head)   ----------------> (B)
Store-release-store(d->tail, new_head)             ----------------> (C)                                               

If we look at address dependencies, then...

(B) depends on (A) due to new_head address dependency.
(C) depends on (A) due to new_head address dependency.

So, dependency graph looks like this
       (A)
    /       \
   v        v
 (B)     (C)

There is no implicit dependence between (B) and (C), I think 
this is the issue you are brining up. 
Even though there is no dependence between the two, 
the store-release of (C) ensures that (B) won't drop below it.
Therefore, the above graph can be turned into an ordered
sequence as shown below..

(A) -> (B) -> (C)

I haven't looked at the so-ring yet. Could it be possible that the
issue is due to something else introduced in that code?

Thanks,

--wathsala

> This patch aims several purposes:
> - provide an alternative (and I think a better) way to fix the
>   issue discussed in previous patch:
>   "ring/soring: fix synchronization issue between head and tail values"
> - make sure that such problem wouldn’t happen within other usages of
>   __rte_ring_headtail_move_head() – both current rte_ring
>   implementation and possible future use-cases.
> - step towards unification of move_head() implementations and
>   removing rte_ring_generic_pvt.h
> It uses Acquire-Release memory ordering for CAS operation in move_head().
> That guarantees that corresponding ‘tail’ updates will be visible before current
> ‘head’ is updated.
> As I said before: I think that in theory the problem described in previous patch
> might happen with our conventional rte_ring too (when
> RTE_USE_C11_MEM_MODEL enabled).
> But, so far I didn’t manage to reproduce it in reality.
> For that reason and also because it touches a critical rte_ring code-path, I put
> these changes into a separate patch. Expect all interested stakeholders to come-
> up with their comments and observations.
> Regarding performance impact – on my boxes both ring_perf_autotest and
> ring_stress_autotest – show a mixed set of results: some of them become few
> cycles faster, another few cycles slower.
> But so far, I didn’t notice any real degradations with that patch.
> 
> Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
> Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
> Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
> Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of the head")
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> ---
>  lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
>  lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--  lib/ring/rte_ring_rts_elem_pvt.h
> |  6 ++++--
>  lib/ring/soring.c                |  5 -----
>  4 files changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> 0845cd6dcf..6d1c46df9a 100644
> --- a/lib/ring/rte_ring_c11_pvt.h
> +++ b/lib/ring/rte_ring_c11_pvt.h
> @@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct
> rte_ring_headtail *d,
>  	int success;
>  	unsigned int max = n;
> 
> +	/* Ensure the head is read before tail */
>  	*old_head = rte_atomic_load_explicit(&d->head,
> -			rte_memory_order_relaxed);
> +			rte_memory_order_acquire);
>  	do {
>  		/* Reset n to the initial burst count */
>  		n = max;
> 
> -		/* Ensure the head is read before tail */
> -		rte_atomic_thread_fence(rte_memory_order_acquire);
> -
> -		/* load-acquire synchronize with store-release of ht->tail
> -		 * in update_tail.
> +		/*
> +		 * Read s->tail value. Note that it will be loaded after
> +		 * d->head load, but before CAS operation for the d->head.
>  		 */
>  		stail = rte_atomic_load_explicit(&s->tail,
> -					rte_memory_order_acquire);
> +					rte_memory_order_relaxed);
> 
>  		/* The subtraction is done between two unsigned 32bits value
>  		 * (the result is always modulo 32 bits even if we have @@ -
> 112,11 +111,19 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail
> *d,
>  			d->head = *new_head;
>  			success = 1;
>  		} else
> -			/* on failure, *old_head is updated */
> +			/*
> +			 * on failure, *old_head is updated.
> +			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
> +			 * barrier to prevent:
> +			 *  - OOO reads of cons tail value
> +			 *  - OOO copy of elems from the ring
> +			 *  Also RELEASE guarantees that latest tail value
> +			 *  will become visible before the new head value.
> +			 */
>  			success =
> rte_atomic_compare_exchange_strong_explicit(
>  					&d->head, old_head, *new_head,
> -					rte_memory_order_relaxed,
> -					rte_memory_order_relaxed);
> +					rte_memory_order_acq_rel,
> +					rte_memory_order_acquire);
>  	} while (unlikely(success == 0));
>  	return n;
>  }
> diff --git a/lib/ring/rte_ring_hts_elem_pvt.h b/lib/ring/rte_ring_hts_elem_pvt.h
> index c59e5f6420..cc593433b9 100644
> --- a/lib/ring/rte_ring_hts_elem_pvt.h
> +++ b/lib/ring/rte_ring_hts_elem_pvt.h
> @@ -116,13 +116,15 @@ __rte_ring_hts_move_head(struct
> rte_ring_hts_headtail *d,
>  		np.pos.head = op.pos.head + n;
> 
>  	/*
> -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
>  	 *  - OOO reads of cons tail value
>  	 *  - OOO copy of elems from the ring
> +	 *   Also RELEASE guarantees that latest tail value
> +	 *   will become visible before the new head value.
>  	 */
>  	} while (rte_atomic_compare_exchange_strong_explicit(&d->ht.raw,
>  			(uint64_t *)(uintptr_t)&op.raw, np.raw,
> -			rte_memory_order_acquire,
> +			rte_memory_order_acq_rel,
>  			rte_memory_order_acquire) == 0);
> 
>  	*old_head = op.pos.head;
> diff --git a/lib/ring/rte_ring_rts_elem_pvt.h b/lib/ring/rte_ring_rts_elem_pvt.h
> index 509fa674fb..860b13cc61 100644
> --- a/lib/ring/rte_ring_rts_elem_pvt.h
> +++ b/lib/ring/rte_ring_rts_elem_pvt.h
> @@ -131,13 +131,15 @@ __rte_ring_rts_move_head(struct
> rte_ring_rts_headtail *d,
>  		nh.val.cnt = oh.val.cnt + 1;
> 
>  	/*
> -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
>  	 *  - OOO reads of cons tail value
>  	 *  - OOO copy of elems to the ring
> +	 *  Also RELEASE guarantees that latest tail value
> +	 *  will become visible before the new head value.
>  	 */
>  	} while (rte_atomic_compare_exchange_strong_explicit(&d->head.raw,
>  			(uint64_t *)(uintptr_t)&oh.raw, nh.raw,
> -			rte_memory_order_acquire,
> +			rte_memory_order_acq_rel,
>  			rte_memory_order_acquire) == 0);
> 
>  	*old_head = oh.val.pos;
> diff --git a/lib/ring/soring.c b/lib/ring/soring.c index 7bcbf35516..21a1a27e24
> 100644
> --- a/lib/ring/soring.c
> +++ b/lib/ring/soring.c
> @@ -123,8 +123,6 @@ __rte_soring_stage_finalize(struct
> soring_stage_headtail *sht, uint32_t stage,
>  	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
>  			rte_memory_order_release);
> 
> -	/* make sure that new tail value is visible */
> -	rte_atomic_thread_fence(rte_memory_order_release);
>  	return i;
>  }
> 
> @@ -219,9 +217,6 @@ __rte_soring_update_tail(struct __rte_ring_headtail
> *rht,
>  		/* unsupported mode, shouldn't be here */
>  		RTE_ASSERT(0);
>  	}
> -
> -	/* make sure that new tail value is visible */
> -	rte_atomic_thread_fence(rte_memory_order_release);
>  }
> 
>  static __rte_always_inline uint32_t
> --
> 2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-21 22:02           ` Morten Brørup
@ 2025-05-26  8:39             ` Konstantin Ananyev
  2025-05-26  9:57               ` Morten Brørup
  0 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-26  8:39 UTC (permalink / raw)
  To: Morten Brørup, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc



> > > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > > Sent: Wednesday, 21 May 2025 14.35
> > > >
> > > > > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > > > > Sent: Wednesday, 21 May 2025 13.14
> > > > > >
> > > > > > Add RTE_ASSERT() to check that different move_tail() flavors
> > > > > > return meaningful  *entries value.
> > > > > > It also helps to ensure that inside move_tail(), it uses
> > correct
> > > > > > head/tail values.
> > > > > >
> > > > > > Signed-off-by: Konstantin Ananyev
> > <konstantin.ananyev@huawei.com>
> > > > > > ---
> > > > > >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> > > > > >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> > > > > >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> > > > > >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> > > > > >  lib/ring/soring.c                | 2 ++
> > > > > >  5 files changed, 21 insertions(+), 7 deletions(-)
> > > > > >
> > > > > > diff --git a/lib/ring/rte_ring_c11_pvt.h
> > > > b/lib/ring/rte_ring_c11_pvt.h
> > > > > > index b9388af0da..0845cd6dcf 100644
> > > > > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > > > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > > > > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > > > > > rte_ring_headtail *d,
> > > > > >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> > > > > >  					0 : *entries;
> > > > > >
> > > > > > +		*new_head = *old_head + n;
> > > > > >  		if (n == 0)
> > > > > >  			return 0;
> > > > > >
> > > > > > -		*new_head = *old_head + n;
> > > > > >  		if (is_st) {
> > > > > >  			d->head = *new_head;
> > > > > >  			success = 1;
> > > > >
> > > > > Is there a need to assign a value to *new_head if n==0?
> > > >
> > > > Not really, main reason I just moved this line up - to keep
> > compiler
> > > > happy.
> > > > Otherwise it complained that *new_head might be left uninitialized.
> > >
> > > Your change might give the impression that *new_head is used by a
> > caller. (Like I asked about.)
> > > To please the compiler, you could mark new_head __rte_unused, or:
> > >
> > > -		if (n == 0)
> > > +		if (n == 0) {
> > > +			RTE_SET_USED(new_head);
> > > 			return 0;
> > > +		}

Actually, that wouldn't help.
By some reason, after introducing RTE_ASSERT()  gcc13 believes that now cons_next can
be used (stored) unfinalized here:

        n = __rte_ring_move_cons_head(r, (int)is_sc, n, behavior,
                        &cons_head, &cons_next, &entries);
        if (n == 0)
                goto end;

        __rte_ring_dequeue_elems(r, cons_head, obj_table, esize, n);

        __rte_ring_update_tail(&r->cons, cons_head, cons_next, is_sc, 0);

end:
   ...

For me it is a false positive, somehow it missed that if (n==0) then update_table()
wouldn't be called  at all. Full error message below.
So making new_head always initialized, even if we are not going to use, seems
like the simplest and cleanest way to fix it.

est-pipeline_runtime.c.o -c ../app/test-pipeline/runtime.c
In file included from ../lib/eal/include/rte_bitops.h:24,
                 from ../lib/eal/include/rte_memory.h:18,
                 from ../app/test-pipeline/runtime.c:19:
In function '__rte_ring_update_tail',
    inlined from '__rte_ring_do_dequeue_elem' at ../lib/ring/rte_ring_elem_pvt.h:472:2,
    inlined from 'rte_ring_sc_dequeue_bulk_elem' at ../lib/ring/rte_ring_elem.h:344:9,
    inlined from 'rte_ring_sc_dequeue_bulk' at ../lib/ring/rte_ring.h:402:9,
    inlined from 'app_main_loop_worker' at ../app/test-pipeline/runtime.c:91:10:
../lib/eal/include/rte_stdatomic.h:139:9: error: 'cons_next' may be used uninitialized [-Werror=maybe-uninitialized]
  139 |         __atomic_store_n(ptr, val, memorder)
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../lib/ring/rte_ring_c11_pvt.h:39:9: note: in expansion of macro 'rte_atomic_store_explicit'
   39 |         rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~
In file included from ../lib/ring/rte_ring_elem.h:20,
                 from ../lib/ring/rte_ring.h:38,
                 from ../lib/mempool/rte_mempool.h:49,
                 from ../lib/mbuf/rte_mbuf.h:38,
                 from ../lib/net/rte_ether.h:20,
                 from ../app/test-pipeline/runtime.c:31:
../lib/ring/rte_ring_elem_pvt.h: In function 'app_main_loop_worker':
../lib/ring/rte_ring_elem_pvt.h:462:29: note: 'cons_next' was declared here
  462 |         uint32_t cons_head, cons_next;
      |                             ^~~~~~~~~
In function '__rte_ring_update_tail',
    inlined from '__rte_ring_do_enqueue_elem' at ../lib/ring/rte_ring_elem_pvt.h:425:2,
    inlined from 'rte_ring_sp_enqueue_bulk_elem' at ../lib/ring/rte_ring_elem.h:159:9,
    inlined from 'rte_ring_sp_enqueue_bulk' at ../lib/ring/rte_ring.h:267:9,
    inlined from 'app_main_loop_worker' at ../app/test-pipeline/runtime.c:101:11 

> > >
> > > >
> >
> > Makes sense, will re-spin.


> I'm having second thoughts about treating this compiler warning as a false positive!
> 
> Are you 100 % sure that no caller uses *new_head?
 
Yes, I believe so. If not, then we do have a severe bug in our rt_ring,

> I suppose you are, because it wasn't set before this patch either, so the existing code would have a bug if some caller uses it.
> But the documentation does not mention that *new_head is not set if the function returns 0.
> So, some future caller might use *new_head, thus introducing a bug when n==0.
> 
> But most importantly for the performance discussion, the costly CAS is only pointless when n==0.
> So, if n==0 is very unlikely, we could accept this pointless cost.
> And it would save us the cost of "if (n==0) return 0;" in the hot code path.
> 
> And, as a consequence, some of the callers of this function could also remove their special handing of
> __rte_ring_headtail_move_head() returning 0. (Likewise, only if a return value of 0 is unlikely, and the special handling has a cost in
> the hot cod path for non-zero return values.)

I don't think it is a good idea.
First of all about the cost - I suppose that situation when 'n==0' is not so uncommon:
There is a contention on the ring (many threads try to enqueue/dequeue) to/from it.
Doing unnecessary CAS() (when n==0) we introduce extra cache-snooping to already busy memory sub-system,
i.e. we slowing down not only current threads, but all other producers/consumers of the ring.
About removing extra branches: I don't think there would be many to remove.
Usually after move_head() finishes we have to do two operations:
__rte_ring_dequeue_elems() ;
__rte_ring_update_tail();
Both have to be performed only when 'n != 0'.
Regarding the doc for the move_head() function - sure it can be updated to note explicitly
that both *old_head and *new_head will contain up-to-date values only when return value
is not zero. 

> > > > > I don't think your suggestion is multi-thread safe.
> > > > > If d->head moves, the value in *new_head will be incorrect.
> > > >
> > > > If d->head moves, then *old_head will also be incorrect.
> > > > For that case we do have CAS() below, it will return zero if (d-
> > >head
> > > > != *old_head)
> > > > and we shall go to the next iteration (attempt).
> > >
> > > Exactly.
> > > And with my suggestion the same will happen if n==0, and the next
> > attempt will update them both, until they are both correct.
> > >
> > > > Basically - if n == 0, your *old_head and *new_head might be
> > invalid
> > > > and should not be used
> > > > (and they are not used).
> > > >
> > > > > Instead, suggest:
> > > > >
> > > > > -		if (n == 0)
> > > > > -			return 0;
> > > > >
> > > > > 		*new_head = *old_head + n;
> > > > > 		if (is_st) {
> > > > > 			d->head = *new_head;
> > > > > 			success = 1;
> > > > > 		} else
> > > > > 			/* on failure, *old_head is updated */
> > > > > 			success =
> > > > rte_atomic_compare_exchange_strong_explicit(
> > > > > 					&d->head, old_head, *new_head,
> > > > > 					rte_memory_order_relaxed,
> > > > > 					rte_memory_order_relaxed);
> > > > > 	} while (unlikely(success == 0));
> > > >
> > > > That's possible, but if (n ==0) we probably don't want to update
> > the
> > > > head -
> > > > as we are not moving head - it is pointless, while still expensive.
> > >
> > > Agree. Let's avoid unnecessary cost!
> > > My suggestion was only relevant if *new_head needed to be updated
> > when n==0.
> > >
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 1/4] ring: introduce extra run-time checks
  2025-05-26  8:39             ` Konstantin Ananyev
@ 2025-05-26  9:57               ` Morten Brørup
  0 siblings, 0 replies; 18+ messages in thread
From: Morten Brørup @ 2025-05-26  9:57 UTC (permalink / raw)
  To: Konstantin Ananyev, dev; +Cc: honnappa.nagarahalli, jerinj, hemant.agrawal, drc

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Monday, 26 May 2025 10.39
> 
> > > > > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > > > > Sent: Wednesday, 21 May 2025 14.35
> > > > >
> > > > > > > From: Konstantin Ananyev
> [mailto:konstantin.ananyev@huawei.com]
> > > > > > > Sent: Wednesday, 21 May 2025 13.14
> > > > > > >
> > > > > > > Add RTE_ASSERT() to check that different move_tail()
> flavors
> > > > > > > return meaningful  *entries value.
> > > > > > > It also helps to ensure that inside move_tail(), it uses
> > > correct
> > > > > > > head/tail values.
> > > > > > >
> > > > > > > Signed-off-by: Konstantin Ananyev
> > > <konstantin.ananyev@huawei.com>
> > > > > > > ---
> > > > > > >  lib/ring/rte_ring_c11_pvt.h      | 2 +-
> > > > > > >  lib/ring/rte_ring_elem_pvt.h     | 8 ++++++--
> > > > > > >  lib/ring/rte_ring_hts_elem_pvt.h | 8 ++++++--
> > > > > > >  lib/ring/rte_ring_rts_elem_pvt.h | 8 ++++++--
> > > > > > >  lib/ring/soring.c                | 2 ++
> > > > > > >  5 files changed, 21 insertions(+), 7 deletions(-)
> > > > > > >
> > > > > > > diff --git a/lib/ring/rte_ring_c11_pvt.h
> > > > > b/lib/ring/rte_ring_c11_pvt.h
> > > > > > > index b9388af0da..0845cd6dcf 100644
> > > > > > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > > > > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > > > > > @@ -104,10 +104,10 @@ __rte_ring_headtail_move_head(struct
> > > > > > > rte_ring_headtail *d,
> > > > > > >  			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> > > > > > >  					0 : *entries;
> > > > > > >
> > > > > > > +		*new_head = *old_head + n;
> > > > > > >  		if (n == 0)
> > > > > > >  			return 0;
> > > > > > >
> > > > > > > -		*new_head = *old_head + n;
> > > > > > >  		if (is_st) {
> > > > > > >  			d->head = *new_head;
> > > > > > >  			success = 1;
> > > > > >
> > > > > > Is there a need to assign a value to *new_head if n==0?
> > > > >
> > > > > Not really, main reason I just moved this line up - to keep
> > > compiler
> > > > > happy.
> > > > > Otherwise it complained that *new_head might be left
> uninitialized.
> > > >
> > > > Your change might give the impression that *new_head is used by a
> > > caller. (Like I asked about.)
> > > > To please the compiler, you could mark new_head __rte_unused, or:
> > > >
> > > > -		if (n == 0)
> > > > +		if (n == 0) {
> > > > +			RTE_SET_USED(new_head);
> > > > 			return 0;
> > > > +		}
> 
> Actually, that wouldn't help.
> By some reason, after introducing RTE_ASSERT()  gcc13 believes that now
> cons_next can
> be used (stored) unfinalized here:
> 
>         n = __rte_ring_move_cons_head(r, (int)is_sc, n, behavior,
>                         &cons_head, &cons_next, &entries);
>         if (n == 0)
>                 goto end;
> 
>         __rte_ring_dequeue_elems(r, cons_head, obj_table, esize, n);
> 
>         __rte_ring_update_tail(&r->cons, cons_head, cons_next, is_sc,
> 0);
> 
> end:
>    ...
> 
> For me it is a false positive, somehow it missed that if (n==0) then
> update_table()
> wouldn't be called  at all. Full error message below.
> So making new_head always initialized, even if we are not going to use,
> seems
> like the simplest and cleanest way to fix it.

NAK.
Initializing new_head with potential garbage will prevent the compiler from detecting that it is being used uninitialized afterwards.

If the compiler is too stupid to understand "goto end", then please rewrite the affected code instead:
-	if (n == 0)
-		goto end;
+	if (n != 0) {
	__rte_ring_dequeue_elems();
	__rte_ring_update_tail();
-end:
+	}
	...

> 
> est-pipeline_runtime.c.o -c ../app/test-pipeline/runtime.c
> In file included from ../lib/eal/include/rte_bitops.h:24,
>                  from ../lib/eal/include/rte_memory.h:18,
>                  from ../app/test-pipeline/runtime.c:19:
> In function '__rte_ring_update_tail',
>     inlined from '__rte_ring_do_dequeue_elem' at
> ../lib/ring/rte_ring_elem_pvt.h:472:2,
>     inlined from 'rte_ring_sc_dequeue_bulk_elem' at
> ../lib/ring/rte_ring_elem.h:344:9,
>     inlined from 'rte_ring_sc_dequeue_bulk' at
> ../lib/ring/rte_ring.h:402:9,
>     inlined from 'app_main_loop_worker' at ../app/test-
> pipeline/runtime.c:91:10:
> ../lib/eal/include/rte_stdatomic.h:139:9: error: 'cons_next' may be
> used uninitialized [-Werror=maybe-uninitialized]
>   139 |         __atomic_store_n(ptr, val, memorder)
>       |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> ../lib/ring/rte_ring_c11_pvt.h:39:9: note: in expansion of macro
> 'rte_atomic_store_explicit'
>    39 |         rte_atomic_store_explicit(&ht->tail, new_val,
> rte_memory_order_release);
>       |         ^~~~~~~~~~~~~~~~~~~~~~~~~
> In file included from ../lib/ring/rte_ring_elem.h:20,
>                  from ../lib/ring/rte_ring.h:38,
>                  from ../lib/mempool/rte_mempool.h:49,
>                  from ../lib/mbuf/rte_mbuf.h:38,
>                  from ../lib/net/rte_ether.h:20,
>                  from ../app/test-pipeline/runtime.c:31:
> ../lib/ring/rte_ring_elem_pvt.h: In function 'app_main_loop_worker':
> ../lib/ring/rte_ring_elem_pvt.h:462:29: note: 'cons_next' was declared
> here
>   462 |         uint32_t cons_head, cons_next;
>       |                             ^~~~~~~~~
> In function '__rte_ring_update_tail',
>     inlined from '__rte_ring_do_enqueue_elem' at
> ../lib/ring/rte_ring_elem_pvt.h:425:2,
>     inlined from 'rte_ring_sp_enqueue_bulk_elem' at
> ../lib/ring/rte_ring_elem.h:159:9,
>     inlined from 'rte_ring_sp_enqueue_bulk' at
> ../lib/ring/rte_ring.h:267:9,
>     inlined from 'app_main_loop_worker' at ../app/test-
> pipeline/runtime.c:101:11
> 
> > > >
> > > > >
> > >
> > > Makes sense, will re-spin.
> 
> 
> > I'm having second thoughts about treating this compiler warning as a
> false positive!
> >
> > Are you 100 % sure that no caller uses *new_head?
> 
> Yes, I believe so. If not, then we do have a severe bug in our rt_ring,
> 
> > I suppose you are, because it wasn't set before this patch either, so
> the existing code would have a bug if some caller uses it.
> > But the documentation does not mention that *new_head is not set if
> the function returns 0.
> > So, some future caller might use *new_head, thus introducing a bug
> when n==0.
> >
> > But most importantly for the performance discussion, the costly CAS
> is only pointless when n==0.
> > So, if n==0 is very unlikely, we could accept this pointless cost.
> > And it would save us the cost of "if (n==0) return 0;" in the hot
> code path.
> >
> > And, as a consequence, some of the callers of this function could
> also remove their special handing of
> > __rte_ring_headtail_move_head() returning 0. (Likewise, only if a
> return value of 0 is unlikely, and the special handling has a cost in
> > the hot cod path for non-zero return values.)
> 
> I don't think it is a good idea.
> First of all about the cost - I suppose that situation when 'n==0' is
> not so uncommon:
> There is a contention on the ring (many threads try to enqueue/dequeue)
> to/from it.
> Doing unnecessary CAS() (when n==0) we introduce extra cache-snooping
> to already busy memory sub-system,
> i.e. we slowing down not only current threads, but all other
> producers/consumers of the ring.
> About removing extra branches: I don't think there would be many to
> remove.
> Usually after move_head() finishes we have to do two operations:
> __rte_ring_dequeue_elems() ;
> __rte_ring_update_tail();
> Both have to be performed only when 'n != 0'.

OK, thank you for the analysis.
You convinced me my suggestion was not a good idea.

> Regarding the doc for the move_head() function - sure it can be updated
> to note explicitly
> that both *old_head and *new_head will contain up-to-date values only
> when return value
> is not zero.

Maybe there's a ripple-down effect regarding documentation...
Some of the output parameters of the functions calling __rte_ring_headtail_move_head() are also conditionally valid.

And if any of these output parameters are used anyway, your patch series might have uncovered a bug.

Note:
For the multi-threaded rings, the output parameter *entries is only up-to-date when the return value is non-zero.
If d->head was fetched into *old_head, and another thread raced to update d->head before the CAS() (which is not called when n==0, but would have failed), then *entries was set based on the old *old_head.
But that race might as well occur when returning from the function, so it shouldn't be a problem.
Your added RTE_ASSERT(*entries <= r->capacity)'s should be valid, regardless if *entries is up-to-date or based on the old *old_head.
This also means that the documentation needs no update regarding the validity of the *entries output parameter.

> 
> > > > > > I don't think your suggestion is multi-thread safe.
> > > > > > If d->head moves, the value in *new_head will be incorrect.
> > > > >
> > > > > If d->head moves, then *old_head will also be incorrect.
> > > > > For that case we do have CAS() below, it will return zero if
> (d-
> > > >head
> > > > > != *old_head)
> > > > > and we shall go to the next iteration (attempt).
> > > >
> > > > Exactly.
> > > > And with my suggestion the same will happen if n==0, and the next
> > > attempt will update them both, until they are both correct.
> > > >
> > > > > Basically - if n == 0, your *old_head and *new_head might be
> > > invalid
> > > > > and should not be used
> > > > > (and they are not used).
> > > > >
> > > > > > Instead, suggest:
> > > > > >
> > > > > > -		if (n == 0)
> > > > > > -			return 0;
> > > > > >
> > > > > > 		*new_head = *old_head + n;
> > > > > > 		if (is_st) {
> > > > > > 			d->head = *new_head;
> > > > > > 			success = 1;
> > > > > > 		} else
> > > > > > 			/* on failure, *old_head is updated */
> > > > > > 			success =
> > > > > rte_atomic_compare_exchange_strong_explicit(
> > > > > > 					&d->head, old_head, *new_head,
> > > > > > 					rte_memory_order_relaxed,
> > > > > > 					rte_memory_order_relaxed);
> > > > > > 	} while (unlikely(success == 0));
> > > > >
> > > > > That's possible, but if (n ==0) we probably don't want to
> update
> > > the
> > > > > head -
> > > > > as we are not moving head - it is pointless, while still
> expensive.
> > > >
> > > > Agree. Let's avoid unnecessary cost!
> > > > My suggestion was only relevant if *new_head needed to be updated
> > > when n==0.
> > > >
> >


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-22 22:03   ` Wathsala Wathawana Vithanage
@ 2025-05-26 11:54     ` Konstantin Ananyev
  2025-05-27 21:33       ` Wathsala Wathawana Vithanage
  0 siblings, 1 reply; 18+ messages in thread
From: Konstantin Ananyev @ 2025-05-26 11:54 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage, dev
  Cc: Honnappa Nagarahalli, jerinj, hemant.agrawal, drc, nd

Hi Wathsala,

> 
> Hi Konstanin,
> 
> In rte_ring the store-release on tail update guarantees that CAS
> won't get reordered with the store-released of the tail update.
> 
> So, the sequence of events would look like this (combined view
> of head and tail update)
> 
> Releaxed-load(new_head,  N)                              ----------------> (A)
> Relaxed-CAS(d->head, new_head, old_head)   ----------------> (B)
> Store-release-store(d->tail, new_head)             ----------------> (C)
> 
> If we look at address dependencies, then...
> 
> (B) depends on (A) due to new_head address dependency.
> (C) depends on (A) due to new_head address dependency.
> 
> So, dependency graph looks like this
>        (A)
>     /       \
>    v        v
>  (B)     (C)
> 
> There is no implicit dependence between (B) and (C), I think
> this is the issue you are brining up.
> Even though there is no dependence between the two,
> the store-release of (C) ensures that (B) won't drop below it.
> Therefore, the above graph can be turned into an ordered
> sequence as shown below..
> 
> (A) -> (B) -> (C)

I do agree that with current implementation of __rte_ring_headtail_move_head()
in lib/ring/rte_ring_c11_pvt.h the order of these 3 operations (A->B->C) should be sequential.
The problem I am talking about is a different one:
thread can see 'latest' 'cons.head' value, with 'previous' value for 'prod.tail' or visa-versa.
In other words: 'cons.head' value depends on 'prod.tail', so before making latest 'cons.head'
value visible to other threads, we need to ensure that latest 'prod.tail' is also visible. 
Let me try to explain it on the example:

Suppose at some moment we have:
prod={.head=2,.tail=1};
cons={.head=0,.tail=0};
I.e. thead #1 is in process to enqueue one more element into the ring.

       Thread #1                                                                             Thread #2
T0:
  store(&prod.tail, 2, release);
  /*AFAIU: this is somewhat equivalent to: wmb(); prod.tail=2;
  * I.E. - it guarantees that all stores initiated before that operation will be visible
  * by other threads at the same moment or before new value of prod.tail wlll become
  * visible, but it doesn't guarantee that new prod.tail  value will be visible  to other
  * threads immediately.
  */ 
...
move_cons_head(...,n=2)                                                    move_cons_head(...,n=1)
...                                                                                              ...
T1:
  *old_head = load(&cons.head, relaxed);                          
  fence(acquire);
  /*old_head == 0, no surprises */
  stail = load(&prod.tail, acquire);
  /*stail == 2, no surprises */
 *entries = (capacity + stail - *old_head);
*new_head = *old_head + n;
 /* *entries == (2 - 0) == 2; *new_head==2; all good */ 
...
T2:
                                                                                                *old_head = load(&cons.head, relaxed);
                                                                                                fence(acquire);
                                                                                                /*old_head == 0, no surprises */
                                                                                                stail = load(&prod.tail, acquire);
/* !!!!! stail == 1  !!!!! for Thread 2
 * Even though we do use acquire here - there was no *release* after store(&prod.tail).
 * So, still no guarantee that Thread 2 will see latest prod.tail value.
 */
                                                                                               *entries = (capacity + stail - *old_head);
                                                                                               /* *entries == (1 - 0) == 1, still ok */
                                                                                               *new_head = *old_head + n;
                                                                                               /* *new_head == 1 */
T3:
  success = CAS(&cons.head,
    old_head /*==0/, *new_head /*==2*/,
    relaxed, relaxed);
  /*success==1, cons.head==2*/
 ...   
T4:   
                                                                                           success = CAS(&cons.head,
                                                                                           old_head /*==0/, *new_head /*==1*/,
                                                                                          relaxed, relaxed);
                                                                                          /*success==0, *old_head==2*/
/* CAS() failed and provided Thread 2 with latest valued for cons.head(==2)
 *  Thread 2 repeats attempt, starts second iteration
 */
                                                                                          fence(acquire);
                                                                                          stail = load(&prod.tail, acquire);
/* !!!!! stail == 1  !!!!! for Thread 2
 * Still no *release* had happened after store(&prod.tail) at T0.
 * So, still no guarantee that Thread 2 will see latest prod.tail value.
 */
                                                                                          *entries = (capacity + stail - *old_head);
                                                                                          *new_head = *old_head + n;
                                                                                          
/* !!!!! *entries == (1 - 2) == -1(UINT32_MAX); *new_head==(2+1)==3; !!!!!
 *  we are about to corrupt our ring !!!!!
 */                                                                                        
                                                                          
> 
> I haven't looked at the so-ring yet. Could it be possible that the
> issue is due to something else introduced in that code?

Well, as I said, so far I wasn't able to re-produce this problem with conventional
ring (ring_stress_autotest), only soring_stress_autotest is failing and
for now - only on one specific ARM platform.
Regarding soring specific fix (without touching common code) - 
sure it is also possible, pls see patch #2.
There I just added 'fence(release);' straight after 'store(&tail);'   
That's seems enough to fix that problem within the soring only.
Though, from my understanding rte_ring might also be affected,
that's why I went ahead and prepared that patch. 
If you feel, that I a missing something here - pls shout.
Konstantin 


> Thanks,
> 
> --wathsala
> 
> > This patch aims several purposes:
> > - provide an alternative (and I think a better) way to fix the
> >   issue discussed in previous patch:
> >   "ring/soring: fix synchronization issue between head and tail values"
> > - make sure that such problem wouldn’t happen within other usages of
> >   __rte_ring_headtail_move_head() – both current rte_ring
> >   implementation and possible future use-cases.
> > - step towards unification of move_head() implementations and
> >   removing rte_ring_generic_pvt.h
> > It uses Acquire-Release memory ordering for CAS operation in move_head().
> > That guarantees that corresponding ‘tail’ updates will be visible before current
> > ‘head’ is updated.
> > As I said before: I think that in theory the problem described in previous patch
> > might happen with our conventional rte_ring too (when
> > RTE_USE_C11_MEM_MODEL enabled).
> > But, so far I didn’t manage to reproduce it in reality.
> > For that reason and also because it touches a critical rte_ring code-path, I put
> > these changes into a separate patch. Expect all interested stakeholders to come-
> > up with their comments and observations.
> > Regarding performance impact – on my boxes both ring_perf_autotest and
> > ring_stress_autotest – show a mixed set of results: some of them become few
> > cycles faster, another few cycles slower.
> > But so far, I didn’t notice any real degradations with that patch.
> >
> > Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
> > Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
> > Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
> > Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of the head")
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > ---
> >  lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
> >  lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--  lib/ring/rte_ring_rts_elem_pvt.h
> > |  6 ++++--
> >  lib/ring/soring.c                |  5 -----
> >  4 files changed, 25 insertions(+), 19 deletions(-)
> >
> > diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > 0845cd6dcf..6d1c46df9a 100644
> > --- a/lib/ring/rte_ring_c11_pvt.h
> > +++ b/lib/ring/rte_ring_c11_pvt.h
> > @@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct
> > rte_ring_headtail *d,
> >  	int success;
> >  	unsigned int max = n;
> >
> > +	/* Ensure the head is read before tail */
> >  	*old_head = rte_atomic_load_explicit(&d->head,
> > -			rte_memory_order_relaxed);
> > +			rte_memory_order_acquire);
> >  	do {
> >  		/* Reset n to the initial burst count */
> >  		n = max;
> >
> > -		/* Ensure the head is read before tail */
> > -		rte_atomic_thread_fence(rte_memory_order_acquire);
> > -
> > -		/* load-acquire synchronize with store-release of ht->tail
> > -		 * in update_tail.
> > +		/*
> > +		 * Read s->tail value. Note that it will be loaded after
> > +		 * d->head load, but before CAS operation for the d->head.
> >  		 */
> >  		stail = rte_atomic_load_explicit(&s->tail,
> > -					rte_memory_order_acquire);
> > +					rte_memory_order_relaxed);
> >
> >  		/* The subtraction is done between two unsigned 32bits value
> >  		 * (the result is always modulo 32 bits even if we have @@ -
> > 112,11 +111,19 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail
> > *d,
> >  			d->head = *new_head;
> >  			success = 1;
> >  		} else
> > -			/* on failure, *old_head is updated */
> > +			/*
> > +			 * on failure, *old_head is updated.
> > +			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
> > +			 * barrier to prevent:
> > +			 *  - OOO reads of cons tail value
> > +			 *  - OOO copy of elems from the ring
> > +			 *  Also RELEASE guarantees that latest tail value
> > +			 *  will become visible before the new head value.
> > +			 */
> >  			success =
> > rte_atomic_compare_exchange_strong_explicit(
> >  					&d->head, old_head, *new_head,
> > -					rte_memory_order_relaxed,
> > -					rte_memory_order_relaxed);
> > +					rte_memory_order_acq_rel,
> > +					rte_memory_order_acquire);
> >  	} while (unlikely(success == 0));
> >  	return n;
> >  }
> > diff --git a/lib/ring/rte_ring_hts_elem_pvt.h b/lib/ring/rte_ring_hts_elem_pvt.h
> > index c59e5f6420..cc593433b9 100644
> > --- a/lib/ring/rte_ring_hts_elem_pvt.h
> > +++ b/lib/ring/rte_ring_hts_elem_pvt.h
> > @@ -116,13 +116,15 @@ __rte_ring_hts_move_head(struct
> > rte_ring_hts_headtail *d,
> >  		np.pos.head = op.pos.head + n;
> >
> >  	/*
> > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> >  	 *  - OOO reads of cons tail value
> >  	 *  - OOO copy of elems from the ring
> > +	 *   Also RELEASE guarantees that latest tail value
> > +	 *   will become visible before the new head value.
> >  	 */
> >  	} while (rte_atomic_compare_exchange_strong_explicit(&d->ht.raw,
> >  			(uint64_t *)(uintptr_t)&op.raw, np.raw,
> > -			rte_memory_order_acquire,
> > +			rte_memory_order_acq_rel,
> >  			rte_memory_order_acquire) == 0);
> >
> >  	*old_head = op.pos.head;
> > diff --git a/lib/ring/rte_ring_rts_elem_pvt.h b/lib/ring/rte_ring_rts_elem_pvt.h
> > index 509fa674fb..860b13cc61 100644
> > --- a/lib/ring/rte_ring_rts_elem_pvt.h
> > +++ b/lib/ring/rte_ring_rts_elem_pvt.h
> > @@ -131,13 +131,15 @@ __rte_ring_rts_move_head(struct
> > rte_ring_rts_headtail *d,
> >  		nh.val.cnt = oh.val.cnt + 1;
> >
> >  	/*
> > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> >  	 *  - OOO reads of cons tail value
> >  	 *  - OOO copy of elems to the ring
> > +	 *  Also RELEASE guarantees that latest tail value
> > +	 *  will become visible before the new head value.
> >  	 */
> >  	} while (rte_atomic_compare_exchange_strong_explicit(&d->head.raw,
> >  			(uint64_t *)(uintptr_t)&oh.raw, nh.raw,
> > -			rte_memory_order_acquire,
> > +			rte_memory_order_acq_rel,
> >  			rte_memory_order_acquire) == 0);
> >
> >  	*old_head = oh.val.pos;
> > diff --git a/lib/ring/soring.c b/lib/ring/soring.c index 7bcbf35516..21a1a27e24
> > 100644
> > --- a/lib/ring/soring.c
> > +++ b/lib/ring/soring.c
> > @@ -123,8 +123,6 @@ __rte_soring_stage_finalize(struct
> > soring_stage_headtail *sht, uint32_t stage,
> >  	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
> >  			rte_memory_order_release);
> >
> > -	/* make sure that new tail value is visible */
> > -	rte_atomic_thread_fence(rte_memory_order_release);
> >  	return i;
> >  }
> >
> > @@ -219,9 +217,6 @@ __rte_soring_update_tail(struct __rte_ring_headtail
> > *rht,
> >  		/* unsupported mode, shouldn't be here */
> >  		RTE_ASSERT(0);
> >  	}
> > -
> > -	/* make sure that new tail value is visible */
> > -	rte_atomic_thread_fence(rte_memory_order_release);
> >  }
> >
> >  static __rte_always_inline uint32_t
> > --
> > 2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-26 11:54     ` Konstantin Ananyev
@ 2025-05-27 21:33       ` Wathsala Wathawana Vithanage
  2025-05-27 22:47         ` Wathsala Wathawana Vithanage
  0 siblings, 1 reply; 18+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-05-27 21:33 UTC (permalink / raw)
  To: Konstantin Ananyev, dev
  Cc: Honnappa Nagarahalli, jerinj, hemant.agrawal, drc, nd, nd


Hi Konstantin,

Thanks for the clarification, I now see your concern. 

So, in summary what you are saying is that tail update from
Thread #1 that happened at T0 is not observed by the thread #2
at T2 when it computed new_head and entries calculation. 
That cannot happen in Arm v8/v9 because tail update is a 
store-release and a load-acquire that program order follows it
can only be issued after all the cores have observed the
store-release (there is a synchronization with relationship to
store-release and load-acquire pairs).

In the example you have provided Thread #1's 
store(&prod.tail, 2, release) is observed by all the cores in the
system by the time same thread performs load(&prod.tail, acquire)
at T2. As per Arm v8/v9 memory model Thread #2 should observe
prod.tail=2 (not prod.tail=1).

Arm Architecture Reference Manual section B2.10.11 states…

"Where a Load-Acquire appears in program order after a Store-Release, 
the memory access generated by the Store-Release instruction is
observed by each PE to the extent that PE is required to observe the
access coherently, before the memory access generated by the
Load-Acquire instruction is observed by that PE, to the extent that the
PE is required to observe the access coherently."

> -----Original Message-----
> From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> Sent: Monday, May 26, 2025 6:54 AM
> To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>;
> dev@dpdk.org
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> jerinj@marvell.com; hemant.agrawal@nxp.com; drc@linux.ibm.com; nd
> <nd@arm.com>
> Subject: RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail
> values
> 
> Hi Wathsala,
> 
> >
> > Hi Konstanin,
> >
> > In rte_ring the store-release on tail update guarantees that CAS
> > won't get reordered with the store-released of the tail update.
> >
> > So, the sequence of events would look like this (combined view
> > of head and tail update)
> >
> > Releaxed-load(new_head,  N)                              ----------------> (A)
> > Relaxed-CAS(d->head, new_head, old_head)   ----------------> (B)
> > Store-release-store(d->tail, new_head)             ----------------> (C)
> >
> > If we look at address dependencies, then...
> >
> > (B) depends on (A) due to new_head address dependency.
> > (C) depends on (A) due to new_head address dependency.
> >
> > So, dependency graph looks like this
> >        (A)
> >     /       \
> >    v        v
> >  (B)     (C)
> >
> > There is no implicit dependence between (B) and (C), I think
> > this is the issue you are brining up.
> > Even though there is no dependence between the two,
> > the store-release of (C) ensures that (B) won't drop below it.
> > Therefore, the above graph can be turned into an ordered
> > sequence as shown below..
> >
> > (A) -> (B) -> (C)
> 
> I do agree that with current implementation of
> __rte_ring_headtail_move_head()
> in lib/ring/rte_ring_c11_pvt.h the order of these 3 operations (A->B->C)
> should be sequential.
> The problem I am talking about is a different one:
> thread can see 'latest' 'cons.head' value, with 'previous' value for 'prod.tail' or
> visa-versa.
> In other words: 'cons.head' value depends on 'prod.tail', so before making
> latest 'cons.head'
> value visible to other threads, we need to ensure that latest 'prod.tail' is also
> visible.
> Let me try to explain it on the example:
> 
> Suppose at some moment we have:
> prod={.head=2,.tail=1};
> cons={.head=0,.tail=0};
> I.e. thead #1 is in process to enqueue one more element into the ring.
> 
>        Thread #1                                                                             Thread #2
> T0:
>   store(&prod.tail, 2, release);
>   /*AFAIU: this is somewhat equivalent to: wmb(); prod.tail=2;
>   * I.E. - it guarantees that all stores initiated before that operation will be
> visible
>   * by other threads at the same moment or before new value of prod.tail wlll
> become
>   * visible, but it doesn't guarantee that new prod.tail  value will be visible  to
> other
>   * threads immediately.
>   */
> ...
> move_cons_head(...,n=2)                                                    move_cons_head(...,n=1)
> ...                                                                                              ...
> T1:
>   *old_head = load(&cons.head, relaxed);
>   fence(acquire);
>   /*old_head == 0, no surprises */
>   stail = load(&prod.tail, acquire);
>   /*stail == 2, no surprises */
>  *entries = (capacity + stail - *old_head);
> *new_head = *old_head + n;
>  /* *entries == (2 - 0) == 2; *new_head==2; all good */
> ...
> T2:
>                                                                                                 *old_head =
> load(&cons.head, relaxed);
>                                                                                                 fence(acquire);
>                                                                                                 /*old_head == 0, no surprises
> */
>                                                                                                 stail = load(&prod.tail,
> acquire);
> /* !!!!! stail == 1  !!!!! for Thread 2
>  * Even though we do use acquire here - there was no *release* after
> store(&prod.tail).
>  * So, still no guarantee that Thread 2 will see latest prod.tail value.
>  */
>                                                                                                *entries = (capacity + stail -
> *old_head);
>                                                                                                /* *entries == (1 - 0) == 1, still
> ok */
>                                                                                                *new_head = *old_head + n;
>                                                                                                /* *new_head == 1 */
> T3:
>   success = CAS(&cons.head,
>     old_head /*==0/, *new_head /*==2*/,
>     relaxed, relaxed);
>   /*success==1, cons.head==2*/
>  ...
> T4:
>                                                                                            success = CAS(&cons.head,
>                                                                                            old_head /*==0/, *new_head
> /*==1*/,
>                                                                                           relaxed, relaxed);
>                                                                                           /*success==0, *old_head==2*/
> /* CAS() failed and provided Thread 2 with latest valued for cons.head(==2)
>  *  Thread 2 repeats attempt, starts second iteration
>  */
>                                                                                           fence(acquire);
>                                                                                           stail = load(&prod.tail, acquire);
> /* !!!!! stail == 1  !!!!! for Thread 2
>  * Still no *release* had happened after store(&prod.tail) at T0.
>  * So, still no guarantee that Thread 2 will see latest prod.tail value.
>  */
>                                                                                           *entries = (capacity + stail -
> *old_head);
>                                                                                           *new_head = *old_head + n;
> 
> /* !!!!! *entries == (1 - 2) == -1(UINT32_MAX); *new_head==(2+1)==3; !!!!!
>  *  we are about to corrupt our ring !!!!!
>  */
> 
> >
> > I haven't looked at the so-ring yet. Could it be possible that the
> > issue is due to something else introduced in that code?
> 
> Well, as I said, so far I wasn't able to re-produce this problem with
> conventional
> ring (ring_stress_autotest), only soring_stress_autotest is failing and
> for now - only on one specific ARM platform.
> Regarding soring specific fix (without touching common code) -
> sure it is also possible, pls see patch #2.
> There I just added 'fence(release);' straight after 'store(&tail);'
> That's seems enough to fix that problem within the soring only.
> Though, from my understanding rte_ring might also be affected,
> that's why I went ahead and prepared that patch.
> If you feel, that I a missing something here - pls shout.
> Konstantin
> 
> 
> > Thanks,
> >
> > --wathsala
> >
> > > This patch aims several purposes:
> > > - provide an alternative (and I think a better) way to fix the
> > >   issue discussed in previous patch:
> > >   "ring/soring: fix synchronization issue between head and tail values"
> > > - make sure that such problem wouldn’t happen within other usages of
> > >   __rte_ring_headtail_move_head() – both current rte_ring
> > >   implementation and possible future use-cases.
> > > - step towards unification of move_head() implementations and
> > >   removing rte_ring_generic_pvt.h
> > > It uses Acquire-Release memory ordering for CAS operation in
> move_head().
> > > That guarantees that corresponding ‘tail’ updates will be visible before
> current
> > > ‘head’ is updated.
> > > As I said before: I think that in theory the problem described in previous
> patch
> > > might happen with our conventional rte_ring too (when
> > > RTE_USE_C11_MEM_MODEL enabled).
> > > But, so far I didn’t manage to reproduce it in reality.
> > > For that reason and also because it touches a critical rte_ring code-path, I
> put
> > > these changes into a separate patch. Expect all interested stakeholders to
> come-
> > > up with their comments and observations.
> > > Regarding performance impact – on my boxes both ring_perf_autotest and
> > > ring_stress_autotest – show a mixed set of results: some of them become
> few
> > > cycles faster, another few cycles slower.
> > > But so far, I didn’t notice any real degradations with that patch.
> > >
> > > Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
> > > Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
> > > Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
> > > Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of the
> head")
> > >
> > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > > ---
> > >  lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
> > >  lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--
> lib/ring/rte_ring_rts_elem_pvt.h
> > > |  6 ++++--
> > >  lib/ring/soring.c                |  5 -----
> > >  4 files changed, 25 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > > 0845cd6dcf..6d1c46df9a 100644
> > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > @@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct
> > > rte_ring_headtail *d,
> > >  	int success;
> > >  	unsigned int max = n;
> > >
> > > +	/* Ensure the head is read before tail */
> > >  	*old_head = rte_atomic_load_explicit(&d->head,
> > > -			rte_memory_order_relaxed);
> > > +			rte_memory_order_acquire);
> > >  	do {
> > >  		/* Reset n to the initial burst count */
> > >  		n = max;
> > >
> > > -		/* Ensure the head is read before tail */
> > > -		rte_atomic_thread_fence(rte_memory_order_acquire);
> > > -
> > > -		/* load-acquire synchronize with store-release of ht->tail
> > > -		 * in update_tail.
> > > +		/*
> > > +		 * Read s->tail value. Note that it will be loaded after
> > > +		 * d->head load, but before CAS operation for the d->head.
> > >  		 */
> > >  		stail = rte_atomic_load_explicit(&s->tail,
> > > -					rte_memory_order_acquire);
> > > +					rte_memory_order_relaxed);
> > >
> > >  		/* The subtraction is done between two unsigned 32bits value
> > >  		 * (the result is always modulo 32 bits even if we have @@ -
> > > 112,11 +111,19 @@ __rte_ring_headtail_move_head(struct
> rte_ring_headtail
> > > *d,
> > >  			d->head = *new_head;
> > >  			success = 1;
> > >  		} else
> > > -			/* on failure, *old_head is updated */
> > > +			/*
> > > +			 * on failure, *old_head is updated.
> > > +			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
> > > +			 * barrier to prevent:
> > > +			 *  - OOO reads of cons tail value
> > > +			 *  - OOO copy of elems from the ring
> > > +			 *  Also RELEASE guarantees that latest tail value
> > > +			 *  will become visible before the new head value.
> > > +			 */
> > >  			success =
> > > rte_atomic_compare_exchange_strong_explicit(
> > >  					&d->head, old_head, *new_head,
> > > -					rte_memory_order_relaxed,
> > > -					rte_memory_order_relaxed);
> > > +					rte_memory_order_acq_rel,
> > > +					rte_memory_order_acquire);
> > >  	} while (unlikely(success == 0));
> > >  	return n;
> > >  }
> > > diff --git a/lib/ring/rte_ring_hts_elem_pvt.h
> b/lib/ring/rte_ring_hts_elem_pvt.h
> > > index c59e5f6420..cc593433b9 100644
> > > --- a/lib/ring/rte_ring_hts_elem_pvt.h
> > > +++ b/lib/ring/rte_ring_hts_elem_pvt.h
> > > @@ -116,13 +116,15 @@ __rte_ring_hts_move_head(struct
> > > rte_ring_hts_headtail *d,
> > >  		np.pos.head = op.pos.head + n;
> > >
> > >  	/*
> > > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> > >  	 *  - OOO reads of cons tail value
> > >  	 *  - OOO copy of elems from the ring
> > > +	 *   Also RELEASE guarantees that latest tail value
> > > +	 *   will become visible before the new head value.
> > >  	 */
> > >  	} while (rte_atomic_compare_exchange_strong_explicit(&d->ht.raw,
> > >  			(uint64_t *)(uintptr_t)&op.raw, np.raw,
> > > -			rte_memory_order_acquire,
> > > +			rte_memory_order_acq_rel,
> > >  			rte_memory_order_acquire) == 0);
> > >
> > >  	*old_head = op.pos.head;
> > > diff --git a/lib/ring/rte_ring_rts_elem_pvt.h
> b/lib/ring/rte_ring_rts_elem_pvt.h
> > > index 509fa674fb..860b13cc61 100644
> > > --- a/lib/ring/rte_ring_rts_elem_pvt.h
> > > +++ b/lib/ring/rte_ring_rts_elem_pvt.h
> > > @@ -131,13 +131,15 @@ __rte_ring_rts_move_head(struct
> > > rte_ring_rts_headtail *d,
> > >  		nh.val.cnt = oh.val.cnt + 1;
> > >
> > >  	/*
> > > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> > >  	 *  - OOO reads of cons tail value
> > >  	 *  - OOO copy of elems to the ring
> > > +	 *  Also RELEASE guarantees that latest tail value
> > > +	 *  will become visible before the new head value.
> > >  	 */
> > >  	} while (rte_atomic_compare_exchange_strong_explicit(&d-
> >head.raw,
> > >  			(uint64_t *)(uintptr_t)&oh.raw, nh.raw,
> > > -			rte_memory_order_acquire,
> > > +			rte_memory_order_acq_rel,
> > >  			rte_memory_order_acquire) == 0);
> > >
> > >  	*old_head = oh.val.pos;
> > > diff --git a/lib/ring/soring.c b/lib/ring/soring.c index
> 7bcbf35516..21a1a27e24
> > > 100644
> > > --- a/lib/ring/soring.c
> > > +++ b/lib/ring/soring.c
> > > @@ -123,8 +123,6 @@ __rte_soring_stage_finalize(struct
> > > soring_stage_headtail *sht, uint32_t stage,
> > >  	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
> > >  			rte_memory_order_release);
> > >
> > > -	/* make sure that new tail value is visible */
> > > -	rte_atomic_thread_fence(rte_memory_order_release);
> > >  	return i;
> > >  }
> > >
> > > @@ -219,9 +217,6 @@ __rte_soring_update_tail(struct
> __rte_ring_headtail
> > > *rht,
> > >  		/* unsupported mode, shouldn't be here */
> > >  		RTE_ASSERT(0);
> > >  	}
> > > -
> > > -	/* make sure that new tail value is visible */
> > > -	rte_atomic_thread_fence(rte_memory_order_release);
> > >  }
> > >
> > >  static __rte_always_inline uint32_t
> > > --
> > > 2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH v1 3/4] ring: fix potential sync issue between head and tail values
  2025-05-27 21:33       ` Wathsala Wathawana Vithanage
@ 2025-05-27 22:47         ` Wathsala Wathawana Vithanage
  0 siblings, 0 replies; 18+ messages in thread
From: Wathsala Wathawana Vithanage @ 2025-05-27 22:47 UTC (permalink / raw)
  To: Wathsala Wathawana Vithanage, Konstantin Ananyev, dev
  Cc: Honnappa Nagarahalli, jerinj, hemant.agrawal, drc, nd, nd

> Hi Konstantin,
> 
> Thanks for the clarification, I now see your concern.
> 
> So, in summary what you are saying is that tail update from
> Thread #1 that happened at T0 is not observed by the thread #2
> at T2 when it computed new_head and entries calculation.
> That cannot happen in Arm v8/v9 because tail update is a
> store-release and a load-acquire that program order follows it
> can only be issued after all the cores have observed the
> store-release (there is a synchronization with relationship to
> store-release and load-acquire pairs).
> 
> In the example you have provided Thread #1's
> store(&prod.tail, 2, release) is observed by all the cores in the
> system by the time same thread performs load(&prod.tail, acquire)
> at T2. As per Arm v8/v9 memory model Thread #2 should observe
> prod.tail=2 (not prod.tail=1).
> 
> Arm Architecture Reference Manual section B2.10.11 states…
> 
> "Where a Load-Acquire appears in program order after a Store-Release,
> the memory access generated by the Store-Release instruction is
> observed by each PE to the extent that PE is required to observe the
> access coherently, before the memory access generated by the
> Load-Acquire instruction is observed by that PE, to the extent that the
> PE is required to observe the access coherently."
> 

In soring is this the pair that update the tail and move head?  

__rte_soring_update_tail:
	__rte_ring_update_tail(&rht->ht, head, next, st, enq); 

__rte_soring_move_cons_head:
	 __rte_ring_headtail_move_head(&r->cons.ht,  &r->stage[stage].ht, 0, ...);


If so, &rht->ht and &r->stage[stage].ht are the same address? If they are
not, then you will run into the issue you have seen (a.k.a "Other-multi-copy
 atomic" which is legit in Arm v8 and above).

Thanks.

--wathsala


> > -----Original Message-----
> > From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > Sent: Monday, May 26, 2025 6:54 AM
> > To: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>;
> > dev@dpdk.org
> > Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > jerinj@marvell.com; hemant.agrawal@nxp.com; drc@linux.ibm.com; nd
> > <nd@arm.com>
> > Subject: RE: [PATCH v1 3/4] ring: fix potential sync issue between head and
> tail
> > values
> >
> > Hi Wathsala,
> >
> > >
> > > Hi Konstanin,
> > >
> > > In rte_ring the store-release on tail update guarantees that CAS
> > > won't get reordered with the store-released of the tail update.
> > >
> > > So, the sequence of events would look like this (combined view
> > > of head and tail update)
> > >
> > > Releaxed-load(new_head,  N)                              ----------------> (A)
> > > Relaxed-CAS(d->head, new_head, old_head)   ----------------> (B)
> > > Store-release-store(d->tail, new_head)             ----------------> (C)
> > >
> > > If we look at address dependencies, then...
> > >
> > > (B) depends on (A) due to new_head address dependency.
> > > (C) depends on (A) due to new_head address dependency.
> > >
> > > So, dependency graph looks like this
> > >        (A)
> > >     /       \
> > >    v        v
> > >  (B)     (C)
> > >
> > > There is no implicit dependence between (B) and (C), I think
> > > this is the issue you are brining up.
> > > Even though there is no dependence between the two,
> > > the store-release of (C) ensures that (B) won't drop below it.
> > > Therefore, the above graph can be turned into an ordered
> > > sequence as shown below..
> > >
> > > (A) -> (B) -> (C)
> >
> > I do agree that with current implementation of
> > __rte_ring_headtail_move_head()
> > in lib/ring/rte_ring_c11_pvt.h the order of these 3 operations (A->B->C)
> > should be sequential.
> > The problem I am talking about is a different one:
> > thread can see 'latest' 'cons.head' value, with 'previous' value for 'prod.tail' or
> > visa-versa.
> > In other words: 'cons.head' value depends on 'prod.tail', so before making
> > latest 'cons.head'
> > value visible to other threads, we need to ensure that latest 'prod.tail' is also
> > visible.
> > Let me try to explain it on the example:
> >
> > Suppose at some moment we have:
> > prod={.head=2,.tail=1};
> > cons={.head=0,.tail=0};
> > I.e. thead #1 is in process to enqueue one more element into the ring.
> >
> >        Thread #1                                                                             Thread #2
> > T0:
> >   store(&prod.tail, 2, release);
> >   /*AFAIU: this is somewhat equivalent to: wmb(); prod.tail=2;
> >   * I.E. - it guarantees that all stores initiated before that operation will be
> > visible
> >   * by other threads at the same moment or before new value of prod.tail wlll
> > become
> >   * visible, but it doesn't guarantee that new prod.tail  value will be visible  to
> > other
> >   * threads immediately.
> >   */
> > ...
> > move_cons_head(...,n=2)
> move_cons_head(...,n=1)
> > ...                                                                                              ...
> > T1:
> >   *old_head = load(&cons.head, relaxed);
> >   fence(acquire);
> >   /*old_head == 0, no surprises */
> >   stail = load(&prod.tail, acquire);
> >   /*stail == 2, no surprises */
> >  *entries = (capacity + stail - *old_head);
> > *new_head = *old_head + n;
> >  /* *entries == (2 - 0) == 2; *new_head==2; all good */
> > ...
> > T2:
> >                                                                                                 *old_head =
> > load(&cons.head, relaxed);
> >                                                                                                 fence(acquire);
> >                                                                                                 /*old_head == 0, no
> surprises
> > */
> >                                                                                                 stail = load(&prod.tail,
> > acquire);
> > /* !!!!! stail == 1  !!!!! for Thread 2
> >  * Even though we do use acquire here - there was no *release* after
> > store(&prod.tail).
> >  * So, still no guarantee that Thread 2 will see latest prod.tail value.
> >  */
> >                                                                                                *entries = (capacity + stail -
> > *old_head);
> >                                                                                                /* *entries == (1 - 0) == 1,
> still
> > ok */
> >                                                                                                *new_head = *old_head + n;
> >                                                                                                /* *new_head == 1 */
> > T3:
> >   success = CAS(&cons.head,
> >     old_head /*==0/, *new_head /*==2*/,
> >     relaxed, relaxed);
> >   /*success==1, cons.head==2*/
> >  ...
> > T4:
> >                                                                                            success = CAS(&cons.head,
> >                                                                                            old_head /*==0/, *new_head
> > /*==1*/,
> >                                                                                           relaxed, relaxed);
> >                                                                                           /*success==0, *old_head==2*/
> > /* CAS() failed and provided Thread 2 with latest valued for cons.head(==2)
> >  *  Thread 2 repeats attempt, starts second iteration
> >  */
> >                                                                                           fence(acquire);
> >                                                                                           stail = load(&prod.tail, acquire);
> > /* !!!!! stail == 1  !!!!! for Thread 2
> >  * Still no *release* had happened after store(&prod.tail) at T0.
> >  * So, still no guarantee that Thread 2 will see latest prod.tail value.
> >  */
> >                                                                                           *entries = (capacity + stail -
> > *old_head);
> >                                                                                           *new_head = *old_head + n;
> >
> > /* !!!!! *entries == (1 - 2) == -1(UINT32_MAX); *new_head==(2+1)==3; !!!!!
> >  *  we are about to corrupt our ring !!!!!
> >  */
> >
> > >
> > > I haven't looked at the so-ring yet. Could it be possible that the
> > > issue is due to something else introduced in that code?
> >
> > Well, as I said, so far I wasn't able to re-produce this problem with
> > conventional
> > ring (ring_stress_autotest), only soring_stress_autotest is failing and
> > for now - only on one specific ARM platform.
> > Regarding soring specific fix (without touching common code) -
> > sure it is also possible, pls see patch #2.
> > There I just added 'fence(release);' straight after 'store(&tail);'
> > That's seems enough to fix that problem within the soring only.
> > Though, from my understanding rte_ring might also be affected,
> > that's why I went ahead and prepared that patch.
> > If you feel, that I a missing something here - pls shout.
> > Konstantin
> >
> >
> > > Thanks,
> > >
> > > --wathsala
> > >
> > > > This patch aims several purposes:
> > > > - provide an alternative (and I think a better) way to fix the
> > > >   issue discussed in previous patch:
> > > >   "ring/soring: fix synchronization issue between head and tail values"
> > > > - make sure that such problem wouldn’t happen within other usages of
> > > >   __rte_ring_headtail_move_head() – both current rte_ring
> > > >   implementation and possible future use-cases.
> > > > - step towards unification of move_head() implementations and
> > > >   removing rte_ring_generic_pvt.h
> > > > It uses Acquire-Release memory ordering for CAS operation in
> > move_head().
> > > > That guarantees that corresponding ‘tail’ updates will be visible before
> > current
> > > > ‘head’ is updated.
> > > > As I said before: I think that in theory the problem described in previous
> > patch
> > > > might happen with our conventional rte_ring too (when
> > > > RTE_USE_C11_MEM_MODEL enabled).
> > > > But, so far I didn’t manage to reproduce it in reality.
> > > > For that reason and also because it touches a critical rte_ring code-path, I
> > put
> > > > these changes into a separate patch. Expect all interested stakeholders to
> > come-
> > > > up with their comments and observations.
> > > > Regarding performance impact – on my boxes both ring_perf_autotest
> and
> > > > ring_stress_autotest – show a mixed set of results: some of them
> become
> > few
> > > > cycles faster, another few cycles slower.
> > > > But so far, I didn’t notice any real degradations with that patch.
> > > >
> > > > Fixes: b5458e2cc483 ("ring: introduce staged ordered ring")
> > > > Fixes: 1cc363b8ce06 ("ring: introduce HTS ring mode")
> > > > Fixes: e6ba4731c0f3 ("ring: introduce RTS ring mode")
> > > > Fixes: 49594a63147a ("ring/c11: relax ordering for load and store of the
> > head")
> > > >
> > > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> > > > ---
> > > >  lib/ring/rte_ring_c11_pvt.h      | 27 +++++++++++++++++----------
> > > >  lib/ring/rte_ring_hts_elem_pvt.h |  6 ++++--
> > lib/ring/rte_ring_rts_elem_pvt.h
> > > > |  6 ++++--
> > > >  lib/ring/soring.c                |  5 -----
> > > >  4 files changed, 25 insertions(+), 19 deletions(-)
> > > >
> > > > diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
> index
> > > > 0845cd6dcf..6d1c46df9a 100644
> > > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > > @@ -77,20 +77,19 @@ __rte_ring_headtail_move_head(struct
> > > > rte_ring_headtail *d,
> > > >  	int success;
> > > >  	unsigned int max = n;
> > > >
> > > > +	/* Ensure the head is read before tail */
> > > >  	*old_head = rte_atomic_load_explicit(&d->head,
> > > > -			rte_memory_order_relaxed);
> > > > +			rte_memory_order_acquire);
> > > >  	do {
> > > >  		/* Reset n to the initial burst count */
> > > >  		n = max;
> > > >
> > > > -		/* Ensure the head is read before tail */
> > > > -		rte_atomic_thread_fence(rte_memory_order_acquire);
> > > > -
> > > > -		/* load-acquire synchronize with store-release of ht->tail
> > > > -		 * in update_tail.
> > > > +		/*
> > > > +		 * Read s->tail value. Note that it will be loaded after
> > > > +		 * d->head load, but before CAS operation for the d->head.
> > > >  		 */
> > > >  		stail = rte_atomic_load_explicit(&s->tail,
> > > > -					rte_memory_order_acquire);
> > > > +					rte_memory_order_relaxed);
> > > >
> > > >  		/* The subtraction is done between two unsigned 32bits value
> > > >  		 * (the result is always modulo 32 bits even if we have @@ -
> > > > 112,11 +111,19 @@ __rte_ring_headtail_move_head(struct
> > rte_ring_headtail
> > > > *d,
> > > >  			d->head = *new_head;
> > > >  			success = 1;
> > > >  		} else
> > > > -			/* on failure, *old_head is updated */
> > > > +			/*
> > > > +			 * on failure, *old_head is updated.
> > > > +			 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist
> > > > +			 * barrier to prevent:
> > > > +			 *  - OOO reads of cons tail value
> > > > +			 *  - OOO copy of elems from the ring
> > > > +			 *  Also RELEASE guarantees that latest tail value
> > > > +			 *  will become visible before the new head value.
> > > > +			 */
> > > >  			success =
> > > > rte_atomic_compare_exchange_strong_explicit(
> > > >  					&d->head, old_head, *new_head,
> > > > -					rte_memory_order_relaxed,
> > > > -					rte_memory_order_relaxed);
> > > > +					rte_memory_order_acq_rel,
> > > > +					rte_memory_order_acquire);
> > > >  	} while (unlikely(success == 0));
> > > >  	return n;
> > > >  }
> > > > diff --git a/lib/ring/rte_ring_hts_elem_pvt.h
> > b/lib/ring/rte_ring_hts_elem_pvt.h
> > > > index c59e5f6420..cc593433b9 100644
> > > > --- a/lib/ring/rte_ring_hts_elem_pvt.h
> > > > +++ b/lib/ring/rte_ring_hts_elem_pvt.h
> > > > @@ -116,13 +116,15 @@ __rte_ring_hts_move_head(struct
> > > > rte_ring_hts_headtail *d,
> > > >  		np.pos.head = op.pos.head + n;
> > > >
> > > >  	/*
> > > > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > > > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> > > >  	 *  - OOO reads of cons tail value
> > > >  	 *  - OOO copy of elems from the ring
> > > > +	 *   Also RELEASE guarantees that latest tail value
> > > > +	 *   will become visible before the new head value.
> > > >  	 */
> > > >  	} while (rte_atomic_compare_exchange_strong_explicit(&d->ht.raw,
> > > >  			(uint64_t *)(uintptr_t)&op.raw, np.raw,
> > > > -			rte_memory_order_acquire,
> > > > +			rte_memory_order_acq_rel,
> > > >  			rte_memory_order_acquire) == 0);
> > > >
> > > >  	*old_head = op.pos.head;
> > > > diff --git a/lib/ring/rte_ring_rts_elem_pvt.h
> > b/lib/ring/rte_ring_rts_elem_pvt.h
> > > > index 509fa674fb..860b13cc61 100644
> > > > --- a/lib/ring/rte_ring_rts_elem_pvt.h
> > > > +++ b/lib/ring/rte_ring_rts_elem_pvt.h
> > > > @@ -131,13 +131,15 @@ __rte_ring_rts_move_head(struct
> > > > rte_ring_rts_headtail *d,
> > > >  		nh.val.cnt = oh.val.cnt + 1;
> > > >
> > > >  	/*
> > > > -	 * this CAS(ACQUIRE, ACQUIRE) serves as a hoist barrier to prevent:
> > > > +	 * this CAS(ACQ_REL, ACQUIRE) serves as a hoist barrier to prevent:
> > > >  	 *  - OOO reads of cons tail value
> > > >  	 *  - OOO copy of elems to the ring
> > > > +	 *  Also RELEASE guarantees that latest tail value
> > > > +	 *  will become visible before the new head value.
> > > >  	 */
> > > >  	} while (rte_atomic_compare_exchange_strong_explicit(&d-
> > >head.raw,
> > > >  			(uint64_t *)(uintptr_t)&oh.raw, nh.raw,
> > > > -			rte_memory_order_acquire,
> > > > +			rte_memory_order_acq_rel,
> > > >  			rte_memory_order_acquire) == 0);
> > > >
> > > >  	*old_head = oh.val.pos;
> > > > diff --git a/lib/ring/soring.c b/lib/ring/soring.c index
> > 7bcbf35516..21a1a27e24
> > > > 100644
> > > > --- a/lib/ring/soring.c
> > > > +++ b/lib/ring/soring.c
> > > > @@ -123,8 +123,6 @@ __rte_soring_stage_finalize(struct
> > > > soring_stage_headtail *sht, uint32_t stage,
> > > >  	rte_atomic_store_explicit(&sht->tail.raw, ot.raw,
> > > >  			rte_memory_order_release);
> > > >
> > > > -	/* make sure that new tail value is visible */
> > > > -	rte_atomic_thread_fence(rte_memory_order_release);
> > > >  	return i;
> > > >  }
> > > >
> > > > @@ -219,9 +217,6 @@ __rte_soring_update_tail(struct
> > __rte_ring_headtail
> > > > *rht,
> > > >  		/* unsupported mode, shouldn't be here */
> > > >  		RTE_ASSERT(0);
> > > >  	}
> > > > -
> > > > -	/* make sure that new tail value is visible */
> > > > -	rte_atomic_thread_fence(rte_memory_order_release);
> > > >  }
> > > >
> > > >  static __rte_always_inline uint32_t
> > > > --
> > > > 2.43.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2025-05-27 22:48 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-21 11:14 [PATCH v1 0/4] ring: some fixes and improvements Konstantin Ananyev
2025-05-21 11:14 ` [PATCH v1 1/4] ring: introduce extra run-time checks Konstantin Ananyev
2025-05-21 12:14   ` Morten Brørup
2025-05-21 12:34     ` Konstantin Ananyev
2025-05-21 18:36       ` Morten Brørup
2025-05-21 19:38         ` Konstantin Ananyev
2025-05-21 22:02           ` Morten Brørup
2025-05-26  8:39             ` Konstantin Ananyev
2025-05-26  9:57               ` Morten Brørup
2025-05-21 11:14 ` [PATCH v1 2/4] ring/soring: fix head-tail synchronization issue Konstantin Ananyev
2025-05-21 11:14 ` [PATCH v1 3/4] ring: fix potential sync issue between head and tail values Konstantin Ananyev
2025-05-21 20:26   ` Morten Brørup
2025-05-22 22:03   ` Wathsala Wathawana Vithanage
2025-05-26 11:54     ` Konstantin Ananyev
2025-05-27 21:33       ` Wathsala Wathawana Vithanage
2025-05-27 22:47         ` Wathsala Wathawana Vithanage
2025-05-21 11:14 ` [PATCH v1 4/4] config/x86: enable RTE_USE_C11_MEM_MODEL by default Konstantin Ananyev
2025-05-21 19:47   ` Morten Brørup

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).