DPDK patches and discussions
 help / color / Atom feed
* [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange
@ 2019-06-23  2:41 Phil Yang
  2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 2/3] test/atomic: add 128b compare and swap test Phil Yang
                   ` (7 more replies)
  0 siblings, 8 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-23  2:41 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128-bit atomic compare exchange on aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
This patch depends on
http://patchwork.dpdk.org/patch/54840/

v2:
eal/stack: fix 'pointer-sign' warning

 .../common/include/arch/arm/rte_atomic_64.h        | 184 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  15 +-
 3 files changed, 198 insertions(+), 13 deletions(-)


diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..4ef900c 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,186 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*----------------------- 128 bit atomic operations -------------------------*/
+
+#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define RTE_HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || \
+			 (mo) == __ATOMIC_ACQ_REL || \
+			 (mo) == __ATOMIC_SEQ_CST)
+
+#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
+		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define RTE_MO_STORE(mo) (RTE_HAS_RLS((mo)) \
+		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#ifdef __ARM_FEATURE_ATOMICS
+static inline rte_int128_t
+__rte_casp(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated, int mo)
+{
+
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */
+	register uint64_t x0 __asm ("x0") = (uint64_t)old.val[0];
+	register uint64_t x1 __asm ("x1") = (uint64_t)old.val[1];
+	register uint64_t x2 __asm ("x2") = (uint64_t)updated.val[0];
+	register uint64_t x3 __asm ("x3") = (uint64_t)updated.val[1];
+
+	if (mo ==  __ATOMIC_RELAXED) {
+		asm volatile(
+				"casp %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_ACQUIRE) {
+		asm volatile(
+				"caspa %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_ACQ_REL) {
+		asm volatile(
+				"caspal %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_RELEASE) {
+		asm volatile(
+				"caspl %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else {
+		rte_panic("Invalid memory order\n");
+	}
+
+	old.val[0] = x0;
+	old.val[1] = x1;
+
+	return old;
+}
+#else
+static inline rte_int128_t
+__rte_ldx128(const rte_int128_t *src, int mo)
+{
+	rte_int128_t ret;
+	if (mo == __ATOMIC_ACQUIRE)
+		asm volatile(
+				"ldaxp %0, %1, %2"
+				: "=&r" (ret.val[0]),
+				  "=&r" (ret.val[1])
+				: "Q" (src->val[0])
+				: "memory");
+	else if (mo == __ATOMIC_RELAXED)
+		asm volatile(
+				"ldxp %0, %1, %2"
+				: "=&r" (ret.val[0]),
+				  "=&r" (ret.val[1])
+				: "Q" (src->val[0])
+				: "memory");
+	else
+		rte_panic("Invalid memory order\n");
+
+	return ret;
+}
+
+static inline uint32_t
+__rte_stx128(rte_int128_t *dst, const rte_int128_t src, int mo)
+{
+	uint32_t ret;
+	if (mo == __ATOMIC_RELEASE)
+		asm volatile(
+				"stlxp %w0, %1, %2, %3"
+				: "=&r" (ret)
+				: "r" (src.val[0]),
+				  "r" (src.val[1]),
+				  "Q" (dst->val[0])
+				: "memory");
+	else if (mo == __ATOMIC_RELAXED)
+		asm volatile(
+				"stxp %w0, %1, %2, %3"
+				: "=&r" (ret)
+				: "r" (src.val[0]),
+				  "r" (src.val[1]),
+				  "Q" (dst->val[0])
+				: "memory");
+	else
+		rte_panic("Invalid memory order\n");
+
+	/* Return 0 on success, 1 on failure */
+	return ret;
+}
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	// Always do strong CAS
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+
+#ifdef __ARM_FEATURE_ATOMICS
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	old = __rte_casp(dst, expected, desired, success);
+#else
+	int ldx_mo = RTE_MO_LOAD(success);
+	int stx_mo = RTE_MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		old = __rte_ldx128(dst, ldx_mo);
+		if (likely(old.int128 == expected.int128))
+			ret = __rte_stx128(dst, desired, stx_mo);
+		else
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			ret = __rte_stx128(dst, old, stx_mo);
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 6232c57..23cf48f 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 static inline int __rte_experimental
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
 			   rte_int128_t *exp,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 9958543..7dd1aa4 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,18 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__extension__ __int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1105,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v1 2/3] test/atomic: add 128b compare and swap test
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
@ 2019-06-23  2:41 ` Phil Yang
  2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-23  2:41 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 app/test/test_atomic.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 1 deletion(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..b248063 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,6 +62,22 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Acquired CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
@@ -73,6 +90,10 @@ static rte_atomic64_t a64;
 static rte_atomic64_t count;
 static rte_atomic32_t synchro;
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+#endif
+
 static int
 test_atomic_usual(__attribute__((unused)) void *arg)
 {
@@ -216,6 +237,71 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation successful.
+ * This test repeat 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	rte_int128_t expected;
+	int success;
+
+	expected = count128;
+
+	for (int i=0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +426,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v1 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-06-23  2:41 ` Phil Yang
  2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-23  2:41 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 doc/guides/rel_notes/release_19_08.rst  | 3 +++
 lib/librte_stack/rte_stack_lf_c11.h     | 4 ++--
 lib/librte_stack/rte_stack_lf_generic.h | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 8c3932d..b79ae28 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -88,6 +88,9 @@ New Features
   * Added multi-queue support to allow one af_xdp vdev with multiple netdev
     queues
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..67c21fd 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -88,7 +88,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..488fd9f 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -84,7 +84,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-06-23  3:15 ` Phil Yang
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (2 more replies)
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
                   ` (4 subsequent siblings)
  7 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-23  3:15 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128-bit atomic compare exchange on aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
This patch depends on 'eal/stack: fix 'pointer-sign' warning'
http://patchwork.dpdk.org/patch/54840/

v2:
Fixed coding style warning.

 .../common/include/arch/arm/rte_atomic_64.h        | 184 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  15 +-
 3 files changed, 198 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..ae29ce6 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,186 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*----------------------- 128 bit atomic operations -------------------------*/
+
+#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define RTE_HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || \
+			 (mo) == __ATOMIC_ACQ_REL || \
+			 (mo) == __ATOMIC_SEQ_CST)
+
+#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
+		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define RTE_MO_STORE(mo) (RTE_HAS_RLS((mo)) \
+		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#ifdef __ARM_FEATURE_ATOMICS
+static inline rte_int128_t
+__rte_casp(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated, int mo)
+{
+
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];
+
+	if (mo ==  __ATOMIC_RELAXED) {
+		asm volatile(
+				"casp %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_ACQUIRE) {
+		asm volatile(
+				"caspa %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_ACQ_REL) {
+		asm volatile(
+				"caspal %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else if (mo == __ATOMIC_RELEASE) {
+		asm volatile(
+				"caspl %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"
+				: [old0] "+r" (x0),
+				  [old1] "+r" (x1)
+				: [upd0] "r" (x2),
+				  [upd1] "r" (x3),
+				  [dst] "r" (dst)
+				: "memory");
+	} else {
+		rte_panic("Invalid memory order\n");
+	}
+
+	old.val[0] = x0;
+	old.val[1] = x1;
+
+	return old;
+}
+#else
+static inline rte_int128_t
+__rte_ldx128(const rte_int128_t *src, int mo)
+{
+	rte_int128_t ret;
+	if (mo == __ATOMIC_ACQUIRE)
+		asm volatile(
+				"ldaxp %0, %1, %2"
+				: "=&r" (ret.val[0]),
+				  "=&r" (ret.val[1])
+				: "Q" (src->val[0])
+				: "memory");
+	else if (mo == __ATOMIC_RELAXED)
+		asm volatile(
+				"ldxp %0, %1, %2"
+				: "=&r" (ret.val[0]),
+				  "=&r" (ret.val[1])
+				: "Q" (src->val[0])
+				: "memory");
+	else
+		rte_panic("Invalid memory order\n");
+
+	return ret;
+}
+
+static inline uint32_t
+__rte_stx128(rte_int128_t *dst, const rte_int128_t src, int mo)
+{
+	uint32_t ret;
+	if (mo == __ATOMIC_RELEASE)
+		asm volatile(
+				"stlxp %w0, %1, %2, %3"
+				: "=&r" (ret)
+				: "r" (src.val[0]),
+				  "r" (src.val[1]),
+				  "Q" (dst->val[0])
+				: "memory");
+	else if (mo == __ATOMIC_RELAXED)
+		asm volatile(
+				"stxp %w0, %1, %2, %3"
+				: "=&r" (ret)
+				: "r" (src.val[0]),
+				  "r" (src.val[1]),
+				  "Q" (dst->val[0])
+				: "memory");
+	else
+		rte_panic("Invalid memory order\n");
+
+	/* Return 0 on success, 1 on failure */
+	return ret;
+}
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	// Always do strong CAS
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+
+#ifdef __ARM_FEATURE_ATOMICS
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	old = __rte_casp(dst, expected, desired, success);
+#else
+	int ldx_mo = RTE_MO_LOAD(success);
+	int stx_mo = RTE_MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		old = __rte_ldx128(dst, ldx_mo);
+		if (likely(old.int128 == expected.int128))
+			ret = __rte_stx128(dst, desired, stx_mo);
+		else
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			ret = __rte_stx128(dst, old, stx_mo);
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 6232c57..23cf48f 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 static inline int __rte_experimental
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
 			   rte_int128_t *exp,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 9958543..7dd1aa4 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,18 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__extension__ __int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1105,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test
  2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
@ 2019-06-23  3:15   ` Phil Yang
  2019-06-24 15:09     ` Eads, Gage
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-06-24 14:46   ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Eads, Gage
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-06-23  3:15 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 app/test/test_atomic.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..da09bc4 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,6 +62,22 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Acquired CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[0] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
@@ -73,6 +90,10 @@ static rte_atomic64_t a64;
 static rte_atomic64_t count;
 static rte_atomic32_t synchro;
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+#endif
+
 static int
 test_atomic_usual(__attribute__((unused)) void *arg)
 {
@@ -216,6 +237,72 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation successful.
+ * This test repeat 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +427,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-06-23  3:15   ` Phil Yang
  2019-06-24 15:15     ` Eads, Gage
  2019-06-24 14:46   ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Eads, Gage
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-06-23  3:15 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 doc/guides/rel_notes/release_19_08.rst  | 3 +++
 lib/librte_stack/rte_stack_lf_c11.h     | 4 ++--
 lib/librte_stack/rte_stack_lf_generic.h | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 8c3932d..b79ae28 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -88,6 +88,9 @@ New Features
   * Added multi-queue support to allow one af_xdp vdev with multiple netdev
     queues
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..67c21fd 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -88,7 +88,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..488fd9f 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -84,7 +84,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-06-24 14:46   ` Eads, Gage
  2019-06-24 15:35     ` Phil Yang (Arm Technology China)
  2 siblings, 1 reply; 91+ messages in thread
From: Eads, Gage @ 2019-06-24 14:46 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Hi Phil,

> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index 9958543..7dd1aa4 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -1081,6 +1081,18 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)
> 
>  /*------------------------ 128 bit atomic operations -------------------------*/
> 
> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +	RTE_STD_C11
> +	union {
> +		uint64_t val[2];
> +		__extension__ __int128 int128;
> +	};
> +} __rte_aligned(16) rte_int128_t;
> +
>  #ifdef __DOXYGEN__
> 

This change breaks 32-bit x86 builds*. A couple ways to resolve this are 1) with RTE_ARCH_* ifdefs, or 2) keep duplicate definitions of the struct in the aarch64 and x86 header files.

Thanks,
Gage

*http://mails.dpdk.org/archives/test-report/2019-June/086586.html

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-06-24 15:09     ` Eads, Gage
  2019-06-24 15:29       ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Eads, Gage @ 2019-06-24 15:09 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Hi Phil,

Looks good overall, just a few documentation issues.

<snip>

> + *
> + * - Test "128b compare and swap" (aarch64 and x86_64 only)
> + *
> + *   - Initialize 128-bit atomic variables to zero.
> + *
> + *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before
> doing
> + *     anything else, the cores are waiting a synchro. Each lcore does
> + *     these compare and swap (CAS) operations several times::
> + *
> + *       Relaxed CAS update counter.val[0] + 2; counter.val[0] + 1;
> + *       Acquired CAS update counter.val[0] + 2; counter.val[0] + 1;
> + *       Released CAS update counter.val[0] + 2; counter.val[0] + 1;
> + *       Acquired_Released CAS update counter.val[0] + 2; counter.val[0] + 1;

The array index in "counter.val[0] + 1", is incorrect, I believe.

Just a nitpick, but "Relaxed CAS update" can go last to match the order in the code.

<snip>

> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> +/*
> + * rte_atomic128_cmp_exchange() should update a 128 bits counter's
> +first 64
> + * bits by 2 and the second 64 bits by 1 in this test. It should return
> +true
> + * if the compare exchange operation successful.

"operation successful" -> "operation is successful"

> + * This test repeat 128 bits compare and swap operations 10K rounds. In
> +each

"repeat" -> "repeats"

Thanks,
Gage

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-06-24 15:15     ` Eads, Gage
  2019-06-24 15:22       ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Eads, Gage @ 2019-06-24 15:15 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> Subject: [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
> 
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
>  doc/guides/rel_notes/release_19_08.rst  | 3 +++
>  lib/librte_stack/rte_stack_lf_c11.h     | 4 ++--
>  lib/librte_stack/rte_stack_lf_generic.h | 4 ++--
>  3 files changed, 7 insertions(+), 4 deletions(-)
> 

Please update doc/guides/prog_guide/env_abstraction_layer.rst as well -- it states that the lock-free stack is "currently limited to the x86_64 platform."

Thanks,
Gage

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-24 15:15     ` Eads, Gage
@ 2019-06-24 15:22       ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-06-24 15:22 UTC (permalink / raw)
  To: Eads, Gage, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Eads, Gage <gage.eads@intel.com>
> Sent: Monday, June 24, 2019 11:15 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
> 
> > Subject: [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64
> >
> > Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > ---
> >  doc/guides/rel_notes/release_19_08.rst  | 3 +++
> >  lib/librte_stack/rte_stack_lf_c11.h     | 4 ++--
> >  lib/librte_stack/rte_stack_lf_generic.h | 4 ++--
> >  3 files changed, 7 insertions(+), 4 deletions(-)
> >
> 
> Please update doc/guides/prog_guide/env_abstraction_layer.rst as well -- it
> states that the lock-free stack is "currently limited to the x86_64 platform."
Thanks, Gage. I will update it in next version.

> Thanks,
> Gage

Thanks,
Phil

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test
  2019-06-24 15:09     ` Eads, Gage
@ 2019-06-24 15:29       ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-06-24 15:29 UTC (permalink / raw)
  To: Eads, Gage, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Eads, Gage <gage.eads@intel.com>
> Sent: Monday, June 24, 2019 11:10 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 2/3] test/atomic: add 128b compare and swap test
> 
> Hi Phil,
> 
> Looks good overall, just a few documentation issues.
> 
> <snip>
> 
> > + *
> > + * - Test "128b compare and swap" (aarch64 and x86_64 only)
> > + *
> > + *   - Initialize 128-bit atomic variables to zero.
> > + *
> > + *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before
> > doing
> > + *     anything else, the cores are waiting a synchro. Each lcore does
> > + *     these compare and swap (CAS) operations several times::
> > + *
> > + *       Relaxed CAS update counter.val[0] + 2; counter.val[0] + 1;
> > + *       Acquired CAS update counter.val[0] + 2; counter.val[0] + 1;
> > + *       Released CAS update counter.val[0] + 2; counter.val[0] + 1;
> > + *       Acquired_Released CAS update counter.val[0] + 2; counter.val[0] + 1;
> 
Hi Gage,

> The array index in "counter.val[0] + 1", is incorrect, I believe.
Yes, you are correct. I will fix it. It should be 'counter.val[2] + 1'.

> 
> Just a nitpick, but "Relaxed CAS update" can go last to match the order in the
> code.
Sure. Thank you for your correction.

> 
> <snip>
> 
> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> > +/*
> > + * rte_atomic128_cmp_exchange() should update a 128 bits counter's
> > +first 64
> > + * bits by 2 and the second 64 bits by 1 in this test. It should
> > +return true
> > + * if the compare exchange operation successful.
> 
> "operation successful" -> "operation is successful"
Yes.

> 
> > + * This test repeat 128 bits compare and swap operations 10K rounds.
> > +In each
> 
> "repeat" -> "repeats"
Yes.

> 
> Thanks,
> Gage

Thanks,
Phil

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-24 14:46   ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Eads, Gage
@ 2019-06-24 15:35     ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-06-24 15:35 UTC (permalink / raw)
  To: Eads, Gage, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Eads, Gage <gage.eads@intel.com>
> Sent: Monday, June 24, 2019 10:46 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange
> 
> Hi Phil,
> 
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index 9958543..7dd1aa4 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -1081,6 +1081,18 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)
> >
> >  /*------------------------ 128 bit atomic operations
> > -------------------------*/
> >
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +RTE_STD_C11
> > +typedef struct {
> > +	RTE_STD_C11
> > +	union {
> > +		uint64_t val[2];
> > +		__extension__ __int128 int128;
> > +	};
> > +} __rte_aligned(16) rte_int128_t;
> > +
> >  #ifdef __DOXYGEN__
> >
> 
Hi Gage,

> This change breaks 32-bit x86 builds*. A couple ways to resolve this are 1)
> with RTE_ARCH_* ifdefs, or 2) keep duplicate definitions of the struct in the
> aarch64 and x86 header files.
OK. Let's follow the first approach. I will update it in the new version. Thanks!

> 
> Thanks,
> Gage
> 
> *http://mails.dpdk.org/archives/test-report/2019-June/086586.html

Thanks,
Phil

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (2 preceding siblings ...)
  2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
@ 2019-06-28  8:11 ` " Phil Yang
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (3 more replies)
  2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
                   ` (3 subsequent siblings)
  7 siblings, 4 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-28  8:11 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128-bit atomic compare exchange on aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Eads Gage)
4. Fix 32-bit x86 builds issue. (Eads Gage)
5. Correct documentation issues in UT. (Eads Gage)

 .../common/include/arch/arm/rte_atomic_64.h        | 165 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 3 files changed, 181 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..2080c4d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,167 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define RTE_HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || \
+			 (mo) == __ATOMIC_ACQ_REL || \
+			 (mo) == __ATOMIC_SEQ_CST)
+
+#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
+		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define RTE_MO_STORE(mo) (RTE_HAS_RLS((mo)) \
+		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#ifdef __ARM_FEATURE_ATOMICS
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                               \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                     \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
+	asm volatile(                                                           \
+			op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"       \
+			: [old0] "+r" (x0),                                             \
+			  [old1] "+r" (x1)                                              \
+			: [upd0] "r" (x2),                                              \
+			  [upd1] "r" (x3),                                              \
+			  [dst] "r" (dst)                                               \
+			: "memory");                                                    \
+	old.val[0] = x0;                                                        \
+	old.val[1] = x1;                                                        \
+	return old;                                                             \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                       \
+	asm volatile(                                                           \
+			op_string " %0, %1, %2"                                         \
+			: "=&r" (ret.val[0]),                                           \
+			  "=&r" (ret.val[1])                                            \
+			: "Q" (src->val[0])                                             \
+			: "memory");                                                    \
+	return ret;                                                             \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                           \
+	asm volatile(                                                           \
+			op_string " %w0, %1, %2, %3"                                    \
+			: "=&r" (ret)                                                   \
+			: "r" (src.val[0]),                                             \
+			  "r" (src.val[1]),                                             \
+			  "Q" (dst->val[0])                                             \
+			: "memory");                                                    \
+	/* Return 0 on success, 1 on failure */                                 \
+	return ret;                                                             \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#ifdef __ARM_FEATURE_ATOMICS
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = RTE_MO_LOAD(success);
+	int stx_mo = RTE_MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 6232c57..23cf48f 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 static inline int __rte_experimental
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
 			   rte_int128_t *exp,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 9958543..2355e50 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__extension__ __int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+#endif
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
@ 2019-06-28  8:11   ` Phil Yang
  2019-06-29  0:17     ` Eads, Gage
  2019-07-19  4:03     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-28  8:11 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 app/test/test_atomic.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..78541e0 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -73,6 +90,10 @@ static rte_atomic64_t a64;
 static rte_atomic64_t count;
 static rte_atomic32_t synchro;
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+#endif
+
 static int
 test_atomic_usual(__attribute__((unused)) void *arg)
 {
@@ -216,6 +237,72 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +427,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-06-28  8:11   ` Phil Yang
  2019-06-29  0:18     ` Eads, Gage
  2019-07-19  4:18     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2019-07-03 12:25   ` [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2019-07-19  6:24   ` Jerin Jacob Kollanukkaran
  3 siblings, 2 replies; 91+ messages in thread
From: Phil Yang @ 2019-06-28  8:11 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu,
	nd, gage.eads

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 doc/guides/prog_guide/env_abstraction_layer.rst | 4 ++--
 doc/guides/rel_notes/release_19_08.rst          | 3 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 4 ++--
 lib/librte_stack/rte_stack_lf_generic.h         | 4 ++--
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index f15bcd9..d569f95 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -592,8 +592,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 3da2667..e2e00b9 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -99,6 +99,9 @@ New Features
   Updated ``librte_telemetry`` to fetch the global metrics from the
   ``librte_metrics`` library.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..67c21fd 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -88,7 +88,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..488fd9f 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(first);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
@@ -84,7 +84,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
+#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
 	RTE_SET_USED(obj_table);
 	RTE_SET_USED(last);
 	RTE_SET_USED(list);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-06-29  0:17     ` Eads, Gage
  2019-07-19  4:03     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  1 sibling, 0 replies; 91+ messages in thread
From: Eads, Gage @ 2019-06-29  0:17 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd



> Add 128b atomic compare and swap test for aarch64 and x86_64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Acked-by: Gage Eads <gage.eads@intel.com>

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-06-29  0:18     ` Eads, Gage
  2019-07-19  4:18     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  1 sibling, 0 replies; 91+ messages in thread
From: Eads, Gage @ 2019-06-29  0:18 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Acked-by: Gage Eads <gage.eads@intel.com>

Thanks,
Gage

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-03 12:25   ` Jerin Jacob Kollanukkaran
  2019-07-03 13:07     ` Jerin Jacob Kollanukkaran
  2019-07-19  6:24   ` Jerin Jacob Kollanukkaran
  3 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-03 12:25 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd, gage.eads

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Friday, June 28, 2019 1:42 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com; gage.eads@intel.com
> Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
> 
> Add 128-bit atomic compare exchange on aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> v3:
> 1. Avoid duplication code with macro. (Jerin Jocob) 2. Make invalid memory
> order to strongest barrier. (Jerin Jocob) 3. Update
> doc/guides/prog_guide/env_abstraction_layer.rst. (Eads Gage) 4. Fix 32-bit x86
> builds issue. (Eads Gage) 5. Correct documentation issues in UT. (Eads Gage)
> 
>  .../common/include/arch/arm/rte_atomic_64.h        | 165
> +++++++++++++++++++++
>  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
>  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
>  3 files changed, 181 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index 97060e4..2080c4d 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2015 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #ifndef _RTE_ATOMIC_ARM64_H_
> @@ -14,6 +15,9 @@ extern "C" {
>  #endif
> 
>  #include "generic/rte_atomic.h"
> +#include <rte_branch_prediction.h>
> +#include <rte_compat.h>
> +#include <rte_debug.h>
> 
>  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")  #define dmb(opt) asm
> volatile("dmb " #opt : : : "memory") @@ -40,6 +44,167 @@ extern "C" {
> 
>  #define rte_cio_rmb() dmb(oshld)
> 
> +/*------------------------ 128 bit atomic operations
> +-------------------------*/
> +
> +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE
> || \
> +			 (mo) == __ATOMIC_ACQ_REL || \
> +			 (mo) == __ATOMIC_SEQ_CST)
> +
> +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> RTE_MO_STORE(mo)
> +(RTE_HAS_RLS((mo)) \
> +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> +
> +#ifdef __ARM_FEATURE_ATOMICS
> +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> +static inline rte_int128_t                                                  \
> +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> +		rte_int128_t updated)                                               \
> +{                                                                           \
> +	/* caspX instructions register pair must start from even-numbered
> +	 * register at operand 1.
> +	 * So, specify registers for local variables here.
> +	 */                                                                     \
> +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \

I understand CASP limitation on register has to be even and odd.
Is there anyway to remove explicit x0 register allocation and
choose compiler to decide the register. Some reason with optimize(03)
gcc makes correctly but not clang.

Hardcoding to specific register makes compiler to not optimize the stuff,
especially if it is inline function.



^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-03 12:25   ` [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
@ 2019-07-03 13:07     ` Jerin Jacob Kollanukkaran
  2019-07-05  4:20       ` Honnappa Nagarahalli
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-03 13:07 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd, gage.eads

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran
> Sent: Wednesday, July 3, 2019 5:56 PM
> To: Phil Yang <phil.yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com;
> gage.eads@intel.com
> Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Friday, June 28, 2019 1:42 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com;
> > gage.eads@intel.com
> > Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> >
> > Add 128-bit atomic compare exchange on aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > ---
> > v3:
> > 1. Avoid duplication code with macro. (Jerin Jocob) 2. Make invalid
> > memory order to strongest barrier. (Jerin Jocob) 3. Update
> > doc/guides/prog_guide/env_abstraction_layer.rst. (Eads Gage) 4. Fix
> > 32-bit x86 builds issue. (Eads Gage) 5. Correct documentation issues
> > in UT. (Eads Gage)
> >
> >  .../common/include/arch/arm/rte_atomic_64.h        | 165
> > +++++++++++++++++++++
> >  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
> >  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
> >  3 files changed, 181 insertions(+), 13 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > index 97060e4..2080c4d 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2015 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_ATOMIC_ARM64_H_
> > @@ -14,6 +15,9 @@ extern "C" {
> >  #endif
> >
> >  #include "generic/rte_atomic.h"
> > +#include <rte_branch_prediction.h>
> > +#include <rte_compat.h>
> > +#include <rte_debug.h>
> >
> >  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")  #define
> > dmb(opt) asm volatile("dmb " #opt : : : "memory") @@ -40,6 +44,167 @@
> > extern "C" {
> >
> >  #define rte_cio_rmb() dmb(oshld)
> >
> > +/*------------------------ 128 bit atomic operations
> > +-------------------------*/
> > +
> > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> __ATOMIC_RELEASE
> > || \
> > +			 (mo) == __ATOMIC_ACQ_REL || \
> > +			 (mo) == __ATOMIC_SEQ_CST)
> > +
> > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> > RTE_MO_STORE(mo)
> > +(RTE_HAS_RLS((mo)) \
> > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > +
> > +#ifdef __ARM_FEATURE_ATOMICS
> > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> > +static inline rte_int128_t                                                  \
> > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > +		rte_int128_t updated)                                               \
> > +{                                                                           \
> > +	/* caspX instructions register pair must start from even-numbered
> > +	 * register at operand 1.
> > +	 * So, specify registers for local variables here.
> > +	 */                                                                     \
> > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
> 
> I understand CASP limitation on register has to be even and odd.
> Is there anyway to remove explicit x0 register allocation and choose compiler to
> decide the register. Some reason with optimize(03) gcc makes correctly but not
> clang.
> 
> Hardcoding to specific register makes compiler to not optimize the stuff,
> especially if it is inline function.

It look like the limitation fixed recently in gcc.
https://patches.linaro.org/patch/147991/

Not sure about old gcc and clang. ARM compiler experts may know the exact status



 


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-03 13:07     ` Jerin Jacob Kollanukkaran
@ 2019-07-05  4:20       ` Honnappa Nagarahalli
  2019-07-05  4:37         ` Pavan Nikhilesh Bhagavatula
  0 siblings, 1 reply; 91+ messages in thread
From: Honnappa Nagarahalli @ 2019-07-05  4:20 UTC (permalink / raw)
  To: jerinj, Phil Yang (Arm Technology China), dev
  Cc: thomas, hemant.agrawal, Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli, nd, gage.eads, nd

<snip>

> > > Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> > > exchange
> > >
> > > Add 128-bit atomic compare exchange on aarch64.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > ---
> > > v3:
> > > 1. Avoid duplication code with macro. (Jerin Jocob) 2. Make invalid
> > > memory order to strongest barrier. (Jerin Jocob) 3. Update
> > > doc/guides/prog_guide/env_abstraction_layer.rst. (Eads Gage) 4. Fix
> > > 32-bit x86 builds issue. (Eads Gage) 5. Correct documentation issues
> > > in UT. (Eads Gage)
> > >
> > >  .../common/include/arch/arm/rte_atomic_64.h        | 165
> > > +++++++++++++++++++++
> > >  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
> > >  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
> > >  3 files changed, 181 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > > index 97060e4..2080c4d 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2015 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> > >
> > >  #ifndef _RTE_ATOMIC_ARM64_H_
> > > @@ -14,6 +15,9 @@ extern "C" {
> > >  #endif
> > >
> > >  #include "generic/rte_atomic.h"
> > > +#include <rte_branch_prediction.h>
> > > +#include <rte_compat.h>
> > > +#include <rte_debug.h>
> > >
> > >  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")  #define
> > > dmb(opt) asm volatile("dmb " #opt : : : "memory") @@ -40,6 +44,167
> > > @@ extern "C" {
> > >
> > >  #define rte_cio_rmb() dmb(oshld)
> > >
> > > +/*------------------------ 128 bit atomic operations
> > > +-------------------------*/
> > > +
> > > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> > __ATOMIC_RELEASE
> > > || \
> > > +			 (mo) == __ATOMIC_ACQ_REL || \
> > > +			 (mo) == __ATOMIC_SEQ_CST)
> > > +
> > > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> > > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> > > RTE_MO_STORE(mo)
> > > +(RTE_HAS_RLS((mo)) \
> > > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > > +
> > > +#ifdef __ARM_FEATURE_ATOMICS
> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
> \
> > > +static inline rte_int128_t                                                  \
> > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > > +		rte_int128_t updated)                                               \
> > > +{                                                                           \
> > > +	/* caspX instructions register pair must start from even-numbered
> > > +	 * register at operand 1.
> > > +	 * So, specify registers for local variables here.
> > > +	 */                                                                     \
> > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
> >
> > I understand CASP limitation on register has to be even and odd.
> > Is there anyway to remove explicit x0 register allocation and choose
> > compiler to decide the register. Some reason with optimize(03) gcc
> > makes correctly but not clang.
> >
> > Hardcoding to specific register makes compiler to not optimize the
> > stuff, especially if it is inline function.
> 
> It look like the limitation fixed recently in gcc.
> https://patches.linaro.org/patch/147991/
> 
> Not sure about old gcc and clang. ARM compiler experts may know the exact
> status
> 
We could use syntax as follows, an example is in [1]
static inline rte_int128_t
__rte_casp(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated, int mo)
{
		__asm__ volatile("caspl %0, %H0, %1, %H1, [%2]"
				 : "+r" (old)
				 : "r" (updated), "r" (dst)
				 : "memory");
	return old;       
}

[1] https://godbolt.org/z/EUJnuG

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-05  4:20       ` Honnappa Nagarahalli
@ 2019-07-05  4:37         ` Pavan Nikhilesh Bhagavatula
  2019-07-09  9:27           ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2019-07-05  4:37 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Jerin Jacob Kollanukkaran,
	Phil Yang (Arm Technology China),
	dev
  Cc: thomas, hemant.agrawal, Gavin Hu (Arm Technology China),
	nd, gage.eads, nd



>-----Original Message-----
>From: dev <dev-bounces@dpdk.org> On Behalf Of Honnappa
>Nagarahalli
>Sent: Friday, July 5, 2019 9:51 AM
>To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Phil Yang (Arm
>Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
>Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Gavin Hu (Arm
>Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
><Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>;
>gage.eads@intel.com; nd <nd@arm.com>
>Subject: Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit
>atomic compare exchange
>
><snip>
>
>> > > Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic
>compare
>> > > exchange
>> > >
>> > > Add 128-bit atomic compare exchange on aarch64.
>> > >
>> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
>> > > Tested-by: Honnappa Nagarahalli
><honnappa.nagarahalli@arm.com>
>> > > Reviewed-by: Honnappa Nagarahalli
><honnappa.nagarahalli@arm.com>
>> > > ---
>> > > v3:
>> > > 1. Avoid duplication code with macro. (Jerin Jocob) 2. Make invalid
>> > > memory order to strongest barrier. (Jerin Jocob) 3. Update
>> > > doc/guides/prog_guide/env_abstraction_layer.rst. (Eads Gage) 4.
>Fix
>> > > 32-bit x86 builds issue. (Eads Gage) 5. Correct documentation
>issues
>> > > in UT. (Eads Gage)
>> > >
>> > >  .../common/include/arch/arm/rte_atomic_64.h        | 165
>> > > +++++++++++++++++++++
>> > >  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
>> > >  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
>> > >  3 files changed, 181 insertions(+), 13 deletions(-)
>> > >
>> > > diff --git
>a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > > index 97060e4..2080c4d 100644
>> > > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > > @@ -1,5 +1,6 @@
>> > >  /* SPDX-License-Identifier: BSD-3-Clause
>> > >   * Copyright(c) 2015 Cavium, Inc
>> > > + * Copyright(c) 2019 Arm Limited
>> > >   */
>> > >
>> > >  #ifndef _RTE_ATOMIC_ARM64_H_
>> > > @@ -14,6 +15,9 @@ extern "C" {
>> > >  #endif
>> > >
>> > >  #include "generic/rte_atomic.h"
>> > > +#include <rte_branch_prediction.h>
>> > > +#include <rte_compat.h>
>> > > +#include <rte_debug.h>
>> > >
>> > >  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")  #define
>> > > dmb(opt) asm volatile("dmb " #opt : : : "memory") @@ -40,6
>+44,167
>> > > @@ extern "C" {
>> > >
>> > >  #define rte_cio_rmb() dmb(oshld)
>> > >
>> > > +/*------------------------ 128 bit atomic operations
>> > > +-------------------------*/
>> > > +
>> > > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED &&
>(mo) !=
>> > > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
>> > __ATOMIC_RELEASE
>> > > || \
>> > > +			 (mo) == __ATOMIC_ACQ_REL || \
>> > > +			 (mo) == __ATOMIC_SEQ_CST)
>> > > +
>> > > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
>> > > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
>> > > RTE_MO_STORE(mo)
>> > > +(RTE_HAS_RLS((mo)) \
>> > > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
>> > > +
>> > > +#ifdef __ARM_FEATURE_ATOMICS
>> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
>> \
>> > > +static inline rte_int128_t                                                  \
>> > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
>> > > +		rte_int128_t updated)                                               \
>> > > +{                                                                           \
>> > > +	/* caspX instructions register pair must start from even-
>numbered
>> > > +	 * register at operand 1.
>> > > +	 * So, specify registers for local variables here.
>> > > +	 */                                                                     \
>> > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
>> >
>> > I understand CASP limitation on register has to be even and odd.
>> > Is there anyway to remove explicit x0 register allocation and choose
>> > compiler to decide the register. Some reason with optimize(03) gcc
>> > makes correctly but not clang.
>> >
>> > Hardcoding to specific register makes compiler to not optimize the
>> > stuff, especially if it is inline function.
>>
>> It look like the limitation fixed recently in gcc.
>> https://patches.linaro.org/patch/147991/
>>
>> Not sure about old gcc and clang. ARM compiler experts may know
>the exact
>> status
>>
>We could use syntax as follows, an example is in [1]
>static inline rte_int128_t
>__rte_casp(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated,
>int mo)
>{
>		__asm__ volatile("caspl %0, %H0, %1, %H1, [%2]"
>				 : "+r" (old)
>				 : "r" (updated), "r" (dst)
>				 : "memory");
>	return old;
>}

We have used this format for mempool/octeontx2 but clang wasn't too happy.

dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:151:15: error: value size does not match register size specified by the constraint and modifier [-Werror,-Wasm-operand-widths]
                [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
                            ^
dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:82:9: note: use constraint modifier "w"
                "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"

Had to change it to hand coded asm

http://patches.dpdk.org/patch/56110/

>
>[1] https://godbolt.org/z/EUJnuG

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-05  4:37         ` Pavan Nikhilesh Bhagavatula
@ 2019-07-09  9:27           ` Phil Yang (Arm Technology China)
  2019-07-09 11:14             ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-09  9:27 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Honnappa Nagarahalli, jerinj, dev
  Cc: thomas, hemant.agrawal, Gavin Hu (Arm Technology China),
	nd, gage.eads, nd, nd

> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
> Sent: Friday, July 5, 2019 12:37 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> jerinj@marvell.com; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>;
> gage.eads@intel.com; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> 

<snip>

> >> > > +#ifdef __ARM_FEATURE_ATOMICS
> >> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
> >> \
> >> > > +static inline rte_int128_t                                                  \
> >> > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> >> > > +		rte_int128_t updated)                                               \
> >> > > +{                                                                           \
> >> > > +	/* caspX instructions register pair must start from even-
> >numbered
> >> > > +	 * register at operand 1.
> >> > > +	 * So, specify registers for local variables here.
> >> > > +	 */                                                                     \
> >> > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];
> \
> >> >
> >> > I understand CASP limitation on register has to be even and odd.
> >> > Is there anyway to remove explicit x0 register allocation and choose
> >> > compiler to decide the register. Some reason with optimize(03) gcc
> >> > makes correctly but not clang.
> >> >
> >> > Hardcoding to specific register makes compiler to not optimize the
> >> > stuff, especially if it is inline function.
> >>
> >> It look like the limitation fixed recently in gcc.
> >> https://patches.linaro.org/patch/147991/
> >>
> >> Not sure about old gcc and clang. ARM compiler experts may know
> >the exact
> >> status
> >>
> >We could use syntax as follows, an example is in [1]
> >static inline rte_int128_t
> >__rte_casp(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated,
> >int mo)
> >{
> >		__asm__ volatile("caspl %0, %H0, %1, %H1, [%2]"
> >				 : "+r" (old)
> >				 : "r" (updated), "r" (dst)
> >				 : "memory");
> >	return old;
> >}
> 
> We have used this format for mempool/octeontx2 but clang wasn't too
> happy.
> 
> dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:151:15: error:
> value size does not match register size specified by the constraint and
> modifier [-Werror,-Wasm-operand-widths]
>                 [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
>                             ^
> dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:82:9: note: use
> constraint modifier "w"
>                 "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> 
> Had to change it to hand coded asm
> 
> http://patches.dpdk.org/patch/56110/

Hi Jerin,

The update from the compiler team is 'the LSE CASP fix has not been backported to older GCC branches'.
So, currently, this seems the only approach works for all versions of GCC and Clang. 
I think we can add another optimization patch for this once all the compilers were ready. 

Thanks,
Phil
> 
> >
> >[1] https://godbolt.org/z/EUJnuG

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-09  9:27           ` Phil Yang (Arm Technology China)
@ 2019-07-09 11:14             ` Jerin Jacob Kollanukkaran
  0 siblings, 0 replies; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-09 11:14 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China),
	Pavan Nikhilesh Bhagavatula, Honnappa Nagarahalli, dev
  Cc: thomas, hemant.agrawal, Gavin Hu (Arm Technology China),
	nd, gage.eads, nd, nd

> -----Original Message-----
> From: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Sent: Tuesday, July 9, 2019 2:58 PM
> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>;
> gage.eads@intel.com; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
> > Sent: Friday, July 5, 2019 12:37 PM
> > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > jerinj@marvell.com; Phil Yang (Arm Technology China)
> > <Phil.Yang@arm.com>; dev@dpdk.org
> > Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>;
> > gage.eads@intel.com; nd <nd@arm.com>
> > Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic
> > compare exchange
> >
> >
> 
> <snip>
> 
> > >> > > +#ifdef __ARM_FEATURE_ATOMICS
> > >> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
> > >> \
> > >> > > +static inline rte_int128_t                                                  \
> > >> > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > >> > > +		rte_int128_t updated)                                               \
> > >> > > +{                                                                           \
> > >> > > +	/* caspX instructions register pair must start from even-
> > >numbered
> > >> > > +	 * register at operand 1.
> > >> > > +	 * So, specify registers for local variables here.
> > >> > > +	 */                                                                     \
> > >> > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];
> > \
> > >> >
> > >> > I understand CASP limitation on register has to be even and odd.
> > >> > Is there anyway to remove explicit x0 register allocation and
> > >> > choose compiler to decide the register. Some reason with
> > >> > optimize(03) gcc makes correctly but not clang.
> > >> >
> > >> > Hardcoding to specific register makes compiler to not optimize
> > >> > the stuff, especially if it is inline function.
> > >>
> > >> It look like the limitation fixed recently in gcc.
> > >> https://patches.linaro.org/patch/147991/
> > >>
> > >> Not sure about old gcc and clang. ARM compiler experts may know
> > >the exact
> > >> status
> > >>
> > >We could use syntax as follows, an example is in [1] static inline
> > >rte_int128_t __rte_casp(rte_int128_t *dst, rte_int128_t old,
> > >rte_int128_t updated, int mo) {
> > >		__asm__ volatile("caspl %0, %H0, %1, %H1, [%2]"
> > >				 : "+r" (old)
> > >				 : "r" (updated), "r" (dst)
> > >				 : "memory");
> > >	return old;
> > >}
> >
> > We have used this format for mempool/octeontx2 but clang wasn't too
> > happy.
> >
> > dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:151:15: error:
> > value size does not match register size specified by the constraint
> > and modifier [-Werror,-Wasm-operand-widths]
> >                 [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
> >                             ^
> > dpdk/drivers/mempool/octeontx2/otx2_mempool_ops.c:82:9: note: use
> > constraint modifier "w"
> >                 "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> >
> > Had to change it to hand coded asm
> >
> > http://patches.dpdk.org/patch/56110/
> 
> Hi Jerin,
> 
> The update from the compiler team is 'the LSE CASP fix has not been
> backported to older GCC branches'.
> So, currently, this seems the only approach works for all versions of GCC and
> Clang.
> I think we can add another optimization patch for this once all the compilers
> were ready.

We are on same page.


> 
> Thanks,
> Phil
> >
> > >
> > >[1] https://godbolt.org/z/EUJnuG

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 2/3] test/atomic: add 128b compare and swap test
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-06-29  0:17     ` Eads, Gage
@ 2019-07-19  4:03     ` " Jerin Jacob Kollanukkaran
  1 sibling, 0 replies; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-19  4:03 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd, gage.eads

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Friday, June 28, 2019 1:42 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com; gage.eads@intel.com
> Subject: [EXT] [PATCH v3 2/3] test/atomic: add 128b compare and swap test
> 
> Add 128b atomic compare and swap test for aarch64 and x86_64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
>  app/test/test_atomic.c | 122
> 
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64) static
> +rte_int128_t count128;

Moving this to down[1] can reduce one #ifdef cutler.

> +
>  static int
>  test_atomic_usual(__attribute__((unused)) void *arg)  { @@ -216,6 +237,72
> @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
>  	return 0;
>  }
> 
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)

[1] See above,

With above change,

Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>




^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-06-29  0:18     ` Eads, Gage
@ 2019-07-19  4:18     ` " Jerin Jacob Kollanukkaran
  2019-07-19  4:42       ` Eads, Gage
  1 sibling, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-19  4:18 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd, gage.eads

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Friday, June 28, 2019 1:42 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com; gage.eads@intel.com
> Subject: [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> 
>  Removed Items
>  -------------
> diff --git a/lib/librte_stack/rte_stack_lf_c11.h
> b/lib/librte_stack/rte_stack_lf_c11.h
> index 3d677ae..67c21fd 100644
> --- a/lib/librte_stack/rte_stack_lf_c11.h
> +++ b/lib/librte_stack/rte_stack_lf_c11.h
> @@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list
> *list,
>  			  struct rte_stack_lf_elem *last,
>  			  unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
>  	RTE_SET_USED(first);
>  	RTE_SET_USED(last);
>  	RTE_SET_USED(list);
> @@ -88,7 +88,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>  			 void **obj_table,
>  			 struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
>  	RTE_SET_USED(obj_table);
>  	RTE_SET_USED(last);
>  	RTE_SET_USED(list);
> diff --git a/lib/librte_stack/rte_stack_lf_generic.h
> b/lib/librte_stack/rte_stack_lf_generic.h
> index 3182151..488fd9f 100644
> --- a/lib/librte_stack/rte_stack_lf_generic.h
> +++ b/lib/librte_stack/rte_stack_lf_generic.h
> @@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list
> *list,
>  			  struct rte_stack_lf_elem *last,
>  			  unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
>  	RTE_SET_USED(first);
>  	RTE_SET_USED(last);
>  	RTE_SET_USED(list);
> @@ -84,7 +84,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>  			 void **obj_table,
>  			 struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
>  	RTE_SET_USED(obj_table);
>  	RTE_SET_USED(last);
>  	RTE_SET_USED(list);


Can we remove this all #ifdef clutter by adding the following

$ git diff
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0c2..46af08b83 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,7 +5,7 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_

-#ifdef RTE_USE_C11_MEM_MODEL
+#if defined (RTE_USE_C11_MEM_MODEL) && defined(RTE_ARCH_X86_64) && defined(RTE_ARCH_ARM64)
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"




^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-19  4:18     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
@ 2019-07-19  4:42       ` Eads, Gage
  2019-07-19  5:02         ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 91+ messages in thread
From: Eads, Gage @ 2019-07-19  4:42 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd



> -----Original Message-----
> From: Jerin Jacob Kollanukkaran [mailto:jerinj@marvell.com]
> Sent: Thursday, July 18, 2019 11:18 PM
> To: Phil Yang <phil.yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com; Eads,
> Gage <gage.eads@intel.com>
> Subject: RE: [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for
> aarch64
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Friday, June 28, 2019 1:42 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com;
> > gage.eads@intel.com
> > Subject: [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for
> > aarch64 Enable both c11 atomic and non c11 atomic lock-free stack for
> aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> >
> >  Removed Items
> >  -------------
> > diff --git a/lib/librte_stack/rte_stack_lf_c11.h
> > b/lib/librte_stack/rte_stack_lf_c11.h
> > index 3d677ae..67c21fd 100644
> > --- a/lib/librte_stack/rte_stack_lf_c11.h
> > +++ b/lib/librte_stack/rte_stack_lf_c11.h
> > @@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list
> > *list,
> >  			  struct rte_stack_lf_elem *last,
> >  			  unsigned int num)
> >  {
> > -#ifndef RTE_ARCH_X86_64
> > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
> >  	RTE_SET_USED(first);
> >  	RTE_SET_USED(last);
> >  	RTE_SET_USED(list);
> > @@ -88,7 +88,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list
> *list,
> >  			 void **obj_table,
> >  			 struct rte_stack_lf_elem **last)
> >  {
> > -#ifndef RTE_ARCH_X86_64
> > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
> >  	RTE_SET_USED(obj_table);
> >  	RTE_SET_USED(last);
> >  	RTE_SET_USED(list);
> > diff --git a/lib/librte_stack/rte_stack_lf_generic.h
> > b/lib/librte_stack/rte_stack_lf_generic.h
> > index 3182151..488fd9f 100644
> > --- a/lib/librte_stack/rte_stack_lf_generic.h
> > +++ b/lib/librte_stack/rte_stack_lf_generic.h
> > @@ -36,7 +36,7 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list
> > *list,
> >  			  struct rte_stack_lf_elem *last,
> >  			  unsigned int num)
> >  {
> > -#ifndef RTE_ARCH_X86_64
> > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
> >  	RTE_SET_USED(first);
> >  	RTE_SET_USED(last);
> >  	RTE_SET_USED(list);
> > @@ -84,7 +84,7 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list
> *list,
> >  			 void **obj_table,
> >  			 struct rte_stack_lf_elem **last)
> >  {
> > -#ifndef RTE_ARCH_X86_64
> > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_ARM64)
> >  	RTE_SET_USED(obj_table);
> >  	RTE_SET_USED(last);
> >  	RTE_SET_USED(list);
> 
> 
> Can we remove this all #ifdef clutter by adding the following
> 
> $ git diff
> diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
> index f5581f0c2..46af08b83 100644
> --- a/lib/librte_stack/rte_stack_lf.h
> +++ b/lib/librte_stack/rte_stack_lf.h
> @@ -5,7 +5,7 @@
>  #ifndef _RTE_STACK_LF_H_
>  #define _RTE_STACK_LF_H_
> 
> -#ifdef RTE_USE_C11_MEM_MODEL
> +#if defined (RTE_USE_C11_MEM_MODEL) && defined(RTE_ARCH_X86_64)
> &&
> +defined(RTE_ARCH_ARM64)

I assume you meant (defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))?

>  #include "rte_stack_lf_c11.h"
>  #else
>  #include "rte_stack_lf_generic.h"
> 
> 

The ifdefs in those two headers prevent DPDK from trying to build rte_atomic128_cmp_exchange() on architectures that don't implement it. So the proposal wouldn't quite work, since rte_stack_lf_generic.h calls rte_atomic128_cmp_exchange().

Something like this could work:

#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
#include rte_stack_lf_stubs.h
#else
#ifdef RTE_USE_C11_MEM_MODEL
#include "rte_stack_lf_c11.h"
#else
#include "rte_stack_lf_generic.h"
#endif
#endif

Where rte_stack_lf_stubs.h is a new header containing stub implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems, and __rte_stack_lf_pop_elems. It still has some ifdef clutter, but less overall.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-19  4:42       ` Eads, Gage
@ 2019-07-19  5:02         ` Jerin Jacob Kollanukkaran
  2019-07-19  5:15           ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-19  5:02 UTC (permalink / raw)
  To: Eads, Gage, Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> > Can we remove this all #ifdef clutter by adding the following
> >
> > $ git diff
> > diff --git a/lib/librte_stack/rte_stack_lf.h
> > b/lib/librte_stack/rte_stack_lf.h index f5581f0c2..46af08b83 100644
> > --- a/lib/librte_stack/rte_stack_lf.h
> > +++ b/lib/librte_stack/rte_stack_lf.h
> > @@ -5,7 +5,7 @@
> >  #ifndef _RTE_STACK_LF_H_
> >  #define _RTE_STACK_LF_H_
> >
> > -#ifdef RTE_USE_C11_MEM_MODEL
> > +#if defined (RTE_USE_C11_MEM_MODEL) &&
> defined(RTE_ARCH_X86_64)
> > &&
> > +defined(RTE_ARCH_ARM64)
> 
> I assume you meant (defined(RTE_ARCH_X86_64) ||
> defined(RTE_ARCH_ARM64))?

Yup.

> 
> >  #include "rte_stack_lf_c11.h"
> >  #else
> >  #include "rte_stack_lf_generic.h"
> >
> >
> 
> The ifdefs in those two headers prevent DPDK from trying to build
> rte_atomic128_cmp_exchange() on architectures that don't implement it. So
> the proposal wouldn't quite work, since rte_stack_lf_generic.h calls
> rte_atomic128_cmp_exchange().
> 
> Something like this could work:
> 
> #if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)) #include
> rte_stack_lf_stubs.h #else #ifdef RTE_USE_C11_MEM_MODEL #include
> "rte_stack_lf_c11.h"
> #else
> #include "rte_stack_lf_generic.h"
> #endif
> #endif
> 
> Where rte_stack_lf_stubs.h is a new header containing stub
> implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems, and
> __rte_stack_lf_pop_elems. It still has some ifdef clutter, but less overall.

Agree. I prefer to take this route to reduce the ifdef clutter across generic and c11 files.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-19  5:02         ` Jerin Jacob Kollanukkaran
@ 2019-07-19  5:15           ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-19  5:15 UTC (permalink / raw)
  To: jerinj, Eads, Gage, dev
  Cc: thomas, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Friday, July 19, 2019 1:03 PM
> To: Eads, Gage <gage.eads@intel.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology
> China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v3 3/3] eal/stack: enable lock-free stack for
> aarch64
> 
> > > Can we remove this all #ifdef clutter by adding the following
> > >
> > > $ git diff
> > > diff --git a/lib/librte_stack/rte_stack_lf.h
> > > b/lib/librte_stack/rte_stack_lf.h index f5581f0c2..46af08b83 100644
> > > --- a/lib/librte_stack/rte_stack_lf.h
> > > +++ b/lib/librte_stack/rte_stack_lf.h
> > > @@ -5,7 +5,7 @@
> > >  #ifndef _RTE_STACK_LF_H_
> > >  #define _RTE_STACK_LF_H_
> > >
> > > -#ifdef RTE_USE_C11_MEM_MODEL
> > > +#if defined (RTE_USE_C11_MEM_MODEL) &&
> > defined(RTE_ARCH_X86_64)
> > > &&
> > > +defined(RTE_ARCH_ARM64)
> >
> > I assume you meant (defined(RTE_ARCH_X86_64) ||
> > defined(RTE_ARCH_ARM64))?
> 
> Yup.
> 
> >
> > >  #include "rte_stack_lf_c11.h"
> > >  #else
> > >  #include "rte_stack_lf_generic.h"
> > >
> > >
> >
> > The ifdefs in those two headers prevent DPDK from trying to build
> > rte_atomic128_cmp_exchange() on architectures that don't implement it.
> So
> > the proposal wouldn't quite work, since rte_stack_lf_generic.h calls
> > rte_atomic128_cmp_exchange().
> >
> > Something like this could work:
> >
> > #if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)) #include
> > rte_stack_lf_stubs.h #else #ifdef RTE_USE_C11_MEM_MODEL #include
> > "rte_stack_lf_c11.h"
> > #else
> > #include "rte_stack_lf_generic.h"
> > #endif
> > #endif
> >
> > Where rte_stack_lf_stubs.h is a new header containing stub
> > implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems,
> and
> > __rte_stack_lf_pop_elems. It still has some ifdef clutter, but less overall.
> 
> Agree. I prefer to take this route to reduce the ifdef clutter across generic
> and c11 files.

Got it. I will update it in the next version.

Thanks,
Phil Yang

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
                     ` (2 preceding siblings ...)
  2019-07-03 12:25   ` [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
@ 2019-07-19  6:24   ` Jerin Jacob Kollanukkaran
  2019-07-19 11:01     ` Phil Yang (Arm Technology China)
  3 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-19  6:24 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd, gage.eads

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Friday, June 28, 2019 1:42 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com; gage.eads@intel.com
> Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> External Email
> 
> ----------------------------------------------------------------------
> Add 128-bit atomic compare exchange on aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> __ATOMIC_RELEASE || \
> +			 (mo) == __ATOMIC_ACQ_REL || \
> +			 (mo) == __ATOMIC_SEQ_CST)
> +
> +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> RTE_MO_STORE(mo)
> +(RTE_HAS_RLS((mo)) \
> +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> +

The one starts with RTE_ are public symbols, If it is generic enough,
Move to common layer so that every architecturse can use.
If you think, otherwise make it internal 



> +#ifdef __ARM_FEATURE_ATOMICS

This define is added in gcc 9.1 and I believe for clang it is not supported yet.
So old gcc and clang this will be undefined.
I think, With meson + native build, we  can find the presence of 
ATOMIC support by running a.out. Not sure about make and cross build case.
I don't want block this feature because of this, IMO, We can add this code
with  existing __ARM_FEATURE_ATOMICS scheme and later find a method
to enhance it. But please check how to fix it.

> +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> +static inline rte_int128_t                                                  \
> +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> +		rte_int128_t updated)                                               \
> +{                                                                           \
> +	/* caspX instructions register pair must start from even-numbered
> +	 * register at operand 1.
> +	 * So, specify registers for local variables here.
> +	 */                                                                     \
> +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \

Since direct x0 register used in the code and
cas_op_name() and rte_atomic128_cmp_exchange() is inline function,
Based on parent function load, we may corrupt x0 register aka
Break arm64 ABI. Not sure clobber list will help here or not?
Making it as no_inline will help but not sure about the performance impact.
May be you can check with compiler team. 

We burned our hands with this scheme, see
5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix possible arm64 ABI break")

Probably we can choose a scheme for rc2 and adjust as when we have complete clarity.

> +	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
> +	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
> +	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
> +	asm volatile(                                                           \
> +			op_string " %[old0], %[old1], %[upd0], %[upd1],
> [%[dst]]"       \
> +			: [old0] "+r" (x0),                                             \
> +			  [old1] "+r" (x1)                                              \
> +			: [upd0] "r" (x2),                                              \
> +			  [upd1] "r" (x3),                                              \
> +			  [dst] "r" (dst)                                               \
> +			: "memory");                                                    \

Should n't we add x0,x1, x2, x3 in clobber list?


>  static inline int __rte_experimental
>  rte_atomic128_cmp_exchange(rte_int128_t *dst,
>  			   rte_int128_t *exp,
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index 9958543..2355e50 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -1081,6 +1081,20 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)
> 
>  /*------------------------ 128 bit atomic operations -------------------------*/
> 
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)

There is nothing specific to x86 and arm64 here, Can we remove this #ifdef ?

> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +	RTE_STD_C11
> +	union {
> +		uint64_t val[2];
> +		__extension__ __int128 int128;
> +	};
> +} __rte_aligned(16) rte_int128_t;
> +#endif
> +
>  #ifdef __DOXYGEN__

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-19  6:24   ` Jerin Jacob Kollanukkaran
@ 2019-07-19 11:01     ` Phil Yang (Arm Technology China)
  2019-07-19 12:35       ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-19 11:01 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, gage.eads, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Friday, July 19, 2019 2:25 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology
> China) <Gavin.Hu@arm.com>; nd <nd@arm.com>; gage.eads@intel.com
> Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Friday, June 28, 2019 1:42 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> > gavin.hu@arm.com; nd@arm.com; gage.eads@intel.com
> > Subject: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > Add 128-bit atomic compare exchange on aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > ---
> > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> > __ATOMIC_RELEASE || \
> > +			 (mo) == __ATOMIC_ACQ_REL || \
> > +			 (mo) == __ATOMIC_SEQ_CST)
> > +
> > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> > RTE_MO_STORE(mo)
> > +(RTE_HAS_RLS((mo)) \
> > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > +
> 
> The one starts with RTE_ are public symbols, If it is generic enough,
> Move to common layer so that every architecturse can use.
> If you think, otherwise make it internal

Let's keep it internal. I will remove the 'RTE_' tag. 

> 
> 
> 
> > +#ifdef __ARM_FEATURE_ATOMICS
> 
> This define is added in gcc 9.1 and I believe for clang it is not supported yet.
> So old gcc and clang this will be undefined.
> I think, With meson + native build, we  can find the presence of
> ATOMIC support by running a.out. Not sure about make and cross build case.
> I don't want block this feature because of this, IMO, We can add this code
> with  existing __ARM_FEATURE_ATOMICS scheme and later find a method
> to enhance it. But please check how to fix it.

OK.

> 
> > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> > +static inline rte_int128_t                                                  \
> > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > +		rte_int128_t updated)                                               \
> > +{                                                                           \
> > +	/* caspX instructions register pair must start from even-numbered
> > +	 * register at operand 1.
> > +	 * So, specify registers for local variables here.
> > +	 */                                                                     \
> > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
> 
> Since direct x0 register used in the code and
> cas_op_name() and rte_atomic128_cmp_exchange() is inline function,
> Based on parent function load, we may corrupt x0 register aka

Since x0/x1 and x2/x3 are used a lot and often contain live values.
Maybe to change them to some relatively less frequently used registers like x14/x15 and x16/x17 might help for this case?
According to the PCS (Procedure Call Standard), x14-x17 are also temporary registers.

> Break arm64 ABI. Not sure clobber list will help here or not?

In my understanding, for the register variable, if it contains a live value in the specified register, the compiler will move the live value into a free register. 
Since x0~x3 are present in the input/output operands and x0/x1's value needs to be restored to the variable 'old' as a return value. 
So I didn't add them into the clobber list.

> Making it as no_inline will help but not sure about the performance impact.
> May be you can check with compiler team.
> 
> We burned our hands with this scheme, see
> 5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix
> possible arm64 ABI break")
> 
> Probably we can choose a scheme for rc2 and adjust as when we have
> complete clarity.
> 
> > +	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
> > +	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
> > +	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
> > +	asm volatile(                                                           \
> > +			op_string " %[old0], %[old1], %[upd0], %[upd1],
> > [%[dst]]"       \
> > +			: [old0] "+r" (x0),                                             \
> > +			  [old1] "+r" (x1)                                              \
> > +			: [upd0] "r" (x2),                                              \
> > +			  [upd1] "r" (x3),                                              \
> > +			  [dst] "r" (dst)                                               \
> > +			: "memory");                                                    \
> 
> Should n't we add x0,x1, x2, x3 in clobber list?

Same as above.

> 
> 
> >  static inline int __rte_experimental
> >  rte_atomic128_cmp_exchange(rte_int128_t *dst,
> >  			   rte_int128_t *exp,
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index 9958543..2355e50 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -1081,6 +1081,20 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)
> >
> >  /*------------------------ 128 bit atomic operations -------------------------*/
> >
> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> 
> There is nothing specific to x86 and arm64 here, Can we remove this #ifdef ?

Without this constraint, it will break 32-bit x86 builds.
http://mails.dpdk.org/archives/test-report/2019-June/086586.html 

> 
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +RTE_STD_C11
> > +typedef struct {
> > +	RTE_STD_C11
> > +	union {
> > +		uint64_t val[2];
> > +		__extension__ __int128 int128;
> > +	};
> > +} __rte_aligned(16) rte_int128_t;
> > +#endif
> > +
> >  #ifdef __DOXYGEN__

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-19 11:01     ` Phil Yang (Arm Technology China)
@ 2019-07-19 12:35       ` Jerin Jacob Kollanukkaran
  2019-07-19 13:56         ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-19 12:35 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), dev
  Cc: thomas, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, gage.eads, nd

> > > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> > > __ATOMIC_RELEASE || \
> > > +			 (mo) == __ATOMIC_ACQ_REL || \
> > > +			 (mo) == __ATOMIC_SEQ_CST)
> > > +
> > > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> > > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> > > RTE_MO_STORE(mo)
> > > +(RTE_HAS_RLS((mo)) \
> > > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > > +
> >
> > The one starts with RTE_ are public symbols, If it is generic enough,
> > Move to common layer so that every architecturse can use.
> > If you think, otherwise make it internal
> 
> Let's keep it internal. I will remove the 'RTE_' tag.

Probably change to __HAS_ACQ to avoid collision(just in case)

> >
> >
> >
> > > +#ifdef __ARM_FEATURE_ATOMICS
> >
> > This define is added in gcc 9.1 and I believe for clang it is not supported yet.
> > So old gcc and clang this will be undefined.
> > I think, With meson + native build, we  can find the presence of
> > ATOMIC support by running a.out. Not sure about make and cross build case.
> > I don't want block this feature because of this, IMO, We can add this
> > code with  existing __ARM_FEATURE_ATOMICS scheme and later find a
> > method to enhance it. But please check how to fix it.
> 
> OK.

After thinking on this a bit, I think,  in order to support old gcc(< gcc 9.1) and clang,
We can introduce a config option, where, by default it is disabled and enable
In specific config(where we know, lse is supported) and meson config.

i.e
#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)


> 
> >
> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> > > +static inline rte_int128_t                                                  \
> > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > > +		rte_int128_t updated)                                               \
> > > +{                                                                           \
> > > +	/* caspX instructions register pair must start from even-numbered
> > > +	 * register at operand 1.
> > > +	 * So, specify registers for local variables here.
> > > +	 */                                                                     \
> > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
> >
> > Since direct x0 register used in the code and
> > cas_op_name() and rte_atomic128_cmp_exchange() is inline function,
> > Based on parent function load, we may corrupt x0 register aka
> 
> Since x0/x1 and x2/x3 are used a lot and often contain live values.
> Maybe to change them to some relatively less frequently used registers like
> x14/x15 and x16/x17 might help for this case?
> According to the PCS (Procedure Call Standard), x14-x17 are also temporary
> registers.

X14-x17 are temporary registers but since 
cas_op_name() and rte_atomic128_cmp_exchange() are inline functions,
Based on the parent function register usage, it _may_ corrupt.


> 
> > Break arm64 ABI. Not sure clobber list will help here or not?
> 
> In my understanding, for the register variable, if it contains a live value in the
> specified register, the compiler will move the live value into a free register.
> Since x0~x3 are present in the input/output operands and x0/x1's value needs to
> be restored to the variable 'old' as a return value.
> So I didn't add them into the clobber list.

OK

> 
> > Making it as no_inline will help but not sure about the performance impact.
> > May be you can check with compiler team.
> >
> > We burned our hands with this scheme, see
> > 5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix
> > possible arm64 ABI break")
> >
> > Probably we can choose a scheme for rc2 and adjust as when we have
> > complete clarity.
> >
> > >
> > > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> >
> > There is nothing specific to x86 and arm64 here, Can we remove this #ifdef ?
> 
> Without this constraint, it will break 32-bit x86 builds.
> http://mails.dpdk.org/archives/test-report/2019-June/086586.html

OK . #ifdef RTE_ARCH_64 would help then.

> 
> >
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +RTE_STD_C11
> > > +typedef struct {
> > > +	RTE_STD_C11
> > > +	union {
> > > +		uint64_t val[2];
> > > +		__extension__ __int128 int128;

Instead of guarding  RTE_ARCH_64 on this complete structure,
How about it only under
#ifdef RTE_ARCH_64
__extension__ __int128 int128;
#endif
So that it rte_int128_t will be available for 32bit as well.


> > > +	};
> > > +} __rte_aligned(16) rte_int128_t;
> > > +#endif
> > > +
> > >  #ifdef __DOXYGEN__

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-19 12:35       ` Jerin Jacob Kollanukkaran
@ 2019-07-19 13:56         ` Phil Yang (Arm Technology China)
  2019-07-19 14:50           ` Eads, Gage
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-19 13:56 UTC (permalink / raw)
  To: jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Friday, July 19, 2019 8:35 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; hemant.agrawal@nxp.com; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology
> China) <Gavin.Hu@arm.com>; nd <nd@arm.com>; gage.eads@intel.com; nd
> <nd@arm.com>
> Subject: RE: [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > > > +#define RTE_HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > > > +__ATOMIC_RELEASE) #define RTE_HAS_RLS(mo) ((mo) ==
> > > > __ATOMIC_RELEASE || \
> > > > +			 (mo) == __ATOMIC_ACQ_REL || \
> > > > +			 (mo) == __ATOMIC_SEQ_CST)
> > > > +
> > > > +#define RTE_MO_LOAD(mo)  (RTE_HAS_ACQ((mo)) \
> > > > +		? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED) #define
> > > > RTE_MO_STORE(mo)
> > > > +(RTE_HAS_RLS((mo)) \
> > > > +		? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > > > +
> > >
> > > The one starts with RTE_ are public symbols, If it is generic enough,
> > > Move to common layer so that every architecturse can use.
> > > If you think, otherwise make it internal
> >
> > Let's keep it internal. I will remove the 'RTE_' tag.
> 
> Probably change to __HAS_ACQ to avoid collision(just in case)

OK.

> 
> > >
> > >
> > >
> > > > +#ifdef __ARM_FEATURE_ATOMICS
> > >
> > > This define is added in gcc 9.1 and I believe for clang it is not supported
> yet.
> > > So old gcc and clang this will be undefined.
> > > I think, With meson + native build, we  can find the presence of
> > > ATOMIC support by running a.out. Not sure about make and cross build
> case.
> > > I don't want block this feature because of this, IMO, We can add this
> > > code with  existing __ARM_FEATURE_ATOMICS scheme and later find a
> > > method to enhance it. But please check how to fix it.
> >
> > OK.
> 
> After thinking on this a bit, I think,  in order to support old gcc(< gcc 9.1) and
> clang,
> We can introduce a config option, where, by default it is disabled and enable
> In specific config(where we know, lse is supported) and meson config.
> 
> i.e
> #if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS)

Cool

> 
> 
> >
> > >
> > > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
> \
> > > > +static inline rte_int128_t                                                  \
> > > > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > > > +		rte_int128_t updated)                                               \
> > > > +{                                                                           \
> > > > +	/* caspX instructions register pair must start from even-numbered
> > > > +	 * register at operand 1.
> > > > +	 * So, specify registers for local variables here.
> > > > +	 */                                                                     \
> > > > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
> > >
> > > Since direct x0 register used in the code and
> > > cas_op_name() and rte_atomic128_cmp_exchange() is inline function,
> > > Based on parent function load, we may corrupt x0 register aka
> >
> > Since x0/x1 and x2/x3 are used a lot and often contain live values.
> > Maybe to change them to some relatively less frequently used registers
> like
> > x14/x15 and x16/x17 might help for this case?
> > According to the PCS (Procedure Call Standard), x14-x17 are also temporary
> > registers.
> 
> X14-x17 are temporary registers but since
> cas_op_name() and rte_atomic128_cmp_exchange() are inline functions,
> Based on the parent function register usage, it _may_ corrupt.

Just checked how Linux Kernel does similar things:
https://github.com/torvalds/linux/blob/master/arch/arm64/include/asm/atomic_lse.h#L19 

Same methods.

I will finish the benchmarking for the no_inline approach. If it has no significant performance loss, I think we can make it as no_inline.  

> 
> 
> >
> > > Break arm64 ABI. Not sure clobber list will help here or not?
> >
> > In my understanding, for the register variable, if it contains a live value in
> the
> > specified register, the compiler will move the live value into a free register.
> > Since x0~x3 are present in the input/output operands and x0/x1's value
> needs to
> > be restored to the variable 'old' as a return value.
> > So I didn't add them into the clobber list.
> 
> OK
> 
> >
> > > Making it as no_inline will help but not sure about the performance
> impact.
> > > May be you can check with compiler team.
> > >
> > > We burned our hands with this scheme, see
> > > 5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix
> > > possible arm64 ABI break")
> > >
> > > Probably we can choose a scheme for rc2 and adjust as when we have
> > > complete clarity.
> > >
> > > >
> > > > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> > >
> > > There is nothing specific to x86 and arm64 here, Can we remove this
> #ifdef ?
> >
> > Without this constraint, it will break 32-bit x86 builds.
> > http://mails.dpdk.org/archives/test-report/2019-June/086586.html
> 
> OK . #ifdef RTE_ARCH_64 would help then.

OK.

> 
> >
> > >
> > > > +/**
> > > > + * 128-bit integer structure.
> > > > + */
> > > > +RTE_STD_C11
> > > > +typedef struct {
> > > > +	RTE_STD_C11
> > > > +	union {
> > > > +		uint64_t val[2];
> > > > +		__extension__ __int128 int128;
> 
> Instead of guarding  RTE_ARCH_64 on this complete structure,
> How about it only under
> #ifdef RTE_ARCH_64
> __extension__ __int128 int128;
> #endif
> So that it rte_int128_t will be available for 32bit as well.

Agree, it should be work. But I am not sure. 

Hi Gage,

How do you think about this? 

> 
> 
> > > > +	};
> > > > +} __rte_aligned(16) rte_int128_t;
> > > > +#endif
> > > > +
> > > >  #ifdef __DOXYGEN__

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-19 13:56         ` Phil Yang (Arm Technology China)
@ 2019-07-19 14:50           ` Eads, Gage
  0 siblings, 0 replies; 91+ messages in thread
From: Eads, Gage @ 2019-07-19 14:50 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), jerinj, dev
  Cc: thomas, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> > > > > +/**
> > > > > + * 128-bit integer structure.
> > > > > + */
> > > > > +RTE_STD_C11
> > > > > +typedef struct {
> > > > > +	RTE_STD_C11
> > > > > +	union {
> > > > > +		uint64_t val[2];
> > > > > +		__extension__ __int128 int128;
> >
> > Instead of guarding  RTE_ARCH_64 on this complete structure, How about
> > it only under #ifdef RTE_ARCH_64 __extension__ __int128 int128; #endif
> > So that it rte_int128_t will be available for 32bit as well.
> 
> Agree, it should be work. But I am not sure.
> 
> Hi Gage,
> 
> How do you think about this?
> 

I don't see any harm in that.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (3 preceding siblings ...)
  2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
@ 2019-07-22  8:44 ` " Phil Yang
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (2 more replies)
  2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
                   ` (2 subsequent siblings)
  7 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22  8:44 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
V4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions. (Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   1 +
 config/common_base                                 |   5 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 162 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 6 files changed, 185 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..a88f21e 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -68,6 +68,7 @@ flags_thunderx_extra = [
 	['RTE_USE_C11_MEM_MODEL', false]]
 flags_thunderx2_extra = [
 	['RTE_MACHINE', '"thunderx2"'],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
diff --git a/config/common_base b/config/common_base
index 8ef75c2..8862495 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1067,3 +1067,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile ARM LSE ATOMIC instructions statically
+#
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..17b6dec 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -6,6 +6,7 @@
 
 CONFIG_RTE_MACHINE="thunderx2"
 
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..88b7ff4 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,164 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                               \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                     \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
+	asm volatile(                                                           \
+			op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"       \
+			: [old0] "+r" (x0),                                             \
+			  [old1] "+r" (x1)                                              \
+			: [upd0] "r" (x2),                                              \
+			  [upd1] "r" (x3),                                              \
+			  [dst] "r" (dst)                                               \
+			: "memory");                                                    \
+	old.val[0] = x0;                                                        \
+	old.val[1] = x1;                                                        \
+	return old;                                                             \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                       \
+	asm volatile(                                                           \
+			op_string " %0, %1, %2"                                         \
+			: "=&r" (ret.val[0]),                                           \
+			  "=&r" (ret.val[1])                                            \
+			: "Q" (src->val[0])                                             \
+			: "memory");                                                    \
+	return ret;                                                             \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                           \
+	asm volatile(                                                           \
+			op_string " %w0, %1, %2, %3"                                    \
+			: "=&r" (ret)                                                   \
+			: "r" (src.val[0]),                                             \
+			  "r" (src.val[1]),                                             \
+			  "Q" (dst->val[0])                                             \
+			: "memory");                                                    \
+	/* Return 0 on success, 1 on failure */                                 \
+	return ret;                                                             \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#ifdef __ARM_FEATURE_ATOMICS
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index e087c6c..1217129 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v4 2/3] test/atomic: add 128b compare and swap test
  2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
@ 2019-07-22  8:44   ` Phil Yang
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-07-22 10:20   ` [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22  8:44 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>

---
 app/test/test_atomic.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..ff6ff88 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,74 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +425,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-07-22  8:44   ` Phil Yang
  2019-07-22 10:22     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2019-07-22 10:20   ` [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-07-22  8:44 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +-
 doc/guides/rel_notes/release_19_08.rst          |  3 ++
 lib/librte_stack/rte_stack_lf.h                 |  4 ++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 -------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 -------
 lib/librte_stack/rte_stack_lf_stubs.h           | 59 +++++++++++++++++++++++++
 6 files changed, 68 insertions(+), 34 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index f15bcd9..d569f95 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -592,8 +592,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 0a3f840..25d45c1 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -212,6 +212,9 @@ New Features
 
   Added multiple cores feature to compression perf tool application.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..d924bc6
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+#include <rte_atomic.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	/* stack_lf_push() and stack_lf_pop() do not update the list's contents
+	 * and stack_lf->len atomically, which can cause the list to appear
+	 * shorter than it actually is if this function is called while other
+	 * threads are modifying the list.
+	 *
+	 * However, given the inherently approximate nature of the get_count
+	 * callback -- even if the list and its size were updated atomically,
+	 * the size could change between when get_count executes and when the
+	 * value is returned to the caller -- this is acceptable.
+	 *
+	 * The stack_lf->len updates are placed such that the list may appear to
+	 * have fewer elements than it does, but will never appear to have more
+	 * elements. If the mempool is near-empty to the point that this is a
+	 * concern, the user should consider increasing the mempool size.
+	 */
+	return (unsigned int)rte_atomic64_read((rte_atomic64_t *)
+			&s->stack_lf.used.len);
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 10:20   ` Jerin Jacob Kollanukkaran
  2019-07-22 11:50     ` Phil Yang (Arm Technology China)
  2 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 10:20 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 2:14 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange
> Add 128-bit atomic compare exchange on aarch64.
> 
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> +
> +#
> +# Compile ARM LSE ATOMIC instructions statically #

There is NO value for the keyword "statically" here. Right?

> +CONFIG_RTE_ARM_FEATURE_ATOMICS=n


> diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> index cc5c64b..17b6dec 100644
> --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> @@ -6,6 +6,7 @@
> 
>  CONFIG_RTE_MACHINE="thunderx2"
> 
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y


Add for octeontx2 as well.

>  CONFIG_RTE_CACHE_LINE_SIZE=64
>  CONFIG_RTE_MAX_NUMA_NODES=2
>  CONFIG_RTE_MAX_LCORE=256


> +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> +				rte_int128_t *exp,
> +				const rte_int128_t *src,
> +				unsigned int weak,
> +				int success,
> +				int failure)
> +{
> +	/* Always do strong CAS */
> +	RTE_SET_USED(weak);
> +	/* Ignore memory ordering for failure, memory order for
> +	 * success must be stronger or equal
> +	 */
> +	RTE_SET_USED(failure);
> +	/* Find invalid memory order */
> +	RTE_ASSERT(success == __ATOMIC_RELAXED
> +			|| success == __ATOMIC_ACQUIRE
> +			|| success == __ATOMIC_RELEASE
> +			|| success == __ATOMIC_ACQ_REL
> +			|| success == __ATOMIC_SEQ_CST);
> +
> +#ifdef __ARM_FEATURE_ATOMICS

Shouldn't it be #if defined(__ARM_FEATURE_ATOMICS) ||  defined(RTE_ARM_FEATURE_ATOMICS) ?




^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 10:22     ` " Jerin Jacob Kollanukkaran
  2019-07-22 11:51       ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 10:22 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 2:14 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64
> 
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.


Probably tell why "rte_stack_lf_stubs.h" introduced in git commit to have
Better history.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22 10:20   ` [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
@ 2019-07-22 11:50     ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-22 11:50 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Monday, July 22, 2019 6:20 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Monday, July 22, 2019 2:14 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> > Add 128-bit atomic compare exchange on aarch64.
> >
> > Suggested-by: Jerin Jacob <jerinj@marvell.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > +
> > +#
> > +# Compile ARM LSE ATOMIC instructions statically #
> 
> There is NO value for the keyword "statically" here. Right?
Agree.  Will remove it.

> 
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=n
> 
> 
> > diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > index cc5c64b..17b6dec 100644
> > --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > @@ -6,6 +6,7 @@
> >
> >  CONFIG_RTE_MACHINE="thunderx2"
> >
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
> 
> 
> Add for octeontx2 as well.

OK. Will add it in v5.

> 
> >  CONFIG_RTE_CACHE_LINE_SIZE=64
> >  CONFIG_RTE_MAX_NUMA_NODES=2
> >  CONFIG_RTE_MAX_LCORE=256
> 
> 
> > +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> > +				rte_int128_t *exp,
> > +				const rte_int128_t *src,
> > +				unsigned int weak,
> > +				int success,
> > +				int failure)
> > +{
> > +	/* Always do strong CAS */
> > +	RTE_SET_USED(weak);
> > +	/* Ignore memory ordering for failure, memory order for
> > +	 * success must be stronger or equal
> > +	 */
> > +	RTE_SET_USED(failure);
> > +	/* Find invalid memory order */
> > +	RTE_ASSERT(success == __ATOMIC_RELAXED
> > +			|| success == __ATOMIC_ACQUIRE
> > +			|| success == __ATOMIC_RELEASE
> > +			|| success == __ATOMIC_ACQ_REL
> > +			|| success == __ATOMIC_SEQ_CST);
> > +
> > +#ifdef __ARM_FEATURE_ATOMICS
> 
> Shouldn't it be #if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS) ?

Yes. That was a mistake. Will update in the version 5. Thanks.

Thanks,
Phil Yang

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 10:22     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
@ 2019-07-22 11:51       ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-22 11:51 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Monday, July 22, 2019 6:22 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v4 3/3] eal/stack: enable lock-free stack for
> aarch64
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Monday, July 22, 2019 2:14 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64
> >
> > Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> 
> Probably tell why "rte_stack_lf_stubs.h" introduced in git commit to have
> Better history.

Agree. I will update it. Thanks

Thanks,
Phil Yang

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (4 preceding siblings ...)
  2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
@ 2019-07-22 13:06 ` " Phil Yang
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (2 more replies)
  2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
  2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
  7 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22 13:06 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions. (Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   5 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   3 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 162 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 7 files changed, 189 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..1f5f471 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -68,6 +68,7 @@ flags_thunderx_extra = [
 	['RTE_USE_C11_MEM_MODEL', false]]
 flags_thunderx2_extra = [
 	['RTE_MACHINE', '"thunderx2"'],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
@@ -76,6 +77,7 @@ flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index 8ef75c2..16dea5a 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1067,3 +1067,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile ARM LSE ATOMIC instructions
+#
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..a6508e8 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -19,3 +19,6 @@ CONFIG_RTE_EAL_IGB_UIO=n
 
 # Max supported NIX LFs
 CONFIG_RTE_MAX_VFIO_GROUPS=128
+
+# arm64 LSE ATOMIC support
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..17b6dec 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -6,6 +6,7 @@
 
 CONFIG_RTE_MACHINE="thunderx2"
 
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..a040d69 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,164 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                               \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                     \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
+	asm volatile(                                                           \
+			op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"       \
+			: [old0] "+r" (x0),                                             \
+			  [old1] "+r" (x1)                                              \
+			: [upd0] "r" (x2),                                              \
+			  [upd1] "r" (x3),                                              \
+			  [dst] "r" (dst)                                               \
+			: "memory");                                                    \
+	old.val[0] = x0;                                                        \
+	old.val[1] = x1;                                                        \
+	return old;                                                             \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                       \
+	asm volatile(                                                           \
+			op_string " %0, %1, %2"                                         \
+			: "=&r" (ret.val[0]),                                           \
+			  "=&r" (ret.val[1])                                            \
+			: "Q" (src->val[0])                                             \
+			: "memory");                                                    \
+	return ret;                                                             \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                           \
+	asm volatile(                                                           \
+			op_string " %w0, %1, %2, %3"                                    \
+			: "=&r" (ret)                                                   \
+			: "r" (src.val[0]),                                             \
+			  "r" (src.val[1]),                                             \
+			  "Q" (dst->val[0])                                             \
+			: "memory");                                                    \
+	/* Return 0 on success, 1 on failure */                                 \
+	return ret;                                                             \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index e087c6c..1217129 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v5 2/3] test/atomic: add 128b compare and swap test
  2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
@ 2019-07-22 13:06   ` Phil Yang
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-07-22 14:19   ` [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22 13:06 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>

---
 app/test/test_atomic.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..ff6ff88 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,74 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +425,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-07-22 13:06   ` Phil Yang
  2019-07-22 14:14     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2019-07-22 14:34     ` [dpdk-dev] " Eads, Gage
  2019-07-22 14:19   ` [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 2 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22 13:06 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header, rte_stack_lf_stubs.h, to reduce the ifdef clutter
across generic and c11 files. The rte_stack_lf_stubs.h contains stub
implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +-
 doc/guides/rel_notes/release_19_08.rst          |  3 ++
 lib/librte_stack/rte_stack_lf.h                 |  4 ++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 -------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 -------
 lib/librte_stack/rte_stack_lf_stubs.h           | 59 +++++++++++++++++++++++++
 6 files changed, 68 insertions(+), 34 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index f15bcd9..d569f95 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -592,8 +592,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 0a3f840..25d45c1 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -212,6 +212,9 @@ New Features
 
   Added multiple cores feature to compression perf tool application.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..d924bc6
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+#include <rte_atomic.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	/* stack_lf_push() and stack_lf_pop() do not update the list's contents
+	 * and stack_lf->len atomically, which can cause the list to appear
+	 * shorter than it actually is if this function is called while other
+	 * threads are modifying the list.
+	 *
+	 * However, given the inherently approximate nature of the get_count
+	 * callback -- even if the list and its size were updated atomically,
+	 * the size could change between when get_count executes and when the
+	 * value is returned to the caller -- this is acceptable.
+	 *
+	 * The stack_lf->len updates are placed such that the list may appear to
+	 * have fewer elements than it does, but will never appear to have more
+	 * elements. If the mempool is near-empty to the point that this is a
+	 * concern, the user should consider increasing the mempool size.
+	 */
+	return (unsigned int)rte_atomic64_read((rte_atomic64_t *)
+			&s->stack_lf.used.len);
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 14:14     ` " Jerin Jacob Kollanukkaran
  2019-07-22 15:19       ` Phil Yang (Arm Technology China)
  2019-07-22 14:34     ` [dpdk-dev] " Eads, Gage
  1 sibling, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 14:14 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 6:36 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
> 
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> Introduced a new header, rte_stack_lf_stubs.h, to reduce the ifdef clutter
> across generic and c11 files. The rte_stack_lf_stubs.h contains stub
> implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems and
> __rte_stack_lf_pop_elems.
> 
> Suggested-by: Gage Eads <gage.eads@intel.com>
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

# Build issue with 32 bit build, Looks like new header file missing in SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include

In file included from /home/jerin/dpdk.org/build/include/rte_stack.h:98,
                 from /home/jerin/dpdk.org/drivers/mempool/stack/rte_mempool_stack.c:7:
/home/jerin/dpdk.org/build/include/rte_stack_lf.h:9:10: fatal error: rte_stack_lf_stubs.h: No such file or directory
    9 | #include "rte_stack_lf_stubs.h"
      |          ^~~~~~~~~~~~~~~~~~~~~~



^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 14:19   ` Jerin Jacob Kollanukkaran
  2019-07-22 16:23     ` Phil Yang (Arm Technology China)
  2 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 14:19 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

It looks good. Some minor comments.

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 6:36 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> ----------------------------------------------------------------------
> Add 128-bit atomic compare exchange on aarch64.
> 
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> 
> diff --git a/config/common_base b/config/common_base index
> 8ef75c2..16dea5a 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -1067,3 +1067,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y  # Compile the
> eventdev application  #  CONFIG_RTE_APP_EVENTDEV=y
> +
> +#
> +# Compile ARM LSE ATOMIC instructions
> +#
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=n


Move this config after "Compile Environment Abstraction Layer" section.
Now it is at end of file. Better to group the configs.


> diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> index f20da24..a6508e8 100644
> --- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> @@ -19,3 +19,6 @@ CONFIG_RTE_EAL_IGB_UIO=n
> 
>  # Max supported NIX LFs
>  CONFIG_RTE_MAX_VFIO_GROUPS=128
> +
> +# arm64 LSE ATOMIC support
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y

Move this config after CONFIG_RTE_MAX_LCORE=24


> diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> index cc5c64b..17b6dec 100644
> --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> @@ -6,6 +6,7 @@
> 
>  CONFIG_RTE_MACHINE="thunderx2"
> 
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
>  CONFIG_RTE_CACHE_LINE_SIZE=64
>  CONFIG_RTE_MAX_NUMA_NODES=2
>  CONFIG_RTE_MAX_LCORE=256

Move the new config here

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-07-22 14:14     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
@ 2019-07-22 14:34     ` " Eads, Gage
  2019-07-22 14:43       ` Phil Yang (Arm Technology China)
  1 sibling, 1 reply; 91+ messages in thread
From: Eads, Gage @ 2019-07-22 14:34 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> diff --git a/lib/librte_stack/rte_stack_lf_stubs.h
> b/lib/librte_stack/rte_stack_lf_stubs.h
> new file mode 100644
> index 0000000..d924bc6
> --- /dev/null
> +++ b/lib/librte_stack/rte_stack_lf_stubs.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
> + */
> +
> +#ifndef _RTE_STACK_LF_STUBS_H_
> +#define _RTE_STACK_LF_STUBS_H_
> +
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +
> +static __rte_always_inline unsigned int __rte_stack_lf_count(struct
> +rte_stack *s) {
> +	/* stack_lf_push() and stack_lf_pop() do not update the list's
> contents
> +	 * and stack_lf->len atomically, which can cause the list to appear
> +	 * shorter than it actually is if this function is called while other
> +	 * threads are modifying the list.
> +	 *
> +	 * However, given the inherently approximate nature of the
> get_count
> +	 * callback -- even if the list and its size were updated atomically,
> +	 * the size could change between when get_count executes and
> when the
> +	 * value is returned to the caller -- this is acceptable.
> +	 *
> +	 * The stack_lf->len updates are placed such that the list may appear
> to
> +	 * have fewer elements than it does, but will never appear to have
> more
> +	 * elements. If the mempool is near-empty to the point that this is a
> +	 * concern, the user should consider increasing the mempool size.
> +	 */
> +	return (unsigned int)rte_atomic64_read((rte_atomic64_t *)
> +			&s->stack_lf.used.len);
> +}

Since the stub functions are not functional, these comments should be removed and the function simply return 0. The rte_atomic.h include can be removed as well.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 14:34     ` [dpdk-dev] " Eads, Gage
@ 2019-07-22 14:43       ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-22 14:43 UTC (permalink / raw)
  To: Eads, Gage, dev
  Cc: thomas, jerinj, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Eads, Gage <gage.eads@intel.com>
> Sent: Monday, July 22, 2019 10:35 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
> 
> > diff --git a/lib/librte_stack/rte_stack_lf_stubs.h
> > b/lib/librte_stack/rte_stack_lf_stubs.h
> > new file mode 100644
> > index 0000000..d924bc6
> > --- /dev/null
> > +++ b/lib/librte_stack/rte_stack_lf_stubs.h
> > @@ -0,0 +1,59 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2019 Arm Limited
> > + */
> > +
> > +#ifndef _RTE_STACK_LF_STUBS_H_
> > +#define _RTE_STACK_LF_STUBS_H_
> > +
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +
> > +static __rte_always_inline unsigned int __rte_stack_lf_count(struct
> > +rte_stack *s) {
> > +	/* stack_lf_push() and stack_lf_pop() do not update the list's
> > contents
> > +	 * and stack_lf->len atomically, which can cause the list to appear
> > +	 * shorter than it actually is if this function is called while other
> > +	 * threads are modifying the list.
> > +	 *
> > +	 * However, given the inherently approximate nature of the
> > get_count
> > +	 * callback -- even if the list and its size were updated atomically,
> > +	 * the size could change between when get_count executes and
> > when the
> > +	 * value is returned to the caller -- this is acceptable.
> > +	 *
> > +	 * The stack_lf->len updates are placed such that the list may appear
> > to
> > +	 * have fewer elements than it does, but will never appear to have
> > more
> > +	 * elements. If the mempool is near-empty to the point that this is a
> > +	 * concern, the user should consider increasing the mempool size.
> > +	 */
> > +	return (unsigned int)rte_atomic64_read((rte_atomic64_t *)
> > +			&s->stack_lf.used.len);
> > +}
> 
> Since the stub functions are not functional, these comments should be
> removed and the function simply return 0. The rte_atomic.h include can be
> removed as well.

OK. Will do it. Thanks.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 14:14     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
@ 2019-07-22 15:19       ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-22 15:19 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Monday, July 22, 2019 10:15 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v5 3/3] eal/stack: enable lock-free stack for
> aarch64
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Monday, July 22, 2019 6:36 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64
> >
> > Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> >
> > Introduced a new header, rte_stack_lf_stubs.h, to reduce the ifdef clutter
> > across generic and c11 files. The rte_stack_lf_stubs.h contains stub
> > implementations of __rte_stack_lf_count, __rte_stack_lf_push_elems
> and
> > __rte_stack_lf_pop_elems.
> >
> > Suggested-by: Gage Eads <gage.eads@intel.com>
> > Suggested-by: Jerin Jacob <jerinj@marvell.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> 
> # Build issue with 32 bit build, Looks like new header file missing in SYMLINK-
> $(CONFIG_RTE_LIBRTE_STACK)-include
> 
> In file included from /home/jerin/dpdk.org/build/include/rte_stack.h:98,
>                  from
> /home/jerin/dpdk.org/drivers/mempool/stack/rte_mempool_stack.c:7:
> /home/jerin/dpdk.org/build/include/rte_stack_lf.h:9:10: fatal error:
> rte_stack_lf_stubs.h: No such file or directory
>     9 | #include "rte_stack_lf_stubs.h"
>       |          ^~~~~~~~~~~~~~~~~~~~~~
> 

Yes. Thanks. Will fix it in next version.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (5 preceding siblings ...)
  2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
@ 2019-07-22 16:22 ` " Phil Yang
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (2 more replies)
  2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
  7 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22 16:22 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions. (Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   2 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 162 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 7 files changed, 184 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..9f28271 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index 8ef75c2..de6d1e0 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,8 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+# Use LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..7687dbe 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..af4a89c 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..a040d69 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,164 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                               \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                     \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
+	asm volatile(                                                           \
+			op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"       \
+			: [old0] "+r" (x0),                                             \
+			  [old1] "+r" (x1)                                              \
+			: [upd0] "r" (x2),                                              \
+			  [upd1] "r" (x3),                                              \
+			  [dst] "r" (dst)                                               \
+			: "memory");                                                    \
+	old.val[0] = x0;                                                        \
+	old.val[1] = x1;                                                        \
+	return old;                                                             \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                       \
+	asm volatile(                                                           \
+			op_string " %0, %1, %2"                                         \
+			: "=&r" (ret.val[0]),                                           \
+			  "=&r" (ret.val[1])                                            \
+			: "Q" (src->val[0])                                             \
+			: "memory");                                                    \
+	return ret;                                                             \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                           \
+	asm volatile(                                                           \
+			op_string " %w0, %1, %2, %3"                                    \
+			: "=&r" (ret)                                                   \
+			: "r" (src.val[0]),                                             \
+			  "r" (src.val[1]),                                             \
+			  "Q" (dst->val[0])                                             \
+			: "memory");                                                    \
+	/* Return 0 on success, 1 on failure */                                 \
+	return ret;                                                             \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index e087c6c..1217129 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v6 2/3] test/atomic: add 128b compare and swap test
  2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
@ 2019-07-22 16:22   ` Phil Yang
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-07-22 16:57   ` [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-22 16:22 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>

---
 app/test/test_atomic.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..ff6ff88 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,74 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +425,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-07-22 16:22   ` Phil Yang
  2019-07-22 16:59     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2019-07-22 16:57   ` [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-07-22 16:22 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
 doc/guides/rel_notes/release_19_08.rst          |  3 ++
 lib/librte_stack/Makefile                       |  3 +-
 lib/librte_stack/rte_stack_lf.h                 |  4 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
 lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index f15bcd9..d569f95 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -592,8 +592,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 0a3f840..25d45c1 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -212,6 +212,9 @@ New Features
 
   Added multiple cores feature to compression perf tool application.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce5..c337ab7 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+						  rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..a05abf1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22 14:19   ` [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
@ 2019-07-22 16:23     ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-22 16:23 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Monday, July 22, 2019 10:20 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> It looks good. Some minor comments.

Thanks. Updated in the new version.

> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Monday, July 22, 2019 6:36 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> > ----------------------------------------------------------------------
> > Add 128-bit atomic compare exchange on aarch64.
> >
> > Suggested-by: Jerin Jacob <jerinj@marvell.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> >
> > diff --git a/config/common_base b/config/common_base index
> > 8ef75c2..16dea5a 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -1067,3 +1067,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y  # Compile
> the
> > eventdev application  #  CONFIG_RTE_APP_EVENTDEV=y
> > +
> > +#
> > +# Compile ARM LSE ATOMIC instructions
> > +#
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=n
> 
> 
> Move this config after "Compile Environment Abstraction Layer" section.
> Now it is at end of file. Better to group the configs.
> 
> 
> > diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> > b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> > index f20da24..a6508e8 100644
> > --- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> > +++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> > @@ -19,3 +19,6 @@ CONFIG_RTE_EAL_IGB_UIO=n
> >
> >  # Max supported NIX LFs
> >  CONFIG_RTE_MAX_VFIO_GROUPS=128
> > +
> > +# arm64 LSE ATOMIC support
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
> 
> Move this config after CONFIG_RTE_MAX_LCORE=24
> 
> 
> > diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > index cc5c64b..17b6dec 100644
> > --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> > @@ -6,6 +6,7 @@
> >
> >  CONFIG_RTE_MACHINE="thunderx2"
> >
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
> >  CONFIG_RTE_CACHE_LINE_SIZE=64
> >  CONFIG_RTE_MAX_NUMA_NODES=2
> >  CONFIG_RTE_MAX_LCORE=256
> 
> Move the new config here

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 16:57   ` Jerin Jacob Kollanukkaran
  2019-07-23  3:28     ` Phil Yang (Arm Technology China)
  2 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 16:57 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 9:53 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> Add 128-bit atomic compare exchange on aarch64.
> 
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> 
> ---
> diff --git a/config/common_base b/config/common_base index
> 8ef75c2..de6d1e0 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -82,6 +82,8 @@ CONFIG_RTE_MAX_LCORE=128
>  CONFIG_RTE_MAX_NUMA_NODES=8
>  CONFIG_RTE_MAX_HEAPS=32
>  CONFIG_RTE_MAX_MEMSEG_LISTS=64


Add new line here. 

> +# Use LSE ATOMIC instructions

I think, you can change the comment to "Use ARM LSE ATOMIC instructions"

> +CONFIG_RTE_ARM_FEATURE_ATOMICS=n

This patches series has following check patch warning. Please fix it

With above fixes you can add my acked-by in 1/3 and 3/3 patches in next revision.


I think, you can ignore following warning.
WARNING:MACRO_WITH_FLOW_CONTROL: Macros with flow control statements should be avoided

[master]dell[dpdk.org] $ ./devtools/checkpatches.sh

### eal/arm64: add 128-bit atomic compare exchange

WARNING:LONG_LINE: line over 80 characters
#103: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:60:
+               rte_int128_t updated)                                               \

WARNING:LONG_LINE: line over 80 characters
#108: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:65:
+        */                                                                     \

WARNING:LINE_CONTINUATIONS: Avoid unnecessary line continuations
#108: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:65:
+        */                                                                     \

WARNING:LONG_LINE: line over 80 characters
#109: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:66:
+       register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \

WARNING:LONG_LINE: line over 80 characters
#110: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:67:
+       register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \

WARNING:LONG_LINE: line over 80 characters
#111: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:68:
+       register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \

WARNING:LONG_LINE: line over 80 characters
#112: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:69:
+       register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \

WARNING:LONG_LINE: line over 80 characters
#113: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:70:
+       asm volatile(                                                           \

WARNING:LONG_LINE: line over 80 characters
#115: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:72:
+                       : [old0] "+r" (x0),                                             \

WARNING:LONG_LINE: line over 80 characters
#116: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:73:
+                         [old1] "+r" (x1)                                              \

WARNING:LONG_LINE: line over 80 characters
#118: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:75:                                                                                                                                                              
+                         [upd1] "r" (x3),                                              \

WARNING:LONG_LINE: line over 80 characters
#119: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:76:
+                         [dst] "r" (dst)                                               \

WARNING:LONG_LINE: line over 80 characters
#120: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:77:
+                       : "memory");                                                    \

WARNING:LONG_LINE: line over 80 characters
#121: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:78:
+       old.val[0] = x0;                                                        \

WARNING:LONG_LINE: line over 80 characters
#122: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:79:
+       old.val[1] = x1;                                                        \

WARNING:LONG_LINE: line over 80 characters
#123: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:80:
+       return old;                                                             \

WARNING:LONG_LINE: line over 80 characters
#135: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:92:
+       rte_int128_t ret;                                                       \

WARNING:LONG_LINE: line over 80 characters
#136: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:93:
+       asm volatile(                                                           \

WARNING:LONG_LINE: line over 80 characters
#138: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:95:
+                       : "=&r" (ret.val[0]),                                           \

WARNING:LONG_LINE: line over 80 characters
#139: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:96:
+                         "=&r" (ret.val[1])                                            \

WARNING:LONG_LINE: line over 80 characters
#140: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:97:
+                       : "Q" (src->val[0])                                             \

WARNING:LONG_LINE: line over 80 characters


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-22 16:59     ` " Jerin Jacob Kollanukkaran
  0 siblings, 0 replies; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-22 16:59 UTC (permalink / raw)
  To: Phil Yang, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Monday, July 22, 2019 9:53 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64
> 
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> 
> Introduced a new header to reduce the ifdef clutter across generic and c11
> files. The rte_stack_lf_stubs.h contains stub implementations of
> __rte_stack_lf_count, __rte_stack_lf_push_elems and
> __rte_stack_lf_pop_elems.
> 
> Suggested-by: Gage Eads <gage.eads@intel.com>
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Acked-by: Jerin Jacob <jerinj@marvell.com>

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-22 16:57   ` [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
@ 2019-07-23  3:28     ` Phil Yang (Arm Technology China)
  2019-07-23  7:09       ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-23  3:28 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Tuesday, July 23, 2019 12:57 AM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Monday, July 22, 2019 9:53 PM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; hemant.agrawal@nxp.com;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> >
> > Add 128-bit atomic compare exchange on aarch64.
> >
> > Suggested-by: Jerin Jacob <jerinj@marvell.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> >
> > ---
> > diff --git a/config/common_base b/config/common_base index
> > 8ef75c2..de6d1e0 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -82,6 +82,8 @@ CONFIG_RTE_MAX_LCORE=128
> >  CONFIG_RTE_MAX_NUMA_NODES=8
> >  CONFIG_RTE_MAX_HEAPS=32
> >  CONFIG_RTE_MAX_MEMSEG_LISTS=64
> 
> 
> Add new line here.
> 
> > +# Use LSE ATOMIC instructions
> 
> I think, you can change the comment to "Use ARM LSE ATOMIC instructions"

OK. Will fix it.

> 
> > +CONFIG_RTE_ARM_FEATURE_ATOMICS=n
> 
> This patches series has following check patch warning. Please fix it
> 
> With above fixes you can add my acked-by in 1/3 and 3/3 patches in next
> revision.

Thank you for your review and your comments.

> 
> 
> I think, you can ignore following warning.
> WARNING:MACRO_WITH_FLOW_CONTROL: Macros with flow control
> statements should be avoided
> 
> [master]dell[dpdk.org] $ ./devtools/checkpatches.sh

Yes. I did this check before upstream this patch. 
I ignored ' WARNING:MACRO_WITH_FLOW_CONTROL: Macros with flow control statements should be avoided ' warnings.
However, I didn't see any LONG_LINE warnings in the CI report here. http://mails.dpdk.org/archives/test-report/2019-July/090889.html
BTW, I think these 3 warnings in the report might be ineluctable.

> 
> ### eal/arm64: add 128-bit atomic compare exchange
> 
> WARNING:LONG_LINE: line over 80 characters

The default tab size is 8. You might need to change it to 4 in the /usr/src/linux/scripts/checkpatch tool. FYI. 

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a09333f..d3865a4 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1211,7 +1211,7 @@ sub expand_tabs {
         if ($c eq "\t") {
             $res .= ' ';
             $n++;
-            for (; ($n % 8) != 0; $n++) {
+            for (; ($n % 4) != 0; $n++) {
                 $res .= ' ';
             }
             next;
@@ -3688,11 +3688,11 @@ sub process {
             #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> 
             cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";

             if ($check && $s ne '' &&
-                (($sindent % 8) != 0 ||
+                (($sindent % 4) != 0 ||
                  ($sindent < $indent) ||
                  ($sindent == $indent &&
                   ($s !~ /^\s*(?:\}|\{|else\b)/)) ||
-                 ($sindent > $indent + 8))) {
+                 ($sindent > $indent + 4))) {
                 WARN("SUSPECT_CODE_INDENT",
                      "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
             }


> #103: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:60:
> +               rte_int128_t updated)                                               \
> 
> WARNING:LONG_LINE: line over 80 characters
> #108: FILE: lib/librte_eal/common/include/arch/arm/rte_atomic_64.h:65:
> +        */                                                                     \

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v7 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                   ` (6 preceding siblings ...)
  2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
@ 2019-07-23  5:57 ` " Phil Yang
  2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 2/3] test/atomic: add 128b compare and swap test Phil Yang
                     ` (2 more replies)
  7 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-23  5:57 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>

---
v7:
1. Adjust code comment.

v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions. (Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   3 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 162 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 7 files changed, 185 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..9f28271 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index 8ef75c2..2054480 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..7687dbe 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..af4a89c 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..a040d69 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,164 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                               \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                     \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];                \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];                \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];            \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];            \
+	asm volatile(                                                           \
+			op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"       \
+			: [old0] "+r" (x0),                                             \
+			  [old1] "+r" (x1)                                              \
+			: [upd0] "r" (x2),                                              \
+			  [upd1] "r" (x3),                                              \
+			  [dst] "r" (dst)                                               \
+			: "memory");                                                    \
+	old.val[0] = x0;                                                        \
+	old.val[1] = x1;                                                        \
+	return old;                                                             \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                       \
+	asm volatile(                                                           \
+			op_string " %0, %1, %2"                                         \
+			: "=&r" (ret.val[0]),                                           \
+			  "=&r" (ret.val[1])                                            \
+			: "Q" (src->val[0])                                             \
+			: "memory");                                                    \
+	return ret;                                                             \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                           \
+	asm volatile(                                                           \
+			op_string " %w0, %1, %2, %3"                                    \
+			: "=&r" (ret)                                                   \
+			: "r" (src.val[0]),                                             \
+			  "r" (src.val[1]),                                             \
+			  "Q" (dst->val[0])                                             \
+			: "memory");                                                    \
+	/* Return 0 on success, 1 on failure */                                 \
+	return ret;                                                             \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain the
+			 * atomically read value of dst. This means, 'old' needs
+			 * to be stored back to ensure it was read atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index e087c6c..1217129 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v7 2/3] test/atomic: add 128b compare and swap test
  2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
@ 2019-07-23  5:57   ` Phil Yang
  2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-23  5:57 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>

---
 app/test/test_atomic.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..ff6ff88 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,74 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128, &expected,
+					&desired, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +425,37 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v7 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
  2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-07-23  5:57   ` Phil Yang
  2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-07-23  5:57 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, gage.eads, hemant.agrawal, Honnappa.Nagarahalli,
	gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>

---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
 doc/guides/rel_notes/release_19_08.rst          |  3 ++
 lib/librte_stack/Makefile                       |  3 +-
 lib/librte_stack/rte_stack_lf.h                 |  4 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
 lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 1487ea5..2335367 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -641,8 +641,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 50c83ce..55c792a 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -212,6 +212,9 @@ New Features
 
   Added multiple cores feature to compression perf tool application.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce5..c337ab7 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+						  rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..a05abf1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
  2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-07-23  7:05   ` jerinj
  2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 2/3] test/atomic: add 128b compare and swap test jerinj
                       ` (2 more replies)
  2 siblings, 3 replies; 91+ messages in thread
From: jerinj @ 2019-07-23  7:05 UTC (permalink / raw)
  To: dev, Thomas Monjalon, Jerin Jacob, Gavin Hu, Jan Viktorin,
	Bruce Richardson, Konstantin Ananyev
  Cc: Phil Yang, Honnappa Nagarahalli

From: Phil Yang <phil.yang@arm.com>

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---

v8:
Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
checkpatch.pl

v7:
1. Adjust code comment.

v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
(Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                        |   2 +
 config/common_base                            |   3 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc |   1 +
 .../common/include/arch/arm/rte_atomic_64.h   | 163 ++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h   |  12 --
 .../common/include/generic/rte_atomic.h       |  17 +-
 7 files changed, 186 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e16..9f2827140 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index 8ef75c203..205448013 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da2442..7687dbec8 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64ba0..af4a89c48 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e444..14d869bc9 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,165 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                       \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                 \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
+	asm volatile(                                                       \
+		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
+		: [old0] "+r" (x0),                                         \
+		[old1] "+r" (x1)                                            \
+		: [upd0] "r" (x2),                                          \
+		[upd1] "r" (x3),                                            \
+		[dst] "r" (dst)                                             \
+		: "memory");                                                \
+	old.val[0] = x0;                                                    \
+	old.val[1] = x1;                                                    \
+	return old;                                                         \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                   \
+	asm volatile(                                                       \
+			op_string " %0, %1, %2"                             \
+			: "=&r" (ret.val[0]),                               \
+			  "=&r" (ret.val[1])                                \
+			: "Q" (src->val[0])                                 \
+			: "memory");                                        \
+	return ret;                                                         \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                       \
+	asm volatile(                                                       \
+			op_string " %w0, %1, %2, %3"                        \
+			: "=&r" (ret)                                       \
+			: "r" (src.val[0]),                                 \
+			  "r" (src.val[1]),                                 \
+			  "Q" (dst->val[0])                                 \
+			: "memory");                                        \
+	/* Return 0 on success, 1 on failure */                             \
+	return ret;                                                         \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain
+			 * the atomically read value of dst. This means, 'old'
+			 * needs to be stored back to ensure it was read
+			 * atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index e087c6c32..12171296b 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -212,18 +212,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dcae..e6ab15a97 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.22.0


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v8 2/3] test/atomic: add 128b compare and swap test
  2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
@ 2019-07-23  7:05     ` jerinj
  2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 3/3] eal/stack: enable lock-free stack for aarch64 jerinj
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2 siblings, 0 replies; 91+ messages in thread
From: jerinj @ 2019-07-23  7:05 UTC (permalink / raw)
  To: dev; +Cc: thomas, Phil Yang, Honnappa Nagarahalli, Gage Eads, Jerin Jacob

From: Phil Yang <phil.yang@arm.com>

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>
---
 app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30ec0..0dad92387 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+				&expected, &desired, 1,
+				__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +429,38 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
+				 SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.22.0


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v8 3/3] eal/stack: enable lock-free stack for aarch64
  2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
  2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 2/3] test/atomic: add 128b compare and swap test jerinj
@ 2019-07-23  7:05     ` jerinj
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2 siblings, 0 replies; 91+ messages in thread
From: jerinj @ 2019-07-23  7:05 UTC (permalink / raw)
  To: dev, Anatoly Burakov, John McNamara, Marko Kovacevic, Gage Eads,
	Olivier Matz
  Cc: thomas, Phil Yang, Jerin Jacob, Honnappa Nagarahalli

From: Phil Yang <phil.yang@arm.com>

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 .../prog_guide/env_abstraction_layer.rst      |  4 +-
 doc/guides/rel_notes/release_19_08.rst        |  3 ++
 lib/librte_stack/Makefile                     |  3 +-
 lib/librte_stack/rte_stack_lf.h               |  4 ++
 lib/librte_stack/rte_stack_lf_c11.h           | 16 -------
 lib/librte_stack/rte_stack_lf_generic.h       | 16 -------
 lib/librte_stack/rte_stack_lf_stubs.h         | 44 +++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 1487ea550..23353678d 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -641,8 +641,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 50c83ce51..55c792af1 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -212,6 +212,9 @@ New Features
 
   Added multiple cores feature to compression perf tool application.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce520..c337ab767 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+						  rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0c2..e67630c27 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677aed1..999359f08 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 31821514f..3abbb5342 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 000000000..a05abf1f1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.22.0


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-23  3:28     ` Phil Yang (Arm Technology China)
@ 2019-07-23  7:09       ` Jerin Jacob Kollanukkaran
  2019-07-23  7:53         ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-23  7:09 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> >
> > [master]dell[dpdk.org] $ ./devtools/checkpatches.sh
> 
> Yes. I did this check before upstream this patch.
> I ignored ' WARNING:MACRO_WITH_FLOW_CONTROL: Macros with flow
> control statements should be avoided ' warnings.
> However, I didn't see any LONG_LINE warnings in the CI report here.

Yes. But Not sure why?

I sent v8 with "WARNING:LONG_LINE: line over 80 characters" warning fixes with latest kernel
checkpatch.pl. Please review.

> http://mails.dpdk.org/archives/test-report/2019-July/090889.html
> BTW, I think these 3 warnings in the report might be ineluctable.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-23  7:09       ` Jerin Jacob Kollanukkaran
@ 2019-07-23  7:53         ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-07-23  7:53 UTC (permalink / raw)
  To: jerinj, dev
  Cc: thomas, gage.eads, hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Tuesday, July 23, 2019 3:10 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; gage.eads@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> > >
> > > [master]dell[dpdk.org] $ ./devtools/checkpatches.sh
> >
> > Yes. I did this check before upstream this patch.
> > I ignored ' WARNING:MACRO_WITH_FLOW_CONTROL: Macros with flow
> > control statements should be avoided ' warnings.
> > However, I didn't see any LONG_LINE warnings in the CI report here.
> 
> Yes. But Not sure why?

Not sure. 
It might because DPDK CI changed tabstop width or indent width which is different than Kernel checkpatch.pl.

> 
> I sent v8 with "WARNING:LONG_LINE: line over 80 characters" warning fixes
> with latest kernel
> checkpatch.pl. Please review.

+1
I really appreciate your help.

> 
> > http://mails.dpdk.org/archives/test-report/2019-July/090889.html
> > BTW, I think these 3 warnings in the report might be ineluctable.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
  2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 2/3] test/atomic: add 128b compare and swap test jerinj
  2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 3/3] eal/stack: enable lock-free stack for aarch64 jerinj
@ 2019-08-14  8:27     ` Phil Yang
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test Phil Yang
                         ` (3 more replies)
  2 siblings, 4 replies; 91+ messages in thread
From: Phil Yang @ 2019-08-14  8:27 UTC (permalink / raw)
  To: thomas, jerinj, gage.eads, dev
  Cc: hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Add 128-bit atomic compare exchange on aarch64.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---

v9:
Updated 19.11 release note.

v8:
Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
checkpatch.pl

v7:
1. Adjust code comment.

v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
(Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   3 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 163 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 7 files changed, 186 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..9f28271 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index 8ef75c2..2054480 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..7687dbe 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..af4a89c 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..14d869b 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,165 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
+#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                       \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                 \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
+	asm volatile(                                                       \
+		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
+		: [old0] "+r" (x0),                                         \
+		[old1] "+r" (x1)                                            \
+		: [upd0] "r" (x2),                                          \
+		[upd1] "r" (x3),                                            \
+		[dst] "r" (dst)                                             \
+		: "memory");                                                \
+	old.val[0] = x0;                                                    \
+	old.val[1] = x1;                                                    \
+	return old;                                                         \
+}
+
+__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
+__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                   \
+	asm volatile(                                                       \
+			op_string " %0, %1, %2"                             \
+			: "=&r" (ret.val[0]),                               \
+			  "=&r" (ret.val[1])                                \
+			: "Q" (src->val[0])                                 \
+			: "memory");                                        \
+	return ret;                                                         \
+}
+
+__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                       \
+	asm volatile(                                                       \
+			op_string " %w0, %1, %2, %3"                        \
+			: "=&r" (ret)                                       \
+			: "r" (src.val[0]),                                 \
+			  "r" (src.val[1]),                                 \
+			  "Q" (dst->val[0])                                 \
+			: "memory");                                        \
+	/* Return 0 on success, 1 on failure */                             \
+	return ret;                                                         \
+}
+
+__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
+#endif
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __rte_cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __rte_cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __rte_cas_release(dst, expected, desired);
+	else
+		old = __rte_cas_acq_rel(dst, expected, desired);
+#else
+	int ldx_mo = __MO_LOAD(success);
+	int stx_mo = __MO_STORE(success);
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __rte_ldx_relaxed(dst);
+		else
+			old = __rte_ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, desired);
+			else
+				ret = __rte_stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain
+			 * the atomically read value of dst. This means, 'old'
+			 * needs to be stored back to ensure it was read
+			 * atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __rte_stx_relaxed(dst, old);
+			else
+				ret = __rte_stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 1335d92..cfe7067 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
@ 2019-08-14  8:27       ` Phil Yang
  2019-10-14 15:45         ` David Marchand
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-08-14  8:27 UTC (permalink / raw)
  To: thomas, jerinj, gage.eads, dev
  Cc: hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>
---
 app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..0dad923 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations 10K rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+				&expected, &desired, 1,
+				__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +429,38 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test 10K times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
+				 SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-08-14  8:27       ` Phil Yang
  2019-10-14 15:45         ` David Marchand
  2019-10-14 15:43       ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
  2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
  3 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-08-14  8:27 UTC (permalink / raw)
  To: thomas, jerinj, gage.eads, dev
  Cc: hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
 doc/guides/rel_notes/release_19_11.rst          |  3 ++
 lib/librte_stack/Makefile                       |  3 +-
 lib/librte_stack/rte_stack_lf.h                 |  4 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
 lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 94f30fd..6e59fae 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -648,8 +648,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 8490d89..60ffd70 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -56,6 +56,9 @@ New Features
      Also, make sure to start the actual text at the margin.
      =========================================================
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce5..c337ab7 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+						  rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..a05abf1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-10-14 15:43       ` David Marchand
  2019-10-15 11:32         ` Phil Yang (Arm Technology China)
  2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
  3 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-14 15:43 UTC (permalink / raw)
  To: Phil Yang
  Cc: Thomas Monjalon, Jerin Jacob Kollanukkaran, Gage Eads, dev,
	Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu, nd

On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <phil.yang@arm.com> wrote:
>
> Add 128-bit atomic compare exchange on aarch64.

A bit short, seeing the complexity of the code and the additional
RTE_ARM_FEATURE_ATOMICS config flag.


Comments inline.

>
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
>
> v9:
> Updated 19.11 release note.
>
> v8:
> Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
> checkpatch.pl
>
> v7:
> 1. Adjust code comment.
>
> v6:
> 1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
> 2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
> 3. Fixed 32 bit build issue.
>
> v5:
> 1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
> 2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
> commit.
> (Jerin, Jocob)
> 3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
> Jocob)
>
> v4:
> 1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
> (Jerin Jocob)
> 2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
> Jocob)
> 3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
> Eads/Jerin Jocob)
>
> v3:
> 1. Avoid duplication code with macro. (Jerin Jocob)
> 2. Make invalid memory order to strongest barrier. (Jerin Jocob)
> 3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
> 4. Fix 32-bit x86 builds issue. (Gage Eads)
> 5. Correct documentation issues in UT. (Gage Eads)
>
> v2:
> Initial version.
>
>  config/arm/meson.build                             |   2 +
>  config/common_base                                 |   3 +
>  config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
>  config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
>  .../common/include/arch/arm/rte_atomic_64.h        | 163 +++++++++++++++++++++
>  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
>  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
>  7 files changed, 186 insertions(+), 13 deletions(-)
>
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..9f28271 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -71,11 +71,13 @@ flags_thunderx2_extra = [
>         ['RTE_CACHE_LINE_SIZE', 64],
>         ['RTE_MAX_NUMA_NODES', 2],
>         ['RTE_MAX_LCORE', 256],
> +       ['RTE_ARM_FEATURE_ATOMICS', true],
>         ['RTE_USE_C11_MEM_MODEL', true]]
>  flags_octeontx2_extra = [
>         ['RTE_MACHINE', '"octeontx2"'],
>         ['RTE_MAX_NUMA_NODES', 1],
>         ['RTE_MAX_LCORE', 24],
> +       ['RTE_ARM_FEATURE_ATOMICS', true],
>         ['RTE_EAL_IGB_UIO', false],
>         ['RTE_USE_C11_MEM_MODEL', true]]
>
> diff --git a/config/common_base b/config/common_base
> index 8ef75c2..2054480 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
>  CONFIG_RTE_MAX_NUMA_NODES=8
>  CONFIG_RTE_MAX_HEAPS=32
>  CONFIG_RTE_MAX_MEMSEG_LISTS=64
> +
> +# Use ARM LSE ATOMIC instructions
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=n
>  # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
>  # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
>  CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
> diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> index f20da24..7687dbe 100644
> --- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> @@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  CONFIG_RTE_MAX_NUMA_NODES=1
>  CONFIG_RTE_MAX_LCORE=24
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
>
>  # Doesn't support NUMA
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> index cc5c64b..af4a89c 100644
> --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> @@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
>  CONFIG_RTE_CACHE_LINE_SIZE=64
>  CONFIG_RTE_MAX_NUMA_NODES=2
>  CONFIG_RTE_MAX_LCORE=256
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index 97060e4..14d869b 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2015 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #ifndef _RTE_ATOMIC_ARM64_H_
> @@ -14,6 +15,9 @@ extern "C" {
>  #endif
>
>  #include "generic/rte_atomic.h"
> +#include <rte_branch_prediction.h>
> +#include <rte_compat.h>
> +#include <rte_debug.h>
>
>  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
>  #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
> @@ -40,6 +44,165 @@ extern "C" {
>
>  #define rte_cio_rmb() dmb(oshld)
>
> +/*------------------------ 128 bit atomic operations -------------------------*/
> +
> +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
> +#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
> +                                         (mo) == __ATOMIC_SEQ_CST)
> +
> +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED)
> +#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED)

Those 4 first macros only make sense when LSE is not available (see below [1]).
Besides, they are used only once, why not directly use those
conditions where needed?


> +
> +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
> +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> +static __rte_noinline rte_int128_t                                          \
> +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> +               rte_int128_t updated)                                       \
> +{                                                                           \
> +       /* caspX instructions register pair must start from even-numbered
> +        * register at operand 1.
> +        * So, specify registers for local variables here.
> +        */                                                                 \
> +       register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
> +       register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
> +       register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
> +       register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
> +       asm volatile(                                                       \
> +               op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
> +               : [old0] "+r" (x0),                                         \
> +               [old1] "+r" (x1)                                            \
> +               : [upd0] "r" (x2),                                          \
> +               [upd1] "r" (x3),                                            \
> +               [dst] "r" (dst)                                             \
> +               : "memory");                                                \
> +       old.val[0] = x0;                                                    \
> +       old.val[1] = x1;                                                    \
> +       return old;                                                         \
> +}
> +
> +__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
> +__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
> +__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
> +__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")

If LSE is available, we expose __rte_cas_XX (explicitely) *non*
inlined functions, while without LSE, we expose inlined __rte_ldr_XX
and __rte_stx_XX functions.
So we have a first disparity with non-inlined vs inlined functions
depending on a #ifdef.
Then, we have a second disparity with two sets of "apis" depending on
this #ifdef.

And we expose those sets with a rte_ prefix, meaning people will try
to use them, but those are not part of a public api.

Can't we do without them ? (see below [2] for a proposal with ldr/stx,
cas should be the same)


> +#else
> +#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
> +static inline rte_int128_t                                                  \
> +ldx_op_name(const rte_int128_t *src)                                        \
> +{                                                                           \
> +       rte_int128_t ret;                                                   \
> +       asm volatile(                                                       \
> +                       op_string " %0, %1, %2"                             \
> +                       : "=&r" (ret.val[0]),                               \
> +                         "=&r" (ret.val[1])                                \
> +                       : "Q" (src->val[0])                                 \
> +                       : "memory");                                        \
> +       return ret;                                                         \
> +}
> +
> +__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
> +__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
> +
> +#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
> +static inline uint32_t                                                      \
> +stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
> +{                                                                           \
> +       uint32_t ret;                                                       \
> +       asm volatile(                                                       \
> +                       op_string " %w0, %1, %2, %3"                        \
> +                       : "=&r" (ret)                                       \
> +                       : "r" (src.val[0]),                                 \
> +                         "r" (src.val[1]),                                 \
> +                         "Q" (dst->val[0])                                 \
> +                       : "memory");                                        \
> +       /* Return 0 on success, 1 on failure */                             \
> +       return ret;                                                         \
> +}
> +
> +__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
> +__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
> +#endif
> +
> +static inline int __rte_experimental

The __rte_experimental tag comes first.


> +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> +                               rte_int128_t *exp,
> +                               const rte_int128_t *src,
> +                               unsigned int weak,
> +                               int success,
> +                               int failure)
> +{
> +       /* Always do strong CAS */
> +       RTE_SET_USED(weak);
> +       /* Ignore memory ordering for failure, memory order for
> +        * success must be stronger or equal
> +        */
> +       RTE_SET_USED(failure);
> +       /* Find invalid memory order */
> +       RTE_ASSERT(success == __ATOMIC_RELAXED
> +                       || success == __ATOMIC_ACQUIRE
> +                       || success == __ATOMIC_RELEASE
> +                       || success == __ATOMIC_ACQ_REL
> +                       || success == __ATOMIC_SEQ_CST);
> +
> +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
> +       rte_int128_t expected = *exp;
> +       rte_int128_t desired = *src;
> +       rte_int128_t old;
> +
> +       if (success == __ATOMIC_RELAXED)
> +               old = __rte_cas_relaxed(dst, expected, desired);
> +       else if (success == __ATOMIC_ACQUIRE)
> +               old = __rte_cas_acquire(dst, expected, desired);
> +       else if (success == __ATOMIC_RELEASE)
> +               old = __rte_cas_release(dst, expected, desired);
> +       else
> +               old = __rte_cas_acq_rel(dst, expected, desired);
> +#else

1: the four first macros (on the memory ordering constraints) can be
moved here then undef'd once unused.
Or you can just do without them.


> +       int ldx_mo = __MO_LOAD(success);
> +       int stx_mo = __MO_STORE(success);
> +       uint32_t ret = 1;
> +       register rte_int128_t expected = *exp;
> +       register rte_int128_t desired = *src;
> +       register rte_int128_t old;
> +
> +       /* ldx128 can not guarantee atomic,
> +        * Must write back src or old to verify atomicity of ldx128;
> +        */
> +       do {
> +               if (ldx_mo == __ATOMIC_RELAXED)
> +                       old = __rte_ldx_relaxed(dst);
> +               else
> +                       old = __rte_ldx_acquire(dst);

2: how about using a simple macro that gets passed the op string?

Something like (untested):

#define __READ_128(op_string, src, dst) \
    asm volatile(                      \
        op_string " %0, %1, %2"    \
        : "=&r" (dst.val[0]),      \
          "=&r" (dst.val[1])       \
        : "Q" (src->val[0])        \
        : "memory")

Then used like this:

        if (ldx_mo == __ATOMIC_RELAXED)
            __READ_128("ldxp", dst, old);
        else
            __READ_128("ldaxp", dst, old);

#undef __READ_128

> +
> +               if (likely(old.int128 == expected.int128)) {
> +                       if (stx_mo == __ATOMIC_RELAXED)
> +                               ret = __rte_stx_relaxed(dst, desired);
> +                       else
> +                               ret = __rte_stx_release(dst, desired);
> +               } else {
> +                       /* In the failure case (since 'weak' is ignored and only
> +                        * weak == 0 is implemented), expected should contain
> +                        * the atomically read value of dst. This means, 'old'
> +                        * needs to be stored back to ensure it was read
> +                        * atomically.
> +                        */
> +                       if (stx_mo == __ATOMIC_RELAXED)
> +                               ret = __rte_stx_relaxed(dst, old);
> +                       else
> +                               ret = __rte_stx_release(dst, old);

And:

#define __STORE_128(op_string, dst, val, ret) \
    asm volatile(                        \
        op_string " %w0, %1, %2, %3"     \
        : "=&r" (ret)                    \
        : "r" (val.val[0]),              \
          "r" (val.val[1]),              \
          "Q" (dst->val[0])              \
        : "memory")

Used like this:

        if (likely(old.int128 == expected.int128)) {
            if (stx_mo == __ATOMIC_RELAXED)
                __STORE_128("stxp", dst, desired, ret);
            else
                __STORE_128("stlxp", dst, desired, ret);
        } else {
            /* In the failure case (since 'weak' is ignored and only
             * weak == 0 is implemented), expected should contain
             * the atomically read value of dst. This means, 'old'
             * needs to be stored back to ensure it was read
             * atomically.
             */
            if (stx_mo == __ATOMIC_RELAXED)
                __STORE_128("stxp", dst, old, ret);
            else
                __STORE_128("stlxp", dst, old, ret);
        }

#undef __STORE_128


> +               }
> +       } while (unlikely(ret));
> +#endif
> +
> +       /* Unconditionally updating expected removes
> +        * an 'if' statement.
> +        * expected should already be in register if
> +        * not in the cache.
> +        */
> +       *exp = old;
> +
> +       return (old.int128 == expected.int128);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index 1335d92..cfe7067 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>
>  /*------------------------ 128 bit atomic operations -------------------------*/
>
> -/**
> - * 128-bit integer structure.
> - */
> -RTE_STD_C11
> -typedef struct {
> -       RTE_STD_C11
> -       union {
> -               uint64_t val[2];
> -               __extension__ __int128 int128;
> -       };
> -} __rte_aligned(16) rte_int128_t;
> -
>  __rte_experimental
>  static inline int
>  rte_atomic128_cmp_exchange(rte_int128_t *dst,
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
> index 24ff7dc..e6ab15a 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>
>  /*------------------------ 128 bit atomic operations -------------------------*/
>
> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +       RTE_STD_C11
> +       union {
> +               uint64_t val[2];
> +#ifdef RTE_ARCH_64
> +               __extension__ __int128 int128;
> +#endif

You hid this field for x86.
What is the reason?


> +       };
> +} __rte_aligned(16) rte_int128_t;
> +
>  #ifdef __DOXYGEN__
>
>  /**
> @@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>   *     *exp = *dst
>   * @endcode
>   *
> - * @note This function is currently only available for the x86-64 platform.
> + * @note This function is currently available for the x86-64 and aarch64
> + * platforms.
>   *
>   * @note The success and failure arguments must be one of the __ATOMIC_* values
>   * defined in the C++11 standard. For details on their behavior, refer to the
> --
> 2.7.4
>



--
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-10-14 15:45         ` David Marchand
  2019-10-15 11:32           ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-14 15:45 UTC (permalink / raw)
  To: Phil Yang
  Cc: Thomas Monjalon, Jerin Jacob Kollanukkaran, Gage Eads, dev,
	Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu, nd

On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <phil.yang@arm.com> wrote:
>
> Add 128b atomic compare and swap test for aarch64 and x86_64.
>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Gage Eads <gage.eads@intel.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> Tested-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 123 insertions(+), 2 deletions(-)
>
> diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
> index 43be30e..0dad923 100644
> --- a/app/test/test_atomic.c
> +++ b/app/test/test_atomic.c
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #include <stdio.h>
> @@ -20,7 +21,7 @@
>   * Atomic Variables
>   * ================
>   *
> - * - The main test function performs three subtests. The first test
> + * - The main test function performs four subtests. The first test
>   *   checks that the usual inc/dec/add/sub functions are working
>   *   correctly:
>   *
> @@ -61,11 +62,27 @@
>   *       atomic_sub(&count, tmp+1);
>   *
>   *   - At the end of the test, the *count* value must be 0.
> + *
> + * - Test "128b compare and swap" (aarch64 and x86_64 only)
> + *
> + *   - Initialize 128-bit atomic variables to zero.
> + *
> + *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before doing

Typo, atomic128.


> + *     anything else, the cores are waiting a synchro. Each lcore does
> + *     these compare and swap (CAS) operations several times::
> + *
> + *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *
> + *   - At the end of the test, the *count128* first 64-bit value and
> + *     second 64-bit value differ by the total iterations.
>   */
>
>  #define NUM_ATOMIC_TYPES 3
>
> -#define N 10000
> +#define N 1000000

This change the number of iterations for this test.
Did you evaluate the impact on the test duration?
I suppose this is fairly quick, but could you explain why this has
been extended?
The commitlog does not give hints.


>
>  static rte_atomic16_t a16;
>  static rte_atomic32_t a32;
> @@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
>         return 0;
>  }
>
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> +static rte_int128_t count128;
> +
> +/*
> + * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
> + * bits by 2 and the second 64 bits by 1 in this test. It should return true
> + * if the compare exchange operation is successful.
> + * This test repeats 128 bits compare and swap operations 10K rounds. In each

s/10K/N/


> + * iteration it runs compare and swap operation with different memory models.
> + */
> +static int
> +test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
> +{
> +       rte_int128_t expected;
> +       int success;
> +       unsigned int i;
> +
> +       while (rte_atomic32_read(&synchro) == 0)
> +               ;
> +
> +       expected = count128;
> +
> +       for (i = 0; i < N; i++) {
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                               &expected, &desired, 1,
> +                               __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_RELEASE, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +       }
> +
> +       return 0;
> +}
> +#endif
> +
>  static int
>  test_atomic(void)
>  {
> @@ -340,6 +429,38 @@ test_atomic(void)
>                 return -1;
>         }
>
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> +       /*
> +        * This case tests the functionality of rte_atomic128b_cmp_exchange
> +        * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
> +        * models successively on each slave core. Once each 128-bit atomic
> +        * compare and swap operation is successful, it updates the global
> +        * 128-bit counter by 2 for the first 64-bit and 1 for the second
> +        * 64-bit. Each slave core iterates this test 10K times.

N times.


> +        * At the end of test, verify whether the first 64-bits of the 128-bit
> +        * counter and the second 64bits is differ by the total iterations. If
> +        * it is, the test passes.
> +        */
> +       printf("128b compare and swap test\n");
> +       uint64_t iterations = 0;
> +
> +       rte_atomic32_clear(&synchro);
> +       count128.val[0] = 0;
> +       count128.val[1] = 0;
> +
> +       rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
> +                                SKIP_MASTER);
> +       rte_atomic32_set(&synchro, 1);
> +       rte_eal_mp_wait_lcore();
> +       rte_atomic32_clear(&synchro);
> +
> +       iterations = count128.val[0] - count128.val[1];
> +       if (iterations != 4*N*(rte_lcore_count()-1)) {
> +               printf("128b compare and swap failed\n");
> +               return -1;
> +       }
> +#endif
> +
>         return 0;
>  }
>
> --
> 2.7.4
>


--
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64
  2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-10-14 15:45         ` David Marchand
  2019-10-15 11:32           ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-14 15:45 UTC (permalink / raw)
  To: Phil Yang
  Cc: Thomas Monjalon, Jerin Jacob Kollanukkaran, Gage Eads, dev,
	Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu, nd

On Wed, Aug 14, 2019 at 10:30 AM Phil Yang <phil.yang@arm.com> wrote:
>
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
>
> Introduced a new header to reduce the ifdef clutter across generic and c11
> files. The rte_stack_lf_stubs.h contains stub implementations of
> __rte_stack_lf_count, __rte_stack_lf_push_elems and
> __rte_stack_lf_pop_elems.
>
> Suggested-by: Gage Eads <gage.eads@intel.com>
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
>  doc/guides/rel_notes/release_19_11.rst          |  3 ++
>  lib/librte_stack/Makefile                       |  3 +-
>  lib/librte_stack/rte_stack_lf.h                 |  4 +++
>  lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
>  lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
>  lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
>  7 files changed, 55 insertions(+), 35 deletions(-)
>  create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h
>
> diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
> index 94f30fd..6e59fae 100644
> --- a/doc/guides/prog_guide/env_abstraction_layer.rst
> +++ b/doc/guides/prog_guide/env_abstraction_layer.rst
> @@ -648,8 +648,8 @@ Known Issues
>    Alternatively, applications can use the lock-free stack mempool handler. When
>    considering this handler, note that:
>
> -  - It is currently limited to the x86_64 platform, because it uses an
> -    instruction (16-byte compare-and-swap) that is not yet available on other
> +  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
> +    an instruction (16-byte compare-and-swap) that is not yet available on other
>      platforms.
>    - It has worse average-case performance than the non-preemptive rte_ring, but
>      software caching (e.g. the mempool cache) can mitigate this by reducing the
> diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
> index 8490d89..60ffd70 100644
> --- a/doc/guides/rel_notes/release_19_11.rst
> +++ b/doc/guides/rel_notes/release_19_11.rst
> @@ -56,6 +56,9 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =========================================================
>
> +* **Added Lock-free Stack for aarch64.**
> +
> +  The lock-free stack implementation is enabled for aarch64 platforms.
>
>  Removed Items
>  -------------
> diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
> index 8d18ce5..c337ab7 100644
> --- a/lib/librte_stack/Makefile
> +++ b/lib/librte_stack/Makefile
> @@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
>                                               rte_stack_std.h \
>                                               rte_stack_lf.h \
>                                               rte_stack_lf_generic.h \
> -                                             rte_stack_lf_c11.h
> +                                             rte_stack_lf_c11.h \
> +                                                 rte_stack_lf_stubs.h

Please, use the same indentation type than the other lines.


>
>  include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
> index f5581f0..e67630c 100644
> --- a/lib/librte_stack/rte_stack_lf.h
> +++ b/lib/librte_stack/rte_stack_lf.h
> @@ -5,11 +5,15 @@
>  #ifndef _RTE_STACK_LF_H_
>  #define _RTE_STACK_LF_H_
>
> +#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
> +#include "rte_stack_lf_stubs.h"
> +#else
>  #ifdef RTE_USE_C11_MEM_MODEL
>  #include "rte_stack_lf_c11.h"
>  #else
>  #include "rte_stack_lf_generic.h"
>  #endif
> +#endif
>
>  /**
>   * @internal Push several objects on the lock-free stack (MT-safe).
> diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
> index 3d677ae..999359f 100644
> --- a/lib/librte_stack/rte_stack_lf_c11.h
> +++ b/lib/librte_stack/rte_stack_lf_c11.h
> @@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>                           struct rte_stack_lf_elem *last,
>                           unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(first);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>          * to the LIFO len update.
>          */
>         __atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
> -#endif
>  }
>
>  static __rte_always_inline struct rte_stack_lf_elem *
> @@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>                          void **obj_table,
>                          struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(obj_table);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -
> -       return NULL;
> -#else
>         struct rte_stack_lf_head old_head;
>         uint64_t len;
>         int success;
> @@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         return old_head.top;
> -#endif
>  }
>
>  #endif /* _RTE_STACK_LF_C11_H_ */
> diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
> index 3182151..3abbb53 100644
> --- a/lib/librte_stack/rte_stack_lf_generic.h
> +++ b/lib/librte_stack/rte_stack_lf_generic.h
> @@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>                           struct rte_stack_lf_elem *last,
>                           unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(first);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         rte_atomic64_add((rte_atomic64_t *)&list->len, num);
> -#endif
>  }
>
>  static __rte_always_inline struct rte_stack_lf_elem *
> @@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>                          void **obj_table,
>                          struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(obj_table);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -
> -       return NULL;
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         return old_head.top;
> -#endif
>  }
>
>  #endif /* _RTE_STACK_LF_GENERIC_H_ */
> diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
> new file mode 100644
> index 0000000..a05abf1
> --- /dev/null
> +++ b/lib/librte_stack/rte_stack_lf_stubs.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
> + */
> +
> +#ifndef _RTE_STACK_LF_STUBS_H_
> +#define _RTE_STACK_LF_STUBS_H_
> +
> +#include <rte_common.h>
> +
> +static __rte_always_inline unsigned int
> +__rte_stack_lf_count(struct rte_stack *s)
> +{
> +       RTE_SET_USED(s);
> +
> +       return 0;
> +}
> +
> +static __rte_always_inline void
> +__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
> +                         struct rte_stack_lf_elem *first,
> +                         struct rte_stack_lf_elem *last,
> +                         unsigned int num)
> +{
> +       RTE_SET_USED(first);
> +       RTE_SET_USED(last);
> +       RTE_SET_USED(list);
> +       RTE_SET_USED(num);
> +}
> +
> +static __rte_always_inline struct rte_stack_lf_elem *
> +__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
> +                        unsigned int num,
> +                        void **obj_table,
> +                        struct rte_stack_lf_elem **last)
> +{
> +       RTE_SET_USED(obj_table);
> +       RTE_SET_USED(last);
> +       RTE_SET_USED(list);
> +       RTE_SET_USED(num);
> +
> +       return NULL;
> +}
> +
> +#endif /* _RTE_STACK_LF_STUBS_H_ */
> --
> 2.7.4
>


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-14 15:43       ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
@ 2019-10-15 11:32         ` Phil Yang (Arm Technology China)
  2019-10-15 12:16           ` David Marchand
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-10-15 11:32 UTC (permalink / raw)
  To: David Marchand
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd, nd

Hi David,

Thanks for your comments. I have addressed most of them in v10.  Please review it.
Some comments inline.
 
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, October 14, 2019 11:44 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Cc: thomas@monjalon.net; jerinj@marvell.com; Gage Eads
> <gage.eads@intel.com>; dev <dev@dpdk.org>; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic
> compare exchange
> 
> On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <phil.yang@arm.com> wrote:
> >
> > Add 128-bit atomic compare exchange on aarch64.
> 
> A bit short, seeing the complexity of the code and the additional
> RTE_ARM_FEATURE_ATOMICS config flag.
Updated in v10. 

<snip>

> >
> > +/*------------------------ 128 bit atomic operations -------------------------*/
> > +
> > +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> __ATOMIC_RELEASE)
> > +#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) ==
> __ATOMIC_ACQ_REL || \
> > +                                         (mo) == __ATOMIC_SEQ_CST)
> > +
> > +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE :
> __ATOMIC_RELAXED)
> > +#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE :
> __ATOMIC_RELAXED)
> 
> Those 4 first macros only make sense when LSE is not available (see below
> [1]).
> Besides, they are used only once, why not directly use those
> conditions where needed?

Agree. I removed __MO_LOAD and __MO_STORE in v10 and kept the __HAS_ACQ and __HAS_REL under the non-LSE condition branch in v10. 
I think they can make the code easy to read.

> 
> 
> > +
> > +#if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS)
> > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> > +static __rte_noinline rte_int128_t                                          \
> > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > +               rte_int128_t updated)                                       \
> > +{                                                                           \
> > +       /* caspX instructions register pair must start from even-numbered
> > +        * register at operand 1.
> > +        * So, specify registers for local variables here.
> > +        */                                                                 \
> > +       register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
> > +       register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
> > +       register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
> > +       register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
> > +       asm volatile(                                                       \
> > +               op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
> > +               : [old0] "+r" (x0),                                         \
> > +               [old1] "+r" (x1)                                            \
> > +               : [upd0] "r" (x2),                                          \
> > +               [upd1] "r" (x3),                                            \
> > +               [dst] "r" (dst)                                             \
> > +               : "memory");                                                \
> > +       old.val[0] = x0;                                                    \
> > +       old.val[1] = x1;                                                    \
> > +       return old;                                                         \
> > +}
> > +
> > +__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
> > +__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
> > +__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
> > +__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")
> 
> If LSE is available, we expose __rte_cas_XX (explicitely) *non*
> inlined functions, while without LSE, we expose inlined __rte_ldr_XX
> and __rte_stx_XX functions.
> So we have a first disparity with non-inlined vs inlined functions
> depending on a #ifdef.
> Then, we have a second disparity with two sets of "apis" depending on
> this #ifdef.
> 
> And we expose those sets with a rte_ prefix, meaning people will try
> to use them, but those are not part of a public api.
> 
> Can't we do without them ? (see below [2] for a proposal with ldr/stx,
> cas should be the same)

No, it doesn't work. 
Because we need to verify the return value at the end of the loop for these macros. 

> 
> 
> > +#else
> > +#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
> > +static inline rte_int128_t                                                  \
> > +ldx_op_name(const rte_int128_t *src)                                        \
> > +{                                                                           \
> > +       rte_int128_t ret;                                                   \
> > +       asm volatile(                                                       \
> > +                       op_string " %0, %1, %2"                             \
> > +                       : "=&r" (ret.val[0]),                               \
> > +                         "=&r" (ret.val[1])                                \
> > +                       : "Q" (src->val[0])                                 \
> > +                       : "memory");                                        \
> > +       return ret;                                                         \
> > +}
> > +
> > +__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
> > +__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
> > +
> > +#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
> > +static inline uint32_t                                                      \
> > +stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
> > +{                                                                           \
> > +       uint32_t ret;                                                       \
> > +       asm volatile(                                                       \
> > +                       op_string " %w0, %1, %2, %3"                        \
> > +                       : "=&r" (ret)                                       \
> > +                       : "r" (src.val[0]),                                 \
> > +                         "r" (src.val[1]),                                 \
> > +                         "Q" (dst->val[0])                                 \
> > +                       : "memory");                                        \
> > +       /* Return 0 on success, 1 on failure */                             \
> > +       return ret;                                                         \
> > +}
> > +
> > +__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
> > +__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
> > +#endif
> > +
> > +static inline int __rte_experimental
> 
> The __rte_experimental tag comes first.

Updated in v10.

> 
> 
> > +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> > +                               rte_int128_t *exp,
> > +                               const rte_int128_t *src,
> > +                               unsigned int weak,
> > +                               int success,
> > +                               int failure)
> > +{
> > +       /* Always do strong CAS */
> > +       RTE_SET_USED(weak);
> > +       /* Ignore memory ordering for failure, memory order for
> > +        * success must be stronger or equal
> > +        */
> > +       RTE_SET_USED(failure);
> > +       /* Find invalid memory order */
> > +       RTE_ASSERT(success == __ATOMIC_RELAXED
> > +                       || success == __ATOMIC_ACQUIRE
> > +                       || success == __ATOMIC_RELEASE
> > +                       || success == __ATOMIC_ACQ_REL
> > +                       || success == __ATOMIC_SEQ_CST);
> > +
> > +#if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS)
> > +       rte_int128_t expected = *exp;
> > +       rte_int128_t desired = *src;
> > +       rte_int128_t old;
> > +
> > +       if (success == __ATOMIC_RELAXED)
> > +               old = __rte_cas_relaxed(dst, expected, desired);
> > +       else if (success == __ATOMIC_ACQUIRE)
> > +               old = __rte_cas_acquire(dst, expected, desired);
> > +       else if (success == __ATOMIC_RELEASE)
> > +               old = __rte_cas_release(dst, expected, desired);
> > +       else
> > +               old = __rte_cas_acq_rel(dst, expected, desired);
> > +#else
> 
> 1: the four first macros (on the memory ordering constraints) can be
> moved here then undef'd once unused.
> Or you can just do without them.

Updated in v10.

> 
> 
> > +       int ldx_mo = __MO_LOAD(success);
> > +       int stx_mo = __MO_STORE(success);
> > +       uint32_t ret = 1;
> > +       register rte_int128_t expected = *exp;
> > +       register rte_int128_t desired = *src;
> > +       register rte_int128_t old;
> > +
> > +       /* ldx128 can not guarantee atomic,
> > +        * Must write back src or old to verify atomicity of ldx128;
> > +        */
> > +       do {
> > +               if (ldx_mo == __ATOMIC_RELAXED)
> > +                       old = __rte_ldx_relaxed(dst);
> > +               else
> > +                       old = __rte_ldx_acquire(dst);
> 
> 2: how about using a simple macro that gets passed the op string?
> 
> Something like (untested):
> 
> #define __READ_128(op_string, src, dst) \
>     asm volatile(                      \
>         op_string " %0, %1, %2"    \
>         : "=&r" (dst.val[0]),      \
>           "=&r" (dst.val[1])       \
>         : "Q" (src->val[0])        \
>         : "memory")
> 
> Then used like this:
> 
>         if (ldx_mo == __ATOMIC_RELAXED)
>             __READ_128("ldxp", dst, old);
>         else
>             __READ_128("ldaxp", dst, old);
> 
> #undef __READ_128
> 
> > +
> > +               if (likely(old.int128 == expected.int128)) {
> > +                       if (stx_mo == __ATOMIC_RELAXED)
> > +                               ret = __rte_stx_relaxed(dst, desired);
> > +                       else
> > +                               ret = __rte_stx_release(dst, desired);
> > +               } else {
> > +                       /* In the failure case (since 'weak' is ignored and only
> > +                        * weak == 0 is implemented), expected should contain
> > +                        * the atomically read value of dst. This means, 'old'
> > +                        * needs to be stored back to ensure it was read
> > +                        * atomically.
> > +                        */
> > +                       if (stx_mo == __ATOMIC_RELAXED)
> > +                               ret = __rte_stx_relaxed(dst, old);
> > +                       else
> > +                               ret = __rte_stx_release(dst, old);
> 
> And:
> 
> #define __STORE_128(op_string, dst, val, ret) \
>     asm volatile(                        \
>         op_string " %w0, %1, %2, %3"     \
>         : "=&r" (ret)                    \
>         : "r" (val.val[0]),              \
>           "r" (val.val[1]),              \
>           "Q" (dst->val[0])              \
>         : "memory")
> 
> Used like this:
> 
>         if (likely(old.int128 == expected.int128)) {
>             if (stx_mo == __ATOMIC_RELAXED)
>                 __STORE_128("stxp", dst, desired, ret);
>             else
>                 __STORE_128("stlxp", dst, desired, ret);
>         } else {
>             /* In the failure case (since 'weak' is ignored and only
>              * weak == 0 is implemented), expected should contain
>              * the atomically read value of dst. This means, 'old'
>              * needs to be stored back to ensure it was read
>              * atomically.
>              */
>             if (stx_mo == __ATOMIC_RELAXED)
>                 __STORE_128("stxp", dst, old, ret);
>             else
>                 __STORE_128("stlxp", dst, old, ret);
>         }
> 
> #undef __STORE_128
> 
> 
> > +               }
> > +       } while (unlikely(ret));
> > +#endif
> > +
> > +       /* Unconditionally updating expected removes
> > +        * an 'if' statement.
> > +        * expected should already be in register if
> > +        * not in the cache.
> > +        */
> > +       *exp = old;
> > +
> > +       return (old.int128 == expected.int128);
> > +}
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > index 1335d92..cfe7067 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > @@ -183,18 +183,6 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)
> >
> >  /*------------------------ 128 bit atomic operations -------------------------*/
> >
> > -/**
> > - * 128-bit integer structure.
> > - */
> > -RTE_STD_C11
> > -typedef struct {
> > -       RTE_STD_C11
> > -       union {
> > -               uint64_t val[2];
> > -               __extension__ __int128 int128;
> > -       };
> > -} __rte_aligned(16) rte_int128_t;
> > -
> >  __rte_experimental
> >  static inline int
> >  rte_atomic128_cmp_exchange(rte_int128_t *dst,
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index 24ff7dc..e6ab15a 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -1081,6 +1081,20 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)
> >
> >  /*------------------------ 128 bit atomic operations -------------------------*/
> >
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +RTE_STD_C11
> > +typedef struct {
> > +       RTE_STD_C11
> > +       union {
> > +               uint64_t val[2];
> > +#ifdef RTE_ARCH_64
> > +               __extension__ __int128 int128;
> > +#endif
> 
> You hid this field for x86.
> What is the reason?
No, we are not hid it for x86. The RTE_ARCH_64 flag covered x86 as well.

Thanks,
Phil

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test
  2019-10-14 15:45         ` David Marchand
@ 2019-10-15 11:32           ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-10-15 11:32 UTC (permalink / raw)
  To: David Marchand
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, October 14, 2019 11:45 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Cc: thomas@monjalon.net; jerinj@marvell.com; Gage Eads
> <gage.eads@intel.com>; dev <dev@dpdk.org>; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and
> swap test
> 
> On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <phil.yang@arm.com> wrote:
> >
> > Add 128b atomic compare and swap test for aarch64 and x86_64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Acked-by: Gage Eads <gage.eads@intel.com>
> > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > Tested-by: Jerin Jacob <jerinj@marvell.com>
> > ---
> >  app/test/test_atomic.c | 125
> ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 123 insertions(+), 2 deletions(-)
> >
> > diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
> > index 43be30e..0dad923 100644
> > --- a/app/test/test_atomic.c
> > +++ b/app/test/test_atomic.c
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2010-2014 Intel Corporation
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #include <stdio.h>
> > @@ -20,7 +21,7 @@
> >   * Atomic Variables
> >   * ================
> >   *
> > - * - The main test function performs three subtests. The first test
> > + * - The main test function performs four subtests. The first test
> >   *   checks that the usual inc/dec/add/sub functions are working
> >   *   correctly:
> >   *
> > @@ -61,11 +62,27 @@
> >   *       atomic_sub(&count, tmp+1);
> >   *
> >   *   - At the end of the test, the *count* value must be 0.
> > + *
> > + * - Test "128b compare and swap" (aarch64 and x86_64 only)
> > + *
> > + *   - Initialize 128-bit atomic variables to zero.
> > + *
> > + *   - Invoke ``test_atomici128_cmp_exchange()`` on each lcore. Before
> doing
> 
> Typo, atomic128.

Updated in v10.

> 
> 
> > + *     anything else, the cores are waiting a synchro. Each lcore does
> > + *     these compare and swap (CAS) operations several times::
> > + *
> > + *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
> > + *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> > + *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> > + *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
> > + *
> > + *   - At the end of the test, the *count128* first 64-bit value and
> > + *     second 64-bit value differ by the total iterations.
> >   */
> >
> >  #define NUM_ATOMIC_TYPES 3
> >
> > -#define N 10000
> > +#define N 1000000
> 
> This change the number of iterations for this test.
> Did you evaluate the impact on the test duration?
> I suppose this is fairly quick, but could you explain why this has
> been extended?

By extending the iterations to 1 million times it can test the stability of these atomic APIs, 
especial for the new added 128bit atomics. 
Yes, I did the evaluation. It has no impact on the test duration as the test case is simple.

> The commitlog does not give hints.
Thanks for pointing out this. I have updated this in the v10 commitlog.
 
> 
> 
> >
> >  static rte_atomic16_t a16;
> >  static rte_atomic32_t a32;
> > @@ -216,6 +233,78 @@
> test_atomic_dec_and_test(__attribute__((unused)) void *arg)
> >         return 0;
> >  }
> >
> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> > +static rte_int128_t count128;
> > +
> > +/*
> > + * rte_atomic128_cmp_exchange() should update a 128 bits counter's first
> 64
> > + * bits by 2 and the second 64 bits by 1 in this test. It should return true
> > + * if the compare exchange operation is successful.
> > + * This test repeats 128 bits compare and swap operations 10K rounds. In
> each
> 
> s/10K/N/

Updated in v10.

<snip>

> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> > +       /*
> > +        * This case tests the functionality of rte_atomic128b_cmp_exchange
> > +        * API. It calls rte_atomic128b_cmp_exchange with four kinds of
> memory
> > +        * models successively on each slave core. Once each 128-bit atomic
> > +        * compare and swap operation is successful, it updates the global
> > +        * 128-bit counter by 2 for the first 64-bit and 1 for the second
> > +        * 64-bit. Each slave core iterates this test 10K times.
> 
> N times.

Updated in v10.

Thanks,
Phil

> 
> 
> > +        * At the end of test, verify whether the first 64-bits of the 128-bit
> > +        * counter and the second 64bits is differ by the total iterations. If
> > +        * it is, the test passes.
> > +        */
> > +       printf("128b compare and swap test\n");
> > +       uint64_t iterations = 0;
> > +
> > +       rte_atomic32_clear(&synchro);
> > +       count128.val[0] = 0;
> > +       count128.val[1] = 0;
> > +
> > +       rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
> > +                                SKIP_MASTER);
> > +       rte_atomic32_set(&synchro, 1);
> > +       rte_eal_mp_wait_lcore();
> > +       rte_atomic32_clear(&synchro);
> > +
> > +       iterations = count128.val[0] - count128.val[1];
> > +       if (iterations != 4*N*(rte_lcore_count()-1)) {
> > +               printf("128b compare and swap failed\n");
> > +               return -1;
> > +       }
> > +#endif
> > +
> >         return 0;
> >  }
> >
> > --
> > 2.7.4
> >
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64
  2019-10-14 15:45         ` David Marchand
@ 2019-10-15 11:32           ` Phil Yang (Arm Technology China)
  0 siblings, 0 replies; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-10-15 11:32 UTC (permalink / raw)
  To: David Marchand
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, October 14, 2019 11:45 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Cc: thomas@monjalon.net; jerinj@marvell.com; Gage Eads
> <gage.eads@intel.com>; dev <dev@dpdk.org>; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for
> aarch64
> 
> On Wed, Aug 14, 2019 at 10:30 AM Phil Yang <phil.yang@arm.com> wrote:
> >
> > Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
> >
> > Introduced a new header to reduce the ifdef clutter across generic and c11
> > files. The rte_stack_lf_stubs.h contains stub implementations of
> > __rte_stack_lf_count, __rte_stack_lf_push_elems and
> > __rte_stack_lf_pop_elems.
> >
> > Suggested-by: Gage Eads <gage.eads@intel.com>
> > Suggested-by: Jerin Jacob <jerinj@marvell.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > ---

<snip>

> >  -------------
> > diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
> > index 8d18ce5..c337ab7 100644
> > --- a/lib/librte_stack/Makefile
> > +++ b/lib/librte_stack/Makefile
> > @@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include :=
> rte_stack.h \
> >                                               rte_stack_std.h \
> >                                               rte_stack_lf.h \
> >                                               rte_stack_lf_generic.h \
> > -                                             rte_stack_lf_c11.h
> > +                                             rte_stack_lf_c11.h \
> > +                                                 rte_stack_lf_stubs.h
> 
> Please, use the same indentation type than the other lines.

Updated in v10. 

Thanks,
Phil
 


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v10 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
                         ` (2 preceding siblings ...)
  2019-10-14 15:43       ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
@ 2019-10-15 11:38       ` " Phil Yang
  2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 2/3] test/atomic: add 128b compare and swap test Phil Yang
                           ` (2 more replies)
  3 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-10-15 11:38 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

This patch adds the implementation of the 128-bit atomic compare
exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
can perform this operation. Moreover, on the LSE atomic extension
accelerated platforms, it implemented by 'casp' instructions for
better performance.

Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
'cas' version on elder version compilers.

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
v10:
1.Removed all the rte tag for internal functions.
2.Removed __MO_LOAD and _MO_STORE macros and keep define __HAS_ACQ
and __HAS_REL under non LSE conditional branch.
3.Undef the macro once it is unused.
4.Reword the 1/3 and 2/3 patches' commitlog more specific.

v9:
Updated 19.11 release note.

v8:
Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
checkpatch.pl

v7:
1. Adjust code comment.

v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
(Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   3 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 173 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 +-
 7 files changed, 196 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..9f28271 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index e843a21..a96beb9 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..7687dbe 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..af4a89c 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..7854c07 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,175 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                       \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                 \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
+	asm volatile(                                                       \
+		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
+		: [old0] "+r" (x0),                                         \
+		[old1] "+r" (x1)                                            \
+		: [upd0] "r" (x2),                                          \
+		[upd1] "r" (x3),                                            \
+		[dst] "r" (dst)                                             \
+		: "memory");                                                \
+	old.val[0] = x0;                                                    \
+	old.val[1] = x1;                                                    \
+	return old;                                                         \
+}
+
+__ATOMIC128_CAS_OP(__cas_relaxed, "casp")
+__ATOMIC128_CAS_OP(__cas_acquire, "caspa")
+__ATOMIC128_CAS_OP(__cas_release, "caspl")
+__ATOMIC128_CAS_OP(__cas_acq_rel, "caspal")
+
+#undef __ATOMIC128_CAS_OP
+
+#else
+#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
+static inline rte_int128_t                                                  \
+ldx_op_name(const rte_int128_t *src)                                        \
+{                                                                           \
+	rte_int128_t ret;                                                   \
+	asm volatile(                                                       \
+			op_string " %0, %1, %2"                             \
+			: "=&r" (ret.val[0]),                               \
+			  "=&r" (ret.val[1])                                \
+			: "Q" (src->val[0])                                 \
+			: "memory");                                        \
+	return ret;                                                         \
+}
+
+__ATOMIC128_LDX_OP(__ldx_relaxed, "ldxp")
+__ATOMIC128_LDX_OP(__ldx_acquire, "ldaxp")
+
+#undef __ATOMIC128_LDX_OP
+
+#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
+static inline uint32_t                                                      \
+stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
+{                                                                           \
+	uint32_t ret;                                                       \
+	asm volatile(                                                       \
+			op_string " %w0, %1, %2, %3"                        \
+			: "=&r" (ret)                                       \
+			: "r" (src.val[0]),                                 \
+			  "r" (src.val[1]),                                 \
+			  "Q" (dst->val[0])                                 \
+			: "memory");                                        \
+	/* Return 0 on success, 1 on failure */                             \
+	return ret;                                                         \
+}
+
+__ATOMIC128_STX_OP(__stx_relaxed, "stxp")
+__ATOMIC128_STX_OP(__stx_release, "stlxp")
+
+#undef __ATOMIC128_STX_OP
+
+#endif
+
+__rte_experimental
+static inline int
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+	if (success == __ATOMIC_RELAXED)
+		old = __cas_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __cas_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __cas_release(dst, expected, desired);
+	else
+		old = __cas_acq_rel(dst, expected, desired);
+#else
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+	int ldx_mo = __HAS_ACQ(success) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED;
+	int stx_mo = __HAS_RLS(success) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED;
+
+#undef __HAS_ACQ
+#undef __HAS_RLS
+
+	uint32_t ret = 1;
+	register rte_int128_t expected = *exp;
+	register rte_int128_t desired = *src;
+	register rte_int128_t old;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+		if (ldx_mo == __ATOMIC_RELAXED)
+			old = __ldx_relaxed(dst);
+		else
+			old = __ldx_acquire(dst);
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __stx_relaxed(dst, desired);
+			else
+				ret = __stx_release(dst, desired);
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain
+			 * the atomically read value of dst. This means, 'old'
+			 * needs to be stored back to ensure it was read
+			 * atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				ret = __stx_relaxed(dst, old);
+			else
+				ret = __stx_release(dst, old);
+		}
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 1335d92..cfe7067 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v10 2/3] test/atomic: add 128b compare and swap test
  2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
@ 2019-10-15 11:38         ` Phil Yang
  2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-10-15 11:38 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.
Extend the test iteration from 10 thousand to 1 million times to test
the stability of the atomic APIs.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>
---
 app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..14bd3a8 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomic128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations N rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+				&expected, &desired, 1,
+				__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +429,38 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test N times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
+				 SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v10 3/3] eal/stack: enable lock-free stack for aarch64
  2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
  2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-10-15 11:38         ` Phil Yang
  2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2 siblings, 0 replies; 91+ messages in thread
From: Phil Yang @ 2019-10-15 11:38 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
 doc/guides/rel_notes/release_19_11.rst          |  3 ++
 lib/librte_stack/Makefile                       |  3 +-
 lib/librte_stack/rte_stack_lf.h                 |  4 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
 lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 94f30fd..6e59fae 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -648,8 +648,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 85953b9..4f82f54 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -115,6 +115,9 @@ New Features
   Added eBPF JIT support for arm64 architecture to improve the eBPF program
   performance.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce5..b5e5bed 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+					      rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..a05abf1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-15 11:32         ` Phil Yang (Arm Technology China)
@ 2019-10-15 12:16           ` David Marchand
  2019-10-16  9:04             ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-15 12:16 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China)
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd

On Tue, Oct 15, 2019 at 1:32 PM Phil Yang (Arm Technology China)
<Phil.Yang@arm.com> wrote:
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > If LSE is available, we expose __rte_cas_XX (explicitely) *non*
> > inlined functions, while without LSE, we expose inlined __rte_ldr_XX
> > and __rte_stx_XX functions.
> > So we have a first disparity with non-inlined vs inlined functions
> > depending on a #ifdef.

You did not comment on the inline / no inline part and I still see
this in the v10.
Is this __rte_noinline on the CAS function intentional?


> > Then, we have a second disparity with two sets of "apis" depending on
> > this #ifdef.
> >
> > And we expose those sets with a rte_ prefix, meaning people will try
> > to use them, but those are not part of a public api.
> >
> > Can't we do without them ? (see below [2] for a proposal with ldr/stx,
> > cas should be the same)
>
> No, it doesn't work.
> Because we need to verify the return value at the end of the loop for these macros.

Do you mean the return value for the stores?

> > #define __STORE_128(op_string, dst, val, ret) \
> >     asm volatile(                        \
> >         op_string " %w0, %1, %2, %3"     \
> >         : "=&r" (ret)                    \
> >         : "r" (val.val[0]),              \
> >           "r" (val.val[1]),              \
> >           "Q" (dst->val[0])              \
> >         : "memory")

The ret variable is still passed in this macro and the while loop can
check it later.


> > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > index 24ff7dc..e6ab15a 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > @@ -1081,6 +1081,20 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)
> > >
> > >  /*------------------------ 128 bit atomic operations -------------------------*/
> > >
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +RTE_STD_C11
> > > +typedef struct {
> > > +       RTE_STD_C11
> > > +       union {
> > > +               uint64_t val[2];
> > > +#ifdef RTE_ARCH_64
> > > +               __extension__ __int128 int128;
> > > +#endif
> >
> > You hid this field for x86.
> > What is the reason?
> No, we are not hid it for x86. The RTE_ARCH_64 flag covered x86 as well.

Ah indeed, I read it wrong, ARCH_64 ... AARCH64 ... :-)



--
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-15 12:16           ` David Marchand
@ 2019-10-16  9:04             ` Phil Yang (Arm Technology China)
  2019-10-17 12:45               ` David Marchand
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-10-16  9:04 UTC (permalink / raw)
  To: David Marchand
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Tuesday, October 15, 2019 8:16 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Cc: thomas@monjalon.net; jerinj@marvell.com; Gage Eads
> <gage.eads@intel.com>; dev <dev@dpdk.org>; hemant.agrawal@nxp.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic
> compare exchange
> 
> On Tue, Oct 15, 2019 at 1:32 PM Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com> wrote:
> > > -----Original Message-----
> > > From: David Marchand <david.marchand@redhat.com>
> > > If LSE is available, we expose __rte_cas_XX (explicitely) *non*
> > > inlined functions, while without LSE, we expose inlined __rte_ldr_XX
> > > and __rte_stx_XX functions.
> > > So we have a first disparity with non-inlined vs inlined functions
> > > depending on a #ifdef.
> 
> You did not comment on the inline / no inline part and I still see
> this in the v10.
> Is this __rte_noinline on the CAS function intentional?

Apologize for missing this item. Yes, it is to avoid ABI break.
Please check
5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix possible arm64 ABI break")

> 
> 
> > > Then, we have a second disparity with two sets of "apis" depending on
> > > this #ifdef.
> > >
> > > And we expose those sets with a rte_ prefix, meaning people will try
> > > to use them, but those are not part of a public api.
> > >
> > > Can't we do without them ? (see below [2] for a proposal with ldr/stx,
> > > cas should be the same)
> >
> > No, it doesn't work.
> > Because we need to verify the return value at the end of the loop for these
> macros.
> 
> Do you mean the return value for the stores?

It is my bad. I missed the ret option in the macro. This approach works.

However, I suggest to keep them as static inline functions rather than a piece of macro in the rte_atomic128_cmp_exchange API.
One reason is APIs name can indicate the memory ordering of these operations.
Moreover, it uses the register type to pass the value in the inline function, so it should not have too much cost comparing with the macro.
I also think these 128bit load and store functions can be used in other places, once it has been proved valuable in rte_atomic128_cmp_exchange API. But let's keep them private for the current stage.
BTW, Linux kernel implemented in the same way. https://github.com/torvalds/linux/blob/master/arch/arm64/include/asm/atomic_lse.h#L19 
 
> > > #define __STORE_128(op_string, dst, val, ret) \
> > >     asm volatile(                        \
> > >         op_string " %w0, %1, %2, %3"     \
> > >         : "=&r" (ret)                    \
> > >         : "r" (val.val[0]),              \
> > >           "r" (val.val[1]),              \
> > >           "Q" (dst->val[0])              \
> > >         : "memory")
> 
> The ret variable is still passed in this macro and the while loop can
> check it later.
> 
> 
> > > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > index 24ff7dc..e6ab15a 100644
> > > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > @@ -1081,6 +1081,20 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t *v)
> > > >
> > > >  /*------------------------ 128 bit atomic operations -------------------------*/
> > > >
> > > > +/**
> > > > + * 128-bit integer structure.
> > > > + */
> > > > +RTE_STD_C11
> > > > +typedef struct {
> > > > +       RTE_STD_C11
> > > > +       union {
> > > > +               uint64_t val[2];
> > > > +#ifdef RTE_ARCH_64
> > > > +               __extension__ __int128 int128;
> > > > +#endif
> > >
> > > You hid this field for x86.
> > > What is the reason?
> > No, we are not hid it for x86. The RTE_ARCH_64 flag covered x86 as well.
> 
> Ah indeed, I read it wrong, ARCH_64 ... AARCH64 ... :-)
> 
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-16  9:04             ` Phil Yang (Arm Technology China)
@ 2019-10-17 12:45               ` David Marchand
  0 siblings, 0 replies; 91+ messages in thread
From: David Marchand @ 2019-10-17 12:45 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China)
  Cc: thomas, jerinj, Gage Eads, dev, hemant.agrawal,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China),
	nd

On Wed, Oct 16, 2019 at 11:04 AM Phil Yang (Arm Technology China)
<Phil.Yang@arm.com> wrote:
>
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: Tuesday, October 15, 2019 8:16 PM
> > To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> > Cc: thomas@monjalon.net; jerinj@marvell.com; Gage Eads
> > <gage.eads@intel.com>; dev <dev@dpdk.org>; hemant.agrawal@nxp.com;
> > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic
> > compare exchange
> >
> > On Tue, Oct 15, 2019 at 1:32 PM Phil Yang (Arm Technology China)
> > <Phil.Yang@arm.com> wrote:
> > > > -----Original Message-----
> > > > From: David Marchand <david.marchand@redhat.com>
> > > > If LSE is available, we expose __rte_cas_XX (explicitely) *non*
> > > > inlined functions, while without LSE, we expose inlined __rte_ldr_XX
> > > > and __rte_stx_XX functions.
> > > > So we have a first disparity with non-inlined vs inlined functions
> > > > depending on a #ifdef.
> >
> > You did not comment on the inline / no inline part and I still see
> > this in the v10.
> > Is this __rte_noinline on the CAS function intentional?
>
> Apologize for missing this item. Yes, it is to avoid ABI break.
> Please check
> 5b40ec6b966260e0ff66a8a2c689664f75d6a0e6 ("mempool/octeontx2: fix possible arm64 ABI break")

Looked at the kernel parts on LSE CAS (thanks for the pointer) but I
see inlines are used:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/include/asm/atomic_lse.h#n365?h=v5.4-rc3

What is special in the kernel or in dpdk that makes this different?


>
> >
> >
> > > > Then, we have a second disparity with two sets of "apis" depending on
> > > > this #ifdef.
> > > >
> > > > And we expose those sets with a rte_ prefix, meaning people will try
> > > > to use them, but those are not part of a public api.
> > > >
> > > > Can't we do without them ? (see below [2] for a proposal with ldr/stx,
> > > > cas should be the same)
> > >
> > > No, it doesn't work.
> > > Because we need to verify the return value at the end of the loop for these
> > macros.
> >
> > Do you mean the return value for the stores?
>
> It is my bad. I missed the ret option in the macro. This approach works.

Ok, thanks for confirming.


>
> However, I suggest to keep them as static inline functions rather than a piece of macro in the rte_atomic128_cmp_exchange API.
> One reason is APIs name can indicate the memory ordering of these operations.

API?
Those inlines are not part of a public API and we agree this patch is
not about adding 128 bits load/store apis.

My proposal gives us small code that looks like:
        if (ldx_mo == __ATOMIC_RELAXED)
            __READ_128("ldxp", dst, old);
        else
            __READ_128("ldaxp", dst, old);

I am not a memory order guru, but with this, I can figure the asm
instruction depends on it.
And, since we are looking at internals of an implementation, this is
mainly for people looking at/maintaining these low level details.


> Moreover, it uses the register type to pass the value in the inline function, so it should not have too much cost comparing with the macro.

This is not a problem of cost, this is about hiding architecture
details from the final user.
If you expose something, you can expect someone will start using it
and will complain later if you break it.


> I also think these 128bit load and store functions can be used in other places, once it has been proved valuable in rte_atomic128_cmp_exchange API. But let's keep them private for the current stage.

Yes I agree this could be introduced in the future.


> BTW, Linux kernel implemented in the same way. https://github.com/torvalds/linux/blob/master/arch/arm64/include/asm/atomic_lse.h#L19

Ok kernel exposes its internals, but I think kernel developpers are
more vigilant than dpdk developpers on what is part of the public API
and what is internal.


--
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
  2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-10-18 11:21         ` Phil Yang
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test Phil Yang
                             ` (2 more replies)
  2 siblings, 3 replies; 91+ messages in thread
From: Phil Yang @ 2019-10-18 11:21 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

This patch adds the implementation of the 128-bit atomic compare
exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
can perform this operation. Moreover, on the LSE atomic extension
accelerated platforms, it implemented by 'casp' instructions for
better performance.

Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
'cas' version on elder version compilers.

Since direct x0 register used in the code and cas_op_name() and
rte_atomic128_cmp_exchange() is inline function, based on parent
function load, it may corrupt x0 register aka Break arm64 ABI.
Define CAS operations as rte_noinline functions to avoid the ABI
break[1].

[1]5b40ec6b9662 ("mempool/octeontx2: fix possible arm64 ABI break").

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
v11:
1. Renamed cas_op_name by adding the data width tag 128.
2. Replaced __ldx/__stx inline functions with macros.
3. Illustrate the reason of define cas operations as non-inline
functions in commitlog.

v10:
1.Removed all the rte tag for internal functions.
2.Removed __MO_LOAD and _MO_STORE macros and keep define __HAS_ACQ
and __HAS_REL under non LSE conditional branch.
3.Undef the macro once it is unused.
4.Reword the 1/3 and 2/3 patches' commitlog more specific.

v9:
Updated 19.11 release note.

v8:
Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
checkpatch.pl

v7:
1. Adjust code comment.

v6:
1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
3. Fixed 32 bit build issue.

v5:
1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
commit.
(Jerin, Jocob)
3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
Jocob)

v4:
1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
(Jerin Jocob)
2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
Jocob)
3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
Eads/Jerin Jocob)

v3:
1. Avoid duplication code with macro. (Jerin Jocob)
2. Make invalid memory order to strongest barrier. (Jerin Jocob)
3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
4. Fix 32-bit x86 builds issue. (Gage Eads)
5. Correct documentation issues in UT. (Gage Eads)

v2:
Initial version.

 config/arm/meson.build                             |   2 +
 config/common_base                                 |   3 +
 config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
 config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
 .../common/include/arch/arm/rte_atomic_64.h        | 151 +++++++++++++++++++++
 .../common/include/arch/x86/rte_atomic_64.h        |  12 --
 lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
 7 files changed, 174 insertions(+), 13 deletions(-)

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..9f28271 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
 	['RTE_CACHE_LINE_SIZE', 64],
 	['RTE_MAX_NUMA_NODES', 2],
 	['RTE_MAX_LCORE', 256],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
 	['RTE_MACHINE', '"octeontx2"'],
 	['RTE_MAX_NUMA_NODES', 1],
 	['RTE_MAX_LCORE', 24],
+	['RTE_ARM_FEATURE_ATOMICS', true],
 	['RTE_EAL_IGB_UIO', false],
 	['RTE_USE_C11_MEM_MODEL', true]]
 
diff --git a/config/common_base b/config/common_base
index e843a21..a96beb9 100644
--- a/config/common_base
+++ b/config/common_base
@@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
 CONFIG_RTE_MAX_NUMA_NODES=8
 CONFIG_RTE_MAX_HEAPS=32
 CONFIG_RTE_MAX_MEMSEG_LISTS=64
+
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
 # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
 # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
 CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc b/config/defconfig_arm64-octeontx2-linuxapp-gcc
index f20da24..7687dbe 100644
--- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
+++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc b/config/defconfig_arm64-thunderx2-linuxapp-gcc
index cc5c64b..af4a89c 100644
--- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
+++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..d9ebccc 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,153 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
+		rte_int128_t updated)                                       \
+{                                                                           \
+	/* caspX instructions register pair must start from even-numbered
+	 * register at operand 1.
+	 * So, specify registers for local variables here.
+	 */                                                                 \
+	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
+	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
+	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
+	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
+	asm volatile(                                                       \
+		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
+		: [old0] "+r" (x0),                                         \
+		[old1] "+r" (x1)                                            \
+		: [upd0] "r" (x2),                                          \
+		[upd1] "r" (x3),                                            \
+		[dst] "r" (dst)                                             \
+		: "memory");                                                \
+	old.val[0] = x0;                                                    \
+	old.val[1] = x1;                                                    \
+	return old;                                                         \
+}
+
+__ATOMIC128_CAS_OP(__cas_128_relaxed, "casp")
+__ATOMIC128_CAS_OP(__cas_128_acquire, "caspa")
+__ATOMIC128_CAS_OP(__cas_128_release, "caspl")
+__ATOMIC128_CAS_OP(__cas_128_acq_rel, "caspal")
+
+#undef __ATOMIC128_CAS_OP
+
+#endif
+
+__rte_experimental
+static inline int
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+				rte_int128_t *exp,
+				const rte_int128_t *src,
+				unsigned int weak,
+				int success,
+				int failure)
+{
+	/* Always do strong CAS */
+	RTE_SET_USED(weak);
+	/* Ignore memory ordering for failure, memory order for
+	 * success must be stronger or equal
+	 */
+	RTE_SET_USED(failure);
+	/* Find invalid memory order */
+	RTE_ASSERT(success == __ATOMIC_RELAXED
+			|| success == __ATOMIC_ACQUIRE
+			|| success == __ATOMIC_RELEASE
+			|| success == __ATOMIC_ACQ_REL
+			|| success == __ATOMIC_SEQ_CST);
+
+	rte_int128_t expected = *exp;
+	rte_int128_t desired = *src;
+	rte_int128_t old;
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+	if (success == __ATOMIC_RELAXED)
+		old = __cas_128_relaxed(dst, expected, desired);
+	else if (success == __ATOMIC_ACQUIRE)
+		old = __cas_128_acquire(dst, expected, desired);
+	else if (success == __ATOMIC_RELEASE)
+		old = __cas_128_release(dst, expected, desired);
+	else
+		old = __cas_128_acq_rel(dst, expected, desired);
+#else
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+					  (mo) == __ATOMIC_SEQ_CST)
+
+	int ldx_mo = __HAS_ACQ(success) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED;
+	int stx_mo = __HAS_RLS(success) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED;
+
+#undef __HAS_ACQ
+#undef __HAS_RLS
+
+	uint32_t ret = 1;
+
+	/* ldx128 can not guarantee atomic,
+	 * Must write back src or old to verify atomicity of ldx128;
+	 */
+	do {
+
+#define __LOAD_128(op_string, src, dst) {\
+	asm volatile(                    \
+		op_string " %0, %1, %2"  \
+		: "=&r" (dst.val[0]),    \
+		  "=&r" (dst.val[1])     \
+		: "Q" (src->val[0])      \
+		: "memory"); }
+
+		if (ldx_mo == __ATOMIC_RELAXED)
+			__LOAD_128("ldxp", dst, old)
+		else
+			__LOAD_128("ldaxp", dst, old)
+
+#undef __LOAD_128
+
+#define __STORE_128(op_string, dst, src, ret) {\
+	asm volatile(                          \
+		op_string " %w0, %1, %2, %3"   \
+		: "=&r" (ret)                  \
+		: "r" (src.val[0]),            \
+		  "r" (src.val[1]),            \
+		  "Q" (dst->val[0])            \
+		: "memory"); }
+
+		if (likely(old.int128 == expected.int128)) {
+			if (stx_mo == __ATOMIC_RELAXED)
+				__STORE_128("stxp", dst, desired, ret)
+			else
+				__STORE_128("stlxp", dst, desired, ret)
+		} else {
+			/* In the failure case (since 'weak' is ignored and only
+			 * weak == 0 is implemented), expected should contain
+			 * the atomically read value of dst. This means, 'old'
+			 * needs to be stored back to ensure it was read
+			 * atomically.
+			 */
+			if (stx_mo == __ATOMIC_RELAXED)
+				__STORE_128("stxp", dst, old, ret)
+			else
+				__STORE_128("stlxp", dst, old, ret)
+		}
+#undef __STORE_128
+
+	} while (unlikely(ret));
+#endif
+
+	/* Unconditionally updating expected removes
+	 * an 'if' statement.
+	 * expected should already be in register if
+	 * not in the cache.
+	 */
+	*exp = old;
+
+	return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index 1335d92..cfe7067 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-	RTE_STD_C11
-	union {
-		uint64_t val[2];
-		__extension__ __int128 int128;
-	};
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 24ff7dc..e6ab15a 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+#ifdef RTE_ARCH_64
+		__extension__ __int128 int128;
+#endif
+	};
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test
  2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
@ 2019-10-18 11:21           ` Phil Yang
  2019-10-21  8:25             ` David Marchand
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
  2019-10-18 14:16           ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-10-18 11:21 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Add 128b atomic compare and swap test for aarch64 and x86_64.
Extend the test iteration from 10 thousand to 1 million times to test
the stability of the atomic APIs.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Gage Eads <gage.eads@intel.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Tested-by: Jerin Jacob <jerinj@marvell.com>
---
 app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index 43be30e..14bd3a8 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2019 Arm Limited
  */
 
 #include <stdio.h>
@@ -20,7 +21,7 @@
  * Atomic Variables
  * ================
  *
- * - The main test function performs three subtests. The first test
+ * - The main test function performs four subtests. The first test
  *   checks that the usual inc/dec/add/sub functions are working
  *   correctly:
  *
@@ -61,11 +62,27 @@
  *       atomic_sub(&count, tmp+1);
  *
  *   - At the end of the test, the *count* value must be 0.
+ *
+ * - Test "128b compare and swap" (aarch64 and x86_64 only)
+ *
+ *   - Initialize 128-bit atomic variables to zero.
+ *
+ *   - Invoke ``test_atomic128_cmp_exchange()`` on each lcore. Before doing
+ *     anything else, the cores are waiting a synchro. Each lcore does
+ *     these compare and swap (CAS) operations several times::
+ *
+ *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
+ *
+ *   - At the end of the test, the *count128* first 64-bit value and
+ *     second 64-bit value differ by the total iterations.
  */
 
 #define NUM_ATOMIC_TYPES 3
 
-#define N 10000
+#define N 1000000
 
 static rte_atomic16_t a16;
 static rte_atomic32_t a32;
@@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+static rte_int128_t count128;
+
+/*
+ * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
+ * bits by 2 and the second 64 bits by 1 in this test. It should return true
+ * if the compare exchange operation is successful.
+ * This test repeats 128 bits compare and swap operations N rounds. In each
+ * iteration it runs compare and swap operation with different memory models.
+ */
+static int
+test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
+{
+	rte_int128_t expected;
+	int success;
+	unsigned int i;
+
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	expected = count128;
+
+	for (i = 0; i < N; i++) {
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+				&expected, &desired, 1,
+				__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELEASE, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
+		} while (success == 0);
+
+		do {
+			rte_int128_t desired;
+
+			desired.val[0] = expected.val[0] + 2;
+			desired.val[1] = expected.val[1] + 1;
+
+			success = rte_atomic128_cmp_exchange(&count128,
+					&expected, &desired, 1,
+					__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+		} while (success == 0);
+	}
+
+	return 0;
+}
+#endif
+
 static int
 test_atomic(void)
 {
@@ -340,6 +429,38 @@ test_atomic(void)
 		return -1;
 	}
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+	/*
+	 * This case tests the functionality of rte_atomic128b_cmp_exchange
+	 * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
+	 * models successively on each slave core. Once each 128-bit atomic
+	 * compare and swap operation is successful, it updates the global
+	 * 128-bit counter by 2 for the first 64-bit and 1 for the second
+	 * 64-bit. Each slave core iterates this test N times.
+	 * At the end of test, verify whether the first 64-bits of the 128-bit
+	 * counter and the second 64bits is differ by the total iterations. If
+	 * it is, the test passes.
+	 */
+	printf("128b compare and swap test\n");
+	uint64_t iterations = 0;
+
+	rte_atomic32_clear(&synchro);
+	count128.val[0] = 0;
+	count128.val[1] = 0;
+
+	rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
+				 SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+	rte_eal_mp_wait_lcore();
+	rte_atomic32_clear(&synchro);
+
+	iterations = count128.val[0] - count128.val[1];
+	if (iterations != 4*N*(rte_lcore_count()-1)) {
+		printf("128b compare and swap failed\n");
+		return -1;
+	}
+#endif
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64
  2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-10-18 11:21           ` Phil Yang
  2019-10-21  8:26             ` David Marchand
  2019-10-18 14:16           ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
  2 siblings, 1 reply; 91+ messages in thread
From: Phil Yang @ 2019-10-18 11:21 UTC (permalink / raw)
  To: david.marchand, jerinj, gage.eads, dev
  Cc: thomas, hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.

Introduced a new header to reduce the ifdef clutter across generic and c11
files. The rte_stack_lf_stubs.h contains stub implementations of
__rte_stack_lf_count, __rte_stack_lf_push_elems and
__rte_stack_lf_pop_elems.

Suggested-by: Gage Eads <gage.eads@intel.com>
Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
 doc/guides/rel_notes/release_19_11.rst          |  3 ++
 lib/librte_stack/Makefile                       |  3 +-
 lib/librte_stack/rte_stack_lf.h                 |  4 +++
 lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
 lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
 lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h

diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 94f30fd..6e59fae 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -648,8 +648,8 @@ Known Issues
   Alternatively, applications can use the lock-free stack mempool handler. When
   considering this handler, note that:
 
-  - It is currently limited to the x86_64 platform, because it uses an
-    instruction (16-byte compare-and-swap) that is not yet available on other
+  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
+    an instruction (16-byte compare-and-swap) that is not yet available on other
     platforms.
   - It has worse average-case performance than the non-preemptive rte_ring, but
     software caching (e.g. the mempool cache) can mitigate this by reducing the
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 85953b9..4f82f54 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -115,6 +115,9 @@ New Features
   Added eBPF JIT support for arm64 architecture to improve the eBPF program
   performance.
 
+* **Added Lock-free Stack for aarch64.**
+
+  The lock-free stack implementation is enabled for aarch64 platforms.
 
 Removed Items
 -------------
diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
index 8d18ce5..b5e5bed 100644
--- a/lib/librte_stack/Makefile
+++ b/lib/librte_stack/Makefile
@@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
 					      rte_stack_std.h \
 					      rte_stack_lf.h \
 					      rte_stack_lf_generic.h \
-					      rte_stack_lf_c11.h
+					      rte_stack_lf_c11.h \
+					      rte_stack_lf_stubs.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
index f5581f0..e67630c 100644
--- a/lib/librte_stack/rte_stack_lf.h
+++ b/lib/librte_stack/rte_stack_lf.h
@@ -5,11 +5,15 @@
 #ifndef _RTE_STACK_LF_H_
 #define _RTE_STACK_LF_H_
 
+#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
+#include "rte_stack_lf_stubs.h"
+#else
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_stack_lf_c11.h"
 #else
 #include "rte_stack_lf_generic.h"
 #endif
+#endif
 
 /**
  * @internal Push several objects on the lock-free stack (MT-safe).
diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
index 3d677ae..999359f 100644
--- a/lib/librte_stack/rte_stack_lf_c11.h
+++ b/lib/librte_stack/rte_stack_lf_c11.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	 * to the LIFO len update.
 	 */
 	__atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	uint64_t len;
 	int success;
@@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_C11_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
index 3182151..3abbb53 100644
--- a/lib/librte_stack/rte_stack_lf_generic.h
+++ b/lib/librte_stack/rte_stack_lf_generic.h
@@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 			  struct rte_stack_lf_elem *last,
 			  unsigned int num)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(first);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	rte_atomic64_add((rte_atomic64_t *)&list->len, num);
-#endif
 }
 
 static __rte_always_inline struct rte_stack_lf_elem *
@@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 			 void **obj_table,
 			 struct rte_stack_lf_elem **last)
 {
-#ifndef RTE_ARCH_X86_64
-	RTE_SET_USED(obj_table);
-	RTE_SET_USED(last);
-	RTE_SET_USED(list);
-	RTE_SET_USED(num);
-
-	return NULL;
-#else
 	struct rte_stack_lf_head old_head;
 	int success;
 
@@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
 	} while (success == 0);
 
 	return old_head.top;
-#endif
 }
 
 #endif /* _RTE_STACK_LF_GENERIC_H_ */
diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
new file mode 100644
index 0000000..a05abf1
--- /dev/null
+++ b/lib/librte_stack/rte_stack_lf_stubs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_STACK_LF_STUBS_H_
+#define _RTE_STACK_LF_STUBS_H_
+
+#include <rte_common.h>
+
+static __rte_always_inline unsigned int
+__rte_stack_lf_count(struct rte_stack *s)
+{
+	RTE_SET_USED(s);
+
+	return 0;
+}
+
+static __rte_always_inline void
+__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
+			  struct rte_stack_lf_elem *first,
+			  struct rte_stack_lf_elem *last,
+			  unsigned int num)
+{
+	RTE_SET_USED(first);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+}
+
+static __rte_always_inline struct rte_stack_lf_elem *
+__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
+			 unsigned int num,
+			 void **obj_table,
+			 struct rte_stack_lf_elem **last)
+{
+	RTE_SET_USED(obj_table);
+	RTE_SET_USED(last);
+	RTE_SET_USED(list);
+	RTE_SET_USED(num);
+
+	return NULL;
+}
+
+#endif /* _RTE_STACK_LF_STUBS_H_ */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test Phil Yang
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-10-18 14:16           ` David Marchand
  2019-10-18 14:24             ` Jerin Jacob
  2 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-18 14:16 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Phil Yang
  Cc: Gage Eads, dev, Thomas Monjalon, Hemant Agrawal,
	Honnappa Nagarahalli, Gavin Hu, nd, Bruce Richardson

On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
>
> This patch adds the implementation of the 128-bit atomic compare
> exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
> can perform this operation. Moreover, on the LSE atomic extension
> accelerated platforms, it implemented by 'casp' instructions for
> better performance.
>
> Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
> patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
> 'cas' version on elder version compilers.

Jerin, Phil,

I am getting a build error on the octeontx2 target:

{standard input}: Assembler messages:
{standard input}:672: Error: selected processor does not support `casp
x0,x1,x2,x3,[x4]'
{standard input}:690: Error: selected processor does not support
`caspa x0,x1,x2,x3,[x4]'
{standard input}:708: Error: selected processor does not support
`caspl x0,x1,x2,x3,[x4]'
{standard input}:726: Error: selected processor does not support
`caspal x0,x1,x2,x3,[x4]'
ninja: build stopped: subcommand failed.

Looking into the meson logs, I can see:

Native C compiler: ccache gcc (gcc 9.2.1 "gcc (GCC) 9.2.1 20190827
(Red Hat 9.2.1-1)")
Cross C compiler: aarch64-linux-gnu-gcc (gcc 8.2.1)
Host machine cpu family: aarch64
Host machine cpu: armv8-a
Target machine cpu family: aarch64
Target machine cpu: armv8-a
Build machine cpu family: x86_64
Build machine cpu: x86_64
...
Message: Implementer : Cavium
Compiler for C supports arguments -mcpu=octeontx2: NO
Message: []
Fetching value of define "__ARM_NEON" : 1
Fetching value of define "__ARM_FEATURE_CRC32" :
Fetching value of define "__ARM_FEATURE_CRYPTO" :


My toolchain does not support the octeontx2 target, but
RTE_ARM_FEATURE_ATOMICS ends up being set in the configuration anyway.
Tried with Linaro toolchains (4.7.1, 7.4) mentionned in the dpdk
documentation, same result.

Looking at config/arm/meson.build, the "extra machine specific flags"
are appended to the configuration, regardless of what the compiler
replied when testing the machine args.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-18 14:16           ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
@ 2019-10-18 14:24             ` Jerin Jacob
  2019-10-18 14:33               ` David Marchand
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob @ 2019-10-18 14:24 UTC (permalink / raw)
  To: David Marchand
  Cc: Jerin Jacob Kollanukkaran, Phil Yang, Gage Eads, dev,
	Thomas Monjalon, Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu,
	nd, Bruce Richardson

On Fri, Oct 18, 2019 at 7:46 PM David Marchand
<david.marchand@redhat.com> wrote:
>
> On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
> >
> > This patch adds the implementation of the 128-bit atomic compare
> > exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
> > can perform this operation. Moreover, on the LSE atomic extension
> > accelerated platforms, it implemented by 'casp' instructions for
> > better performance.
> >
> > Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
> > patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
> > 'cas' version on elder version compilers.
>
> Jerin, Phil,
>
> I am getting a build error on the octeontx2 target:
>
> {standard input}: Assembler messages:
> {standard input}:672: Error: selected processor does not support `casp
> x0,x1,x2,x3,[x4]'
> {standard input}:690: Error: selected processor does not support
> `caspa x0,x1,x2,x3,[x4]'
> {standard input}:708: Error: selected processor does not support
> `caspl x0,x1,x2,x3,[x4]'
> {standard input}:726: Error: selected processor does not support
> `caspal x0,x1,x2,x3,[x4]'
> ninja: build stopped: subcommand failed.
>
> Looking into the meson logs, I can see:
>
> Native C compiler: ccache gcc (gcc 9.2.1 "gcc (GCC) 9.2.1 20190827
> (Red Hat 9.2.1-1)")
> Cross C compiler: aarch64-linux-gnu-gcc (gcc 8.2.1)
> Host machine cpu family: aarch64
> Host machine cpu: armv8-a
> Target machine cpu family: aarch64
> Target machine cpu: armv8-a
> Build machine cpu family: x86_64
> Build machine cpu: x86_64
> ...
> Message: Implementer : Cavium
> Compiler for C supports arguments -mcpu=octeontx2: NO

The compiler needs either +lse or mcpu=octeontx2 to generate casp instruction.
Could you try this patch, I can submit a patch if it works for you.

[master][dpdk-next-net-mrvl] $ git diff
diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e16..466522786 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -96,7 +96,7 @@ machine_args_cavium = [
        ['0xa2', ['-mcpu=thunderxt81'], flags_thunderx_extra],
        ['0xa3', ['-mcpu=thunderxt83'], flags_thunderx_extra],
        ['0xaf', ['-march=armv8.1-a+crc+crypto','-mcpu=thunderx2t99'],
flags_thunderx2_extra],
-       ['0xb2', ['-mcpu=octeontx2'], flags_octeontx2_extra]]
+       ['0xb2',
['-march=armv8.2-a+crc+crypto+lse','-mcpu=octeontx2'],
flags_octeontx2_extra]]

 ## Arm implementer ID (ARM DDI 0487C.a, Section G7.2.106, Page G7-5321)
 impl_generic = ['Generic armv8', flags_generic, machine_args_generic]

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-18 14:24             ` Jerin Jacob
@ 2019-10-18 14:33               ` David Marchand
  2019-10-18 14:36                 ` Jerin Jacob
  0 siblings, 1 reply; 91+ messages in thread
From: David Marchand @ 2019-10-18 14:33 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Jerin Jacob Kollanukkaran, Phil Yang, Gage Eads, dev,
	Thomas Monjalon, Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu,
	nd, Bruce Richardson

On Fri, Oct 18, 2019 at 4:25 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> On Fri, Oct 18, 2019 at 7:46 PM David Marchand
> <david.marchand@redhat.com> wrote:
> >
> > On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
> > >
> > > This patch adds the implementation of the 128-bit atomic compare
> > > exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
> > > can perform this operation. Moreover, on the LSE atomic extension
> > > accelerated platforms, it implemented by 'casp' instructions for
> > > better performance.
> > >
> > > Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
> > > patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
> > > 'cas' version on elder version compilers.
> >
> > Jerin, Phil,
> >
> > I am getting a build error on the octeontx2 target:
> >
> > {standard input}: Assembler messages:
> > {standard input}:672: Error: selected processor does not support `casp
> > x0,x1,x2,x3,[x4]'
> > {standard input}:690: Error: selected processor does not support
> > `caspa x0,x1,x2,x3,[x4]'
> > {standard input}:708: Error: selected processor does not support
> > `caspl x0,x1,x2,x3,[x4]'
> > {standard input}:726: Error: selected processor does not support
> > `caspal x0,x1,x2,x3,[x4]'
> > ninja: build stopped: subcommand failed.
> >
> > Looking into the meson logs, I can see:
> >
> > Native C compiler: ccache gcc (gcc 9.2.1 "gcc (GCC) 9.2.1 20190827
> > (Red Hat 9.2.1-1)")
> > Cross C compiler: aarch64-linux-gnu-gcc (gcc 8.2.1)
> > Host machine cpu family: aarch64
> > Host machine cpu: armv8-a
> > Target machine cpu family: aarch64
> > Target machine cpu: armv8-a
> > Build machine cpu family: x86_64
> > Build machine cpu: x86_64
> > ...
> > Message: Implementer : Cavium
> > Compiler for C supports arguments -mcpu=octeontx2: NO
>
> The compiler needs either +lse or mcpu=octeontx2 to generate casp instruction.
> Could you try this patch, I can submit a patch if it works for you.

Ah cool, I was looking at the march stuff.
Tried your patch, it works fine.

I'd say we can squash your bits in the current patch, since this was
unneeded before this patch.
Is this okay for you?


>
> [master][dpdk-next-net-mrvl] $ git diff
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e16..466522786 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -96,7 +96,7 @@ machine_args_cavium = [
>         ['0xa2', ['-mcpu=thunderxt81'], flags_thunderx_extra],
>         ['0xa3', ['-mcpu=thunderxt83'], flags_thunderx_extra],
>         ['0xaf', ['-march=armv8.1-a+crc+crypto','-mcpu=thunderx2t99'],
> flags_thunderx2_extra],
> -       ['0xb2', ['-mcpu=octeontx2'], flags_octeontx2_extra]]
> +       ['0xb2',
> ['-march=armv8.2-a+crc+crypto+lse','-mcpu=octeontx2'],
> flags_octeontx2_extra]]
>
>  ## Arm implementer ID (ARM DDI 0487C.a, Section G7.2.106, Page G7-5321)
>  impl_generic = ['Generic armv8', flags_generic, machine_args_generic]

Thanks for the quick reply.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-18 14:33               ` David Marchand
@ 2019-10-18 14:36                 ` Jerin Jacob
  2019-10-21  8:24                   ` David Marchand
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob @ 2019-10-18 14:36 UTC (permalink / raw)
  To: David Marchand
  Cc: Jerin Jacob Kollanukkaran, Phil Yang, Gage Eads, dev,
	Thomas Monjalon, Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu,
	nd, Bruce Richardson

On Fri, Oct 18, 2019 at 8:04 PM David Marchand
<david.marchand@redhat.com> wrote:
>
> On Fri, Oct 18, 2019 at 4:25 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
> >
> > On Fri, Oct 18, 2019 at 7:46 PM David Marchand
> > <david.marchand@redhat.com> wrote:
> > >
> > > On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
> > > >
> > > > This patch adds the implementation of the 128-bit atomic compare
> > > > exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
> > > > can perform this operation. Moreover, on the LSE atomic extension
> > > > accelerated platforms, it implemented by 'casp' instructions for
> > > > better performance.
> > > >
> > > > Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
> > > > patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
> > > > 'cas' version on elder version compilers.
> > >
> > > Jerin, Phil,
> > >
> > > I am getting a build error on the octeontx2 target:
> > >
> > > {standard input}: Assembler messages:
> > > {standard input}:672: Error: selected processor does not support `casp
> > > x0,x1,x2,x3,[x4]'
> > > {standard input}:690: Error: selected processor does not support
> > > `caspa x0,x1,x2,x3,[x4]'
> > > {standard input}:708: Error: selected processor does not support
> > > `caspl x0,x1,x2,x3,[x4]'
> > > {standard input}:726: Error: selected processor does not support
> > > `caspal x0,x1,x2,x3,[x4]'
> > > ninja: build stopped: subcommand failed.
> > >
> > > Looking into the meson logs, I can see:
> > >
> > > Native C compiler: ccache gcc (gcc 9.2.1 "gcc (GCC) 9.2.1 20190827
> > > (Red Hat 9.2.1-1)")
> > > Cross C compiler: aarch64-linux-gnu-gcc (gcc 8.2.1)
> > > Host machine cpu family: aarch64
> > > Host machine cpu: armv8-a
> > > Target machine cpu family: aarch64
> > > Target machine cpu: armv8-a
> > > Build machine cpu family: x86_64
> > > Build machine cpu: x86_64
> > > ...
> > > Message: Implementer : Cavium
> > > Compiler for C supports arguments -mcpu=octeontx2: NO
> >
> > The compiler needs either +lse or mcpu=octeontx2 to generate casp instruction.
> > Could you try this patch, I can submit a patch if it works for you.
>
> Ah cool, I was looking at the march stuff.
> Tried your patch, it works fine.
>
> I'd say we can squash your bits in the current patch, since this was
> unneeded before this patch.
> Is this okay for you?

Yup.

>
>
> >
> > [master][dpdk-next-net-mrvl] $ git diff
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e16..466522786 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -96,7 +96,7 @@ machine_args_cavium = [
> >         ['0xa2', ['-mcpu=thunderxt81'], flags_thunderx_extra],
> >         ['0xa3', ['-mcpu=thunderxt83'], flags_thunderx_extra],
> >         ['0xaf', ['-march=armv8.1-a+crc+crypto','-mcpu=thunderx2t99'],
> > flags_thunderx2_extra],
> > -       ['0xb2', ['-mcpu=octeontx2'], flags_octeontx2_extra]]
> > +       ['0xb2',
> > ['-march=armv8.2-a+crc+crypto+lse','-mcpu=octeontx2'],
> > flags_octeontx2_extra]]
> >
> >  ## Arm implementer ID (ARM DDI 0487C.a, Section G7.2.106, Page G7-5321)
> >  impl_generic = ['Generic armv8', flags_generic, machine_args_generic]
>
> Thanks for the quick reply.
>
>
> --
> David Marchand
>

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-10-18 14:36                 ` Jerin Jacob
@ 2019-10-21  8:24                   ` David Marchand
  0 siblings, 0 replies; 91+ messages in thread
From: David Marchand @ 2019-10-21  8:24 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Jerin Jacob Kollanukkaran, Phil Yang, Gage Eads, dev,
	Thomas Monjalon, Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu,
	nd, Bruce Richardson

On Fri, Oct 18, 2019 at 4:36 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> On Fri, Oct 18, 2019 at 8:04 PM David Marchand
> <david.marchand@redhat.com> wrote:
> >
> > On Fri, Oct 18, 2019 at 4:25 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > >
> > > On Fri, Oct 18, 2019 at 7:46 PM David Marchand
> > > <david.marchand@redhat.com> wrote:
> > > >
> > > > On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
> > > > >
> > > > > This patch adds the implementation of the 128-bit atomic compare
> > > > > exchange API on AArch64. Using 64-bit 'ldxp/stxp' instructions
> > > > > can perform this operation. Moreover, on the LSE atomic extension
> > > > > accelerated platforms, it implemented by 'casp' instructions for
> > > > > better performance.
> > > > >
> > > > > Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, so this
> > > > > patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable the
> > > > > 'cas' version on elder version compilers.
> > > >
> > > > Jerin, Phil,
> > > >
> > > > I am getting a build error on the octeontx2 target:
> > > >
> > > > {standard input}: Assembler messages:
> > > > {standard input}:672: Error: selected processor does not support `casp
> > > > x0,x1,x2,x3,[x4]'
> > > > {standard input}:690: Error: selected processor does not support
> > > > `caspa x0,x1,x2,x3,[x4]'
> > > > {standard input}:708: Error: selected processor does not support
> > > > `caspl x0,x1,x2,x3,[x4]'
> > > > {standard input}:726: Error: selected processor does not support
> > > > `caspal x0,x1,x2,x3,[x4]'
> > > > ninja: build stopped: subcommand failed.
> > > >
> > > > Looking into the meson logs, I can see:
> > > >
> > > > Native C compiler: ccache gcc (gcc 9.2.1 "gcc (GCC) 9.2.1 20190827
> > > > (Red Hat 9.2.1-1)")
> > > > Cross C compiler: aarch64-linux-gnu-gcc (gcc 8.2.1)
> > > > Host machine cpu family: aarch64
> > > > Host machine cpu: armv8-a
> > > > Target machine cpu family: aarch64
> > > > Target machine cpu: armv8-a
> > > > Build machine cpu family: x86_64
> > > > Build machine cpu: x86_64
> > > > ...
> > > > Message: Implementer : Cavium
> > > > Compiler for C supports arguments -mcpu=octeontx2: NO
> > >
> > > The compiler needs either +lse or mcpu=octeontx2 to generate casp instruction.
> > > Could you try this patch, I can submit a patch if it works for you.
> >
> > Ah cool, I was looking at the march stuff.
> > Tried your patch, it works fine.
> >
> > I'd say we can squash your bits in the current patch, since this was
> > unneeded before this patch.
> > Is this okay for you?
>
> Yup.
>
> >
> >
> > >
> > > [master][dpdk-next-net-mrvl] $ git diff
> > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > index 979018e16..466522786 100644
> > > --- a/config/arm/meson.build
> > > +++ b/config/arm/meson.build
> > > @@ -96,7 +96,7 @@ machine_args_cavium = [
> > >         ['0xa2', ['-mcpu=thunderxt81'], flags_thunderx_extra],
> > >         ['0xa3', ['-mcpu=thunderxt83'], flags_thunderx_extra],
> > >         ['0xaf', ['-march=armv8.1-a+crc+crypto','-mcpu=thunderx2t99'],
> > > flags_thunderx2_extra],
> > > -       ['0xb2', ['-mcpu=octeontx2'], flags_octeontx2_extra]]
> > > +       ['0xb2',
> > > ['-march=armv8.2-a+crc+crypto+lse','-mcpu=octeontx2'],
> > > flags_octeontx2_extra]]
> > >
> > >  ## Arm implementer ID (ARM DDI 0487C.a, Section G7.2.106, Page G7-5321)
> > >  impl_generic = ['Generic armv8', flags_generic, machine_args_generic]
> >
> > Thanks for the quick reply.
> >

Applied with above fix.
Thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test Phil Yang
@ 2019-10-21  8:25             ` David Marchand
  0 siblings, 0 replies; 91+ messages in thread
From: David Marchand @ 2019-10-21  8:25 UTC (permalink / raw)
  To: Phil Yang
  Cc: Jerin Jacob Kollanukkaran, Gage Eads, dev, Thomas Monjalon,
	Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu, nd

On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
>
> Add 128b atomic compare and swap test for aarch64 and x86_64.
> Extend the test iteration from 10 thousand to 1 million times to test
> the stability of the atomic APIs.
>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Gage Eads <gage.eads@intel.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> Tested-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  app/test/test_atomic.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 123 insertions(+), 2 deletions(-)
>
> diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
> index 43be30e..14bd3a8 100644
> --- a/app/test/test_atomic.c
> +++ b/app/test/test_atomic.c
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #include <stdio.h>
> @@ -20,7 +21,7 @@
>   * Atomic Variables
>   * ================
>   *
> - * - The main test function performs three subtests. The first test
> + * - The main test function performs four subtests. The first test
>   *   checks that the usual inc/dec/add/sub functions are working
>   *   correctly:
>   *
> @@ -61,11 +62,27 @@
>   *       atomic_sub(&count, tmp+1);
>   *
>   *   - At the end of the test, the *count* value must be 0.
> + *
> + * - Test "128b compare and swap" (aarch64 and x86_64 only)
> + *
> + *   - Initialize 128-bit atomic variables to zero.
> + *
> + *   - Invoke ``test_atomic128_cmp_exchange()`` on each lcore. Before doing
> + *     anything else, the cores are waiting a synchro. Each lcore does
> + *     these compare and swap (CAS) operations several times::
> + *
> + *       Acquired CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Acquired_Released CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *       Relaxed CAS update counter.val[0] + 2; counter.val[1] + 1;
> + *
> + *   - At the end of the test, the *count128* first 64-bit value and
> + *     second 64-bit value differ by the total iterations.
>   */
>
>  #define NUM_ATOMIC_TYPES 3
>
> -#define N 10000
> +#define N 1000000
>
>  static rte_atomic16_t a16;
>  static rte_atomic32_t a32;
> @@ -216,6 +233,78 @@ test_atomic_dec_and_test(__attribute__((unused)) void *arg)
>         return 0;
>  }
>
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> +static rte_int128_t count128;
> +
> +/*
> + * rte_atomic128_cmp_exchange() should update a 128 bits counter's first 64
> + * bits by 2 and the second 64 bits by 1 in this test. It should return true
> + * if the compare exchange operation is successful.
> + * This test repeats 128 bits compare and swap operations N rounds. In each
> + * iteration it runs compare and swap operation with different memory models.
> + */
> +static int
> +test_atomic128_cmp_exchange(__attribute__((unused)) void *arg)
> +{
> +       rte_int128_t expected;
> +       int success;
> +       unsigned int i;
> +
> +       while (rte_atomic32_read(&synchro) == 0)
> +               ;
> +
> +       expected = count128;
> +
> +       for (i = 0; i < N; i++) {
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                               &expected, &desired, 1,
> +                               __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_RELEASE, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +
> +               do {
> +                       rte_int128_t desired;
> +
> +                       desired.val[0] = expected.val[0] + 2;
> +                       desired.val[1] = expected.val[1] + 1;
> +
> +                       success = rte_atomic128_cmp_exchange(&count128,
> +                                       &expected, &desired, 1,
> +                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
> +               } while (success == 0);
> +       }
> +
> +       return 0;
> +}
> +#endif
> +
>  static int
>  test_atomic(void)
>  {
> @@ -340,6 +429,38 @@ test_atomic(void)
>                 return -1;
>         }
>
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
> +       /*
> +        * This case tests the functionality of rte_atomic128b_cmp_exchange
> +        * API. It calls rte_atomic128b_cmp_exchange with four kinds of memory
> +        * models successively on each slave core. Once each 128-bit atomic
> +        * compare and swap operation is successful, it updates the global
> +        * 128-bit counter by 2 for the first 64-bit and 1 for the second
> +        * 64-bit. Each slave core iterates this test N times.
> +        * At the end of test, verify whether the first 64-bits of the 128-bit
> +        * counter and the second 64bits is differ by the total iterations. If
> +        * it is, the test passes.
> +        */
> +       printf("128b compare and swap test\n");
> +       uint64_t iterations = 0;
> +
> +       rte_atomic32_clear(&synchro);
> +       count128.val[0] = 0;
> +       count128.val[1] = 0;
> +
> +       rte_eal_mp_remote_launch(test_atomic128_cmp_exchange, NULL,
> +                                SKIP_MASTER);
> +       rte_atomic32_set(&synchro, 1);
> +       rte_eal_mp_wait_lcore();
> +       rte_atomic32_clear(&synchro);
> +
> +       iterations = count128.val[0] - count128.val[1];
> +       if (iterations != 4*N*(rte_lcore_count()-1)) {
> +               printf("128b compare and swap failed\n");
> +               return -1;
> +       }
> +#endif
> +
>         return 0;
>  }
>
> --
> 2.7.4
>

Applied, thanks.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64
  2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
@ 2019-10-21  8:26             ` David Marchand
  0 siblings, 0 replies; 91+ messages in thread
From: David Marchand @ 2019-10-21  8:26 UTC (permalink / raw)
  To: Phil Yang
  Cc: Jerin Jacob Kollanukkaran, Gage Eads, dev, Thomas Monjalon,
	Hemant Agrawal, Honnappa Nagarahalli, Gavin Hu, nd

On Fri, Oct 18, 2019 at 1:22 PM Phil Yang <phil.yang@arm.com> wrote:
>
> Enable both c11 atomic and non c11 atomic lock-free stack for aarch64.
>
> Introduced a new header to reduce the ifdef clutter across generic and c11
> files. The rte_stack_lf_stubs.h contains stub implementations of
> __rte_stack_lf_count, __rte_stack_lf_push_elems and
> __rte_stack_lf_pop_elems.
>
> Suggested-by: Gage Eads <gage.eads@intel.com>
> Suggested-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  doc/guides/prog_guide/env_abstraction_layer.rst |  4 +--
>  doc/guides/rel_notes/release_19_11.rst          |  3 ++
>  lib/librte_stack/Makefile                       |  3 +-
>  lib/librte_stack/rte_stack_lf.h                 |  4 +++
>  lib/librte_stack/rte_stack_lf_c11.h             | 16 ---------
>  lib/librte_stack/rte_stack_lf_generic.h         | 16 ---------
>  lib/librte_stack/rte_stack_lf_stubs.h           | 44 +++++++++++++++++++++++++
>  7 files changed, 55 insertions(+), 35 deletions(-)
>  create mode 100644 lib/librte_stack/rte_stack_lf_stubs.h
>
> diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
> index 94f30fd..6e59fae 100644
> --- a/doc/guides/prog_guide/env_abstraction_layer.rst
> +++ b/doc/guides/prog_guide/env_abstraction_layer.rst
> @@ -648,8 +648,8 @@ Known Issues
>    Alternatively, applications can use the lock-free stack mempool handler. When
>    considering this handler, note that:
>
> -  - It is currently limited to the x86_64 platform, because it uses an
> -    instruction (16-byte compare-and-swap) that is not yet available on other
> +  - It is currently limited to the aarch64 and x86_64 platforms, because it uses
> +    an instruction (16-byte compare-and-swap) that is not yet available on other
>      platforms.
>    - It has worse average-case performance than the non-preemptive rte_ring, but
>      software caching (e.g. the mempool cache) can mitigate this by reducing the
> diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
> index 85953b9..4f82f54 100644
> --- a/doc/guides/rel_notes/release_19_11.rst
> +++ b/doc/guides/rel_notes/release_19_11.rst
> @@ -115,6 +115,9 @@ New Features
>    Added eBPF JIT support for arm64 architecture to improve the eBPF program
>    performance.
>
> +* **Added Lock-free Stack for aarch64.**
> +
> +  The lock-free stack implementation is enabled for aarch64 platforms.

Missing line (paragraphs are separated with two empty lines).
This entry should go with Core libs entries, so at the beginning of this list.

I did the changes.

>
>  Removed Items
>  -------------
> diff --git a/lib/librte_stack/Makefile b/lib/librte_stack/Makefile
> index 8d18ce5..b5e5bed 100644
> --- a/lib/librte_stack/Makefile
> +++ b/lib/librte_stack/Makefile
> @@ -24,6 +24,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_STACK)-include := rte_stack.h \
>                                               rte_stack_std.h \
>                                               rte_stack_lf.h \
>                                               rte_stack_lf_generic.h \
> -                                             rte_stack_lf_c11.h
> +                                             rte_stack_lf_c11.h \
> +                                             rte_stack_lf_stubs.h
>
>  include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/lib/librte_stack/rte_stack_lf.h b/lib/librte_stack/rte_stack_lf.h
> index f5581f0..e67630c 100644
> --- a/lib/librte_stack/rte_stack_lf.h
> +++ b/lib/librte_stack/rte_stack_lf.h
> @@ -5,11 +5,15 @@
>  #ifndef _RTE_STACK_LF_H_
>  #define _RTE_STACK_LF_H_
>
> +#if !(defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64))
> +#include "rte_stack_lf_stubs.h"
> +#else
>  #ifdef RTE_USE_C11_MEM_MODEL
>  #include "rte_stack_lf_c11.h"
>  #else
>  #include "rte_stack_lf_generic.h"
>  #endif
> +#endif
>
>  /**
>   * @internal Push several objects on the lock-free stack (MT-safe).
> diff --git a/lib/librte_stack/rte_stack_lf_c11.h b/lib/librte_stack/rte_stack_lf_c11.h
> index 3d677ae..999359f 100644
> --- a/lib/librte_stack/rte_stack_lf_c11.h
> +++ b/lib/librte_stack/rte_stack_lf_c11.h
> @@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>                           struct rte_stack_lf_elem *last,
>                           unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(first);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -79,7 +73,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>          * to the LIFO len update.
>          */
>         __atomic_add_fetch(&list->len, num, __ATOMIC_RELEASE);
> -#endif
>  }
>
>  static __rte_always_inline struct rte_stack_lf_elem *
> @@ -88,14 +81,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>                          void **obj_table,
>                          struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(obj_table);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -
> -       return NULL;
> -#else
>         struct rte_stack_lf_head old_head;
>         uint64_t len;
>         int success;
> @@ -169,7 +154,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         return old_head.top;
> -#endif
>  }
>
>  #endif /* _RTE_STACK_LF_C11_H_ */
> diff --git a/lib/librte_stack/rte_stack_lf_generic.h b/lib/librte_stack/rte_stack_lf_generic.h
> index 3182151..3abbb53 100644
> --- a/lib/librte_stack/rte_stack_lf_generic.h
> +++ b/lib/librte_stack/rte_stack_lf_generic.h
> @@ -36,12 +36,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>                           struct rte_stack_lf_elem *last,
>                           unsigned int num)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(first);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -75,7 +69,6 @@ __rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         rte_atomic64_add((rte_atomic64_t *)&list->len, num);
> -#endif
>  }
>
>  static __rte_always_inline struct rte_stack_lf_elem *
> @@ -84,14 +77,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>                          void **obj_table,
>                          struct rte_stack_lf_elem **last)
>  {
> -#ifndef RTE_ARCH_X86_64
> -       RTE_SET_USED(obj_table);
> -       RTE_SET_USED(last);
> -       RTE_SET_USED(list);
> -       RTE_SET_USED(num);
> -
> -       return NULL;
> -#else
>         struct rte_stack_lf_head old_head;
>         int success;
>
> @@ -159,7 +144,6 @@ __rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
>         } while (success == 0);
>
>         return old_head.top;
> -#endif
>  }
>
>  #endif /* _RTE_STACK_LF_GENERIC_H_ */
> diff --git a/lib/librte_stack/rte_stack_lf_stubs.h b/lib/librte_stack/rte_stack_lf_stubs.h
> new file mode 100644
> index 0000000..a05abf1
> --- /dev/null
> +++ b/lib/librte_stack/rte_stack_lf_stubs.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
> + */
> +
> +#ifndef _RTE_STACK_LF_STUBS_H_
> +#define _RTE_STACK_LF_STUBS_H_
> +
> +#include <rte_common.h>
> +
> +static __rte_always_inline unsigned int
> +__rte_stack_lf_count(struct rte_stack *s)
> +{
> +       RTE_SET_USED(s);
> +
> +       return 0;
> +}
> +
> +static __rte_always_inline void
> +__rte_stack_lf_push_elems(struct rte_stack_lf_list *list,
> +                         struct rte_stack_lf_elem *first,
> +                         struct rte_stack_lf_elem *last,
> +                         unsigned int num)
> +{
> +       RTE_SET_USED(first);
> +       RTE_SET_USED(last);
> +       RTE_SET_USED(list);
> +       RTE_SET_USED(num);
> +}
> +
> +static __rte_always_inline struct rte_stack_lf_elem *
> +__rte_stack_lf_pop_elems(struct rte_stack_lf_list *list,
> +                        unsigned int num,
> +                        void **obj_table,
> +                        struct rte_stack_lf_elem **last)
> +{
> +       RTE_SET_USED(obj_table);
> +       RTE_SET_USED(last);
> +       RTE_SET_USED(list);
> +       RTE_SET_USED(num);
> +
> +       return NULL;
> +}
> +
> +#endif /* _RTE_STACK_LF_STUBS_H_ */
> --
> 2.7.4
>

Applied, thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-08-14 10:24 ` Phil Yang (Arm Technology China)
@ 2019-08-14 12:40   ` Jerin Jacob Kollanukkaran
  0 siblings, 0 replies; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-08-14 12:40 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), thomas, gage.eads, dev
  Cc: hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>
> Sent: Wednesday, August 14, 2019 3:55 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; thomas@monjalon.net;
> gage.eads@intel.com; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: [EXT] RE: [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> 
> External Email
> 
> ----------------------------------------------------------------------
> > -----Original Message-----
> > From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> > Sent: Wednesday, August 14, 2019 4:46 PM
> > To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>;
> > thomas@monjalon.net; gage.eads@intel.com; dev@dpdk.org
> > Cc: hemant.agrawal@nxp.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > <Gavin.Hu@arm.com>; nd <nd@arm.com>
> > Subject: RE: [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> >
> > > -----Original Message-----
> > > From: Phil Yang <phil.yang@arm.com>
> > > Sent: Wednesday, August 14, 2019 1:58 PM
> > > To: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>;
> > > gage.eads@intel.com; dev@dpdk.org
> > > Cc: hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> > > gavin.hu@arm.com; nd@arm.com
> > > Subject: [EXT] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare
> > > exchange
> > > +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > > +__ATOMIC_RELEASE) #define __HAS_RLS(mo) ((mo) ==
> > > __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
> > > +					  (mo) == __ATOMIC_SEQ_CST)
> > > +
> > > +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE :
> > > +__ATOMIC_RELAXED) #define __MO_STORE(mo) (__HAS_RLS((mo)) ?
> > > +__ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > > +
> > > +#if defined(__ARM_FEATURE_ATOMICS) ||
> > > defined(RTE_ARM_FEATURE_ATOMICS)
> > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)
> \
> > > +static __rte_noinline rte_int128_t                                          \
> >
> >
> > Could you check the cost of making it as __rte_noinline?
> > If it is costly, How about having two versions, one with
> > __rte_noinline to make compliance with arm64 procedure call standard
> > for old gcc and clang.
> > Other one without explicit register hardcoding + inline for latest gcc
> 
> Hi Jerin,

Hi Phil Yang,

> According to the stack_lf_perf_autotest, making it as __rte_noinline has no
> overhead on ThunderX2 with GCC 8.3.
> The 'Average cycles per object push/pop' numbers for __rte_noinline and
> __rte_always_inline versions are nearly the same.

I tested with octeontx2 as well. It is yielding similar result. 
No change is expected in this patch then.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
  2019-08-14  8:45 [dpdk-dev] [PATCH v9 " Jerin Jacob Kollanukkaran
@ 2019-08-14 10:24 ` Phil Yang (Arm Technology China)
  2019-08-14 12:40   ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 91+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-08-14 10:24 UTC (permalink / raw)
  To: jerinj, thomas, gage.eads, dev
  Cc: hemant.agrawal, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China),
	nd, nd

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Wednesday, August 14, 2019 4:46 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>;
> thomas@monjalon.net; gage.eads@intel.com; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
> 
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Wednesday, August 14, 2019 1:58 PM
> > To: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>;
> > gage.eads@intel.com; dev@dpdk.org
> > Cc: hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> > gavin.hu@arm.com; nd@arm.com
> > Subject: [EXT] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare
> > exchange
> > +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> > +__ATOMIC_RELEASE) #define __HAS_RLS(mo) ((mo) ==
> > __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
> > +					  (mo) == __ATOMIC_SEQ_CST)
> > +
> > +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE :
> > +__ATOMIC_RELAXED) #define __MO_STORE(mo) (__HAS_RLS((mo)) ?
> > +__ATOMIC_RELEASE : __ATOMIC_RELAXED)
> > +
> > +#if defined(__ARM_FEATURE_ATOMICS) ||
> > defined(RTE_ARM_FEATURE_ATOMICS)
> > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> > +static __rte_noinline rte_int128_t                                          \
> 
> 
> Could you check the cost of making it as __rte_noinline?
> If it is costly, How about having two versions, one with __rte_noinline
> to make compliance with arm64 procedure call standard for
> old gcc and clang.
> Other one without explicit register hardcoding + inline for latest
> gcc

Hi Jerin,

According to the stack_lf_perf_autotest, making it as __rte_noinline has no overhead on ThunderX2 with GCC 8.3.
The 'Average cycles per object push/pop' numbers for __rte_noinline and __rte_always_inline versions are nearly the same.

Test results :
###### Two NUMA Node ######
#### __rte_noinline ####

RTE>>stack_lf_perf_autotest
<snip>
### Testing using two NUMA nodes ###
Average cycles per object push/pop (bulk size: 8): 24.10
Average cycles per object push/pop (bulk size: 32): 6.85

### Testing on all 18 lcores ###
Average cycles per object push/pop (bulk size: 8): 680.39
Average cycles per object push/pop (bulk size: 32): 146.38
Test OK

#### __rte_always-inline ####
RTE>>stack_lf_perf_autotest
<snip>
### Testing using two NUMA nodes ###
Average cycles per object push/pop (bulk size: 8): 24.29
Average cycles per object push/pop (bulk size: 32): 6.92

### Testing on all 18 lcores ###
Average cycles per object push/pop (bulk size: 8): 683.92
Average cycles per object push/pop (bulk size: 32): 145.11
Test OK

###### Single NUMA ######
#### __rte_always-inline ####

RTE>>stack_lf_perf_autotest
<snip>
### Testing on all 18 lcores ###
Average cycles per object push/pop (bulk size: 8): 582.92
Average cycles per object push/pop (bulk size: 32): 125.57
Test OK
#### __rte_noinline ####

RTE>>stack_lf_perf_autotest
<snip>
### Testing on all 18 lcores ###
Average cycles per object push/pop (bulk size: 8): 537.56
Average cycles per object push/pop (bulk size: 32): 122.98
Test OK

Thanks,
Phil Yang

> 
> 
> > +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> > +		rte_int128_t updated)                                       \
> > +{                                                                           \
> > +	/* caspX instructions register pair must start from even-numbered
> > +	 * register at operand 1.
> > +	 * So, specify registers for local variables here.
> > +	 */                                                                 \
> > +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
> > +	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
> > +	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
> > +	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
> > +	asm volatile(                                                       \
> > +		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
> > +		: [old0] "+r" (x0),                                         \
> > +		[old1] "+r" (x1)                                            \
> > +		: [upd0] "r" (x2),                                          \
> > +		[upd1] "r" (x3),                                            \
> > +		[dst] "r" (dst)                                             \
> > +		: "memory");                                                \
> > +	old.val[0] = x0;                                                    \
> > +	old.val[1] = x1;                                                    \
> > +	return old;                                                         \
> > +}
> > +

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange
@ 2019-08-14  8:45 " Jerin Jacob Kollanukkaran
  2019-08-14 10:24 ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 91+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-08-14  8:45 UTC (permalink / raw)
  To: Phil Yang, thomas, gage.eads, dev
  Cc: hemant.agrawal, Honnappa.Nagarahalli, gavin.hu, nd

> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Wednesday, August 14, 2019 1:58 PM
> To: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> gage.eads@intel.com; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com
> Subject: [EXT] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare
> exchange
> +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) !=
> +__ATOMIC_RELEASE) #define __HAS_RLS(mo) ((mo) ==
> __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
> +					  (mo) == __ATOMIC_SEQ_CST)
> +
> +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE :
> +__ATOMIC_RELAXED) #define __MO_STORE(mo) (__HAS_RLS((mo)) ?
> +__ATOMIC_RELEASE : __ATOMIC_RELAXED)
> +
> +#if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS)
> +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> +static __rte_noinline rte_int128_t                                          \


Could you check the cost of making it as __rte_noinline?
If it is costly, How about having two versions, one with __rte_noinline
to make compliance with arm64 procedure call standard for
old gcc and clang.
Other one without explicit register hardcoding + inline for latest
gcc


> +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> +		rte_int128_t updated)                                       \
> +{                                                                           \
> +	/* caspX instructions register pair must start from even-numbered
> +	 * register at operand 1.
> +	 * So, specify registers for local variables here.
> +	 */                                                                 \
> +	register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
> +	register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
> +	register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
> +	register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
> +	asm volatile(                                                       \
> +		op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
> +		: [old0] "+r" (x0),                                         \
> +		[old1] "+r" (x1)                                            \
> +		: [upd0] "r" (x2),                                          \
> +		[upd1] "r" (x3),                                            \
> +		[dst] "r" (dst)                                             \
> +		: "memory");                                                \
> +	old.val[0] = x0;                                                    \
> +	old.val[1] = x1;                                                    \
> +	return old;                                                         \
> +}
> +

^ permalink raw reply	[flat|nested] 91+ messages in thread

end of thread, back to index

Thread overview: 91+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-23  2:41 [dpdk-dev] [PATCH v1 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-06-23  2:41 ` [dpdk-dev] [PATCH v1 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-06-23  3:15 ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-06-24 15:09     ` Eads, Gage
2019-06-24 15:29       ` Phil Yang (Arm Technology China)
2019-06-23  3:15   ` [dpdk-dev] [PATCH v2 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-06-24 15:15     ` Eads, Gage
2019-06-24 15:22       ` Phil Yang (Arm Technology China)
2019-06-24 14:46   ` [dpdk-dev] [PATCH v2 1/3] eal/arm64: add 128-bit atomic compare exchange Eads, Gage
2019-06-24 15:35     ` Phil Yang (Arm Technology China)
2019-06-28  8:11 ` [dpdk-dev] [PATCH v3 " Phil Yang
2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-06-29  0:17     ` Eads, Gage
2019-07-19  4:03     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2019-06-28  8:11   ` [dpdk-dev] [PATCH v3 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-06-29  0:18     ` Eads, Gage
2019-07-19  4:18     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2019-07-19  4:42       ` Eads, Gage
2019-07-19  5:02         ` Jerin Jacob Kollanukkaran
2019-07-19  5:15           ` Phil Yang (Arm Technology China)
2019-07-03 12:25   ` [dpdk-dev] [EXT] [PATCH v3 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
2019-07-03 13:07     ` Jerin Jacob Kollanukkaran
2019-07-05  4:20       ` Honnappa Nagarahalli
2019-07-05  4:37         ` Pavan Nikhilesh Bhagavatula
2019-07-09  9:27           ` Phil Yang (Arm Technology China)
2019-07-09 11:14             ` Jerin Jacob Kollanukkaran
2019-07-19  6:24   ` Jerin Jacob Kollanukkaran
2019-07-19 11:01     ` Phil Yang (Arm Technology China)
2019-07-19 12:35       ` Jerin Jacob Kollanukkaran
2019-07-19 13:56         ` Phil Yang (Arm Technology China)
2019-07-19 14:50           ` Eads, Gage
2019-07-22  8:44 ` [dpdk-dev] [PATCH v4 " Phil Yang
2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-07-22  8:44   ` [dpdk-dev] [PATCH v4 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-07-22 10:22     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2019-07-22 11:51       ` Phil Yang (Arm Technology China)
2019-07-22 10:20   ` [dpdk-dev] [EXT] [PATCH v4 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
2019-07-22 11:50     ` Phil Yang (Arm Technology China)
2019-07-22 13:06 ` [dpdk-dev] [PATCH v5 " Phil Yang
2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-07-22 13:06   ` [dpdk-dev] [PATCH v5 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-07-22 14:14     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2019-07-22 15:19       ` Phil Yang (Arm Technology China)
2019-07-22 14:34     ` [dpdk-dev] " Eads, Gage
2019-07-22 14:43       ` Phil Yang (Arm Technology China)
2019-07-22 14:19   ` [dpdk-dev] [EXT] [PATCH v5 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
2019-07-22 16:23     ` Phil Yang (Arm Technology China)
2019-07-22 16:22 ` [dpdk-dev] [PATCH v6 " Phil Yang
2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-07-22 16:22   ` [dpdk-dev] [PATCH v6 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-07-22 16:59     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2019-07-22 16:57   ` [dpdk-dev] [EXT] [PATCH v6 1/3] eal/arm64: add 128-bit atomic compare exchange Jerin Jacob Kollanukkaran
2019-07-23  3:28     ` Phil Yang (Arm Technology China)
2019-07-23  7:09       ` Jerin Jacob Kollanukkaran
2019-07-23  7:53         ` Phil Yang (Arm Technology China)
2019-07-23  5:57 ` [dpdk-dev] [PATCH v7 " Phil Yang
2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-07-23  5:57   ` [dpdk-dev] [PATCH v7 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-07-23  7:05   ` [dpdk-dev] [PATCH v8 1/3] eal/arm64: add 128-bit atomic compare exchange jerinj
2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 2/3] test/atomic: add 128b compare and swap test jerinj
2019-07-23  7:05     ` [dpdk-dev] [PATCH v8 3/3] eal/stack: enable lock-free stack for aarch64 jerinj
2019-08-14  8:27     ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-10-14 15:45         ` David Marchand
2019-10-15 11:32           ` Phil Yang (Arm Technology China)
2019-08-14  8:27       ` [dpdk-dev] [PATCH v9 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-10-14 15:45         ` David Marchand
2019-10-15 11:32           ` Phil Yang (Arm Technology China)
2019-10-14 15:43       ` [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
2019-10-15 11:32         ` Phil Yang (Arm Technology China)
2019-10-15 12:16           ` David Marchand
2019-10-16  9:04             ` Phil Yang (Arm Technology China)
2019-10-17 12:45               ` David Marchand
2019-10-15 11:38       ` [dpdk-dev] [PATCH v10 " Phil Yang
2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-10-15 11:38         ` [dpdk-dev] [PATCH v10 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-10-18 11:21         ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange Phil Yang
2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 2/3] test/atomic: add 128b compare and swap test Phil Yang
2019-10-21  8:25             ` David Marchand
2019-10-18 11:21           ` [dpdk-dev] [PATCH v11 3/3] eal/stack: enable lock-free stack for aarch64 Phil Yang
2019-10-21  8:26             ` David Marchand
2019-10-18 14:16           ` [dpdk-dev] [PATCH v11 1/3] eal/arm64: add 128-bit atomic compare exchange David Marchand
2019-10-18 14:24             ` Jerin Jacob
2019-10-18 14:33               ` David Marchand
2019-10-18 14:36                 ` Jerin Jacob
2019-10-21  8:24                   ` David Marchand
2019-08-14  8:45 [dpdk-dev] [PATCH v9 " Jerin Jacob Kollanukkaran
2019-08-14 10:24 ` Phil Yang (Arm Technology China)
2019-08-14 12:40   ` Jerin Jacob Kollanukkaran

DPDK patches and discussions

Archives are clonable:
	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox