DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme
@ 2021-09-02  5:32 Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 1/5] eal: " Feifei Wang
                   ` (7 more replies)
  0 siblings, 8 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang

Add new API for wait_until scheme, and apply this new API into lib to
replace rte_pause.

Feifei Wang (5):
  eal: add new API for wait until scheme
  eal: use wait until scheme for read pflock
  eal: use wait until scheme for mcslock
  lib/bpf: use wait until scheme for Rx/Tx iteration
  lib/distributor: use wait until scheme

 lib/bpf/bpf_pkt.c                        |  11 +-
 lib/distributor/rte_distributor_single.c |  10 +-
 lib/eal/arm/include/rte_pause_64.h       | 271 ++++++++++++++++----
 lib/eal/include/generic/rte_mcslock.h    |   9 +-
 lib/eal/include/generic/rte_pause.h      | 309 +++++++++++++++++++++++
 lib/eal/include/generic/rte_pflock.h     |   5 +-
 6 files changed, 543 insertions(+), 72 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v1 1/5] eal: add new API for wait until scheme
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
@ 2021-09-02  5:32 ` Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 2/5] eal: use wait until scheme for read pflock Feifei Wang
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  To: Ruifeng Wang; +Cc: dev, nd, Feifei Wang

For 'wait until' scheme, add new APIs for more cases:
1. add wait_until_unequal API
2. add wait_until_part_equal API

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/arm/include/rte_pause_64.h  | 271 +++++++++++++++++++-----
 lib/eal/include/generic/rte_pause.h | 309 ++++++++++++++++++++++++++++
 2 files changed, 526 insertions(+), 54 deletions(-)

diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
index e87d10b8cc..19716276fc 100644
--- a/lib/eal/arm/include/rte_pause_64.h
+++ b/lib/eal/arm/include/rte_pause_64.h
@@ -31,20 +31,12 @@ static inline void rte_pause(void)
 /* Put processor into low power WFE(Wait For Event) state. */
 #define __WFE() { asm volatile("wfe" : : : "memory"); }
 
-static __rte_always_inline void
-rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
-		int memorder)
-{
-	uint16_t value;
-
-	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
-
-	/*
-	 * Atomic exclusive load from addr, it returns the 16-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
+/*
+ * Atomic exclusive load from addr, it returns the 16-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
 #define __LOAD_EXC_16(src, dst, memorder) {               \
 	if (memorder == __ATOMIC_RELAXED) {               \
 		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
@@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			: "memory");                      \
 	} }
 
+/*
+ * Atomic exclusive load from addr, it returns the 32-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_32(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+/*
+ * Atomic exclusive load from addr, it returns the 64-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_64(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
 	__LOAD_EXC_16(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			__LOAD_EXC_16(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_16
 }
 
 static __rte_always_inline void
@@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 32-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_32(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_32(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 			__LOAD_EXC_32(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_32
 }
 
 static __rte_always_inline void
@@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 64-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_64(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_64(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -143,6 +141,171 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		} while (value != expected);
 	}
 }
+
+static __rte_always_inline void
+rte_wait_until_part_equal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t expected, int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_16(addr, value, memorder)
+	if ((value & mask) != expected) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_16(addr, value, memorder)
+		} while ((value & mask) != expected);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_part_equal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t expected, int memorder)
+{
+	uint32_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_32(addr, value, memorder)
+	if ((value & mask) != expected) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_32(addr, value, memorder)
+		} while ((value & mask) != expected);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_part_equal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t expected, int memorder)
+{
+	uint64_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_64(addr, value, memorder)
+	if ((value & mask) != expected) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_64(addr, value, memorder)
+		} while ((value & mask) != expected);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_16(volatile uint16_t *addr, uint16_t original,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_16(addr, value, memorder)
+	if (value == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_16(addr, value, memorder)
+		} while (value == original);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_32(volatile uint32_t *addr, uint32_t original,
+		int memorder)
+{
+	uint32_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_32(addr, value, memorder)
+	if (value == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_32(addr, value, memorder)
+		} while (value == original);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_64(volatile uint64_t *addr, uint64_t original,
+		int memorder)
+{
+	uint64_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_64(addr, value, memorder)
+	if (value == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_64(addr, value, memorder)
+		} while (value == original);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t original, int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_16(addr, value, memorder)
+	if ((value & mask) == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_16(addr, value, memorder)
+		} while ((value & mask) == original);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t original, int memorder)
+{
+	uint32_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_32(addr, value, memorder)
+	if ((value & mask) == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_32(addr, value, memorder)
+		} while ((value & mask) == original);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t original, int memorder)
+{
+	uint64_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	__LOAD_EXC_64(addr, value, memorder)
+	if ((value & mask) == original) {
+		__SEVL()
+		do {
+			__WFE()
+			__LOAD_EXC_64(addr, value, memorder)
+		} while ((value & mask) == original);
+	}
+}
+
+#undef __LOAD_EXC_16
+#undef __LOAD_EXC_32
 #undef __LOAD_EXC_64
 
 #undef __SEVL
diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
index 668ee4a184..943a886f01 100644
--- a/lib/eal/include/generic/rte_pause.h
+++ b/lib/eal/include/generic/rte_pause.h
@@ -81,6 +81,222 @@ static __rte_always_inline void
 rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		int memorder);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be equal with a 16-bit expected value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_equal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t expected, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be equal with a 32-bit expected value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_equal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t expected, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be equal with a 64-bit expected value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_equal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t expected, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for *addr to be unequal with a 16-bit original value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param original
+ *  A 16-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_unequal_16(volatile uint16_t *addr, uint16_t original,
+		int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for *addr to be unequal with a 32-bit original value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param original
+ *  A 32-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_unequal_32(volatile uint32_t *addr, uint32_t original,
+		int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for *addr to be unequal with a 64-bit original value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param original
+ *  A 64-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_unequal_64(volatile uint64_t *addr, uint64_t original,
+		int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be unequal with a 16-bit original value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param original
+ *  A 16-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_unequal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t original, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be unequal with a 32-bit original value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param original
+ *  A 32-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_unequal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t original, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wait for part bits of *addr to be unequal with a 64-bit original value, with
+ * a relaxed memory ordering model meaning the loads around this API can be
+ * reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ * value mask of a specific location
+ * @param original
+ *  A 64-bit original value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_part_unequal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t original, int memorder);
+
 #ifndef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
 static __rte_always_inline void
 rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
@@ -111,6 +327,99 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 	while (__atomic_load_n(addr, memorder) != expected)
 		rte_pause();
 }
+
+static __rte_always_inline void
+rte_wait_until_part_equal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t expected, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) != expected)
+		rte_pause();
+
+}
+
+static __rte_always_inline void
+rte_wait_until_part_equal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t expected, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) != expected)
+		rte_pause();
+
+}
+
+static __rte_always_inline void
+rte_wait_until_part_equal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t expected, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) != expected)
+		rte_pause();
+
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_16(volatile uint16_t *addr, uint16_t original,
+		int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while (__atomic_load_n(addr, memorder) == original)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_32(volatile uint32_t *addr, uint32_t original,
+		int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while (__atomic_load_n(addr, memorder) == original)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_unequal_64(volatile uint64_t *addr, uint64_t original,
+		int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while (__atomic_load_n(addr, memorder) == original)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_16(volatile uint16_t *addr, uint16_t mask,
+		uint16_t original, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) == original)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_32(volatile uint32_t *addr, uint32_t mask,
+		uint32_t original, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) == original)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_part_unequal_64(volatile uint64_t *addr, uint64_t mask,
+		uint64_t original, int memorder)
+{
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
+	while ((__atomic_load_n(addr, memorder) & mask) == original)
+		rte_pause();
+}
 #endif
 
 #endif /* _RTE_PAUSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v1 2/5] eal: use wait until scheme for read pflock
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 1/5] eal: " Feifei Wang
@ 2021-09-02  5:32 ` Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 3/5] eal: use wait until scheme for mcslock Feifei Wang
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for read pflock update, use wait_until_part_equal
for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_pflock.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/eal/include/generic/rte_pflock.h b/lib/eal/include/generic/rte_pflock.h
index e57c179ef2..5298dec7b8 100644
--- a/lib/eal/include/generic/rte_pflock.h
+++ b/lib/eal/include/generic/rte_pflock.h
@@ -121,9 +121,8 @@ rte_pflock_read_lock(rte_pflock_t *pf)
 		return;
 
 	/* Wait for current write phase to complete. */
-	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE)
-		& RTE_PFLOCK_WBITS) == w)
-		rte_pause();
+	rte_wait_until_part_unequal_16(&pf->rd.in,
+			RTE_PFLOCK_WBITS, w, __ATOMIC_ACQUIRE);
 }
 
 /**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v1 3/5] eal: use wait until scheme for mcslock
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 1/5] eal: " Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 2/5] eal: use wait until scheme for read pflock Feifei Wang
@ 2021-09-02  5:32 ` Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 4/5] lib/bpf: use wait until scheme for Rx/Tx iteration Feifei Wang
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for mcslock to be updated, use wait_until_unequal
for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_mcslock.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/eal/include/generic/rte_mcslock.h b/lib/eal/include/generic/rte_mcslock.h
index 9f323bd2a2..dabad0d4e0 100644
--- a/lib/eal/include/generic/rte_mcslock.h
+++ b/lib/eal/include/generic/rte_mcslock.h
@@ -117,8 +117,13 @@ rte_mcslock_unlock(rte_mcslock_t **msl, rte_mcslock_t *me)
 		/* More nodes added to the queue by other CPUs.
 		 * Wait until the next pointer is set.
 		 */
-		while (__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL)
-			rte_pause();
+#ifdef RTE_ARCH_32
+		rte_wait_until_unequal_32((volatile uint32_t *)&me->next,
+				0, __ATOMIC_RELAXED);
+#else
+		rte_wait_until_unequal_64((volatile uint64_t *)&me->next,
+				0, __ATOMIC_RELAXED);
+#endif
 	}
 
 	/* Pass lock to next waiter. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v1 4/5] lib/bpf: use wait until scheme for Rx/Tx iteration
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
                   ` (2 preceding siblings ...)
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 3/5] eal: use wait until scheme for mcslock Feifei Wang
@ 2021-09-02  5:32 ` Feifei Wang
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 5/5] lib/distributor: use wait until scheme Feifei Wang
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  To: Konstantin Ananyev, Ferruh Yigit
  Cc: dev, nd, Feifei Wang, stable, Ruifeng Wang

First, fix the bug that keyword const of func arg should be after "*".
This is because const before "*" means the value of "cbi" should not be
changed. But we should monitor that cbi->use changed and then we can
jump out of loop.

Second, instead of polling for cbi->use to be updated, use
wait_until_unequal api.

Fixes: a93ff62a8938 ("bpf: introduce basic Rx/Tx filters")
Cc: konstantin.ananyev@intel.com
Cc: stable@dpdk.org

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/bpf/bpf_pkt.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 6e8248f0d6..ed63e00219 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -111,9 +111,9 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
  * Waits till datapath finished using given callback.
  */
 static void
-bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
+bpf_eth_cbi_wait(struct bpf_eth_cbi *const cbi)
 {
-	uint32_t nuse, puse;
+	uint32_t puse;
 
 	/* make sure all previous loads and stores are completed */
 	rte_smp_mb();
@@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 
 	/* in use, busy wait till current RX/TX iteration is finished */
 	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
-		do {
-			rte_pause();
-			rte_compiler_barrier();
-			nuse = cbi->use;
-		} while (nuse == puse);
+		rte_compiler_barrier();
+		rte_wait_until_unequal_32(&cbi->use, puse, __ATOMIC_RELAXED);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v1 5/5] lib/distributor: use wait until scheme
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
                   ` (3 preceding siblings ...)
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 4/5] lib/bpf: use wait until scheme for Rx/Tx iteration Feifei Wang
@ 2021-09-02  5:32 ` Feifei Wang
  2021-09-02 15:22 ` [dpdk-dev] [RFC PATCH v1 0/5] add new API for " Stephen Hemminger
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-02  5:32 UTC (permalink / raw)
  To: David Hunt; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for bufptr64 to be updated, use
rte_wait_until_part_equal for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/distributor/rte_distributor_single.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/distributor/rte_distributor_single.c b/lib/distributor/rte_distributor_single.c
index f4725b1d0b..95de42f41a 100644
--- a/lib/distributor/rte_distributor_single.c
+++ b/lib/distributor/rte_distributor_single.c
@@ -33,9 +33,8 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_GET_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_until_part_equal_64((volatile uint64_t *)&buf->bufptr64,
+			RTE_DISTRIB_FLAGS_MASK, 0, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on GET_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
@@ -74,9 +73,8 @@ rte_distributor_return_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_RETURN_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_until_part_equal_64((volatile uint64_t *)&buf->bufptr64,
+			RTE_DISTRIB_FLAGS_MASK, 0, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on RETURN_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
                   ` (4 preceding siblings ...)
  2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 5/5] lib/distributor: use wait until scheme Feifei Wang
@ 2021-09-02 15:22 ` Stephen Hemminger
  2021-09-03  7:02   ` [dpdk-dev] 回复: " Feifei Wang
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
  7 siblings, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2021-09-02 15:22 UTC (permalink / raw)
  To: Feifei Wang; +Cc: dev, nd

On Thu,  2 Sep 2021 13:32:48 +0800
Feifei Wang <feifei.wang2@arm.com> wrote:

> Add new API for wait_until scheme, and apply this new API into lib to
> replace rte_pause.
> 
> Feifei Wang (5):
>   eal: add new API for wait until scheme
>   eal: use wait until scheme for read pflock
>   eal: use wait until scheme for mcslock
>   lib/bpf: use wait until scheme for Rx/Tx iteration
>   lib/distributor: use wait until scheme
> 
>  lib/bpf/bpf_pkt.c                        |  11 +-
>  lib/distributor/rte_distributor_single.c |  10 +-
>  lib/eal/arm/include/rte_pause_64.h       | 271 ++++++++++++++++----
>  lib/eal/include/generic/rte_mcslock.h    |   9 +-
>  lib/eal/include/generic/rte_pause.h      | 309 +++++++++++++++++++++++
>  lib/eal/include/generic/rte_pflock.h     |   5 +-
>  6 files changed, 543 insertions(+), 72 deletions(-)
> 

Since these are all inline, would it be possible to make
this a macro and have the caller pass a condition function?

Look at Linux wait_event() for an example of that.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复:  [RFC PATCH v1 0/5] add new API for wait until scheme
  2021-09-02 15:22 ` [dpdk-dev] [RFC PATCH v1 0/5] add new API for " Stephen Hemminger
@ 2021-09-03  7:02   ` Feifei Wang
  0 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-03  7:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, nd, nd

Hi, Stephen

Thanks for the reviewing. I think it is a good comment.
According to the comments, we plan to change this API as follow:

#define wait_until_event_16(addr, mask, expected, op, memorder)
	uint16_t value
	__LOAD_EXC_16(addr, value, memorder)
	if ((value&mask) op expected) {
		__SEVL()
		do {
			__WFE()
			__LOAD_EXC_16(addr, value, memorder)
		} while ((value&mask) op expected);

1. According to the size, there will be three definitions: 16/32/64 bits
2. op is defined for a symbol(!= or ==), I'm not sure whether it is legal in dpdk.
3. If the case is not 'wait_part_equal/unequal', mask can be 0xFF. 

Have you any more comments for this change?

Best Regards
Feifei
> -----邮件原件-----
> 发件人: Stephen Hemminger <stephen@networkplumber.org>
> 发送时间: Thursday, September 2, 2021 11:22 PM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>
> 主题: Re: [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme
> 
> On Thu,  2 Sep 2021 13:32:48 +0800
> Feifei Wang <feifei.wang2@arm.com> wrote:
> 
> > Add new API for wait_until scheme, and apply this new API into lib to
> > replace rte_pause.
> >
> > Feifei Wang (5):
> >   eal: add new API for wait until scheme
> >   eal: use wait until scheme for read pflock
> >   eal: use wait until scheme for mcslock
> >   lib/bpf: use wait until scheme for Rx/Tx iteration
> >   lib/distributor: use wait until scheme
> >
> >  lib/bpf/bpf_pkt.c                        |  11 +-
> >  lib/distributor/rte_distributor_single.c |  10 +-
> >  lib/eal/arm/include/rte_pause_64.h       | 271 ++++++++++++++++----
> >  lib/eal/include/generic/rte_mcslock.h    |   9 +-
> >  lib/eal/include/generic/rte_pause.h      | 309 +++++++++++++++++++++++
> >  lib/eal/include/generic/rte_pflock.h     |   5 +-
> >  6 files changed, 543 insertions(+), 72 deletions(-)
> >
> 
> Since these are all inline, would it be possible to make this a macro and have
> the caller pass a condition function?
> 
> Look at Linux wait_event() for an example of that.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
                   ` (5 preceding siblings ...)
  2021-09-02 15:22 ` [dpdk-dev] [RFC PATCH v1 0/5] add new API for " Stephen Hemminger
@ 2021-09-23  9:58 ` Feifei Wang
  2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 1/5] eal: " Feifei Wang
                     ` (4 more replies)
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
  7 siblings, 5 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:58 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang

Add new definitions for wait scheme, and apply this new definitions into
lib to replace rte_pause.

v2:
1. use macro to create new wait scheme (Stephen)

Feifei Wang (5):
  eal: add new definitions for wait scheme
  eal: use wait event for read pflock
  eal: use wait event scheme for mcslock
  lib/bpf: use wait event scheme for Rx/Tx iteration
  lib/distributor: use wait event scheme

 lib/bpf/bpf_pkt.c                        |  11 +-
 lib/distributor/rte_distributor_single.c |  10 +-
 lib/eal/arm/include/rte_pause_64.h       | 151 +++++++++++++++--------
 lib/eal/include/generic/rte_mcslock.h    |  12 +-
 lib/eal/include/generic/rte_pause.h      |  78 ++++++++++++
 lib/eal/include/generic/rte_pflock.h     |   4 +-
 6 files changed, 192 insertions(+), 74 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 1/5] eal: add new definitions for wait scheme
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
@ 2021-09-23  9:58   ` Feifei Wang
  2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 2/5] eal: use wait event for read pflock Feifei Wang
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:58 UTC (permalink / raw)
  To: Ruifeng Wang; +Cc: dev, nd, Feifei Wang

Introduce macros as generic interface for address monitoring.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/arm/include/rte_pause_64.h  | 151 ++++++++++++++++++----------
 lib/eal/include/generic/rte_pause.h |  78 ++++++++++++++
 2 files changed, 175 insertions(+), 54 deletions(-)

diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
index e87d10b8cc..205510e044 100644
--- a/lib/eal/arm/include/rte_pause_64.h
+++ b/lib/eal/arm/include/rte_pause_64.h
@@ -31,20 +31,12 @@ static inline void rte_pause(void)
 /* Put processor into low power WFE(Wait For Event) state. */
 #define __WFE() { asm volatile("wfe" : : : "memory"); }
 
-static __rte_always_inline void
-rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
-		int memorder)
-{
-	uint16_t value;
-
-	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
-
-	/*
-	 * Atomic exclusive load from addr, it returns the 16-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
+/*
+ * Atomic exclusive load from addr, it returns the 16-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
 #define __LOAD_EXC_16(src, dst, memorder) {               \
 	if (memorder == __ATOMIC_RELAXED) {               \
 		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
@@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			: "memory");                      \
 	} }
 
+/*
+ * Atomic exclusive load from addr, it returns the 32-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_32(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+/*
+ * Atomic exclusive load from addr, it returns the 64-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_64(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
 	__LOAD_EXC_16(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			__LOAD_EXC_16(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_16
 }
 
 static __rte_always_inline void
@@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 32-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_32(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_32(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 			__LOAD_EXC_32(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_32
 }
 
 static __rte_always_inline void
@@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 64-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_64(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_64(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -143,6 +141,51 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		} while (value != expected);
 	}
 }
+
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)                \
+do {									       \
+	uint16_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_16(addr, value, memorder)				       \
+	if ((value & mask) cond expected) {				       \
+		__SEVL()						       \
+		do {							       \
+			__WFE()						       \
+			__LOAD_EXC_16(addr, value, memorder)		       \
+		} while ((value & mask) cond expected);			       \
+	}								       \
+} while (0)
+
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint32_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_32(addr, value, memorder)                                   \
+	if ((value & mask) op expected) {                                      \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_32(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint64_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_64(addr, value, memorder)                                   \
+	if ((value & mask) cond expected) {                                    \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_64(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#undef __LOAD_EXC_16
+#undef __LOAD_EXC_32
 #undef __LOAD_EXC_64
 
 #undef __SEVL
diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
index 668ee4a184..4e32107eca 100644
--- a/lib/eal/include/generic/rte_pause.h
+++ b/lib/eal/include/generic/rte_pause.h
@@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 	while (__atomic_load_n(addr, memorder) != expected)
 		rte_pause();
 }
+
+/*
+ * Wait until a 16-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 32-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 64-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
 #endif
 
 #endif /* _RTE_PAUSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 2/5] eal: use wait event for read pflock
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
  2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 1/5] eal: " Feifei Wang
@ 2021-09-23  9:58   ` Feifei Wang
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 3/5] eal: use wait event scheme for mcslock Feifei Wang
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:58 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for read pflock update, use wait event scheme for
this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_pflock.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/eal/include/generic/rte_pflock.h b/lib/eal/include/generic/rte_pflock.h
index e57c179ef2..9865f1349c 100644
--- a/lib/eal/include/generic/rte_pflock.h
+++ b/lib/eal/include/generic/rte_pflock.h
@@ -121,9 +121,7 @@ rte_pflock_read_lock(rte_pflock_t *pf)
 		return;
 
 	/* Wait for current write phase to complete. */
-	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE)
-		& RTE_PFLOCK_WBITS) == w)
-		rte_pause();
+	rte_wait_event_16(&pf->rd.in, RTE_PFLOCK_WBITS, w, ==, __ATOMIC_ACQUIRE);
 }
 
 /**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 3/5] eal: use wait event scheme for mcslock
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
  2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 1/5] eal: " Feifei Wang
  2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 2/5] eal: use wait event for read pflock Feifei Wang
@ 2021-09-23  9:59   ` Feifei Wang
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 5/5] lib/distributor: use wait event scheme Feifei Wang
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:59 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for mcslock to be updated, use wait event scheme
for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_mcslock.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/eal/include/generic/rte_mcslock.h b/lib/eal/include/generic/rte_mcslock.h
index 9f323bd2a2..264c04021f 100644
--- a/lib/eal/include/generic/rte_mcslock.h
+++ b/lib/eal/include/generic/rte_mcslock.h
@@ -84,8 +84,7 @@ rte_mcslock_lock(rte_mcslock_t **msl, rte_mcslock_t *me)
 	 * to spin on me->locked until the previous lock holder resets
 	 * the me->locked using mcslock_unlock().
 	 */
-	while (__atomic_load_n(&me->locked, __ATOMIC_ACQUIRE))
-		rte_pause();
+	rte_wait_event_32(&me->locked, INT_MAX, 0, !=, __ATOMIC_ACQUIRE);
 }
 
 /**
@@ -117,8 +116,13 @@ rte_mcslock_unlock(rte_mcslock_t **msl, rte_mcslock_t *me)
 		/* More nodes added to the queue by other CPUs.
 		 * Wait until the next pointer is set.
 		 */
-		while (__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL)
-			rte_pause();
+#ifdef RTE_ARCH_32
+		rte_wait_event_32((volatile uint32_t *)&me->next, UINT_MAX, 0, ==,
+				__ATOMIC_RELAXED);
+#else
+		rte_wait_event_64((volatile uint64_t *)&me->next, ULLONG_MAX, 0, ==,
+				__ATOMIC_RELAXED);
+#endif
 	}
 
 	/* Pass lock to next waiter. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
                     ` (2 preceding siblings ...)
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 3/5] eal: use wait event scheme for mcslock Feifei Wang
@ 2021-09-23  9:59   ` Feifei Wang
  2021-09-24 18:07     ` Ananyev, Konstantin
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 5/5] lib/distributor: use wait event scheme Feifei Wang
  4 siblings, 1 reply; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:59 UTC (permalink / raw)
  To: Konstantin Ananyev, Ferruh Yigit
  Cc: dev, nd, Feifei Wang, stable, Ruifeng Wang

First, fix the bug that keyword const of func arg should be after "*".
This is because const before "*" means the value of "cbi" should not be
changed. But we should monitor that cbi->use changed and then we can
jump out of loop.

Second, instead of polling for cbi->use to be updated, use
wait event scheme.

Fixes: a93ff62a8938 ("bpf: introduce basic Rx/Tx filters")
Cc: konstantin.ananyev@intel.com
Cc: stable@dpdk.org

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/bpf/bpf_pkt.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 6e8248f0d6..08ed8ff68c 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -111,9 +111,9 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
  * Waits till datapath finished using given callback.
  */
 static void
-bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
+bpf_eth_cbi_wait(struct bpf_eth_cbi *const cbi)
 {
-	uint32_t nuse, puse;
+	uint32_t puse;
 
 	/* make sure all previous loads and stores are completed */
 	rte_smp_mb();
@@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 
 	/* in use, busy wait till current RX/TX iteration is finished */
 	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
-		do {
-			rte_pause();
-			rte_compiler_barrier();
-			nuse = cbi->use;
-		} while (nuse == puse);
+		rte_compiler_barrier();
+		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==, __ATOMIC_RELAXED);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v2 5/5] lib/distributor: use wait event scheme
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
                     ` (3 preceding siblings ...)
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
@ 2021-09-23  9:59   ` Feifei Wang
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-23  9:59 UTC (permalink / raw)
  To: David Hunt; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for bufptr64 to be updated, use
wait event for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/distributor/rte_distributor_single.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/distributor/rte_distributor_single.c b/lib/distributor/rte_distributor_single.c
index f4725b1d0b..815305444a 100644
--- a/lib/distributor/rte_distributor_single.c
+++ b/lib/distributor/rte_distributor_single.c
@@ -33,9 +33,8 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_GET_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event_64((volatile uint64_t *)&buf->bufptr64,
+			RTE_DISTRIB_FLAGS_MASK, 0, !=, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on GET_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
@@ -74,9 +73,8 @@ rte_distributor_return_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_RETURN_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event_64((volatile uint64_t *)&buf->bufptr64,
+			RTE_DISTRIB_FLAGS_MASK, 0, !=, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on RETURN_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
@ 2021-09-24 18:07     ` Ananyev, Konstantin
  2021-09-26  2:19       ` [dpdk-dev] 回复: " Feifei Wang
  0 siblings, 1 reply; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-09-24 18:07 UTC (permalink / raw)
  To: Feifei Wang, Yigit, Ferruh; +Cc: dev, nd, stable, Ruifeng Wang


> 
> First, fix the bug that keyword const of func arg should be after "*".

I believe there is no bug here.

> This is because const before "*" means the value of "cbi" should not be
> changed. 

Exactly, it says that the function itself will not change the value of "cbi".
It just waits for the value to be changed by someone else.
So please keep parameter list intact.

> But we should monitor that cbi->use changed and then we can
> jump out of loop.
> 
> Second, instead of polling for cbi->use to be updated, use
> wait event scheme.
> 
> Fixes: a93ff62a8938 ("bpf: introduce basic Rx/Tx filters")
> Cc: konstantin.ananyev@intel.com
> Cc: stable@dpdk.org
> 
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/bpf/bpf_pkt.c | 11 ++++-------
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
> index 6e8248f0d6..08ed8ff68c 100644
> --- a/lib/bpf/bpf_pkt.c
> +++ b/lib/bpf/bpf_pkt.c
> @@ -111,9 +111,9 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
>   * Waits till datapath finished using given callback.
>   */
>  static void
> -bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> +bpf_eth_cbi_wait(struct bpf_eth_cbi *const cbi)
>  {
> -	uint32_t nuse, puse;
> +	uint32_t puse;
> 
>  	/* make sure all previous loads and stores are completed */
>  	rte_smp_mb();
> @@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> 
>  	/* in use, busy wait till current RX/TX iteration is finished */
>  	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
> -		do {
> -			rte_pause();
> -			rte_compiler_barrier();
> -			nuse = cbi->use;
> -		} while (nuse == puse);
> +		rte_compiler_barrier();
> +		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==, __ATOMIC_RELAXED);
>  	}
>  }
> 
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复: [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-09-24 18:07     ` Ananyev, Konstantin
@ 2021-09-26  2:19       ` Feifei Wang
  0 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  2:19 UTC (permalink / raw)
  To: Ananyev, Konstantin, Yigit, Ferruh; +Cc: dev, nd, stable, Ruifeng Wang, nd


> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Saturday, September 25, 2021 2:08 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Yigit, Ferruh
> <ferruh.yigit@intel.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>; stable@dpdk.org; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 主题: RE: [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx
> iteration
> 
> 
> >
> > First, fix the bug that keyword const of func arg should be after "*".
> 
> I believe there is no bug here.
> 
> > This is because const before "*" means the value of "cbi" should not
> > be changed.
> 
> Exactly, it says that the function itself will not change the value of "cbi".
> It just waits for the value to be changed by someone else.
> So please keep parameter list intact.

Thanks for your explanation. The reason I changed is that I ever used rte_wait_until_xx(validate *addr) API here,
And there is conflict between "const" and "validate", complier will report warning here.
But now I think since I keep it as it is, there will be no warning due to new macro has no "validate".
I will delete this unnecessary bug fix.
> 
> > But we should monitor that cbi->use changed and then we can jump out
> > of loop.
> >
> > Second, instead of polling for cbi->use to be updated, use wait event
> > scheme.
> >
> > Fixes: a93ff62a8938 ("bpf: introduce basic Rx/Tx filters")
> > Cc: konstantin.ananyev@intel.com
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >  lib/bpf/bpf_pkt.c | 11 ++++-------
> >  1 file changed, 4 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c index
> > 6e8248f0d6..08ed8ff68c 100644
> > --- a/lib/bpf/bpf_pkt.c
> > +++ b/lib/bpf/bpf_pkt.c
> > @@ -111,9 +111,9 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
> >   * Waits till datapath finished using given callback.
> >   */
> >  static void
> > -bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> > +bpf_eth_cbi_wait(struct bpf_eth_cbi *const cbi)
> >  {
> > -	uint32_t nuse, puse;
> > +	uint32_t puse;
> >
> >  	/* make sure all previous loads and stores are completed */
> >  	rte_smp_mb();
> > @@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> >
> >  	/* in use, busy wait till current RX/TX iteration is finished */
> >  	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
> > -		do {
> > -			rte_pause();
> > -			rte_compiler_barrier();
> > -			nuse = cbi->use;
> > -		} while (nuse == puse);
> > +		rte_compiler_barrier();
> > +		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==,
> __ATOMIC_RELAXED);
> >  	}
> >  }
> >
> > --
> > 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme
  2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
                   ` (6 preceding siblings ...)
  2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
@ 2021-09-26  6:32 ` Feifei Wang
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 1/5] eal: " Feifei Wang
                     ` (5 more replies)
  7 siblings, 6 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:32 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang

Add new definitions for wait scheme, and apply this new definitions into
lib to replace rte_pause.

v2:
1. use macro to create new wait scheme (Stephen)

v3:
1. delete unnecessary bug fix in bpf (Konstantin)

Feifei Wang (5):
  eal: add new definitions for wait scheme
  eal: use wait event for read pflock
  eal: use wait event scheme for mcslock
  lib/bpf: use wait event scheme for Rx/Tx iteration
  lib/distributor: use wait event scheme

 lib/bpf/bpf_pkt.c                        |   9 +-
 lib/distributor/rte_distributor_single.c |  10 +-
 lib/eal/arm/include/rte_pause_64.h       | 151 +++++++++++++++--------
 lib/eal/include/generic/rte_mcslock.h    |  12 +-
 lib/eal/include/generic/rte_pause.h      |  78 ++++++++++++
 lib/eal/include/generic/rte_pflock.h     |   4 +-
 6 files changed, 191 insertions(+), 73 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
@ 2021-09-26  6:32   ` Feifei Wang
  2021-10-07 16:18     ` Ananyev, Konstantin
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 2/5] eal: use wait event for read pflock Feifei Wang
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:32 UTC (permalink / raw)
  To: Ruifeng Wang; +Cc: dev, nd, Feifei Wang

Introduce macros as generic interface for address monitoring.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/arm/include/rte_pause_64.h  | 151 ++++++++++++++++++----------
 lib/eal/include/generic/rte_pause.h |  78 ++++++++++++++
 2 files changed, 175 insertions(+), 54 deletions(-)

diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
index e87d10b8cc..205510e044 100644
--- a/lib/eal/arm/include/rte_pause_64.h
+++ b/lib/eal/arm/include/rte_pause_64.h
@@ -31,20 +31,12 @@ static inline void rte_pause(void)
 /* Put processor into low power WFE(Wait For Event) state. */
 #define __WFE() { asm volatile("wfe" : : : "memory"); }
 
-static __rte_always_inline void
-rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
-		int memorder)
-{
-	uint16_t value;
-
-	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
-
-	/*
-	 * Atomic exclusive load from addr, it returns the 16-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
+/*
+ * Atomic exclusive load from addr, it returns the 16-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
 #define __LOAD_EXC_16(src, dst, memorder) {               \
 	if (memorder == __ATOMIC_RELAXED) {               \
 		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
@@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			: "memory");                      \
 	} }
 
+/*
+ * Atomic exclusive load from addr, it returns the 32-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_32(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+/*
+ * Atomic exclusive load from addr, it returns the 64-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_64(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
 	__LOAD_EXC_16(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			__LOAD_EXC_16(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_16
 }
 
 static __rte_always_inline void
@@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 32-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_32(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_32(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 			__LOAD_EXC_32(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_32
 }
 
 static __rte_always_inline void
@@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 64-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_64(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_64(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -143,6 +141,51 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		} while (value != expected);
 	}
 }
+
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)                \
+do {									       \
+	uint16_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_16(addr, value, memorder)				       \
+	if ((value & mask) cond expected) {				       \
+		__SEVL()						       \
+		do {							       \
+			__WFE()						       \
+			__LOAD_EXC_16(addr, value, memorder)		       \
+		} while ((value & mask) cond expected);			       \
+	}								       \
+} while (0)
+
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint32_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_32(addr, value, memorder)                                   \
+	if ((value & mask) op expected) {                                      \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_32(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint64_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_64(addr, value, memorder)                                   \
+	if ((value & mask) cond expected) {                                    \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_64(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#undef __LOAD_EXC_16
+#undef __LOAD_EXC_32
 #undef __LOAD_EXC_64
 
 #undef __SEVL
diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
index 668ee4a184..4e32107eca 100644
--- a/lib/eal/include/generic/rte_pause.h
+++ b/lib/eal/include/generic/rte_pause.h
@@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 	while (__atomic_load_n(addr, memorder) != expected)
 		rte_pause();
 }
+
+/*
+ * Wait until a 16-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 32-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 64-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
 #endif
 
 #endif /* _RTE_PAUSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 2/5] eal: use wait event for read pflock
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 1/5] eal: " Feifei Wang
@ 2021-09-26  6:32   ` Feifei Wang
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 3/5] eal: use wait event scheme for mcslock Feifei Wang
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:32 UTC (permalink / raw)
  Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for read pflock update, use wait event scheme for
this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_pflock.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/eal/include/generic/rte_pflock.h b/lib/eal/include/generic/rte_pflock.h
index e57c179ef2..9865f1349c 100644
--- a/lib/eal/include/generic/rte_pflock.h
+++ b/lib/eal/include/generic/rte_pflock.h
@@ -121,9 +121,7 @@ rte_pflock_read_lock(rte_pflock_t *pf)
 		return;
 
 	/* Wait for current write phase to complete. */
-	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE)
-		& RTE_PFLOCK_WBITS) == w)
-		rte_pause();
+	rte_wait_event_16(&pf->rd.in, RTE_PFLOCK_WBITS, w, ==, __ATOMIC_ACQUIRE);
 }
 
 /**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 3/5] eal: use wait event scheme for mcslock
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 1/5] eal: " Feifei Wang
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 2/5] eal: use wait event for read pflock Feifei Wang
@ 2021-09-26  6:33   ` Feifei Wang
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:33 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for mcslock to be updated, use wait event scheme
for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_mcslock.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/eal/include/generic/rte_mcslock.h b/lib/eal/include/generic/rte_mcslock.h
index 9f323bd2a2..c8d1c4f38f 100644
--- a/lib/eal/include/generic/rte_mcslock.h
+++ b/lib/eal/include/generic/rte_mcslock.h
@@ -84,8 +84,7 @@ rte_mcslock_lock(rte_mcslock_t **msl, rte_mcslock_t *me)
 	 * to spin on me->locked until the previous lock holder resets
 	 * the me->locked using mcslock_unlock().
 	 */
-	while (__atomic_load_n(&me->locked, __ATOMIC_ACQUIRE))
-		rte_pause();
+	rte_wait_event_32(&me->locked, INT_MAX, 0, !=, __ATOMIC_ACQUIRE);
 }
 
 /**
@@ -117,8 +116,13 @@ rte_mcslock_unlock(rte_mcslock_t **msl, rte_mcslock_t *me)
 		/* More nodes added to the queue by other CPUs.
 		 * Wait until the next pointer is set.
 		 */
-		while (__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL)
-			rte_pause();
+#ifdef RTE_ARCH_32
+		rte_wait_event_32((uint32_t *)&me->next, UINT_MAX, 0, ==,
+				__ATOMIC_RELAXED);
+#else
+		rte_wait_event_64((uint64_t *)&me->next, ULONG_MAX, 0, ==,
+				__ATOMIC_RELAXED);
+#endif
 	}
 
 	/* Pass lock to next waiter. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
                     ` (2 preceding siblings ...)
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 3/5] eal: use wait event scheme for mcslock Feifei Wang
@ 2021-09-26  6:33   ` Feifei Wang
  2021-10-07 15:50     ` Ananyev, Konstantin
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 5/5] lib/distributor: use wait event scheme Feifei Wang
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
  5 siblings, 1 reply; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:33 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for cbi->use to be updated, use wait event scheme.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/bpf/bpf_pkt.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 6e8248f0d6..3af15ae97b 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -113,7 +113,7 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
 static void
 bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 {
-	uint32_t nuse, puse;
+	uint32_t puse;
 
 	/* make sure all previous loads and stores are completed */
 	rte_smp_mb();
@@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 
 	/* in use, busy wait till current RX/TX iteration is finished */
 	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
-		do {
-			rte_pause();
-			rte_compiler_barrier();
-			nuse = cbi->use;
-		} while (nuse == puse);
+		rte_compiler_barrier();
+		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==, __ATOMIC_RELAXED);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [RFC PATCH v3 5/5] lib/distributor: use wait event scheme
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
                     ` (3 preceding siblings ...)
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
@ 2021-09-26  6:33   ` Feifei Wang
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
  5 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-09-26  6:33 UTC (permalink / raw)
  To: David Hunt; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for bufptr64 to be updated, use
wait event for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/distributor/rte_distributor_single.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/distributor/rte_distributor_single.c b/lib/distributor/rte_distributor_single.c
index f4725b1d0b..86cab349f4 100644
--- a/lib/distributor/rte_distributor_single.c
+++ b/lib/distributor/rte_distributor_single.c
@@ -33,9 +33,8 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_GET_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event_64(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK,
+			0, !=, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on GET_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
@@ -74,9 +73,8 @@ rte_distributor_return_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_RETURN_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event_64(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK,
+			0, !=, __ATOMIC_RELAXED);
 
 	/* Sync with distributor on RETURN_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
@ 2021-10-07 15:50     ` Ananyev, Konstantin
  2021-10-07 17:40       ` Ananyev, Konstantin
  0 siblings, 1 reply; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-10-07 15:50 UTC (permalink / raw)
  To: Feifei Wang; +Cc: dev, nd, Ruifeng Wang



> 
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/bpf/bpf_pkt.c | 9 +++------
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
> index 6e8248f0d6..3af15ae97b 100644
> --- a/lib/bpf/bpf_pkt.c
> +++ b/lib/bpf/bpf_pkt.c
> @@ -113,7 +113,7 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
>  static void
>  bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
>  {
> -	uint32_t nuse, puse;
> +	uint32_t puse;
> 
>  	/* make sure all previous loads and stores are completed */
>  	rte_smp_mb();
> @@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> 
>  	/* in use, busy wait till current RX/TX iteration is finished */
>  	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
> -		do {
> -			rte_pause();
> -			rte_compiler_barrier();
> -			nuse = cbi->use;
> -		} while (nuse == puse);
> +		rte_compiler_barrier();
> +		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==, __ATOMIC_RELAXED);

If we do use atomic load, while we still need a compiler_barrier() here?

>  	}
>  }
> 
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 1/5] eal: " Feifei Wang
@ 2021-10-07 16:18     ` Ananyev, Konstantin
  2021-10-12  8:09       ` [dpdk-dev] 回复: " Feifei Wang
  0 siblings, 1 reply; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-10-07 16:18 UTC (permalink / raw)
  To: Feifei Wang, Ruifeng Wang; +Cc: dev, nd


> Introduce macros as generic interface for address monitoring.
> 
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/eal/arm/include/rte_pause_64.h  | 151 ++++++++++++++++++----------
>  lib/eal/include/generic/rte_pause.h |  78 ++++++++++++++
>  2 files changed, 175 insertions(+), 54 deletions(-)
> 
> diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
> index e87d10b8cc..205510e044 100644
> --- a/lib/eal/arm/include/rte_pause_64.h
> +++ b/lib/eal/arm/include/rte_pause_64.h
> @@ -31,20 +31,12 @@ static inline void rte_pause(void)
>  /* Put processor into low power WFE(Wait For Event) state. */
>  #define __WFE() { asm volatile("wfe" : : : "memory"); }
> 
> -static __rte_always_inline void
> -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> -		int memorder)
> -{
> -	uint16_t value;
> -
> -	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> -
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 16-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> +/*
> + * Atomic exclusive load from addr, it returns the 16-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
>  #define __LOAD_EXC_16(src, dst, memorder) {               \
>  	if (memorder == __ATOMIC_RELAXED) {               \
>  		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
> @@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			: "memory");                      \
>  	} }
> 
> +/*
> + * Atomic exclusive load from addr, it returns the 32-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_32(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +/*
> + * Atomic exclusive load from addr, it returns the 64-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_64(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +		int memorder)
> +{
> +	uint16_t value;
> +
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> +
>  	__LOAD_EXC_16(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			__LOAD_EXC_16(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_16
>  }
> 
>  static __rte_always_inline void
> @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 32-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_32(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_32(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
>  			__LOAD_EXC_32(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_32
>  }
> 
>  static __rte_always_inline void
> @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 64-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_64(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_64(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -143,6 +141,51 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  		} while (value != expected);
>  	}
>  }
> +
> +#define rte_wait_event_16(addr, mask, expected, cond, memorder)                \
> +do {									       \
> +	uint16_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_16(addr, value, memorder)				       \
> +	if ((value & mask) cond expected) {				       \
> +		__SEVL()						       \
> +		do {							       \
> +			__WFE()						       \
> +			__LOAD_EXC_16(addr, value, memorder)		       \
> +		} while ((value & mask) cond expected);			       \
> +	}								       \
> +} while (0)
> +
> +#define rte_wait_event_32(addr, mask, expected, cond, memorder)                \
> +do {                                                                           \
> +	uint32_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_32(addr, value, memorder)                                   \
> +	if ((value & mask) op expected) {                                      \
> +		__SEVL()                                                       \
> +		do {                                                           \
> +			__WFE()                                                \
> +			__LOAD_EXC_32(addr, value, memorder)                   \
> +		} while ((value & mask) cond expected);                        \
> +	}                                                                      \
> +} while (0)
> +
> +#define rte_wait_event_64(addr, mask, expected, cond, memorder)                \
> +do {                                                                           \
> +	uint64_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_64(addr, value, memorder)                                   \
> +	if ((value & mask) cond expected) {                                    \
> +		__SEVL()                                                       \
> +		do {                                                           \
> +			__WFE()                                                \
> +			__LOAD_EXC_64(addr, value, memorder)                   \
> +		} while ((value & mask) cond expected);                        \
> +	}                                                                      \
> +} while (0)
> +
> +#undef __LOAD_EXC_16
> +#undef __LOAD_EXC_32
>  #undef __LOAD_EXC_64
> 
>  #undef __SEVL
> diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
> index 668ee4a184..4e32107eca 100644
> --- a/lib/eal/include/generic/rte_pause.h
> +++ b/lib/eal/include/generic/rte_pause.h
> @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  	while (__atomic_load_n(addr, memorder) != expected)
>  		rte_pause();
>  }
> +
> +/*
> + * Wait until a 16-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */

Hmm, so now we have 2 APIs doing similar thing:
rte_wait_until_equal_n() and rte_wait_event_n().
Can we probably unite them somehow?
At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.

> +#define rte_wait_event_16(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \

And why user is not allowed to use __ATOMIC_SEQ_CST here?
BTW, if we expect memorder to always be a constant, might be better BUILD_BUG_ON()?

> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)

Two thoughts with these macros:
1. It is a goof practise to put () around macro parameters in the macro body.
Will save from a lot of unexpected troubles.
2. I think these 3 macros can be united into one.
Something like:

#define rte_wait_event(addr, mask, expected, cond, memorder) do {\
        typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
        if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
                break; \
        rte_pause(); \
} while (1);


> +
> +/*
> + * Wait until a 32-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +#define rte_wait_event_32(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)
> +
> +/*
> + * Wait until a 64-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +#define rte_wait_event_64(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)
>  #endif
> 
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-10-07 15:50     ` Ananyev, Konstantin
@ 2021-10-07 17:40       ` Ananyev, Konstantin
  2021-10-20  6:20         ` [dpdk-dev] 回复: " Feifei Wang
  0 siblings, 1 reply; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-10-07 17:40 UTC (permalink / raw)
  To: Feifei Wang; +Cc: dev, nd, Ruifeng Wang



> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Thursday, October 7, 2021 4:50 PM
> To: Feifei Wang <feifei.wang2@arm.com>
> Cc: dev@dpdk.org; nd@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>
> Subject: RE: [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
> 
> 
> 
> >
> > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >  lib/bpf/bpf_pkt.c | 9 +++------
> >  1 file changed, 3 insertions(+), 6 deletions(-)
> >
> > diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
> > index 6e8248f0d6..3af15ae97b 100644
> > --- a/lib/bpf/bpf_pkt.c
> > +++ b/lib/bpf/bpf_pkt.c
> > @@ -113,7 +113,7 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
> >  static void
> >  bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> >  {
> > -	uint32_t nuse, puse;
> > +	uint32_t puse;
> >
> >  	/* make sure all previous loads and stores are completed */
> >  	rte_smp_mb();
> > @@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> >
> >  	/* in use, busy wait till current RX/TX iteration is finished */
> >  	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
> > -		do {
> > -			rte_pause();
> > -			rte_compiler_barrier();
> > -			nuse = cbi->use;
> > -		} while (nuse == puse);
> > +		rte_compiler_barrier();
> > +		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==, __ATOMIC_RELAXED);

Probably UINT32_MAX will be a bit better here.

> 
> If we do use atomic load, while we still need a compiler_barrier() here?
> 
> >  	}
> >  }
> >
> > --
> > 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复:  [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-10-07 16:18     ` Ananyev, Konstantin
@ 2021-10-12  8:09       ` Feifei Wang
  2021-10-13 15:03         ` [dpdk-dev] " Ananyev, Konstantin
  0 siblings, 1 reply; 42+ messages in thread
From: Feifei Wang @ 2021-10-12  8:09 UTC (permalink / raw)
  To: Ananyev, Konstantin, Ruifeng Wang; +Cc: dev, nd, nd

> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Friday, October 8, 2021 12:19 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>
> 主题: RE: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme

[snip]

> > diff --git a/lib/eal/include/generic/rte_pause.h
> > b/lib/eal/include/generic/rte_pause.h
> > index 668ee4a184..4e32107eca 100644
> > --- a/lib/eal/include/generic/rte_pause.h
> > +++ b/lib/eal/include/generic/rte_pause.h
> > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >  	while (__atomic_load_n(addr, memorder) != expected)
> >  		rte_pause();
> >  }
> > +
> > +/*
> > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> 
> Hmm, so now we have 2 APIs doing similar thing:
> rte_wait_until_equal_n() and rte_wait_event_n().
> Can we probably unite them somehow?
> At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.
> 
You are right. We plan to change rte_wait_until_equal API after this new scheme
is achieved.  And then, we will merge wait_unil into wait_event definition in the next new
patch series.
 
> > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > +__ATOMIC_RELAXED);  \
> 
> And why user is not allowed to use __ATOMIC_SEQ_CST here?
Actually this is just a load operation, and acquire here is enough to make sure 'load
addr value' can be before other operations.
 
> BTW, if we expect memorder to always be a constant, might be better
> BUILD_BUG_ON()?
If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);  

> 
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> 
> Two thoughts with these macros:
> 1. It is a goof practise to put () around macro parameters in the macro body.
> Will save from a lot of unexpected troubles.
> 2. I think these 3 macros can be united into one.
> Something like:
> 
> #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
>         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
>         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
>                 break; \
>         rte_pause(); \
> } while (1);
For this point, I think it is due to different size need to use different assembly instructions
in arm architecture. For example,
load 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
load 32 bits instruction is " ldxr %w[tmp], [%x[addr]" 
load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "
And for consistency, we also use 3 APIs in generic path.
> 
> 
> > +
> > +/*
> > + * Wait until a 32-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +#define rte_wait_event_32(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);  \
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> > +
> > +/*
> > + * Wait until a 64-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +#define rte_wait_event_64(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);  \
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> >  #endif
> >
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-10-12  8:09       ` [dpdk-dev] 回复: " Feifei Wang
@ 2021-10-13 15:03         ` Ananyev, Konstantin
  2021-10-13 17:00           ` Stephen Hemminger
  2021-10-14  3:08           ` Feifei Wang
  0 siblings, 2 replies; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-10-13 15:03 UTC (permalink / raw)
  To: Feifei Wang, Ruifeng Wang; +Cc: dev, nd, nd

> 
> [snip]
> 
> > > diff --git a/lib/eal/include/generic/rte_pause.h
> > > b/lib/eal/include/generic/rte_pause.h
> > > index 668ee4a184..4e32107eca 100644
> > > --- a/lib/eal/include/generic/rte_pause.h
> > > +++ b/lib/eal/include/generic/rte_pause.h
> > > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> > >  	while (__atomic_load_n(addr, memorder) != expected)
> > >  		rte_pause();
> > >  }
> > > +
> > > +/*
> > > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > > +memory
> > > + * ordering model meaning the loads around this API can be reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param mask
> > > + *  A mask of value bits in interest
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + * @param cond
> > > + *  A symbol representing the condition (==, !=).
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> > > +or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + */
> >
> > Hmm, so now we have 2 APIs doing similar thing:
> > rte_wait_until_equal_n() and rte_wait_event_n().
> > Can we probably unite them somehow?
> > At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.
> >
> You are right. We plan to change rte_wait_until_equal API after this new scheme
> is achieved.  And then, we will merge wait_unil into wait_event definition in the next new
> patch series.
> 
> > > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> > 		       \
> > > +do {									       \
> > > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > +__ATOMIC_RELAXED);  \
> >
> > And why user is not allowed to use __ATOMIC_SEQ_CST here?
> Actually this is just a load operation, and acquire here is enough to make sure 'load
> addr value' can be before other operations.
> 
> > BTW, if we expect memorder to always be a constant, might be better
> > BUILD_BUG_ON()?
> If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
> RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);

Yes, that was my thought.
In that case I think we should be able to catch wrong memorder at compilation stage.

> 
> >
> > > +									       \
> > > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> > 	       \
> > > +		rte_pause();						       \
> > > +} while (0)
> >
> > Two thoughts with these macros:
> > 1. It is a goof practise to put () around macro parameters in the macro body.
> > Will save from a lot of unexpected troubles.
> > 2. I think these 3 macros can be united into one.
> > Something like:
> >
> > #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
> >         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
> >         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
> >                 break; \
> >         rte_pause(); \
> > } while (1);
> For this point, I think it is due to different size need to use different assembly instructions
> in arm architecture. For example,
> load 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
> load 32 bits instruction is " ldxr %w[tmp], [%x[addr]"
> load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "

Ok, but it could be then something like that for arm specific code:
if (sizeof(val) == sizeof(uint16_t)) \
	__LOAD_EXC_16(...); \
else if (sizeof(val) == sizeof(uint32_t)) \	
	__LOAD_EXC_32(...); \
else if (sizeof(val) == sizeof(uint64_t)) \
	__LOAD_EXC_64(...); \
...

> And for consistency, we also use 3 APIs in generic path.
Honestly, even one multi-line macro doesn't look nice.
Having 3 identical ones looks even worse.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-10-13 15:03         ` [dpdk-dev] " Ananyev, Konstantin
@ 2021-10-13 17:00           ` Stephen Hemminger
  2021-10-14  3:14             ` [dpdk-dev] 回复: " Feifei Wang
  2021-10-14  3:08           ` Feifei Wang
  1 sibling, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2021-10-13 17:00 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Feifei Wang, Ruifeng Wang, dev, nd

On Wed, 13 Oct 2021 15:03:56 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> > addr value' can be before other operations.
> >   
> > > BTW, if we expect memorder to always be a constant, might be better
> > > BUILD_BUG_ON()?  
> > If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
> > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);  
> 
> Yes, that was my thought.
> In that case I think we should be able to catch wrong memorder at compilation stage.

Maybe:
   RTE_BUILD_BUG_ON(!_constant_p(memorder));
   RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);  
 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复:  [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-10-13 15:03         ` [dpdk-dev] " Ananyev, Konstantin
  2021-10-13 17:00           ` Stephen Hemminger
@ 2021-10-14  3:08           ` Feifei Wang
  1 sibling, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-14  3:08 UTC (permalink / raw)
  To: Ananyev, Konstantin, Ruifeng Wang; +Cc: dev, nd, nd, nd

> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Wednesday, October 13, 2021 11:04 PM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>; nd <nd@arm.com>
> 主题: RE: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme
> 
> >
> > [snip]
> >
> > > > diff --git a/lib/eal/include/generic/rte_pause.h
> > > > b/lib/eal/include/generic/rte_pause.h
> > > > index 668ee4a184..4e32107eca 100644
> > > > --- a/lib/eal/include/generic/rte_pause.h
> > > > +++ b/lib/eal/include/generic/rte_pause.h
> > > > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t
> > > > *addr,
> > > uint64_t expected,
> > > >  	while (__atomic_load_n(addr, memorder) != expected)
> > > >  		rte_pause();
> > > >  }
> > > > +
> > > > +/*
> > > > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > > > +memory
> > > > + * ordering model meaning the loads around this API can be reordered.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param mask
> > > > + *  A mask of value bits in interest
> > > > + * @param expected
> > > > + *  A 16-bit expected value to be in the memory location.
> > > > + * @param cond
> > > > + *  A symbol representing the condition (==, !=).
> > > > + * @param memorder
> > > > + *  Two different memory orders that can be specified:
> > > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > > + *  C++11 memory orders with the same names, see the C++11
> > > > +standard or
> > > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > > + */
> > >
> > > Hmm, so now we have 2 APIs doing similar thing:
> > > rte_wait_until_equal_n() and rte_wait_event_n().
> > > Can we probably unite them somehow?
> > > At least make rte_wait_until_equal_n() to use rte_wait_event_n()
> underneath.
> > >
> > You are right. We plan to change rte_wait_until_equal API after this
> > new scheme is achieved.  And then, we will merge wait_unil into
> > wait_event definition in the next new patch series.
> >
> > > > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> > > 		       \
> > > > +do {
> 	       \
> > > > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > > +__ATOMIC_RELAXED);  \
> > >
> > > And why user is not allowed to use __ATOMIC_SEQ_CST here?
> > Actually this is just a load operation, and acquire here is enough to
> > make sure 'load addr value' can be before other operations.
> >
> > > BTW, if we expect memorder to always be a constant, might be better
> > > BUILD_BUG_ON()?
> > If I understand correctly, you means we can replace 'assert' by
> 'build_bug_on':
> > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder
> > !=__ATOMIC_RELAXED);
> 
> Yes, that was my thought.
> In that case I think we should be able to catch wrong memorder at compilation
> stage.
> 
> >
> > >
> > > > +									       \
> > > > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> > > 	       \
> > > > +		rte_pause();						       \
> > > > +} while (0)
> > >
> > > Two thoughts with these macros:
> > > 1. It is a goof practise to put () around macro parameters in the macro
> body.
> > > Will save from a lot of unexpected troubles.
> > > 2. I think these 3 macros can be united into one.
> > > Something like:
> > >
> > > #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
> > >         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
> > >         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
> > >                 break; \
> > >         rte_pause(); \
> > > } while (1);
> > For this point, I think it is due to different size need to use
> > different assembly instructions in arm architecture. For example, load
> > 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
> > load 32 bits instruction is " ldxr %w[tmp], [%x[addr]"
> > load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "
> 
> Ok, but it could be then something like that for arm specific code:
> if (sizeof(val) == sizeof(uint16_t)) \
> 	__LOAD_EXC_16(...); \
> else if (sizeof(val) == sizeof(uint32_t)) \
> 	__LOAD_EXC_32(...); \
> else if (sizeof(val) == sizeof(uint64_t)) \
> 	__LOAD_EXC_64(...); \
> ...
> 
I thinks we should use "addr" as judgement:

rte_wait_event(addr, mask, expected, cond, memorder)
if (sizeof(*addr)) == sizeof(uint16_t) 
	uint16_t value                                                         \
	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
	__LOAD_EXC_16(addr, value, memorder)				       \
	if ((value & mask) cond expected) {				       \
		__SEVL()						       \
		do {							       \
			__WFE()						       \
			__LOAD_EXC_16(addr, value, memorder)		       \
		} while ((value & mask) cond expected);			       \
	}	
if (sizeof(*addr)) == sizeof(uint32_t) 
	..........
if (sizeof(*addr)) == sizeof(uint64_t) 
	...........

> > And for consistency, we also use 3 APIs in generic path.
> Honestly, even one multi-line macro doesn't look nice.
> Having 3 identical ones looks even worse.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复:  [RFC PATCH v3 1/5] eal: add new definitions for wait scheme
  2021-10-13 17:00           ` Stephen Hemminger
@ 2021-10-14  3:14             ` Feifei Wang
  0 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-14  3:14 UTC (permalink / raw)
  To: Stephen Hemminger, Ananyev, Konstantin; +Cc: Ruifeng Wang, dev, nd, nd



> -----邮件原件-----
> 发件人: Stephen Hemminger <stephen@networkplumber.org>
> 发送时间: Thursday, October 14, 2021 1:00 AM
> 收件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 抄送: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd <nd@arm.com>
> 主题: Re: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme
> 
> On Wed, 13 Oct 2021 15:03:56 +0000
> "Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> 
> > > addr value' can be before other operations.
> > >
> > > > BTW, if we expect memorder to always be a constant, might be
> > > > better BUILD_BUG_ON()?
> > > If I understand correctly, you means we can replace 'assert' by
> 'build_bug_on':
> > > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder
> > > !=__ATOMIC_RELAXED);
> >
> > Yes, that was my thought.
> > In that case I think we should be able to catch wrong memorder at
> compilation stage.
> 
> Maybe:
>    RTE_BUILD_BUG_ON(!_constant_p(memorder));
>    RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&
> memorder !=__ATOMIC_RELAXED);
> 
Thanks for your comments. One question for this, I do not know why we should check if memorder is a constant?
Is it to check whether memorder has been assigned or NULL?  

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复: [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-10-07 17:40       ` Ananyev, Konstantin
@ 2021-10-20  6:20         ` Feifei Wang
  0 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  6:20 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev, nd, Ruifeng Wang, nd

> -----邮件原件-----
> 发件人: dev <dev-bounces@dpdk.org> 代表 Ananyev, Konstantin
> 发送时间: Friday, October 8, 2021 1:40 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 主题: Re: [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for
> Rx/Tx iteration
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Thursday, October 7, 2021 4:50 PM
> > To: Feifei Wang <feifei.wang2@arm.com>
> > Cc: dev@dpdk.org; nd@arm.com; Ruifeng Wang <ruifeng.wang@arm.com>
> > Subject: RE: [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for
> > Rx/Tx iteration
> >
> >
> >
> > >
> > > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > ---
> > >  lib/bpf/bpf_pkt.c | 9 +++------
> > >  1 file changed, 3 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c index
> > > 6e8248f0d6..3af15ae97b 100644
> > > --- a/lib/bpf/bpf_pkt.c
> > > +++ b/lib/bpf/bpf_pkt.c
> > > @@ -113,7 +113,7 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
> > > static void  bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)  {
> > > -	uint32_t nuse, puse;
> > > +	uint32_t puse;
> > >
> > >  	/* make sure all previous loads and stores are completed */
> > >  	rte_smp_mb();
> > > @@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
> > >
> > >  	/* in use, busy wait till current RX/TX iteration is finished */
> > >  	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
> > > -		do {
> > > -			rte_pause();
> > > -			rte_compiler_barrier();
> > > -			nuse = cbi->use;
> > > -		} while (nuse == puse);
> > > +		rte_compiler_barrier();
> > > +		rte_wait_event_32(&cbi->use, UINT_MAX, puse, ==,
> > > +__ATOMIC_RELAXED);
> 
> Probably UINT32_MAX will be a bit better here.
That's right, UINT32_MAX is more suitable.
> 
> >
> > If we do use atomic load, while we still need a compiler_barrier() here?
Yes, compiler_barrier can be removed here since atomic_load can update the value in time.
> >
> > >  	}
> > >  }
> > >
> > > --
> > > 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme
  2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
                     ` (4 preceding siblings ...)
  2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 5/5] lib/distributor: use wait event scheme Feifei Wang
@ 2021-10-20  8:45   ` Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
                       ` (4 more replies)
  5 siblings, 5 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  Cc: konstantin.ananyev, dev, nd, Feifei Wang

Add new definitions for wait scheme, and apply this new definitions into
lib to replace rte_pause.

v2:
1. use macro to create new wait scheme (Stephen)

v3:
1. delete unnecessary bug fix in bpf (Konstantin)

v4:
1. put size into the macro body (Konstantin)
2. replace assert with BUILD_BUG_ON (Stephen)
3. delete unnecessary compiler barrier for bpf (Konstantin)

Feifei Wang (5):
  eal: add new definitions for wait scheme
  eal: use wait event for read pflock
  eal: use wait event scheme for mcslock
  lib/bpf: use wait event scheme for Rx/Tx iteration
  lib/distributor: use wait event scheme

 lib/bpf/bpf_pkt.c                        |   9 +-
 lib/distributor/rte_distributor_single.c |  10 +-
 lib/eal/arm/include/rte_pause_64.h       | 126 +++++++++++++----------
 lib/eal/include/generic/rte_mcslock.h    |   9 +-
 lib/eal/include/generic/rte_pause.h      |  32 ++++++
 lib/eal/include/generic/rte_pflock.h     |   4 +-
 6 files changed, 119 insertions(+), 71 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
@ 2021-10-20  8:45     ` Feifei Wang
  2021-10-21 16:24       ` Ananyev, Konstantin
  2021-10-22  0:10       ` [dpdk-dev] " Jerin Jacob
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 2/5] eal: use wait event for read pflock Feifei Wang
                       ` (3 subsequent siblings)
  4 siblings, 2 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  To: Ruifeng Wang; +Cc: konstantin.ananyev, dev, nd, Feifei Wang

Introduce macros as generic interface for address monitoring.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/arm/include/rte_pause_64.h  | 126 ++++++++++++++++------------
 lib/eal/include/generic/rte_pause.h |  32 +++++++
 2 files changed, 104 insertions(+), 54 deletions(-)

diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
index e87d10b8cc..23954c2de2 100644
--- a/lib/eal/arm/include/rte_pause_64.h
+++ b/lib/eal/arm/include/rte_pause_64.h
@@ -31,20 +31,12 @@ static inline void rte_pause(void)
 /* Put processor into low power WFE(Wait For Event) state. */
 #define __WFE() { asm volatile("wfe" : : : "memory"); }
 
-static __rte_always_inline void
-rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
-		int memorder)
-{
-	uint16_t value;
-
-	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
-
-	/*
-	 * Atomic exclusive load from addr, it returns the 16-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
+/*
+ * Atomic exclusive load from addr, it returns the 16-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
 #define __LOAD_EXC_16(src, dst, memorder) {               \
 	if (memorder == __ATOMIC_RELAXED) {               \
 		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
@@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			: "memory");                      \
 	} }
 
+/*
+ * Atomic exclusive load from addr, it returns the 32-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_32(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+/*
+ * Atomic exclusive load from addr, it returns the 64-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_64(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
 	__LOAD_EXC_16(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			__LOAD_EXC_16(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_16
 }
 
 static __rte_always_inline void
@@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 32-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_32(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_32(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 			__LOAD_EXC_32(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_32
 }
 
 static __rte_always_inline void
@@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 64-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_64(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_64(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		} while (value != expected);
 	}
 }
+
+#define rte_wait_event(addr, mask, expected, cond, memorder, size) \
+do {                                                               \
+	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \
+	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
+	memorder != __ATOMIC_RELAXED);                             \
+	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
+	uint##size_t value;                                        \
+	__LOAD_EXC_##size(addr, value, memorder)                   \
+	if ((value & mask) cond expected) {		           \
+		__SEVL()                                           \
+		do {                                               \
+			__WFE()                                    \
+			__LOAD_EXC_##size(addr, value, memorder)   \
+		} while ((value & mask) cond expected);            \
+	}                                                          \
+} while (0)
+
+#undef __LOAD_EXC_16
+#undef __LOAD_EXC_32
 #undef __LOAD_EXC_64
 
 #undef __SEVL
diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
index 668ee4a184..20a5d2a9fd 100644
--- a/lib/eal/include/generic/rte_pause.h
+++ b/lib/eal/include/generic/rte_pause.h
@@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 	while (__atomic_load_n(addr, memorder) != expected)
 		rte_pause();
 }
+
+/*
+ * Wait until *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest.
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ * @param size
+ * The bit size of *addr:
+ * It is used for arm architecture to choose load instructions,
+ * and the optional value is 16, 32 and 64.
+ */
+#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
+do {                                                                   \
+	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
+	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
+				memorder != __ATOMIC_RELAXED);         \
+	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
+		rte_pause();                                           \
+} while (0)
 #endif
 
 #endif /* _RTE_PAUSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 2/5] eal: use wait event for read pflock
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
@ 2021-10-20  8:45     ` Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 3/5] eal: use wait event scheme for mcslock Feifei Wang
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  Cc: konstantin.ananyev, dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for read pflock update, use wait event scheme for
this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_pflock.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/eal/include/generic/rte_pflock.h b/lib/eal/include/generic/rte_pflock.h
index e57c179ef2..c1c230d131 100644
--- a/lib/eal/include/generic/rte_pflock.h
+++ b/lib/eal/include/generic/rte_pflock.h
@@ -121,9 +121,7 @@ rte_pflock_read_lock(rte_pflock_t *pf)
 		return;
 
 	/* Wait for current write phase to complete. */
-	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE)
-		& RTE_PFLOCK_WBITS) == w)
-		rte_pause();
+	rte_wait_event(&pf->rd.in, RTE_PFLOCK_WBITS, w, ==, __ATOMIC_ACQUIRE, 16);
 }
 
 /**
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 3/5] eal: use wait event scheme for mcslock
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 2/5] eal: use wait event for read pflock Feifei Wang
@ 2021-10-20  8:45     ` Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 5/5] lib/distributor: use wait event scheme Feifei Wang
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  To: Honnappa Nagarahalli
  Cc: konstantin.ananyev, dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for mcslock to be updated, use wait event scheme
for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/include/generic/rte_mcslock.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/eal/include/generic/rte_mcslock.h b/lib/eal/include/generic/rte_mcslock.h
index 34f33c64a5..08137c361b 100644
--- a/lib/eal/include/generic/rte_mcslock.h
+++ b/lib/eal/include/generic/rte_mcslock.h
@@ -116,8 +116,13 @@ rte_mcslock_unlock(rte_mcslock_t **msl, rte_mcslock_t *me)
 		/* More nodes added to the queue by other CPUs.
 		 * Wait until the next pointer is set.
 		 */
-		while (__atomic_load_n(&me->next, __ATOMIC_RELAXED) == NULL)
-			rte_pause();
+#ifdef RTE_ARCH_32
+		rte_wait_event((uint32_t *)&me->next, UINT32_MAX, 0, ==,
+				__ATOMIC_RELAXED, 32);
+#else
+		rte_wait_event((uint64_t *)&me->next, UINT64_MAX, 0, ==,
+				__ATOMIC_RELAXED, 64);
+#endif
 	}
 
 	/* Pass lock to next waiter. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
                       ` (2 preceding siblings ...)
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 3/5] eal: use wait event scheme for mcslock Feifei Wang
@ 2021-10-20  8:45     ` Feifei Wang
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 5/5] lib/distributor: use wait event scheme Feifei Wang
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for cbi->use to be updated, use wait event scheme.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/bpf/bpf_pkt.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 6e8248f0d6..00a5748061 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -113,7 +113,7 @@ bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi)
 static void
 bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 {
-	uint32_t nuse, puse;
+	uint32_t puse;
 
 	/* make sure all previous loads and stores are completed */
 	rte_smp_mb();
@@ -122,11 +122,8 @@ bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi)
 
 	/* in use, busy wait till current RX/TX iteration is finished */
 	if ((puse & BPF_ETH_CBI_INUSE) != 0) {
-		do {
-			rte_pause();
-			rte_compiler_barrier();
-			nuse = cbi->use;
-		} while (nuse == puse);
+		rte_wait_event(&cbi->use, UINT32_MAX, puse, ==,
+				__ATOMIC_RELAXED, 32);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] [PATCH v4 5/5] lib/distributor: use wait event scheme
  2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
                       ` (3 preceding siblings ...)
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
@ 2021-10-20  8:45     ` Feifei Wang
  4 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-20  8:45 UTC (permalink / raw)
  To: David Hunt; +Cc: konstantin.ananyev, dev, nd, Feifei Wang, Ruifeng Wang

Instead of polling for bufptr64 to be updated, use
wait event for this case.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/distributor/rte_distributor_single.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/distributor/rte_distributor_single.c b/lib/distributor/rte_distributor_single.c
index f4725b1d0b..c623bb135d 100644
--- a/lib/distributor/rte_distributor_single.c
+++ b/lib/distributor/rte_distributor_single.c
@@ -33,9 +33,8 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_GET_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK,
+			0, !=, __ATOMIC_RELAXED, 64);
 
 	/* Sync with distributor on GET_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
@@ -74,9 +73,8 @@ rte_distributor_return_pkt_single(struct rte_distributor_single *d,
 	union rte_distributor_buffer_single *buf = &d->bufs[worker_id];
 	uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS)
 			| RTE_DISTRIB_RETURN_BUF;
-	while (unlikely(__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED)
-			& RTE_DISTRIB_FLAGS_MASK))
-		rte_pause();
+	rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK,
+			0, !=, __ATOMIC_RELAXED, 64);
 
 	/* Sync with distributor on RETURN_BUF flag. */
 	__atomic_store_n(&(buf->bufptr64), req, __ATOMIC_RELEASE);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
@ 2021-10-21 16:24       ` Ananyev, Konstantin
  2021-10-25  9:20         ` [dpdk-dev] 回复: " Feifei Wang
  2021-10-22  0:10       ` [dpdk-dev] " Jerin Jacob
  1 sibling, 1 reply; 42+ messages in thread
From: Ananyev, Konstantin @ 2021-10-21 16:24 UTC (permalink / raw)
  To: Feifei Wang, Ruifeng Wang; +Cc: dev, nd

> Introduce macros as generic interface for address monitoring.
> 
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/eal/arm/include/rte_pause_64.h  | 126 ++++++++++++++++------------
>  lib/eal/include/generic/rte_pause.h |  32 +++++++
>  2 files changed, 104 insertions(+), 54 deletions(-)
> 
> diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
> index e87d10b8cc..23954c2de2 100644
> --- a/lib/eal/arm/include/rte_pause_64.h
> +++ b/lib/eal/arm/include/rte_pause_64.h
> @@ -31,20 +31,12 @@ static inline void rte_pause(void)
>  /* Put processor into low power WFE(Wait For Event) state. */
>  #define __WFE() { asm volatile("wfe" : : : "memory"); }
> 
> -static __rte_always_inline void
> -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> -		int memorder)
> -{
> -	uint16_t value;
> -
> -	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> -
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 16-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> +/*
> + * Atomic exclusive load from addr, it returns the 16-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
>  #define __LOAD_EXC_16(src, dst, memorder) {               \
>  	if (memorder == __ATOMIC_RELAXED) {               \
>  		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
> @@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			: "memory");                      \
>  	} }
> 
> +/*
> + * Atomic exclusive load from addr, it returns the 32-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_32(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +/*
> + * Atomic exclusive load from addr, it returns the 64-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_64(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +		int memorder)
> +{
> +	uint16_t value;
> +
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> +
>  	__LOAD_EXC_16(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			__LOAD_EXC_16(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_16
>  }
> 
>  static __rte_always_inline void
> @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 32-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_32(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_32(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
>  			__LOAD_EXC_32(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_32
>  }
> 
>  static __rte_always_inline void
> @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 64-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_64(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_64(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  		} while (value != expected);
>  	}
>  }
> +
> +#define rte_wait_event(addr, mask, expected, cond, memorder, size) \
> +do {                                                               \
> +	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \
> +	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
> +	memorder != __ATOMIC_RELAXED);                             \
> +	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
> +	uint##size_t value;                                        \
> +	__LOAD_EXC_##size(addr, value, memorder)                   \
> +	if ((value & mask) cond expected) {		           \
> +		__SEVL()                                           \
> +		do {                                               \
> +			__WFE()                                    \
> +			__LOAD_EXC_##size(addr, value, memorder)   \
> +		} while ((value & mask) cond expected);            \
> +	}                                                          \
> +} while (0)
> +
> +#undef __LOAD_EXC_16
> +#undef __LOAD_EXC_32
>  #undef __LOAD_EXC_64
> 
>  #undef __SEVL
> diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
> index 668ee4a184..20a5d2a9fd 100644
> --- a/lib/eal/include/generic/rte_pause.h
> +++ b/lib/eal/include/generic/rte_pause.h
> @@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  	while (__atomic_load_n(addr, memorder) != expected)
>  		rte_pause();
>  }
> +
> +/*
> + * Wait until *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + * @param size
> + * The bit size of *addr:
> + * It is used for arm architecture to choose load instructions,
> + * and the optional value is 16, 32 and 64.
> + */
> +#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
> +do {                                                                   \
> +	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
> +	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
> +				memorder != __ATOMIC_RELAXED);         \
> +	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \

I don't' really understand why you do need 'size' passed as parameter.
Can't it be:
size_t size = sizeof(*(addr));
And then:
RTE_BUILD_BUG_ON(size != sizeof(uint16_t) && size != sizeof(uint32_t) && size != sizeof(uint64_t));  
?

> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
> +		rte_pause();                                           \

Just to repeat my own comment from previous version review: 
put () around macro parameters in the macro body.
Will save from a lot of unexpected troubles.

> +} while (0)
>  #endif
> 
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
  2021-10-21 16:24       ` Ananyev, Konstantin
@ 2021-10-22  0:10       ` Jerin Jacob
  2021-10-25  9:30         ` [dpdk-dev] 回复: " Feifei Wang
  1 sibling, 1 reply; 42+ messages in thread
From: Jerin Jacob @ 2021-10-22  0:10 UTC (permalink / raw)
  To: Feifei Wang; +Cc: Ruifeng Wang, Ananyev, Konstantin, dpdk-dev, nd

On Wed, Oct 20, 2021 at 2:16 PM Feifei Wang <feifei.wang2@arm.com> wrote:
>
> Introduce macros as generic interface for address monitoring.
>
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/eal/arm/include/rte_pause_64.h  | 126 ++++++++++++++++------------
>  lib/eal/include/generic/rte_pause.h |  32 +++++++
>  2 files changed, 104 insertions(+), 54 deletions(-)
>
> diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
> index e87d10b8cc..23954c2de2 100644
> --- a/lib/eal/arm/include/rte_pause_64.h
> +++ b/lib/eal/arm/include/rte_pause_64.h
> @@ -31,20 +31,12 @@ static inline void rte_pause(void)
>  /* Put processor into low power WFE(Wait For Event) state. */
>  #define __WFE() { asm volatile("wfe" : : : "memory"); }
>
> -static __rte_always_inline void
> -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> -               int memorder)
> -{
> -       uint16_t value;
> -
> -       assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> -
> -       /*
> -        * Atomic exclusive load from addr, it returns the 16-bit content of
> -        * *addr while making it 'monitored',when it is written by someone
> -        * else, the 'monitored' state is cleared and a event is generated

a event -> an event in all the occurrence.

> -        * implicitly to exit WFE.
> -        */
> +/*
> + * Atomic exclusive load from addr, it returns the 16-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
>  #define __LOAD_EXC_16(src, dst, memorder) {               \
>         if (memorder == __ATOMIC_RELAXED) {               \
>                 asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
> @@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>                         : "memory");                      \
>         } }
>
> +/*
> + * Atomic exclusive load from addr, it returns the 32-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_32(src, dst, memorder) {              \
> +       if (memorder == __ATOMIC_RELAXED) {              \
> +               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> +                       : [tmp] "=&r" (dst)              \
> +                       : [addr] "r"(src)                \
> +                       : "memory");                     \
> +       } else {                                         \
> +               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> +                       : [tmp] "=&r" (dst)              \
> +                       : [addr] "r"(src)                \
> +                       : "memory");                     \
> +       } }
> +
> +/*
> + * Atomic exclusive load from addr, it returns the 64-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_64(src, dst, memorder) {              \
> +       if (memorder == __ATOMIC_RELAXED) {              \
> +               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> +                       : [tmp] "=&r" (dst)              \
> +                       : [addr] "r"(src)                \
> +                       : "memory");                     \
> +       } else {                                         \
> +               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> +                       : [tmp] "=&r" (dst)              \
> +                       : [addr] "r"(src)                \
> +                       : "memory");                     \
> +       } }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +               int memorder)
> +{
> +       uint16_t value;
> +
> +       assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> +
>         __LOAD_EXC_16(addr, value, memorder)
>         if (value != expected) {
>                 __SEVL()
> @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>                         __LOAD_EXC_16(addr, value, memorder)
>                 } while (value != expected);
>         }
> -#undef __LOAD_EXC_16
>  }
>
>  static __rte_always_inline void
> @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
>
>         assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
>
> -       /*
> -        * Atomic exclusive load from addr, it returns the 32-bit content of
> -        * *addr while making it 'monitored',when it is written by someone
> -        * else, the 'monitored' state is cleared and a event is generated
> -        * implicitly to exit WFE.
> -        */
> -#define __LOAD_EXC_32(src, dst, memorder) {              \
> -       if (memorder == __ATOMIC_RELAXED) {              \
> -               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> -                       : [tmp] "=&r" (dst)              \
> -                       : [addr] "r"(src)                \
> -                       : "memory");                     \
> -       } else {                                         \
> -               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> -                       : [tmp] "=&r" (dst)              \
> -                       : [addr] "r"(src)                \
> -                       : "memory");                     \
> -       } }
> -
>         __LOAD_EXC_32(addr, value, memorder)
>         if (value != expected) {
>                 __SEVL()
> @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
>                         __LOAD_EXC_32(addr, value, memorder)
>                 } while (value != expected);
>         }
> -#undef __LOAD_EXC_32
>  }
>
>  static __rte_always_inline void
> @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>
>         assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
>
> -       /*
> -        * Atomic exclusive load from addr, it returns the 64-bit content of
> -        * *addr while making it 'monitored',when it is written by someone
> -        * else, the 'monitored' state is cleared and a event is generated
> -        * implicitly to exit WFE.
> -        */
> -#define __LOAD_EXC_64(src, dst, memorder) {              \
> -       if (memorder == __ATOMIC_RELAXED) {              \
> -               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> -                       : [tmp] "=&r" (dst)              \
> -                       : [addr] "r"(src)                \
> -                       : "memory");                     \
> -       } else {                                         \
> -               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> -                       : [tmp] "=&r" (dst)              \
> -                       : [addr] "r"(src)                \
> -                       : "memory");                     \
> -       } }
> -
>         __LOAD_EXC_64(addr, value, memorder)
>         if (value != expected) {
>                 __SEVL()
> @@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>                 } while (value != expected);
>         }
>  }
> +
> +#define rte_wait_event(addr, mask, expected, cond, memorder, size) \

I think it is better to swap "cond" and "expected" positions to get
better readability.

 rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, 0, !=,
__ATOMIC_RELAXED, 64);

Vs

 rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, !=, 0,
__ATOMIC_RELAXED, 64);

> +do {                                                               \

Any reason to not make an inline function instead of macro?

> +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \

Should n't we add __builtin_constant_p(size) of check?

> +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
> +       memorder != __ATOMIC_RELAXED);                             \
> +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
> +       uint##size_t value;


                               \
> +       __LOAD_EXC_##size(addr, value, memorder)                   \
> +       if ((value & mask) cond expected) {                        \
> +               __SEVL()                                           \
> +               do {                                               \
> +                       __WFE()                                    \
> +                       __LOAD_EXC_##size(addr, value, memorder)   \
> +               } while ((value & mask) cond expected);            \
> +       }                                                          \
> +} while (0)
> +
> +#undef __LOAD_EXC_16
> +#undef __LOAD_EXC_32
>  #undef __LOAD_EXC_64
>
>  #undef __SEVL
> diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
> index 668ee4a184..20a5d2a9fd 100644
> --- a/lib/eal/include/generic/rte_pause.h
> +++ b/lib/eal/include/generic/rte_pause.h
> @@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>         while (__atomic_load_n(addr, memorder) != expected)
>                 rte_pause();
>  }
> +
> +/*
> + * Wait until *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + * @param size
> + * The bit size of *addr:
> + * It is used for arm architecture to choose load instructions,
> + * and the optional value is 16, 32 and 64.
> + */
> +#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
> +do {                                                                   \
> +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
> +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
> +                               memorder != __ATOMIC_RELAXED);         \
> +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \
> +       while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
> +               rte_pause();                                           \
> +} while (0)
>  #endif
>
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复: [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-21 16:24       ` Ananyev, Konstantin
@ 2021-10-25  9:20         ` Feifei Wang
  0 siblings, 0 replies; 42+ messages in thread
From: Feifei Wang @ 2021-10-25  9:20 UTC (permalink / raw)
  To: Ananyev, Konstantin, Ruifeng Wang; +Cc: dev, nd, nd



> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Friday, October 22, 2021 12:25 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>
> 主题: RE: [PATCH v4 1/5] eal: add new definitions for wait scheme
> 
> > Introduce macros as generic interface for address monitoring.
> >
> > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >  lib/eal/arm/include/rte_pause_64.h  | 126
> > ++++++++++++++++------------  lib/eal/include/generic/rte_pause.h |
> > 32 +++++++
> >  2 files changed, 104 insertions(+), 54 deletions(-)
> >
> > diff --git a/lib/eal/arm/include/rte_pause_64.h
> > b/lib/eal/arm/include/rte_pause_64.h
> > index e87d10b8cc..23954c2de2 100644
> > --- a/lib/eal/arm/include/rte_pause_64.h
> > +++ b/lib/eal/arm/include/rte_pause_64.h
> > @@ -31,20 +31,12 @@ static inline void rte_pause(void)
> >  /* Put processor into low power WFE(Wait For Event) state. */
> > #define __WFE() { asm volatile("wfe" : : : "memory"); }
> >
> > -static __rte_always_inline void
> > -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > -		int memorder)
> > -{
> > -	uint16_t value;
> > -
> > -	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);
> > -
> > -	/*
> > -	 * Atomic exclusive load from addr, it returns the 16-bit content of
> > -	 * *addr while making it 'monitored',when it is written by someone
> > -	 * else, the 'monitored' state is cleared and a event is generated
> > -	 * implicitly to exit WFE.
> > -	 */
> > +/*
> > + * Atomic exclusive load from addr, it returns the 16-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> >  #define __LOAD_EXC_16(src, dst, memorder) {               \
> >  	if (memorder == __ATOMIC_RELAXED) {               \
> >  		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \ @@ -58,6 +50,52
> @@
> > rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> >  			: "memory");                      \
> >  	} }
> >
> > +/*
> > + * Atomic exclusive load from addr, it returns the 32-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> > +#define __LOAD_EXC_32(src, dst, memorder) {              \
> > +	if (memorder == __ATOMIC_RELAXED) {              \
> > +		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > +			: [tmp] "=&r" (dst)              \
> > +			: [addr] "r"(src)                \
> > +			: "memory");                     \
> > +	} else {                                         \
> > +		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > +			: [tmp] "=&r" (dst)              \
> > +			: [addr] "r"(src)                \
> > +			: "memory");                     \
> > +	} }
> > +
> > +/*
> > + * Atomic exclusive load from addr, it returns the 64-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> > +#define __LOAD_EXC_64(src, dst, memorder) {              \
> > +	if (memorder == __ATOMIC_RELAXED) {              \
> > +		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > +			: [tmp] "=&r" (dst)              \
> > +			: [addr] "r"(src)                \
> > +			: "memory");                     \
> > +	} else {                                         \
> > +		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > +			: [tmp] "=&r" (dst)              \
> > +			: [addr] "r"(src)                \
> > +			: "memory");                     \
> > +	} }
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +		int memorder)
> > +{
> > +	uint16_t value;
> > +
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > +__ATOMIC_RELAXED);
> > +
> >  	__LOAD_EXC_16(addr, value, memorder)
> >  	if (value != expected) {
> >  		__SEVL()
> > @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr,
> uint16_t expected,
> >  			__LOAD_EXC_16(addr, value, memorder)
> >  		} while (value != expected);
> >  	}
> > -#undef __LOAD_EXC_16
> >  }
> >
> >  static __rte_always_inline void
> > @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> > uint32_t expected,
> >
> >  	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > __ATOMIC_RELAXED);
> >
> > -	/*
> > -	 * Atomic exclusive load from addr, it returns the 32-bit content of
> > -	 * *addr while making it 'monitored',when it is written by someone
> > -	 * else, the 'monitored' state is cleared and a event is generated
> > -	 * implicitly to exit WFE.
> > -	 */
> > -#define __LOAD_EXC_32(src, dst, memorder) {              \
> > -	if (memorder == __ATOMIC_RELAXED) {              \
> > -		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > -			: [tmp] "=&r" (dst)              \
> > -			: [addr] "r"(src)                \
> > -			: "memory");                     \
> > -	} else {                                         \
> > -		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > -			: [tmp] "=&r" (dst)              \
> > -			: [addr] "r"(src)                \
> > -			: "memory");                     \
> > -	} }
> > -
> >  	__LOAD_EXC_32(addr, value, memorder)
> >  	if (value != expected) {
> >  		__SEVL()
> > @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> uint32_t expected,
> >  			__LOAD_EXC_32(addr, value, memorder)
> >  		} while (value != expected);
> >  	}
> > -#undef __LOAD_EXC_32
> >  }
> >
> >  static __rte_always_inline void
> > @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> >
> >  	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > __ATOMIC_RELAXED);
> >
> > -	/*
> > -	 * Atomic exclusive load from addr, it returns the 64-bit content of
> > -	 * *addr while making it 'monitored',when it is written by someone
> > -	 * else, the 'monitored' state is cleared and a event is generated
> > -	 * implicitly to exit WFE.
> > -	 */
> > -#define __LOAD_EXC_64(src, dst, memorder) {              \
> > -	if (memorder == __ATOMIC_RELAXED) {              \
> > -		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > -			: [tmp] "=&r" (dst)              \
> > -			: [addr] "r"(src)                \
> > -			: "memory");                     \
> > -	} else {                                         \
> > -		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > -			: [tmp] "=&r" (dst)              \
> > -			: [addr] "r"(src)                \
> > -			: "memory");                     \
> > -	} }
> > -
> >  	__LOAD_EXC_64(addr, value, memorder)
> >  	if (value != expected) {
> >  		__SEVL()
> > @@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >  		} while (value != expected);
> >  	}
> >  }
> > +
> > +#define rte_wait_event(addr, mask, expected, cond, memorder, size) \
> > +do {                                                               \
> > +	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \
> > +	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
> > +	memorder != __ATOMIC_RELAXED);                             \
> > +	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
> > +	uint##size_t value;                                        \
> > +	__LOAD_EXC_##size(addr, value, memorder)                   \
> > +	if ((value & mask) cond expected) {		           \
> > +		__SEVL()                                           \
> > +		do {                                               \
> > +			__WFE()                                    \
> > +			__LOAD_EXC_##size(addr, value, memorder)   \
> > +		} while ((value & mask) cond expected);            \
> > +	}                                                          \
> > +} while (0)
> > +
> > +#undef __LOAD_EXC_16
> > +#undef __LOAD_EXC_32
> >  #undef __LOAD_EXC_64
> >
> >  #undef __SEVL
> > diff --git a/lib/eal/include/generic/rte_pause.h
> > b/lib/eal/include/generic/rte_pause.h
> > index 668ee4a184..20a5d2a9fd 100644
> > --- a/lib/eal/include/generic/rte_pause.h
> > +++ b/lib/eal/include/generic/rte_pause.h
> > @@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >  	while (__atomic_load_n(addr, memorder) != expected)
> >  		rte_pause();
> >  }
> > +
> > +/*
> > + * Wait until *addr breaks the condition, with a relaxed memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + * @param size
> > + * The bit size of *addr:
> > + * It is used for arm architecture to choose load instructions,
> > + * and the optional value is 16, 32 and 64.
> > + */
> > +#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
> > +do {                                                                   \
> > +	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
> > +	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
> > +				memorder != __ATOMIC_RELAXED);         \
> > +	RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \
> 
> I don't' really understand why you do need 'size' passed as parameter.
> Can't it be:
> size_t size = sizeof(*(addr));
> And then:
> RTE_BUILD_BUG_ON(size != sizeof(uint16_t) && size != sizeof(uint32_t) &&
> size != sizeof(uint64_t)); ?
> 
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
> > +		rte_pause();                                           \
> 
> Just to repeat my own comment from previous version review:
> put () around macro parameters in the macro body.
> Will save from a lot of unexpected troubles.

Sorry I didn't catch the point.
In this version, I firstly want to use '__LOAD_EXC_##size' to choose , so I use size
as a parameter.  And in the next version, I will update this as:

#define __LOAD_EXC(src, dst, memorder, size) {    \
	if (size == 16)                               \
		__LOAD_EXC_16(src, dst, memorder)     \
	else if (size == 32)                          \
		__LOAD_EXC_32(src, dst, memorder)     \
	else if (size == 64)                          \
		__LOAD_EXC_64(src, dst, memorder)     \
}

#define rte_wait_event(addr, mask, cond, expected, memorder)    \
do {                                                            \
	RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));      \
	RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&        \
				memorder != __ATOMIC_RELAXED);  \
	uint32_t size = sizeof(*addr) << 3;                     \
	typeof(*addr) value = 0;                                \
	__LOAD_EXC(addr, value, memorder, size)                 \
	if ((value & mask) cond expected) {                     \
		__SEVL()                                        \
		do {                                            \
			__WFE()                                 \
			__LOAD_EXC(addr, value, memorder, size) \
		} while ((value & mask) cond expected);         \
	}                                                       \
} while (0)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [dpdk-dev] 回复:  [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-22  0:10       ` [dpdk-dev] " Jerin Jacob
@ 2021-10-25  9:30         ` Feifei Wang
  2021-10-25  9:43           ` [dpdk-dev] " Jerin Jacob
  0 siblings, 1 reply; 42+ messages in thread
From: Feifei Wang @ 2021-10-25  9:30 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Ruifeng Wang, Ananyev, Konstantin, dpdk-dev, nd, nd

> -----邮件原件-----
> 发件人: Jerin Jacob <jerinjacobk@gmail.com>
> 发送时间: Friday, October 22, 2021 8:10 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>
> 抄送: Ruifeng Wang <Ruifeng.Wang@arm.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; dpdk-dev <dev@dpdk.org>; nd
> <nd@arm.com>
> 主题: Re: [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
> 
> On Wed, Oct 20, 2021 at 2:16 PM Feifei Wang <feifei.wang2@arm.com>
> wrote:
> >
> > Introduce macros as generic interface for address monitoring.
> >
> > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >  lib/eal/arm/include/rte_pause_64.h  | 126
> > ++++++++++++++++------------  lib/eal/include/generic/rte_pause.h |
> > 32 +++++++
> >  2 files changed, 104 insertions(+), 54 deletions(-)
> >
> > diff --git a/lib/eal/arm/include/rte_pause_64.h
> > b/lib/eal/arm/include/rte_pause_64.h
> > index e87d10b8cc..23954c2de2 100644
> > --- a/lib/eal/arm/include/rte_pause_64.h
> > +++ b/lib/eal/arm/include/rte_pause_64.h
> > @@ -31,20 +31,12 @@ static inline void rte_pause(void)
> >  /* Put processor into low power WFE(Wait For Event) state. */
> > #define __WFE() { asm volatile("wfe" : : : "memory"); }
> >
> > -static __rte_always_inline void
> > -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > -               int memorder)
> > -{
> > -       uint16_t value;
> > -
> > -       assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);
> > -
> > -       /*
> > -        * Atomic exclusive load from addr, it returns the 16-bit content of
> > -        * *addr while making it 'monitored',when it is written by someone
> > -        * else, the 'monitored' state is cleared and a event is generated
> 
> a event -> an event in all the occurrence.
> 
> > -        * implicitly to exit WFE.
> > -        */
> > +/*
> > + * Atomic exclusive load from addr, it returns the 16-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> >  #define __LOAD_EXC_16(src, dst, memorder) {               \
> >         if (memorder == __ATOMIC_RELAXED) {               \
> >                 asm volatile("ldxrh %w[tmp], [%x[addr]]"  \ @@ -58,6
> > +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t
> expected,
> >                         : "memory");                      \
> >         } }
> >
> > +/*
> > + * Atomic exclusive load from addr, it returns the 32-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> > +#define __LOAD_EXC_32(src, dst, memorder) {              \
> > +       if (memorder == __ATOMIC_RELAXED) {              \
> > +               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > +                       : [tmp] "=&r" (dst)              \
> > +                       : [addr] "r"(src)                \
> > +                       : "memory");                     \
> > +       } else {                                         \
> > +               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > +                       : [tmp] "=&r" (dst)              \
> > +                       : [addr] "r"(src)                \
> > +                       : "memory");                     \
> > +       } }
> > +
> > +/*
> > + * Atomic exclusive load from addr, it returns the 64-bit content of
> > + * *addr while making it 'monitored', when it is written by someone
> > + * else, the 'monitored' state is cleared and a event is generated
> > + * implicitly to exit WFE.
> > + */
> > +#define __LOAD_EXC_64(src, dst, memorder) {              \
> > +       if (memorder == __ATOMIC_RELAXED) {              \
> > +               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > +                       : [tmp] "=&r" (dst)              \
> > +                       : [addr] "r"(src)                \
> > +                       : "memory");                     \
> > +       } else {                                         \
> > +               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > +                       : [tmp] "=&r" (dst)              \
> > +                       : [addr] "r"(src)                \
> > +                       : "memory");                     \
> > +       } }
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +               int memorder)
> > +{
> > +       uint16_t value;
> > +
> > +       assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > + __ATOMIC_RELAXED);
> > +
> >         __LOAD_EXC_16(addr, value, memorder)
> >         if (value != expected) {
> >                 __SEVL()
> > @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr,
> uint16_t expected,
> >                         __LOAD_EXC_16(addr, value, memorder)
> >                 } while (value != expected);
> >         }
> > -#undef __LOAD_EXC_16
> >  }
> >
> >  static __rte_always_inline void
> > @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> > uint32_t expected,
> >
> >         assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > __ATOMIC_RELAXED);
> >
> > -       /*
> > -        * Atomic exclusive load from addr, it returns the 32-bit content of
> > -        * *addr while making it 'monitored',when it is written by someone
> > -        * else, the 'monitored' state is cleared and a event is generated
> > -        * implicitly to exit WFE.
> > -        */
> > -#define __LOAD_EXC_32(src, dst, memorder) {              \
> > -       if (memorder == __ATOMIC_RELAXED) {              \
> > -               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > -                       : [tmp] "=&r" (dst)              \
> > -                       : [addr] "r"(src)                \
> > -                       : "memory");                     \
> > -       } else {                                         \
> > -               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > -                       : [tmp] "=&r" (dst)              \
> > -                       : [addr] "r"(src)                \
> > -                       : "memory");                     \
> > -       } }
> > -
> >         __LOAD_EXC_32(addr, value, memorder)
> >         if (value != expected) {
> >                 __SEVL()
> > @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> uint32_t expected,
> >                         __LOAD_EXC_32(addr, value, memorder)
> >                 } while (value != expected);
> >         }
> > -#undef __LOAD_EXC_32
> >  }
> >
> >  static __rte_always_inline void
> > @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> >
> >         assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > __ATOMIC_RELAXED);
> >
> > -       /*
> > -        * Atomic exclusive load from addr, it returns the 64-bit content of
> > -        * *addr while making it 'monitored',when it is written by someone
> > -        * else, the 'monitored' state is cleared and a event is generated
> > -        * implicitly to exit WFE.
> > -        */
> > -#define __LOAD_EXC_64(src, dst, memorder) {              \
> > -       if (memorder == __ATOMIC_RELAXED) {              \
> > -               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > -                       : [tmp] "=&r" (dst)              \
> > -                       : [addr] "r"(src)                \
> > -                       : "memory");                     \
> > -       } else {                                         \
> > -               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > -                       : [tmp] "=&r" (dst)              \
> > -                       : [addr] "r"(src)                \
> > -                       : "memory");                     \
> > -       } }
> > -
> >         __LOAD_EXC_64(addr, value, memorder)
> >         if (value != expected) {
> >                 __SEVL()
> > @@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >                 } while (value != expected);
> >         }
> >  }
> > +
> > +#define rte_wait_event(addr, mask, expected, cond, memorder, size) \
> 
> I think it is better to swap "cond" and "expected" positions to get better
> readability.
Thanks for the comments, it is better than before and I will update in the next version.
> 
>  rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, 0, !=,
> __ATOMIC_RELAXED, 64);
> 
> Vs
> 
>  rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, !=, 0,
> __ATOMIC_RELAXED, 64);
> 
> > +do {                                                               \
> 
> Any reason to not make an inline function instead of macro?
Because there were many new APIs for different cases. And then we refer to
Linux 'wait_event' code for an example. Please see the first version and its discussion:
http://patches.dpdk.org/project/dpdk/cover/20210902053253.3017858-1-feifei.wang2@arm.com/
> 
> > +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \
> 
> Should n't we add __builtin_constant_p(size) of check?

Please see the discussion with Konstantin.
'size' will not be as a parameter and then it is unnecessary to check it with build_bug.
> 
> > +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
> > +       memorder != __ATOMIC_RELAXED);                             \
> > +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
> > +       uint##size_t value;
> 
> 
>                                \
> > +       __LOAD_EXC_##size(addr, value, memorder)                   \
> > +       if ((value & mask) cond expected) {                        \
> > +               __SEVL()                                           \
> > +               do {                                               \
> > +                       __WFE()                                    \
> > +                       __LOAD_EXC_##size(addr, value, memorder)   \
> > +               } while ((value & mask) cond expected);            \
> > +       }                                                          \
> > +} while (0)
> > +
> > +#undef __LOAD_EXC_16
> > +#undef __LOAD_EXC_32
> >  #undef __LOAD_EXC_64
> >
> >  #undef __SEVL
> > diff --git a/lib/eal/include/generic/rte_pause.h
> > b/lib/eal/include/generic/rte_pause.h
> > index 668ee4a184..20a5d2a9fd 100644
> > --- a/lib/eal/include/generic/rte_pause.h
> > +++ b/lib/eal/include/generic/rte_pause.h
> > @@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >         while (__atomic_load_n(addr, memorder) != expected)
> >                 rte_pause();
> >  }
> > +
> > +/*
> > + * Wait until *addr breaks the condition, with a relaxed memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + * @param size
> > + * The bit size of *addr:
> > + * It is used for arm architecture to choose load instructions,
> > + * and the optional value is 16, 32 and 64.
> > + */
> > +#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
> > +do {                                                                   \
> > +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
> > +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
> > +                               memorder != __ATOMIC_RELAXED);         \
> > +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \
> > +       while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
> > +               rte_pause();                                           \
> > +} while (0)
> >  #endif
> >
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.25.1
> >

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
  2021-10-25  9:30         ` [dpdk-dev] 回复: " Feifei Wang
@ 2021-10-25  9:43           ` Jerin Jacob
  0 siblings, 0 replies; 42+ messages in thread
From: Jerin Jacob @ 2021-10-25  9:43 UTC (permalink / raw)
  To: Feifei Wang; +Cc: Ruifeng Wang, Ananyev, Konstantin, dpdk-dev, nd

On Mon, Oct 25, 2021 at 3:01 PM Feifei Wang <Feifei.Wang2@arm.com> wrote:
>
> > -----邮件原件-----
> > 发件人: Jerin Jacob <jerinjacobk@gmail.com>
> > 发送时间: Friday, October 22, 2021 8:10 AM
> > 收件人: Feifei Wang <Feifei.Wang2@arm.com>
> > 抄送: Ruifeng Wang <Ruifeng.Wang@arm.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; dpdk-dev <dev@dpdk.org>; nd
> > <nd@arm.com>
> > 主题: Re: [dpdk-dev] [PATCH v4 1/5] eal: add new definitions for wait scheme
> >
> > On Wed, Oct 20, 2021 at 2:16 PM Feifei Wang <feifei.wang2@arm.com>
> > wrote:
> > >
> > > Introduce macros as generic interface for address monitoring.
> > >
> > > Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > ---
> > >  lib/eal/arm/include/rte_pause_64.h  | 126
> > > ++++++++++++++++------------  lib/eal/include/generic/rte_pause.h |
> > > 32 +++++++
> > >  2 files changed, 104 insertions(+), 54 deletions(-)
> > >
> > > diff --git a/lib/eal/arm/include/rte_pause_64.h
> > > b/lib/eal/arm/include/rte_pause_64.h
> > > index e87d10b8cc..23954c2de2 100644
> > > --- a/lib/eal/arm/include/rte_pause_64.h
> > > +++ b/lib/eal/arm/include/rte_pause_64.h
> > > @@ -31,20 +31,12 @@ static inline void rte_pause(void)
> > >  /* Put processor into low power WFE(Wait For Event) state. */
> > > #define __WFE() { asm volatile("wfe" : : : "memory"); }
> > >
> > > -static __rte_always_inline void
> > > -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > > -               int memorder)
> > > -{
> > > -       uint16_t value;
> > > -
> > > -       assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > __ATOMIC_RELAXED);
> > > -
> > > -       /*
> > > -        * Atomic exclusive load from addr, it returns the 16-bit content of
> > > -        * *addr while making it 'monitored',when it is written by someone
> > > -        * else, the 'monitored' state is cleared and a event is generated
> >
> > a event -> an event in all the occurrence.
> >
> > > -        * implicitly to exit WFE.
> > > -        */
> > > +/*
> > > + * Atomic exclusive load from addr, it returns the 16-bit content of
> > > + * *addr while making it 'monitored', when it is written by someone
> > > + * else, the 'monitored' state is cleared and a event is generated
> > > + * implicitly to exit WFE.
> > > + */
> > >  #define __LOAD_EXC_16(src, dst, memorder) {               \
> > >         if (memorder == __ATOMIC_RELAXED) {               \
> > >                 asm volatile("ldxrh %w[tmp], [%x[addr]]"  \ @@ -58,6
> > > +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t
> > expected,
> > >                         : "memory");                      \
> > >         } }
> > >
> > > +/*
> > > + * Atomic exclusive load from addr, it returns the 32-bit content of
> > > + * *addr while making it 'monitored', when it is written by someone
> > > + * else, the 'monitored' state is cleared and a event is generated
> > > + * implicitly to exit WFE.
> > > + */
> > > +#define __LOAD_EXC_32(src, dst, memorder) {              \
> > > +       if (memorder == __ATOMIC_RELAXED) {              \
> > > +               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > > +                       : [tmp] "=&r" (dst)              \
> > > +                       : [addr] "r"(src)                \
> > > +                       : "memory");                     \
> > > +       } else {                                         \
> > > +               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > > +                       : [tmp] "=&r" (dst)              \
> > > +                       : [addr] "r"(src)                \
> > > +                       : "memory");                     \
> > > +       } }
> > > +
> > > +/*
> > > + * Atomic exclusive load from addr, it returns the 64-bit content of
> > > + * *addr while making it 'monitored', when it is written by someone
> > > + * else, the 'monitored' state is cleared and a event is generated
> > > + * implicitly to exit WFE.
> > > + */
> > > +#define __LOAD_EXC_64(src, dst, memorder) {              \
> > > +       if (memorder == __ATOMIC_RELAXED) {              \
> > > +               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > > +                       : [tmp] "=&r" (dst)              \
> > > +                       : [addr] "r"(src)                \
> > > +                       : "memory");                     \
> > > +       } else {                                         \
> > > +               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > > +                       : [tmp] "=&r" (dst)              \
> > > +                       : [addr] "r"(src)                \
> > > +                       : "memory");                     \
> > > +       } }
> > > +
> > > +static __rte_always_inline void
> > > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > > +               int memorder)
> > > +{
> > > +       uint16_t value;
> > > +
> > > +       assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > + __ATOMIC_RELAXED);
> > > +
> > >         __LOAD_EXC_16(addr, value, memorder)
> > >         if (value != expected) {
> > >                 __SEVL()
> > > @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr,
> > uint16_t expected,
> > >                         __LOAD_EXC_16(addr, value, memorder)
> > >                 } while (value != expected);
> > >         }
> > > -#undef __LOAD_EXC_16
> > >  }
> > >
> > >  static __rte_always_inline void
> > > @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> > > uint32_t expected,
> > >
> > >         assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > __ATOMIC_RELAXED);
> > >
> > > -       /*
> > > -        * Atomic exclusive load from addr, it returns the 32-bit content of
> > > -        * *addr while making it 'monitored',when it is written by someone
> > > -        * else, the 'monitored' state is cleared and a event is generated
> > > -        * implicitly to exit WFE.
> > > -        */
> > > -#define __LOAD_EXC_32(src, dst, memorder) {              \
> > > -       if (memorder == __ATOMIC_RELAXED) {              \
> > > -               asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> > > -                       : [tmp] "=&r" (dst)              \
> > > -                       : [addr] "r"(src)                \
> > > -                       : "memory");                     \
> > > -       } else {                                         \
> > > -               asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> > > -                       : [tmp] "=&r" (dst)              \
> > > -                       : [addr] "r"(src)                \
> > > -                       : "memory");                     \
> > > -       } }
> > > -
> > >         __LOAD_EXC_32(addr, value, memorder)
> > >         if (value != expected) {
> > >                 __SEVL()
> > > @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr,
> > uint32_t expected,
> > >                         __LOAD_EXC_32(addr, value, memorder)
> > >                 } while (value != expected);
> > >         }
> > > -#undef __LOAD_EXC_32
> > >  }
> > >
> > >  static __rte_always_inline void
> > > @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > > uint64_t expected,
> > >
> > >         assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > __ATOMIC_RELAXED);
> > >
> > > -       /*
> > > -        * Atomic exclusive load from addr, it returns the 64-bit content of
> > > -        * *addr while making it 'monitored',when it is written by someone
> > > -        * else, the 'monitored' state is cleared and a event is generated
> > > -        * implicitly to exit WFE.
> > > -        */
> > > -#define __LOAD_EXC_64(src, dst, memorder) {              \
> > > -       if (memorder == __ATOMIC_RELAXED) {              \
> > > -               asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> > > -                       : [tmp] "=&r" (dst)              \
> > > -                       : [addr] "r"(src)                \
> > > -                       : "memory");                     \
> > > -       } else {                                         \
> > > -               asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> > > -                       : [tmp] "=&r" (dst)              \
> > > -                       : [addr] "r"(src)                \
> > > -                       : "memory");                     \
> > > -       } }
> > > -
> > >         __LOAD_EXC_64(addr, value, memorder)
> > >         if (value != expected) {
> > >                 __SEVL()
> > > @@ -143,6 +141,26 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> > >                 } while (value != expected);
> > >         }
> > >  }
> > > +
> > > +#define rte_wait_event(addr, mask, expected, cond, memorder, size) \
> >
> > I think it is better to swap "cond" and "expected" positions to get better
> > readability.
> Thanks for the comments, it is better than before and I will update in the next version.
> >
> >  rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, 0, !=,
> > __ATOMIC_RELAXED, 64);
> >
> > Vs
> >
> >  rte_wait_event(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, !=, 0,
> > __ATOMIC_RELAXED, 64);
> >
> > > +do {                                                               \
> >
> > Any reason to not make an inline function instead of macro?
> Because there were many new APIs for different cases. And then we refer to
> Linux 'wait_event' code for an example. Please see the first version and its discussion:
> http://patches.dpdk.org/project/dpdk/cover/20210902053253.3017858-1-feifei.wang2@arm.com/


OK.


> >
> > > +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));         \
> >
> > Should n't we add __builtin_constant_p(size) of check?
>
> Please see the discussion with Konstantin.
> 'size' will not be as a parameter and then it is unnecessary to check it with build_bug.

Make sense to remove the 'size'. My comment was more in the direction
of, if the 'size' is required to pass.

> >
> > > +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&           \
> > > +       memorder != __ATOMIC_RELAXED);                             \
> > > +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);  \
> > > +       uint##size_t value;
> >
> >
> >                                \
> > > +       __LOAD_EXC_##size(addr, value, memorder)                   \
> > > +       if ((value & mask) cond expected) {                        \
> > > +               __SEVL()                                           \
> > > +               do {                                               \
> > > +                       __WFE()                                    \
> > > +                       __LOAD_EXC_##size(addr, value, memorder)   \
> > > +               } while ((value & mask) cond expected);            \
> > > +       }                                                          \
> > > +} while (0)
> > > +
> > > +#undef __LOAD_EXC_16
> > > +#undef __LOAD_EXC_32
> > >  #undef __LOAD_EXC_64
> > >
> > >  #undef __SEVL
> > > diff --git a/lib/eal/include/generic/rte_pause.h
> > > b/lib/eal/include/generic/rte_pause.h
> > > index 668ee4a184..20a5d2a9fd 100644
> > > --- a/lib/eal/include/generic/rte_pause.h
> > > +++ b/lib/eal/include/generic/rte_pause.h
> > > @@ -111,6 +111,38 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> > >         while (__atomic_load_n(addr, memorder) != expected)
> > >                 rte_pause();
> > >  }
> > > +
> > > +/*
> > > + * Wait until *addr breaks the condition, with a relaxed memory
> > > + * ordering model meaning the loads around this API can be reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param mask
> > > + *  A mask of value bits in interest.
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + * @param cond
> > > + *  A symbol representing the condition (==, !=).
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> > > +or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + * @param size
> > > + * The bit size of *addr:
> > > + * It is used for arm architecture to choose load instructions,
> > > + * and the optional value is 16, 32 and 64.
> > > + */
> > > +#define rte_wait_event(addr, mask, expected, cond, memorder, size)     \
> > > +do {                                                                   \
> > > +       RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder));             \
> > > +       RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&               \
> > > +                               memorder != __ATOMIC_RELAXED);         \
> > > +       RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64);      \
> > > +       while ((__atomic_load_n(addr, memorder) & mask) cond expected) \
> > > +               rte_pause();                                           \
> > > +} while (0)
> > >  #endif
> > >
> > >  #endif /* _RTE_PAUSE_H_ */
> > > --
> > > 2.25.1
> > >

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2021-10-25  9:44 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-02  5:32 [dpdk-dev] [RFC PATCH v1 0/5] add new API for wait until scheme Feifei Wang
2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 1/5] eal: " Feifei Wang
2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 2/5] eal: use wait until scheme for read pflock Feifei Wang
2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 3/5] eal: use wait until scheme for mcslock Feifei Wang
2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 4/5] lib/bpf: use wait until scheme for Rx/Tx iteration Feifei Wang
2021-09-02  5:32 ` [dpdk-dev] [RFC PATCH v1 5/5] lib/distributor: use wait until scheme Feifei Wang
2021-09-02 15:22 ` [dpdk-dev] [RFC PATCH v1 0/5] add new API for " Stephen Hemminger
2021-09-03  7:02   ` [dpdk-dev] 回复: " Feifei Wang
2021-09-23  9:58 ` [dpdk-dev] [RFC PATCH v2 0/5] add new definitions for wait scheme Feifei Wang
2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 1/5] eal: " Feifei Wang
2021-09-23  9:58   ` [dpdk-dev] [RFC PATCH v2 2/5] eal: use wait event for read pflock Feifei Wang
2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 3/5] eal: use wait event scheme for mcslock Feifei Wang
2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
2021-09-24 18:07     ` Ananyev, Konstantin
2021-09-26  2:19       ` [dpdk-dev] 回复: " Feifei Wang
2021-09-23  9:59   ` [dpdk-dev] [RFC PATCH v2 5/5] lib/distributor: use wait event scheme Feifei Wang
2021-09-26  6:32 ` [dpdk-dev] [RFC PATCH v3 0/5] add new definitions for wait scheme Feifei Wang
2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 1/5] eal: " Feifei Wang
2021-10-07 16:18     ` Ananyev, Konstantin
2021-10-12  8:09       ` [dpdk-dev] 回复: " Feifei Wang
2021-10-13 15:03         ` [dpdk-dev] " Ananyev, Konstantin
2021-10-13 17:00           ` Stephen Hemminger
2021-10-14  3:14             ` [dpdk-dev] 回复: " Feifei Wang
2021-10-14  3:08           ` Feifei Wang
2021-09-26  6:32   ` [dpdk-dev] [RFC PATCH v3 2/5] eal: use wait event for read pflock Feifei Wang
2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 3/5] eal: use wait event scheme for mcslock Feifei Wang
2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
2021-10-07 15:50     ` Ananyev, Konstantin
2021-10-07 17:40       ` Ananyev, Konstantin
2021-10-20  6:20         ` [dpdk-dev] 回复: " Feifei Wang
2021-09-26  6:33   ` [dpdk-dev] [RFC PATCH v3 5/5] lib/distributor: use wait event scheme Feifei Wang
2021-10-20  8:45   ` [dpdk-dev] [PATCH v4 0/5] add new definitions for wait scheme Feifei Wang
2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 1/5] eal: " Feifei Wang
2021-10-21 16:24       ` Ananyev, Konstantin
2021-10-25  9:20         ` [dpdk-dev] 回复: " Feifei Wang
2021-10-22  0:10       ` [dpdk-dev] " Jerin Jacob
2021-10-25  9:30         ` [dpdk-dev] 回复: " Feifei Wang
2021-10-25  9:43           ` [dpdk-dev] " Jerin Jacob
2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 2/5] eal: use wait event for read pflock Feifei Wang
2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 3/5] eal: use wait event scheme for mcslock Feifei Wang
2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 4/5] lib/bpf: use wait event scheme for Rx/Tx iteration Feifei Wang
2021-10-20  8:45     ` [dpdk-dev] [PATCH v4 5/5] lib/distributor: use wait event scheme Feifei Wang

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git