DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC] eal: add fair reader writer lock
@ 2021-01-12  6:05 Stephen Hemminger
  2021-01-14 17:34 ` [dpdk-dev] [PATCH v1] eal: add ticket based " Stephen Hemminger
  0 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2021-01-12  6:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Implement fair reader/writer lock based on current DPDK ticket lock.
These lock type acts like rte_rwlock() but like ticket lock they
are fair for multiple writers.  Writers have full priority over
readers, a stream of readers will not starve incoming writers
which is a serious bug in existing rte_rwlock.

The tests are just a clone of existing rte_rwlock with test
and function names changed. So the new fair locks should be drop
in replacement for most users.


Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/autotest_data.py                     |   6 +
 app/test/meson.build                          |   5 +
 app/test/test_fair_rwlock.c                   | 555 ++++++++++++++++++
 doc/api/doxy-api-index.md                     |   1 +
 lib/librte_eal/arm/include/meson.build        |   1 +
 lib/librte_eal/arm/include/rte_fair_rwlock.h  |  22 +
 .../include/generic/rte_fair_rwlock.h         | 203 +++++++
 lib/librte_eal/include/meson.build            |   1 +
 lib/librte_eal/ppc/include/meson.build        |   1 +
 lib/librte_eal/ppc/include/rte_fair_rwlock.h  |  18 +
 lib/librte_eal/x86/include/meson.build        |   1 +
 lib/librte_eal/x86/include/rte_fair_rwlock.h  |  18 +
 12 files changed, 832 insertions(+)
 create mode 100644 app/test/test_fair_rwlock.c
 create mode 100644 lib/librte_eal/arm/include/rte_fair_rwlock.h
 create mode 100644 lib/librte_eal/include/generic/rte_fair_rwlock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_fair_rwlock.h
 create mode 100644 lib/librte_eal/x86/include/rte_fair_rwlock.h

diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 097638941f19..d7e970d1cde0 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -63,6 +63,12 @@
         "Func":    rwlock_autotest,
         "Report":  None,
     },
+    {
+        "Name":    "Fair read/write lock autotest",
+        "Command": "fair_rwlock_autotest",
+        "Func":    rwlock_autotest,
+        "Report":  None,
+    },
     {
         "Name":    "Lcores autotest",
         "Command": "lcores_autotest",
diff --git a/app/test/meson.build b/app/test/meson.build
index 94fd39fecb82..388547ade3ff 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -46,6 +46,7 @@ test_sources = files('commands.c',
 	'test_event_timer_adapter.c',
 	'test_eventdev.c',
 	'test_external_mem.c',
+	'test_fair_rwlock.c',
 	'test_fbarray.c',
 	'test_fib.c',
 	'test_fib_perf.c',
@@ -206,6 +207,10 @@ fast_tests = [
         ['errno_autotest', true],
         ['ethdev_link_status', true],
         ['event_ring_autotest', true],
+        ['fair_rwlock_test1_autotest', true],
+        ['fair_rwlock_rda_autotest', true],
+        ['fair_rwlock_rds_wrm_autotest', true],
+        ['fair_rwlock_rde_wro_autotest', true],
         ['fib_autotest', true],
         ['fib6_autotest', true],
         ['func_reentrancy_autotest', false],
diff --git a/app/test/test_fair_rwlock.c b/app/test/test_fair_rwlock.c
new file mode 100644
index 000000000000..b3da49e2ad3b
--- /dev/null
+++ b/app/test/test_fair_rwlock.c
@@ -0,0 +1,555 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_fair_rwlock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * fair rwlock test
+ * ===========
+ * Provides UT for rte_fair_rwlock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_fair_rwlock_t sl;
+static rte_fair_rwlock_t sl_tab[RTE_MAX_LCORE];
+static rte_atomic32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static struct {
+	rte_fair_rwlock_t lock;
+	uint64_t tick;
+	volatile union {
+		uint8_t u8[RTE_CACHE_LINE_SIZE];
+		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
+	} data;
+} __rte_cache_aligned try_rwlock_data;
+
+struct try_rwlock_lcore {
+	int32_t rc;
+	int32_t type;
+	struct {
+		uint64_t tick;
+		uint64_t fail;
+		uint64_t success;
+	} stat;
+} __rte_cache_aligned;
+
+static struct try_rwlock_lcore try_lcore_data[RTE_MAX_LCORE];
+
+static int
+test_rwlock_per_core(__rte_unused void *arg)
+{
+	rte_fair_rwlock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_fair_rwlock_write_unlock(&sl);
+
+	rte_fair_rwlock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_fair_rwlock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_fair_rwlock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_fair_rwlock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_fair_rwlock_t lk = RTE_FAIR_RWLOCK_INITIALIZER;
+static volatile uint64_t rwlock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_RWLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		while (rte_atomic32_read(&synchro) == 0)
+			;
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_fair_rwlock_write_lock(&lk);
+		++rwlock_data;
+		rte_fair_rwlock_write_unlock(&lk);
+
+		rte_fair_rwlock_read_lock(&lk);
+		if (TEST_RWLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] rwlock_data = %"PRIu64"\n",
+				lcore, rwlock_data);
+		rte_fair_rwlock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_rwlock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nRwlock Perf Test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	rte_atomic32_set(&synchro, 0);
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global rwlock and a table of rwlocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_rwlock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+rwlock_test1(void)
+{
+	int i;
+
+	rte_fair_rwlock_init(&sl);
+	for (i=0; i<RTE_MAX_LCORE; i++)
+		rte_fair_rwlock_init(&sl_tab[i]);
+
+	rte_fair_rwlock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_fair_rwlock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_rwlock_per_core, NULL, i);
+	}
+
+	rte_fair_rwlock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_fair_rwlock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_fair_rwlock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_fair_rwlock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_rwlock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+try_read(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i;
+
+	rc = rte_fair_rwlock_read_trylock(&try_rwlock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	for (i = 0; i != RTE_DIM(try_rwlock_data.data.u64); i++) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u64[i] != 0) {
+			printf("%s(%u) error: unexpected data pattern\n",
+				__func__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+	}
+
+	rte_fair_rwlock_read_unlock(&try_rwlock_data.lock);
+	return rc;
+}
+
+static int
+try_write(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i, v;
+
+	v = RTE_MAX(lc % UINT8_MAX, 1U);
+
+	rc = rte_fair_rwlock_write_trylock(&try_rwlock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	/* update by bytes in reverese order */
+	for (i = RTE_DIM(try_rwlock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u8[i] != 0) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_rwlock_data.data.u8[i] = v;
+	}
+
+	/* restore by bytes in reverese order */
+	for (i = RTE_DIM(try_rwlock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u8[i] != v) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_rwlock_data.data.u8[i] = 0;
+	}
+
+	rte_fair_rwlock_write_unlock(&try_rwlock_data.lock);
+	return rc;
+}
+
+static int
+try_read_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_rwlock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_RDLOCK;
+
+	ftm = try_rwlock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_read(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static int
+try_write_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_rwlock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_WRLOCK;
+
+	ftm = try_rwlock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_write(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static void
+print_try_lcore_stats(const struct try_rwlock_lcore *tlc, uint32_t lc)
+{
+	uint64_t f, s;
+
+	f = RTE_MAX(tlc->stat.fail, 1ULL);
+	s = RTE_MAX(tlc->stat.success, 1ULL);
+
+	printf("try_lcore_data[%u]={\n"
+		"\trc=%d,\n"
+		"\ttype=%s,\n"
+		"\tfail=%" PRIu64 ",\n"
+		"\tsuccess=%" PRIu64 ",\n"
+		"\tcycles=%" PRIu64 ",\n"
+		"\tcycles/op=%#Lf,\n"
+		"\tcycles/success=%#Lf,\n"
+		"\tsuccess/fail=%#Lf,\n"
+		"};\n",
+		lc,
+		tlc->rc,
+		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
+		tlc->stat.fail,
+		tlc->stat.success,
+		tlc->stat.tick,
+		(long double)tlc->stat.tick /
+		(tlc->stat.fail + tlc->stat.success),
+		(long double)tlc->stat.tick / s,
+		(long double)tlc->stat.success / f);
+}
+
+static void
+collect_try_lcore_stats(struct try_rwlock_lcore *tlc,
+	const struct try_rwlock_lcore *lc)
+{
+	tlc->stat.tick += lc->stat.tick;
+	tlc->stat.fail += lc->stat.fail;
+	tlc->stat.success += lc->stat.success;
+}
+
+/*
+ * Process collected results:
+ *  - check status
+ *  - collect and print statistics
+ */
+static int
+process_try_lcore_stats(void)
+{
+	int32_t rc;
+	uint32_t lc, rd, wr;
+	struct try_rwlock_lcore rlc, wlc;
+
+	memset(&rlc, 0, sizeof(rlc));
+	memset(&wlc, 0, sizeof(wlc));
+
+	rlc.type = LC_TYPE_RDLOCK;
+	wlc.type = LC_TYPE_WRLOCK;
+	rd = 0;
+	wr = 0;
+
+	rc = 0;
+	RTE_LCORE_FOREACH(lc) {
+		rc |= try_lcore_data[lc].rc;
+		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
+			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
+			rd++;
+		} else {
+			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
+			wr++;
+		}
+	}
+
+	if (rc == 0) {
+		RTE_LCORE_FOREACH(lc)
+			print_try_lcore_stats(try_lcore_data + lc, lc);
+
+		if (rd != 0) {
+			printf("aggregated stats for %u RDLOCK cores:\n", rd);
+			print_try_lcore_stats(&rlc, rd);
+		}
+
+		if (wr != 0) {
+			printf("aggregated stats for %u WRLOCK cores:\n", wr);
+			print_try_lcore_stats(&wlc, wr);
+		}
+	}
+
+	return rc;
+}
+
+static void
+try_test_reset(void)
+{
+	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
+	memset(&try_rwlock_data, 0, sizeof(try_rwlock_data));
+	try_rwlock_data.tick = TEST_SEC * rte_get_tsc_hz();
+}
+
+/* all lcores grab RDLOCK */
+static int
+try_rwlock_test_rda(void)
+{
+	try_test_reset();
+
+	/* start read test on all avaialble lcores */
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* all worker lcores grab RDLOCK, main one grabs WRLOCK */
+static int
+try_rwlock_test_rds_wrm(void)
+{
+	try_test_reset();
+
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
+	try_write_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
+static int
+try_rwlock_test_rde_wro(void)
+{
+	uint32_t lc, mlc;
+
+	try_test_reset();
+
+	mlc = rte_get_main_lcore();
+
+	RTE_LCORE_FOREACH(lc) {
+		if (lc != mlc) {
+			if ((lc & 1) == 0)
+				rte_eal_remote_launch(try_read_lcore,
+						NULL, lc);
+			else
+				rte_eal_remote_launch(try_write_lcore,
+						NULL, lc);
+		}
+	}
+	try_read_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+static int
+test_rwlock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "rwlock_test1",
+			.ftst = rwlock_test1,
+		},
+		{
+			.name = "try_rwlock_test_rda",
+			.ftst = try_rwlock_test_rda,
+		},
+		{
+			.name = "try_rwlock_test_rds_wrm",
+			.ftst = try_rwlock_test_rds_wrm,
+		},
+		{
+			.name = "try_rwlock_test_rde_wro",
+			.ftst = try_rwlock_test_rde_wro,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+REGISTER_TEST_COMMAND(fair_rwlock_autotest, test_rwlock);
+
+/* subtests used in meson for CI */
+REGISTER_TEST_COMMAND(fair_rwlock_test1_autotest, rwlock_test1);
+REGISTER_TEST_COMMAND(fair_rwlock_rda_autotest, try_rwlock_test_rda);
+REGISTER_TEST_COMMAND(fair_rwlock_rds_wrm_autotest, try_rwlock_test_rds_wrm);
+REGISTER_TEST_COMMAND(fair_rwlock_rde_wro_autotest, try_rwlock_test_rde_wro);
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 748514e24316..988a00c8532d 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -76,6 +76,7 @@ The public API headers are grouped by topics:
   [rwlock]             (@ref rte_rwlock.h),
   [spinlock]           (@ref rte_spinlock.h),
   [ticketlock]         (@ref rte_ticketlock.h),
+  [rwlock]             (@ref rte_fair_rwlock.h),
   [RCU]                (@ref rte_rcu_qsbr.h)
 
 - **CPU arch**:
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..e88aaa1dcd40 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -12,6 +12,7 @@ arch_headers = files(
 	'rte_cycles_32.h',
 	'rte_cycles_64.h',
 	'rte_cycles.h',
+	'rte_fair_rwlock.h',
 	'rte_io_64.h',
 	'rte_io.h',
 	'rte_mcslock.h',
diff --git a/lib/librte_eal/arm/include/rte_fair_rwlock.h b/lib/librte_eal/arm/include/rte_fair_rwlock.h
new file mode 100644
index 000000000000..73d8a1c17583
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_fair_rwlock.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_ARM_H_
+#define _RTE_FAIR_RWLOCK_ARM_H_
+
+#ifndef RTE_FORCE_INTRINSICS
+#  error Platform must be built with RTE_FORCE_INTRINSICS
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_fair_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_fair_rwlock.h b/lib/librte_eal/include/generic/rte_fair_rwlock.h
new file mode 100644
index 000000000000..b2e9c4d92afc
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_fair_rwlock.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_TICKET_RWLOCK_H_
+#define _RTE_TICKET_RWLOCK_H_
+
+/**
+ * @file
+ *
+ * RTE Fair Read-Write Locks
+ *
+ * This file defines an API for fair read-write locks.
+ * This type  act like rwlock but provide fairness and
+ * first come, first serviced.
+ *
+ * All locks must be initialised before use, and only initialised once.
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_ticketlock.h>
+
+/**
+ * The rte_fair_rwlock_t type.
+ */
+typedef struct {
+        rte_ticketlock_t writer;
+        uint16_t readers;
+} rte_fair_rwlock_t;
+
+/**
+ * A static fair_rwlock initializer.
+ */
+#define RTE_FAIR_RWLOCK_INITIALIZER { RTE_TICKETLOCK_INITIALIZER, 0 }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the fair_rwlock to an unlocked state.
+ *
+ * @param rwl
+ *   A pointer to the fair_rwlock structure.
+ */
+__rte_experimental
+static inline void
+rte_fair_rwlock_init(rte_fair_rwlock_t *rwl)
+{
+        rte_ticketlock_init(&rwl->writer);
+        rwl->readers = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a read lock. Loop until the lock is held.
+ *
+ * @param rwl
+ *   A pointer to a fair_rwlock structure.
+ */
+__rte_experimental
+static inline void
+rte_fair_rwlock_read_lock(rte_fair_rwlock_t *rwl)
+{
+	/* Optimistically, grab a reference */
+        __atomic_fetch_add(&rwl->readers, 1, __ATOMIC_ACQUIRE);
+
+        /* If a writer raced in and got the lock, we need to back out. */
+        if (rte_ticketlock_is_locked(&rwl->writer)) {
+                /* Drop our reference so the Writer can continue */
+                __atomic_sub_fetch(&rwl->readers, 1, __ATOMIC_RELEASE);
+
+                /* Wait for the Writer to finish then get our new reference */
+                rte_ticketlock_lock(&rwl->writer);
+                __atomic_add_fetch(&rwl->readers, 1, __ATOMIC_ACQUIRE);
+                rte_ticketlock_unlock(&rwl->writer);
+        }
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a read lock.
+ *
+ * @param rwl
+ *   A pointer to a fair_rwlock structure.
+ *
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_fair_rwlock_read_trylock(rte_fair_rwlock_t *rwl)
+{
+        /* Get our reference count */
+        __atomic_fetch_add(&rwl->readers, 1, __ATOMIC_ACQUIRE);
+
+        /* If a writer has the lock, we need to back out. */
+        if (rte_ticketlock_is_locked(&rwl->writer)) {
+                __atomic_sub_fetch(&rwl->readers, 1, __ATOMIC_RELEASE);
+                return -EBUSY;
+        }
+
+	return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a read lock.
+ *
+ * @param rwl
+ *   A pointer to the fair_rwlock structure.
+ */
+__rte_experimental
+static inline void
+rte_fair_rwlock_read_unlock(rte_fair_rwlock_t *rwl)
+{
+        __atomic_fetch_sub(&rwl->readers, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a write lock. Loop until the lock is held.
+ *
+ * @param rwl
+ *   A pointer to a fair_rwlock structure.
+ */
+__rte_experimental
+static inline void
+rte_fair_rwlock_write_lock(rte_fair_rwlock_t *rwl)
+{
+        /* Wait for our turn to be writer */
+        rte_ticketlock_lock(&rwl->writer);
+
+        /* Wait for all outstanding readers */
+	rte_wait_until_equal_16(&rwl->readers, 0, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a write lock.
+ *
+ * @param rwl
+ *   A pointer to a fair_rwlock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     it was already locked for reading or writing
+ */
+__rte_experimental
+static inline int
+rte_fair_rwlock_write_trylock(rte_fair_rwlock_t *rwl)
+{
+	uint16_t x;
+
+        if (!rte_ticketlock_trylock(&rwl->writer))
+                return -EBUSY;
+
+	x = __atomic_load_n(&rwl->readers, __ATOMIC_RELAXED);
+	if (x != 0) {
+                rte_ticketlock_unlock(&rwl->writer);
+                return -EBUSY;
+        }
+
+	return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a write lock.
+ *
+ * @param rwl
+ *   A pointer to a fair_rwlock structure.
+ */
+__rte_experimental
+static inline void
+rte_fair_rwlock_write_unlock(rte_fair_rwlock_t *rwl)
+{
+        /* Unblock readers */
+        rte_ticketlock_unlock(&rwl->writer);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_H_ */
diff --git a/lib/librte_eal/include/meson.build b/lib/librte_eal/include/meson.build
index dc007084ff22..2ba6f3be9e6b 100644
--- a/lib/librte_eal/include/meson.build
+++ b/lib/librte_eal/include/meson.build
@@ -56,6 +56,7 @@ generic_headers = files(
 	'generic/rte_cpuflags.h',
 	'generic/rte_cycles.h',
 	'generic/rte_io.h',
+	'generic/rte_fair_rwlock.h',
 	'generic/rte_mcslock.h',
 	'generic/rte_memcpy.h',
 	'generic/rte_pause.h',
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..47c7dfaa4efe 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -7,6 +7,7 @@ arch_headers = files(
 	'rte_byteorder.h',
 	'rte_cpuflags.h',
 	'rte_cycles.h',
+	'rte_fair_rwlock.h',
 	'rte_io.h',
 	'rte_mcslock.h',
 	'rte_memcpy.h',
diff --git a/lib/librte_eal/ppc/include/rte_fair_rwlock.h b/lib/librte_eal/ppc/include/rte_fair_rwlock.h
new file mode 100644
index 000000000000..bf22251a28c5
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_fair_rwlock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_PPC_64_H_
+#define _RTE_FAIR_RWLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_fair_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 549cc21a42ed..a348d51c97a8 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_byteorder.h',
 	'rte_cpuflags.h',
 	'rte_cycles.h',
+	'rte_fair_rwlock.h',
 	'rte_io.h',
 	'rte_mcslock.h',
 	'rte_memcpy.h',
diff --git a/lib/librte_eal/x86/include/rte_fair_rwlock.h b/lib/librte_eal/x86/include/rte_fair_rwlock.h
new file mode 100644
index 000000000000..cde08885062e
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_fair_rwlock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_X86_64_H_
+#define _RTE_FAIR_RWLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_fair_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_X86_64_H_ */
-- 
2.29.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v1] eal: add ticket based reader writer lock
  2021-01-12  6:05 [dpdk-dev] [RFC] eal: add fair reader writer lock Stephen Hemminger
@ 2021-01-14 17:34 ` Stephen Hemminger
  2021-01-27 10:25   ` Ruifeng Wang
                     ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-01-14 17:34 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

This patch implements a reader/writer ticket lock.
This lock type acts like rte_rwlock() but uses a ticket algorithm
and are fair for multiple writers and readers.
Writers have  priority over readers.

The tests are just a clone of existing rte_rwlock with test
and function names changed. So the new ticket rwlocks should be drop
in replacement for most users.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
Ps: I have additional tests for rwlock that test for fairness.
Would these be valuable?

 app/test/autotest_data.py                     |   6 +
 app/test/meson.build                          |   5 +
 app/test/test_ticket_rwlock.c                 | 554 ++++++++++++++++++
 doc/api/doxy-api-index.md                     |   1 +
 lib/librte_eal/arm/include/meson.build        |   1 +
 .../arm/include/rte_ticket_rwlock.h           |  22 +
 .../include/generic/rte_ticket_rwlock.h       | 218 +++++++
 lib/librte_eal/include/meson.build            |   1 +
 lib/librte_eal/ppc/include/meson.build        |   1 +
 .../ppc/include/rte_ticket_rwlock.h           |  18 +
 lib/librte_eal/x86/include/meson.build        |   1 +
 .../x86/include/rte_ticket_rwlock.h           |  18 +
 12 files changed, 846 insertions(+)
 create mode 100644 app/test/test_ticket_rwlock.c
 create mode 100644 lib/librte_eal/arm/include/rte_ticket_rwlock.h
 create mode 100644 lib/librte_eal/include/generic/rte_ticket_rwlock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_ticket_rwlock.h
 create mode 100644 lib/librte_eal/x86/include/rte_ticket_rwlock.h

diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 097638941f19..62816c36d873 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -231,6 +231,12 @@
         "Func":    ticketlock_autotest,
         "Report":  None,
     },
+    {
+        "Name":    "Ticket rwlock autotest",
+        "Command": "ticket_rwlock_autotest",
+        "Func":    ticketrwlock_autotest,
+        "Report":  None,
+    },
     {
         "Name":    "MCSlock autotest",
         "Command": "mcslock_autotest",
diff --git a/app/test/meson.build b/app/test/meson.build
index 94fd39fecb82..26bf0c15097d 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -135,6 +135,7 @@ test_sources = files('commands.c',
 	'test_timer_racecond.c',
 	'test_timer_secondary.c',
 	'test_ticketlock.c',
+	'test_ticket_rwlock.c',
 	'test_trace.c',
 	'test_trace_register.c',
 	'test_trace_perf.c',
@@ -245,6 +246,10 @@ fast_tests = [
         ['string_autotest', true],
         ['table_autotest', true],
         ['tailq_autotest', true],
+        ['ticketrwlock_test1_autotest', true],
+        ['ticketrwlock_rda_autotest', true],
+        ['ticketrwlock_rds_wrm_autotest', true],
+        ['ticketrwlock_rde_wro_autotest', true],
         ['timer_autotest', false],
         ['user_delay_us', true],
         ['version_autotest', true],
diff --git a/app/test/test_ticket_rwlock.c b/app/test/test_ticket_rwlock.c
new file mode 100644
index 000000000000..cffc9bf23ef6
--- /dev/null
+++ b/app/test/test_ticket_rwlock.c
@@ -0,0 +1,554 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_ticket_rwlock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * ticket rwlock test
+ * ===========
+ * Provides UT for rte_ticket_rwlock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_rwticketlock_t sl;
+static rte_rwticketlock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static struct {
+	rte_rwticketlock_t lock;
+	uint64_t tick;
+	volatile union {
+		uint8_t u8[RTE_CACHE_LINE_SIZE];
+		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
+	} data;
+} __rte_cache_aligned try_rwlock_data;
+
+struct try_rwlock_lcore {
+	int32_t rc;
+	int32_t type;
+	struct {
+		uint64_t tick;
+		uint64_t fail;
+		uint64_t success;
+	} stat;
+} __rte_cache_aligned;
+
+static struct try_rwlock_lcore try_lcore_data[RTE_MAX_LCORE];
+
+static int
+test_rwlock_per_core(__rte_unused void *arg)
+{
+	rte_rwticket_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_rwticket_write_unlock(&sl);
+
+	rte_rwticket_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_rwticket_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_rwticket_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_rwticket_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_rwticketlock_t lk = RTE_RWTICKETLOCK_INITIALIZER;
+static volatile uint64_t rwlock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_RWLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_rwticket_write_lock(&lk);
+		++rwlock_data;
+		rte_rwticket_write_unlock(&lk);
+
+		rte_rwticket_read_lock(&lk);
+		if (TEST_RWLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] rwlock_data = %"PRIu64"\n",
+				lcore, rwlock_data);
+		rte_rwticket_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_rwlock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nTicket rwlock Perf Test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global rwlock and a table of rwlocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_rwlock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+rwlock_test1(void)
+{
+	int i;
+
+	rte_rwticketlock_init(&sl);
+	for (i=0; i<RTE_MAX_LCORE; i++)
+		rte_rwticketlock_init(&sl_tab[i]);
+
+	rte_rwticket_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_rwticket_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_rwlock_per_core, NULL, i);
+	}
+
+	rte_rwticket_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_rwticket_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_rwticket_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_rwticket_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_rwlock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+try_read(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i;
+
+	rc = rte_rwticket_read_trylock(&try_rwlock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	for (i = 0; i != RTE_DIM(try_rwlock_data.data.u64); i++) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u64[i] != 0) {
+			printf("%s(%u) error: unexpected data pattern\n",
+				__func__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+	}
+
+	rte_rwticket_read_unlock(&try_rwlock_data.lock);
+	return rc;
+}
+
+static int
+try_write(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i, v;
+
+	v = RTE_MAX(lc % UINT8_MAX, 1U);
+
+	rc = rte_rwticket_write_trylock(&try_rwlock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	/* update by bytes in reverse order */
+	for (i = RTE_DIM(try_rwlock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u8[i] != 0) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_rwlock_data.data.u8[i] = v;
+	}
+
+	/* restore by bytes in reverse order */
+	for (i = RTE_DIM(try_rwlock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_rwlock_data.data.u8[i] != v) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_rwlock_data.data,
+				sizeof(try_rwlock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_rwlock_data.data.u8[i] = 0;
+	}
+
+	rte_rwticket_write_unlock(&try_rwlock_data.lock);
+	return rc;
+}
+
+static int
+try_read_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_rwlock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_RDLOCK;
+
+	ftm = try_rwlock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_read(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static int
+try_write_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_rwlock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_WRLOCK;
+
+	ftm = try_rwlock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_write(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static void
+print_try_lcore_stats(const struct try_rwlock_lcore *tlc, uint32_t lc)
+{
+	uint64_t f, s;
+
+	f = RTE_MAX(tlc->stat.fail, 1ULL);
+	s = RTE_MAX(tlc->stat.success, 1ULL);
+
+	printf("try_lcore_data[%u]={\n"
+		"\trc=%d,\n"
+		"\ttype=%s,\n"
+		"\tfail=%" PRIu64 ",\n"
+		"\tsuccess=%" PRIu64 ",\n"
+		"\tcycles=%" PRIu64 ",\n"
+		"\tcycles/op=%#Lf,\n"
+		"\tcycles/success=%#Lf,\n"
+		"\tsuccess/fail=%#Lf,\n"
+		"};\n",
+		lc,
+		tlc->rc,
+		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
+		tlc->stat.fail,
+		tlc->stat.success,
+		tlc->stat.tick,
+		(long double)tlc->stat.tick /
+		(tlc->stat.fail + tlc->stat.success),
+		(long double)tlc->stat.tick / s,
+		(long double)tlc->stat.success / f);
+}
+
+static void
+collect_try_lcore_stats(struct try_rwlock_lcore *tlc,
+	const struct try_rwlock_lcore *lc)
+{
+	tlc->stat.tick += lc->stat.tick;
+	tlc->stat.fail += lc->stat.fail;
+	tlc->stat.success += lc->stat.success;
+}
+
+/*
+ * Process collected results:
+ *  - check status
+ *  - collect and print statistics
+ */
+static int
+process_try_lcore_stats(void)
+{
+	int32_t rc;
+	uint32_t lc, rd, wr;
+	struct try_rwlock_lcore rlc, wlc;
+
+	memset(&rlc, 0, sizeof(rlc));
+	memset(&wlc, 0, sizeof(wlc));
+
+	rlc.type = LC_TYPE_RDLOCK;
+	wlc.type = LC_TYPE_WRLOCK;
+	rd = 0;
+	wr = 0;
+
+	rc = 0;
+	RTE_LCORE_FOREACH(lc) {
+		rc |= try_lcore_data[lc].rc;
+		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
+			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
+			rd++;
+		} else {
+			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
+			wr++;
+		}
+	}
+
+	if (rc == 0) {
+		RTE_LCORE_FOREACH(lc)
+			print_try_lcore_stats(try_lcore_data + lc, lc);
+
+		if (rd != 0) {
+			printf("aggregated stats for %u RDLOCK cores:\n", rd);
+			print_try_lcore_stats(&rlc, rd);
+		}
+
+		if (wr != 0) {
+			printf("aggregated stats for %u WRLOCK cores:\n", wr);
+			print_try_lcore_stats(&wlc, wr);
+		}
+	}
+
+	return rc;
+}
+
+static void
+try_test_reset(void)
+{
+	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
+	memset(&try_rwlock_data, 0, sizeof(try_rwlock_data));
+	try_rwlock_data.tick = TEST_SEC * rte_get_tsc_hz();
+}
+
+/* all lcores grab RDLOCK */
+static int
+try_rwlock_test_rda(void)
+{
+	try_test_reset();
+
+	/* start read test on all available lcores */
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* all worker lcores grab RDLOCK, main one grabs WRLOCK */
+static int
+try_rwlock_test_rds_wrm(void)
+{
+	try_test_reset();
+
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
+	try_write_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
+static int
+try_rwlock_test_rde_wro(void)
+{
+	uint32_t lc, mlc;
+
+	try_test_reset();
+
+	mlc = rte_get_main_lcore();
+
+	RTE_LCORE_FOREACH(lc) {
+		if (lc != mlc) {
+			if ((lc & 1) == 0)
+				rte_eal_remote_launch(try_read_lcore,
+						NULL, lc);
+			else
+				rte_eal_remote_launch(try_write_lcore,
+						NULL, lc);
+		}
+	}
+	try_read_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+static int
+test_rwlock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "rwlock_test1",
+			.ftst = rwlock_test1,
+		},
+		{
+			.name = "try_rwlock_test_rda",
+			.ftst = try_rwlock_test_rda,
+		},
+		{
+			.name = "try_rwlock_test_rds_wrm",
+			.ftst = try_rwlock_test_rds_wrm,
+		},
+		{
+			.name = "try_rwlock_test_rde_wro",
+			.ftst = try_rwlock_test_rde_wro,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+REGISTER_TEST_COMMAND(ticketrwlock_autotest, test_rwlock);
+
+/* subtests used in meson for CI */
+REGISTER_TEST_COMMAND(ticketrwlock_test1_autotest, rwlock_test1);
+REGISTER_TEST_COMMAND(ticketrwlock_rda_autotest, try_rwlock_test_rda);
+REGISTER_TEST_COMMAND(ticketrwlock_rds_wrm_autotest, try_rwlock_test_rds_wrm);
+REGISTER_TEST_COMMAND(ticketrwlock_rde_wro_autotest, try_rwlock_test_rde_wro);
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 748514e24316..d76a4c8ba1c4 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -76,6 +76,7 @@ The public API headers are grouped by topics:
   [rwlock]             (@ref rte_rwlock.h),
   [spinlock]           (@ref rte_spinlock.h),
   [ticketlock]         (@ref rte_ticketlock.h),
+  [ticketrwlock]       (@ref rte_ticket_rwlock.h),
   [RCU]                (@ref rte_rcu_qsbr.h)
 
 - **CPU arch**:
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..951a527ffa64 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -28,6 +28,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/arm/include/rte_ticket_rwlock.h b/lib/librte_eal/arm/include/rte_ticket_rwlock.h
new file mode 100644
index 000000000000..273137a5abba
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_ticket_rwlock.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_ARM_H_
+#define _RTE_FAIR_RWLOCK_ARM_H_
+
+#ifndef RTE_FORCE_INTRINSICS
+#  error Platform must be built with RTE_FORCE_INTRINSICS
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_ticket_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_ticket_rwlock.h b/lib/librte_eal/include/generic/rte_ticket_rwlock.h
new file mode 100644
index 000000000000..b3637358c1f7
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_ticket_rwlock.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_TICKET_RWLOCK_H_
+#define _RTE_TICKET_RWLOCK_H_
+
+/**
+ * @file
+ *
+ * Ticket based reader/writer lock
+ *
+ * This file defines an API for ticket style read-write locks.
+ * This types of lock act like rte_rwlock but provide fairness
+ * and requests are handled first come, first serviced.
+ *
+ * All locks must be initialized before use, and only initialized once.
+ *
+ * References:
+ *  "Spinlocks and Read-Write Locks"
+ *     http://locklessinc.com/articles/locks/
+ *  "Scalable Read-Writer Synchronization for Shared-Memory Multiprocessors"
+ *     https://www.cs.rochester.edu/research/synchronization/pseudocode/rw.html
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef union {
+	uint64_t tickets;
+	struct {
+		union {
+			struct {
+				uint16_t write; /* current writer */
+				uint16_t read;	/* current reader */
+			};
+			uint32_t readwrite;	/* atomic for both read and write */
+		};
+		uint16_t next;	/* next ticket */
+	};
+} rte_rwticketlock_t;
+
+/**
+ * A static rwticket initializer.
+ */
+#define RTE_RWTICKETLOCK_INITIALIZER { 0 }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the rwticketlock to an unlocked state.
+ *
+ * @param rwl
+ *   A pointer to the rwticketlock structure.
+ */
+__rte_experimental
+static inline void
+rte_rwticketlock_init(rte_rwticketlock_t *rwl)
+{
+	rwl->tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ * Take a write lock. Loop until the lock is held.
+ *
+ * @param rwl
+ *   A pointer to a rwticketlock structure.
+ */
+__rte_experimental
+static inline void
+rte_rwticket_write_lock(rte_rwticketlock_t *rwl)
+{
+	uint16_t me;
+
+	me = __atomic_fetch_add(&rwl->next, 1, __ATOMIC_RELAXED);
+	rte_wait_until_equal_16(&rwl->write, me, __ATOMIC_ACQUIRE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a write lock.
+ *
+ * @param rwl
+ *   A pointer to a rwticketlock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     it was already locked for reading or writing
+ */
+__rte_experimental
+static inline int
+rte_rwticket_write_trylock(rte_rwticketlock_t *rwl)
+{
+	rte_rwticketlock_t old, new;
+
+	old.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
+	if (old.write != old.next)
+		return -EBUSY;
+
+	new.tickets = old.tickets;
+	new.next = old.next + 1;
+	if (__atomic_compare_exchange_n(&rwl->tickets, &old.tickets, new.tickets,
+					0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+		return 0;
+	else
+		return -EBUSY;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a write lock.
+ *
+ * @param rwl
+ *   A pointer to a rwticketlock structure.
+ */
+__rte_experimental
+static inline void
+rte_rwticket_write_unlock(rte_rwticketlock_t *rwl)
+{
+	rte_rwticketlock_t t;
+
+	t.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
+	t.write++;
+	t.read++;
+	__atomic_store_n(&rwl->readwrite, t.readwrite, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ *
+ * Take a read lock. Loop until the lock is held.
+ *
+ * @param l
+ *   A pointer to a rwticketlock structure.
+ */
+__rte_experimental
+static inline void
+rte_rwticket_read_lock(rte_rwticketlock_t *rwl)
+{
+	uint16_t me;
+
+	me = __atomic_fetch_add(&rwl->next, 1, __ATOMIC_RELAXED);
+	rte_wait_until_equal_16(&rwl->read, me, __ATOMIC_ACQUIRE);
+	__atomic_fetch_add(&rwl->read, 1, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a read lock.
+ *
+ * @param rwl
+ *   A pointer to a rwticketlock structure.
+ *
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_rwticket_read_trylock(rte_rwticketlock_t *rwl)
+{
+	rte_rwticketlock_t old, new;
+	int success;
+
+	old.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
+
+	do {
+		uint16_t me = old.next; /* this is our ticket */
+
+		/* does writer have the lock now? */
+		if (old.read != me && old.write != me)
+			return -EBUSY;
+
+		/* expect to be the next reader */
+		new.tickets = old.tickets;
+		old.read = me;
+		new.read = new.next = me + 1;
+		success = __atomic_compare_exchange_n(&rwl->tickets, &old.tickets, new.tickets,
+						      0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+	} while (!success);
+
+	return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a read lock.
+ *
+ * @param rwl
+ *   A pointer to the rwticketlock structure.
+ */
+__rte_experimental
+static inline void
+rte_rwticket_read_unlock(rte_rwticketlock_t *rwl)
+{
+	__atomic_add_fetch(&rwl->write, 1, __ATOMIC_RELEASE);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_TICKET_RWLOCK_H_ */
diff --git a/lib/librte_eal/include/meson.build b/lib/librte_eal/include/meson.build
index 0dea342e1deb..fe5c19748926 100644
--- a/lib/librte_eal/include/meson.build
+++ b/lib/librte_eal/include/meson.build
@@ -65,6 +65,7 @@ generic_headers = files(
 	'generic/rte_rwlock.h',
 	'generic/rte_spinlock.h',
 	'generic/rte_ticketlock.h',
+	'generic/rte_ticket_rwlock.h',
 	'generic/rte_vect.h',
 )
 install_headers(generic_headers, subdir: 'generic')
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..0bc560327749 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/ppc/include/rte_ticket_rwlock.h b/lib/librte_eal/ppc/include/rte_ticket_rwlock.h
new file mode 100644
index 000000000000..4768d5bfa8ef
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_ticket_rwlock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_PPC_64_H_
+#define _RTE_FAIR_RWLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_ticket_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 549cc21a42ed..e9169f0d1da5 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -20,6 +20,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/x86/include/rte_ticket_rwlock.h b/lib/librte_eal/x86/include/rte_ticket_rwlock.h
new file mode 100644
index 000000000000..83c8bd0899d3
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_ticket_rwlock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_FAIR_RWLOCK_X86_64_H_
+#define _RTE_FAIR_RWLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_ticket_rwlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_FAIR_RWLOCK_X86_64_H_ */
-- 
2.29.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v1] eal: add ticket based reader writer lock
  2021-01-14 17:34 ` [dpdk-dev] [PATCH v1] eal: add ticket based " Stephen Hemminger
@ 2021-01-27 10:25   ` Ruifeng Wang
  2021-01-28  1:32     ` Stephen Hemminger
  2021-01-28  1:16   ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
  2 siblings, 1 reply; 27+ messages in thread
From: Ruifeng Wang @ 2021-01-27 10:25 UTC (permalink / raw)
  To: Stephen Hemminger, dev; +Cc: nd

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Stephen Hemminger
> Sent: Friday, January 15, 2021 1:35 AM
> To: dev@dpdk.org
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Subject: [dpdk-dev] [PATCH v1] eal: add ticket based reader writer lock
> 
> This patch implements a reader/writer ticket lock.
> This lock type acts like rte_rwlock() but uses a ticket algorithm and are fair for
> multiple writers and readers.
> Writers have  priority over readers.

The lock is ticket based to be fair. So writers should have no priority?

> 
> The tests are just a clone of existing rte_rwlock with test and function names
> changed. So the new ticket rwlocks should be drop in replacement for most
> users.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> Ps: I have additional tests for rwlock that test for fairness.
> Would these be valuable?
> 
>  app/test/autotest_data.py                     |   6 +
>  app/test/meson.build                          |   5 +
>  app/test/test_ticket_rwlock.c                 | 554 ++++++++++++++++++
>  doc/api/doxy-api-index.md                     |   1 +
>  lib/librte_eal/arm/include/meson.build        |   1 +
>  .../arm/include/rte_ticket_rwlock.h           |  22 +
>  .../include/generic/rte_ticket_rwlock.h       | 218 +++++++
>  lib/librte_eal/include/meson.build            |   1 +
>  lib/librte_eal/ppc/include/meson.build        |   1 +
>  .../ppc/include/rte_ticket_rwlock.h           |  18 +
>  lib/librte_eal/x86/include/meson.build        |   1 +
>  .../x86/include/rte_ticket_rwlock.h           |  18 +
>  12 files changed, 846 insertions(+)
>  create mode 100644 app/test/test_ticket_rwlock.c  create mode 100644
> lib/librte_eal/arm/include/rte_ticket_rwlock.h
>  create mode 100644 lib/librte_eal/include/generic/rte_ticket_rwlock.h
>  create mode 100644 lib/librte_eal/ppc/include/rte_ticket_rwlock.h
>  create mode 100644 lib/librte_eal/x86/include/rte_ticket_rwlock.h
> 

<snip>

> diff --git a/lib/librte_eal/include/generic/rte_ticket_rwlock.h
> b/lib/librte_eal/include/generic/rte_ticket_rwlock.h
> new file mode 100644
> index 000000000000..b3637358c1f7
> --- /dev/null
> +++ b/lib/librte_eal/include/generic/rte_ticket_rwlock.h
> @@ -0,0 +1,218 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_TICKET_RWLOCK_H_
> +#define _RTE_TICKET_RWLOCK_H_
> +
> +/**
> + * @file
> + *
> + * Ticket based reader/writer lock
> + *
> + * This file defines an API for ticket style read-write locks.
> + * This types of lock act like rte_rwlock but provide fairness
> + * and requests are handled first come, first serviced.
> + *
> + * All locks must be initialized before use, and only initialized once.
> + *
> + * References:
> + *  "Spinlocks and Read-Write Locks"
> + *     http://locklessinc.com/articles/locks/
> + *  "Scalable Read-Writer Synchronization for Shared-Memory
> Multiprocessors"
> + *
> https://www.cs.rochester.edu/research/synchronization/pseudocode/rw.ht
> ml
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +typedef union {
> +	uint64_t tickets;
> +	struct {
> +		union {
> +			struct {
> +				uint16_t write; /* current writer */
> +				uint16_t read;	/* current reader */
> +			};
> +			uint32_t readwrite;	/* atomic for both read and
> write */
> +		};
> +		uint16_t next;	/* next ticket */
> +	};
> +} rte_rwticketlock_t;
> +
> +/**
> + * A static rwticket initializer.
> + */
> +#define RTE_RWTICKETLOCK_INITIALIZER { 0 }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Initialize the rwticketlock to an unlocked state.
> + *
> + * @param rwl
> + *   A pointer to the rwticketlock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_rwticketlock_init(rte_rwticketlock_t *rwl) {
> +	rwl->tickets = 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + * Take a write lock. Loop until the lock is held.
> + *
> + * @param rwl
> + *   A pointer to a rwticketlock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_rwticket_write_lock(rte_rwticketlock_t *rwl) {
> +	uint16_t me;
> +
> +	me = __atomic_fetch_add(&rwl->next, 1, __ATOMIC_RELAXED);
> +	rte_wait_until_equal_16(&rwl->write, me, __ATOMIC_ACQUIRE); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Try to take a write lock.
> + *
> + * @param rwl
> + *   A pointer to a rwticketlock structure.
> + * @return
> + *   - zero if the lock is successfully taken
> + *   - -EBUSY if lock could not be acquired for writing because
> + *     it was already locked for reading or writing
> + */
> +__rte_experimental
> +static inline int
> +rte_rwticket_write_trylock(rte_rwticketlock_t *rwl) {
> +	rte_rwticketlock_t old, new;
> +
> +	old.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
> +	if (old.write != old.next)
> +		return -EBUSY;
> +
> +	new.tickets = old.tickets;
> +	new.next = old.next + 1;
> +	if (__atomic_compare_exchange_n(&rwl->tickets, &old.tickets,
> new.tickets,
> +					0, __ATOMIC_ACQUIRE,
> __ATOMIC_RELAXED))
> +		return 0;
> +	else
> +		return -EBUSY;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a write lock.
> + *
> + * @param rwl
> + *   A pointer to a rwticketlock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_rwticket_write_unlock(rte_rwticketlock_t *rwl) {
> +	rte_rwticketlock_t t;
> +
> +	t.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
> +	t.write++;
> +	t.read++;
> +	__atomic_store_n(&rwl->readwrite, t.readwrite,
> __ATOMIC_RELEASE); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + *
> + * Take a read lock. Loop until the lock is held.
> + *
> + * @param l

Nit, 'rwl'.

> + *   A pointer to a rwticketlock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_rwticket_read_lock(rte_rwticketlock_t *rwl) {
> +	uint16_t me;
> +
> +	me = __atomic_fetch_add(&rwl->next, 1, __ATOMIC_RELAXED);
> +	rte_wait_until_equal_16(&rwl->read, me, __ATOMIC_ACQUIRE);
> +	__atomic_fetch_add(&rwl->read, 1, __ATOMIC_RELAXED); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Try to take a read lock.
> + *
> + * @param rwl
> + *   A pointer to a rwticketlock structure.
> + *
> + * @return
> + *   - zero if the lock is successfully taken
> + *   - -EBUSY if lock could not be acquired for reading because a
> + *     writer holds the lock
> + */
> +__rte_experimental
> +static inline int
> +rte_rwticket_read_trylock(rte_rwticketlock_t *rwl) {
> +	rte_rwticketlock_t old, new;
> +	int success;
> +
> +	old.tickets = __atomic_load_n(&rwl->tickets, __ATOMIC_RELAXED);
> +
> +	do {
> +		uint16_t me = old.next; /* this is our ticket */

When __atomic_compare_exchange_n fails, old.tickets needs a reload.
 
> +
> +		/* does writer have the lock now? */
> +		if (old.read != me && old.write != me)

Check (old.read != me) should be enough?

> +			return -EBUSY;
> +
> +		/* expect to be the next reader */
> +		new.tickets = old.tickets;
> +		old.read = me;

This line is unnecessary?

> +		new.read = new.next = me + 1;
> +		success = __atomic_compare_exchange_n(&rwl->tickets,
> &old.tickets, new.tickets,
> +						      0, __ATOMIC_ACQUIRE,
> __ATOMIC_RELAXED);
> +	} while (!success);
> +
> +	return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a read lock.
> + *
> + * @param rwl
> + *   A pointer to the rwticketlock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_rwticket_read_unlock(rte_rwticketlock_t *rwl) {
> +	__atomic_add_fetch(&rwl->write, 1, __ATOMIC_RELEASE); }
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_TICKET_RWLOCK_H_ */
> diff --git a/lib/librte_eal/include/meson.build
> b/lib/librte_eal/include/meson.build
> index 0dea342e1deb..fe5c19748926 100644
> --- a/lib/librte_eal/include/meson.build
> +++ b/lib/librte_eal/include/meson.build
> @@ -65,6 +65,7 @@ generic_headers = files(
>  	'generic/rte_rwlock.h',
>  	'generic/rte_spinlock.h',
>  	'generic/rte_ticketlock.h',
> +	'generic/rte_ticket_rwlock.h',
>  	'generic/rte_vect.h',
>  )
>  install_headers(generic_headers, subdir: 'generic') diff --git
> a/lib/librte_eal/ppc/include/meson.build
> b/lib/librte_eal/ppc/include/meson.build
> index dae40ede546e..0bc560327749 100644
> --- a/lib/librte_eal/ppc/include/meson.build
> +++ b/lib/librte_eal/ppc/include/meson.build
> @@ -16,6 +16,7 @@ arch_headers = files(
>  	'rte_rwlock.h',
>  	'rte_spinlock.h',
>  	'rte_ticketlock.h',
> +	'rte_ticket_rwlock.h',
>  	'rte_vect.h',
>  )
>  install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
> diff --git a/lib/librte_eal/ppc/include/rte_ticket_rwlock.h
> b/lib/librte_eal/ppc/include/rte_ticket_rwlock.h
> new file mode 100644
> index 000000000000..4768d5bfa8ef
> --- /dev/null
> +++ b/lib/librte_eal/ppc/include/rte_ticket_rwlock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_FAIR_RWLOCK_PPC_64_H_
> +#define _RTE_FAIR_RWLOCK_PPC_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_ticket_rwlock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_FAIR_RWLOCK_PPC_64_H_ */
> diff --git a/lib/librte_eal/x86/include/meson.build
> b/lib/librte_eal/x86/include/meson.build
> index 549cc21a42ed..e9169f0d1da5 100644
> --- a/lib/librte_eal/x86/include/meson.build
> +++ b/lib/librte_eal/x86/include/meson.build
> @@ -20,6 +20,7 @@ arch_headers = files(
>  	'rte_rwlock.h',
>  	'rte_spinlock.h',
>  	'rte_ticketlock.h',
> +	'rte_ticket_rwlock.h',
>  	'rte_vect.h',
>  )
>  install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
> diff --git a/lib/librte_eal/x86/include/rte_ticket_rwlock.h
> b/lib/librte_eal/x86/include/rte_ticket_rwlock.h
> new file mode 100644
> index 000000000000..83c8bd0899d3
> --- /dev/null
> +++ b/lib/librte_eal/x86/include/rte_ticket_rwlock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_FAIR_RWLOCK_X86_64_H_
> +#define _RTE_FAIR_RWLOCK_X86_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_ticket_rwlock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_FAIR_RWLOCK_X86_64_H_ */
> --
> 2.29.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v2] eal: add ticket based reader writer lock
  2021-01-14 17:34 ` [dpdk-dev] [PATCH v1] eal: add ticket based " Stephen Hemminger
  2021-01-27 10:25   ` Ruifeng Wang
@ 2021-01-28  1:16   ` Stephen Hemminger
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
  2 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-01-28  1:16 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Stephen Hemminger

This patch implements a reader/writer ticket lock because the
current DPDK reader/writer lock will starve writers when
presented with a stream of readers.

This lock type acts like rte_rwlock() but uses a ticket algorithm
and is therefore fair for multiple writers and readers. It acts
like the existing DPDK ticket and MCS lock but supports
reader/writer semantics.

It is referred to as "Simple, non-scalable, fair reader-writer lock"
in the MCS paper from PPoP '91.

The tests are just a clone of existing rte_rwlock with test
and function names changed. So the new ticket rwlocks should be drop
in replacement for most users.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v2 - fix a minor checkpatch warning and docbook param error

 app/test/autotest_data.py              | 6 ++++++
 app/test/meson.build                   | 5 +++++
 doc/api/doxy-api-index.md              | 1 +
 lib/librte_eal/arm/include/meson.build | 1 +
 lib/librte_eal/include/meson.build     | 1 +
 lib/librte_eal/ppc/include/meson.build | 1 +
 lib/librte_eal/x86/include/meson.build | 1 +
 7 files changed, 16 insertions(+)

diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 097638941f19..62816c36d873 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -231,6 +231,12 @@
         "Func":    ticketlock_autotest,
         "Report":  None,
     },
+    {
+        "Name":    "Ticket rwlock autotest",
+        "Command": "ticket_rwlock_autotest",
+        "Func":    ticketrwlock_autotest,
+        "Report":  None,
+    },
     {
         "Name":    "MCSlock autotest",
         "Command": "mcslock_autotest",
diff --git a/app/test/meson.build b/app/test/meson.build
index 0889ad4c2367..e6ace8e597e6 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -136,6 +136,7 @@ test_sources = files('commands.c',
 	'test_timer_racecond.c',
 	'test_timer_secondary.c',
 	'test_ticketlock.c',
+	'test_ticket_rwlock.c',
 	'test_trace.c',
 	'test_trace_register.c',
 	'test_trace_perf.c',
@@ -247,6 +248,10 @@ fast_tests = [
         ['table_autotest', true],
         ['tailq_autotest', true],
         ['ticketlock_autotest', true],
+        ['ticketrwlock_test1_autotest', true],
+        ['ticketrwlock_rda_autotest', true],
+        ['ticketrwlock_rds_wrm_autotest', true],
+        ['ticketrwlock_rde_wro_autotest', true],
         ['timer_autotest', false],
         ['user_delay_us', true],
         ['version_autotest', true],
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 748514e24316..d76a4c8ba1c4 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -76,6 +76,7 @@ The public API headers are grouped by topics:
   [rwlock]             (@ref rte_rwlock.h),
   [spinlock]           (@ref rte_spinlock.h),
   [ticketlock]         (@ref rte_ticketlock.h),
+  [ticketrwlock]       (@ref rte_ticket_rwlock.h),
   [RCU]                (@ref rte_rcu_qsbr.h)
 
 - **CPU arch**:
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..951a527ffa64 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -28,6 +28,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/include/meson.build b/lib/librte_eal/include/meson.build
index 0dea342e1deb..fe5c19748926 100644
--- a/lib/librte_eal/include/meson.build
+++ b/lib/librte_eal/include/meson.build
@@ -65,6 +65,7 @@ generic_headers = files(
 	'generic/rte_rwlock.h',
 	'generic/rte_spinlock.h',
 	'generic/rte_ticketlock.h',
+	'generic/rte_ticket_rwlock.h',
 	'generic/rte_vect.h',
 )
 install_headers(generic_headers, subdir: 'generic')
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..0bc560327749 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 549cc21a42ed..e9169f0d1da5 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -20,6 +20,7 @@ arch_headers = files(
 	'rte_rwlock.h',
 	'rte_spinlock.h',
 	'rte_ticketlock.h',
+	'rte_ticket_rwlock.h',
 	'rte_vect.h',
 )
 install_headers(arch_headers, subdir: get_option('include_subdir_arch'))
-- 
2.29.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v1] eal: add ticket based reader writer lock
  2021-01-27 10:25   ` Ruifeng Wang
@ 2021-01-28  1:32     ` Stephen Hemminger
  0 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-01-28  1:32 UTC (permalink / raw)
  To: Ruifeng Wang; +Cc: dev, nd

On Wed, 27 Jan 2021 10:25:15 +0000
Ruifeng Wang <Ruifeng.Wang@arm.com> wrote:

> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Stephen Hemminger
> > Sent: Friday, January 15, 2021 1:35 AM
> > To: dev@dpdk.org
> > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > Subject: [dpdk-dev] [PATCH v1] eal: add ticket based reader writer lock
> > 
> > This patch implements a reader/writer ticket lock.
> > This lock type acts like rte_rwlock() but uses a ticket algorithm and are fair for
> > multiple writers and readers.
> > Writers have  priority over readers.  
> 
> The lock is ticket based to be fair. So writers should have no priority?


Read the articles referenced in the code.
The naming matches what the original MCS paper called it.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks
  2021-01-14 17:34 ` [dpdk-dev] [PATCH v1] eal: add ticket based " Stephen Hemminger
  2021-01-27 10:25   ` Ruifeng Wang
  2021-01-28  1:16   ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
@ 2021-02-12  1:38   ` Stephen Hemminger
  2021-02-28 17:21     ` [dpdk-dev] [PATCH v1] pflock: implementation of phase-fair reader writer locks Stephen Hemminger
                       ` (4 more replies)
  2 siblings, 5 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-02-12  1:38 UTC (permalink / raw)
  To: dev; +Cc: honnappa.nagarahalli, Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees. According to the research it will be better for real
time applications such as DPDK.

A similar implementation is Concurrency Kit available in FreeBSD.

For information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
Note: This is better than earlier ticket variant proposal so dropping
that one. The fairness properties are better.

 app/test/meson.build                        |   6 +
 app/test/test_pflock.c                      | 220 ++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
 lib/librte_eal/include/generic/rte_pflock.h | 272 ++++++++++++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  16 ++
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
 9 files changed, 553 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 94fd39fecb82..d5cf0ba701e9 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -89,6 +89,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -227,6 +228,11 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
+        ['pflock_test1_autotest', true],
+        ['pflock_rda_autotest', true],
+        ['pflock_rds_wrm_autotest', true],
+        ['pflock_rde_wro_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..b6c5d2f8afde
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+pflock_test1(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+test_pflock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "pflock_test1",
+			.ftst = pflock_test1,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..afa4edeb2830
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * Copyright 2011-2015 Samy Al Bahra.
+ * All rights reserved.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	union rte_pflock_ticket {
+		uint32_t tickets;
+		struct {
+			uint16_t in;
+			uint16_t out;
+		};
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/**
+ * Constants used to map the bits in reader counter
+ *
+ * +-----------------+-+-+
+ * |     Readers     |W|P|
+ * |                 |R|H|
+ * +-----------------+-+-+
+ */
+#define RTE_PFLOCK_LSB   0xFFFFFFF0
+#define RTE_PFLOCK_RINC  0x100		/* Reader increment value. */
+#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.tickets = 0;
+	pf->wr.tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint32_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a pflock for reading
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_read_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->rd.tickets, __ATOMIC_RELAXED);
+
+	/* loop until writer shows up */
+	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
+		new.out = old.out;
+		new.in = old.in + RTE_PFLOCK_RINC;
+		if (__atomic_compare_exchange_n(&pf->rd.tickets, &old.tickets, new.tickets,
+						0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+			return 0;	/* got it */
+
+		/* either new reader got in (so retry) or a writer */
+	}
+
+	/* If writer is present then we are busy */
+	return -EBUSY;
+}
+	
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param p
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket;
+
+	/* Acquire ownership of write-phase. */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * Need ACQUIRE to prevent speculative execution of the wait loop
+	 */
+	ticket = __atomic_fetch_add(&pf->rd.in,
+				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				    __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     another writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_write_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+	uint16_t ticket;
+	
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->wr.tickets, __ATOMIC_RELAXED);
+	new.out = old.out;
+	new.in  = old.in + 1;
+	ticket = new.in;
+
+	/* if writer is already present then too busy */
+	if (old.out != new.in ||
+	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets, new.tickets,
+					 0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+		return -EBUSY; /* another writer is present already */
+		
+	/*
+	 * We now own the write phase, but still need to tell
+	 * readers and wait for them.
+	 *
+	 * Need ACQUIRE semantics to avoid speculative execution of wait loop
+	 */
+	ticket  = __atomic_fetch_add(&pf->rd.in,
+				 (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				 __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..e7b875ac56a8
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 549cc21a42ed..39222cf724be 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -14,6 +14,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.0


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v1] pflock: implementation of phase-fair reader writer locks
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
@ 2021-02-28 17:21     ` Stephen Hemminger
  2021-03-03 18:30     ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-02-28 17:21 UTC (permalink / raw)
  To: honnappa.nagarahalli, dev; +Cc: Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees which makes it better for typical DPDK applications.
They lock internally uses two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader or writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same time (reader or writer)
that try to acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
v1 - cleanup whitespace from the RFC version
     Note: checkpatch has bogus complaint about spelling error
     on this patch. It objects to the definition "RTE_PFLOCK_PRES"
     as a spelling error.

 app/test/meson.build                        |   6 +
 app/test/test_pflock.c                      | 220 ++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
 lib/librte_eal/include/generic/rte_pflock.h | 272 ++++++++++++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  16 ++
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
 9 files changed, 553 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 561e493a2944..134098de9ac2 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,11 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
+        ['pflock_test1_autotest', true],
+        ['pflock_rda_autotest', true],
+        ['pflock_rds_wrm_autotest', true],
+        ['pflock_rde_wro_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..b6c5d2f8afde
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+pflock_test1(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+test_pflock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "pflock_test1",
+			.ftst = pflock_test1,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..327526bc8e12
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * Copyright 2011-2015 Samy Al Bahra.
+ * All rights reserved.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	union rte_pflock_ticket {
+		uint32_t tickets;
+		struct {
+			uint16_t in;
+			uint16_t out;
+		};
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/**
+ * Constants used to map the bits in reader counter
+ *
+ * +-----------------+-+-+
+ * |     Readers     |W|P|
+ * |                 |R|H|
+ * +-----------------+-+-+
+ */
+#define RTE_PFLOCK_LSB   0xFFFFFFF0
+#define RTE_PFLOCK_RINC  0x100		/* Reader increment value. */
+#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.tickets = 0;
+	pf->wr.tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint32_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a pflock for reading
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_read_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->rd.tickets, __ATOMIC_RELAXED);
+
+	/* loop until writer shows up */
+	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
+		new.out = old.out;
+		new.in = old.in + RTE_PFLOCK_RINC;
+		if (__atomic_compare_exchange_n(&pf->rd.tickets, &old.tickets, new.tickets,
+						0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+			return 0;	/* got it */
+
+		/* either new reader got in (so retry) or a writer */
+	}
+
+	/* If writer is present then we are busy */
+	return -EBUSY;
+}
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param p
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket;
+
+	/* Acquire ownership of write-phase. */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * Need ACQUIRE to prevent speculative execution of the wait loop
+	 */
+	ticket = __atomic_fetch_add(&pf->rd.in,
+				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				    __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     another writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_write_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+	uint16_t ticket;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->wr.tickets, __ATOMIC_RELAXED);
+	new.out = old.out;
+	new.in  = old.in + 1;
+	ticket = new.in;
+
+	/* if writer is already present then too busy */
+	if (old.out != new.in ||
+	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets, new.tickets,
+					 0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+		return -EBUSY; /* another writer is present already */
+
+	/*
+	 * We now own the write phase, but still need to tell
+	 * readers and wait for them.
+	 *
+	 * Need ACQUIRE semantics to avoid speculative execution of wait loop
+	 */
+	ticket  = __atomic_fetch_add(&pf->rd.in,
+				 (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				 __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..e7b875ac56a8
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v2] pflock: implementation of phase-fair reader writer locks
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
  2021-02-28 17:21     ` [dpdk-dev] [PATCH v1] pflock: implementation of phase-fair reader writer locks Stephen Hemminger
@ 2021-03-03 18:30     ` Stephen Hemminger
  2021-03-03 19:19     ` [dpdk-dev] [PATCH v3] " Stephen Hemminger
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-03 18:30 UTC (permalink / raw)
  To: honnappa.nagarahalli, dev; +Cc: Stephen Hemminger, Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees which makes it better for typical DPDK applications.
They lock internally uses two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader or writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same time (reader or writer)
that try to acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v2 - fix truncation of mask (reported by some compilers)
     and docbook formatting

 app/test/meson.build                        |   6 +
 app/test/test_pflock.c                      | 542 ++++++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 +
 lib/librte_eal/include/generic/rte_pflock.h | 273 ++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  16 +
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 +
 9 files changed, 876 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 561e493a2944..134098de9ac2 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,11 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
+        ['pflock_test1_autotest', true],
+        ['pflock_rda_autotest', true],
+        ['pflock_rds_wrm_autotest', true],
+        ['pflock_rde_wro_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..cd36c02cc710
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,542 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static struct {
+	rte_pflock_t lock;
+	uint64_t tick;
+	volatile union {
+		uint8_t u8[RTE_CACHE_LINE_SIZE];
+		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
+	} data;
+} __rte_cache_aligned try_pflock_data;
+
+struct try_pflock_lcore {
+	int32_t rc;
+	int32_t type;
+	struct {
+		uint64_t tick;
+		uint64_t fail;
+		uint64_t success;
+	} stat;
+} __rte_cache_aligned;
+
+static struct try_pflock_lcore try_lcore_data[RTE_MAX_LCORE];
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+pflock_test1(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+test_pflock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "pflock_test1",
+			.ftst = pflock_test1,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+static int
+try_read(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i;
+
+	rc = rte_pflock_read_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	for (i = 0; i != RTE_DIM(try_pflock_data.data.u64); i++) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u64[i] != 0) {
+			printf("%s(%u) error: unexpected data pattern\n",
+				__func__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+	}
+
+	rte_pflock_read_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_write(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i, v;
+
+	v = RTE_MAX(lc % UINT8_MAX, 1U);
+
+	rc = rte_pflock_write_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	/* update by bytes in reverese order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != 0) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = v;
+	}
+
+	/* restore by bytes in reverese order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != v) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = 0;
+	}
+
+	rte_pflock_write_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_read_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_RDLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_read(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static int
+try_write_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_WRLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_write(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static void
+print_try_lcore_stats(const struct try_pflock_lcore *tlc, uint32_t lc)
+{
+	uint64_t f, s;
+
+	f = RTE_MAX(tlc->stat.fail, 1ULL);
+	s = RTE_MAX(tlc->stat.success, 1ULL);
+
+	printf("try_lcore_data[%u]={\n"
+		"\trc=%d,\n"
+		"\ttype=%s,\n"
+		"\tfail=%" PRIu64 ",\n"
+		"\tsuccess=%" PRIu64 ",\n"
+		"\tcycles=%" PRIu64 ",\n"
+		"\tcycles/op=%#Lf,\n"
+		"\tcycles/success=%#Lf,\n"
+		"\tsuccess/fail=%#Lf,\n"
+		"};\n",
+		lc,
+		tlc->rc,
+		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
+		tlc->stat.fail,
+		tlc->stat.success,
+		tlc->stat.tick,
+		(long double)tlc->stat.tick /
+		(tlc->stat.fail + tlc->stat.success),
+		(long double)tlc->stat.tick / s,
+		(long double)tlc->stat.success / f);
+}
+
+static void
+collect_try_lcore_stats(struct try_pflock_lcore *tlc,
+	const struct try_pflock_lcore *lc)
+{
+	tlc->stat.tick += lc->stat.tick;
+	tlc->stat.fail += lc->stat.fail;
+	tlc->stat.success += lc->stat.success;
+}
+
+/*
+ * Process collected results:
+ *  - check status
+ *  - collect and print statistics
+ */
+static int
+process_try_lcore_stats(void)
+{
+	int32_t rc;
+	uint32_t lc, rd, wr;
+	struct try_pflock_lcore rlc, wlc;
+
+	memset(&rlc, 0, sizeof(rlc));
+	memset(&wlc, 0, sizeof(wlc));
+
+	rlc.type = LC_TYPE_RDLOCK;
+	wlc.type = LC_TYPE_WRLOCK;
+	rd = 0;
+	wr = 0;
+
+	rc = 0;
+	RTE_LCORE_FOREACH(lc) {
+		rc |= try_lcore_data[lc].rc;
+		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
+			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
+			rd++;
+		} else {
+			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
+			wr++;
+		}
+	}
+
+	if (rc == 0) {
+		RTE_LCORE_FOREACH(lc)
+			print_try_lcore_stats(try_lcore_data + lc, lc);
+
+		if (rd != 0) {
+			printf("aggregated stats for %u RDLOCK cores:\n", rd);
+			print_try_lcore_stats(&rlc, rd);
+		}
+
+		if (wr != 0) {
+			printf("aggregated stats for %u WRLOCK cores:\n", wr);
+			print_try_lcore_stats(&wlc, wr);
+		}
+	}
+
+	return rc;
+}
+
+static void
+try_test_reset(void)
+{
+	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
+	memset(&try_pflock_data, 0, sizeof(try_pflock_data));
+	try_pflock_data.tick = TEST_SEC * rte_get_tsc_hz();
+}
+
+/* all lcores grab RDLOCK */
+static int
+try_pflock_test_rda(void)
+{
+	try_test_reset();
+
+	/* start read test on all avaialble lcores */
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* all worker lcores grab RDLOCK, main one grabs WRLOCK */
+static int
+try_pflock_test_rds_wrm(void)
+{
+	try_test_reset();
+
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
+	try_write_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
+static int
+try_pflock_test_rde_wro(void)
+{
+	uint32_t lc, mlc;
+
+	try_test_reset();
+
+	mlc = rte_get_main_lcore();
+
+	RTE_LCORE_FOREACH(lc) {
+		if (lc != mlc) {
+			if ((lc & 1) == 0)
+				rte_eal_remote_launch(try_read_lcore,
+						NULL, lc);
+			else
+				rte_eal_remote_launch(try_write_lcore,
+						NULL, lc);
+		}
+	}
+	try_read_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
+
+/* subtests used in meson for CI */
+REGISTER_TEST_COMMAND(pflock_test1_autotest, pflock_test1);
+REGISTER_TEST_COMMAND(pflock_rda_autotest, try_pflock_test_rda);
+REGISTER_TEST_COMMAND(pflock_rds_wrm_autotest, try_pflock_test_rds_wrm);
+REGISTER_TEST_COMMAND(pflock_rde_wro_autotest, try_pflock_test_rde_wro);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..6808c70c34a2
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * Copyright 2011-2015 Samy Al Bahra.
+ * All rights reserved.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	union rte_pflock_ticket {
+		uint32_t tickets;
+		struct {
+			uint16_t in;
+			uint16_t out;
+		};
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/**
+ * Constants used to map the bits in reader counter
+ *
+ * +-----------------+-+-+
+ * |     Readers     |W|P|
+ * |                 |R|H|
+ * +-----------------+-+-+
+ */
+
+#define RTE_PFLOCK_LSB   0xFFF0
+#define RTE_PFLOCK_RINC  0x100		/* Reader increment value. */
+#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.tickets = 0;
+	pf->wr.tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint32_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a pflock for reading
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_read_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->rd.tickets, __ATOMIC_RELAXED);
+
+	/* loop until writer shows up */
+	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
+		new.out = old.out;
+		new.in = old.in + RTE_PFLOCK_RINC;
+		if (__atomic_compare_exchange_n(&pf->rd.tickets, &old.tickets, new.tickets,
+						0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+			return 0;	/* got it */
+
+		/* either new reader got in (so retry) or a writer */
+	}
+
+	/* If writer is present then we are busy */
+	return -EBUSY;
+}
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket;
+
+	/* Acquire ownership of write-phase. */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * Need ACQUIRE to prevent speculative execution of the wait loop
+	 */
+	ticket = __atomic_fetch_add(&pf->rd.in,
+				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				    __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     another writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_write_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+	uint16_t ticket;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->wr.tickets, __ATOMIC_RELAXED);
+	new.out = old.out;
+	new.in  = old.in + 1;
+	ticket = new.in;
+
+	/* if writer is already present then too busy */
+	if (old.out != new.in ||
+	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets, new.tickets,
+					 0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+		return -EBUSY; /* another writer is present already */
+
+	/*
+	 * We now own the write phase, but still need to tell
+	 * readers and wait for them.
+	 *
+	 * Need ACQUIRE semantics to avoid speculative execution of wait loop
+	 */
+	ticket  = __atomic_fetch_add(&pf->rd.in,
+				 (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				 __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..e7b875ac56a8
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
  2021-02-28 17:21     ` [dpdk-dev] [PATCH v1] pflock: implementation of phase-fair reader writer locks Stephen Hemminger
  2021-03-03 18:30     ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
@ 2021-03-03 19:19     ` Stephen Hemminger
  2021-03-26 17:17       ` Stephen Hemminger
  2021-03-29  3:14       ` Honnappa Nagarahalli
  2021-03-30  5:00     ` [dpdk-dev] [PATCH v4] pflock: add " Stephen Hemminger
  2021-04-02  1:42     ` [dpdk-dev] [PATCH v5] pflock: implementation of " Stephen Hemminger
  4 siblings, 2 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-03 19:19 UTC (permalink / raw)
  To: honnappa.nagarahalli, dev; +Cc: Stephen Hemminger, Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees which makes it better for typical DPDK applications.
They lock internally uses two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader or writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same time (reader or writer)
that try to acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v3 - fix some spelling errors inherited in app/test/test_pflock

 app/test/meson.build                        |   6 +
 app/test/test_pflock.c                      | 542 ++++++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 +
 lib/librte_eal/include/generic/rte_pflock.h | 273 ++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  16 +
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 +
 9 files changed, 876 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 561e493a2944..134098de9ac2 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,11 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
+        ['pflock_test1_autotest', true],
+        ['pflock_rda_autotest', true],
+        ['pflock_rds_wrm_autotest', true],
+        ['pflock_rde_wro_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..f46610b73914
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,542 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static struct {
+	rte_pflock_t lock;
+	uint64_t tick;
+	volatile union {
+		uint8_t u8[RTE_CACHE_LINE_SIZE];
+		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
+	} data;
+} __rte_cache_aligned try_pflock_data;
+
+struct try_pflock_lcore {
+	int32_t rc;
+	int32_t type;
+	struct {
+		uint64_t tick;
+		uint64_t fail;
+		uint64_t success;
+	} stat;
+} __rte_cache_aligned;
+
+static struct try_pflock_lcore try_lcore_data[RTE_MAX_LCORE];
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+pflock_test1(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+test_pflock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "pflock_test1",
+			.ftst = pflock_test1,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+static int
+try_read(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i;
+
+	rc = rte_pflock_read_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	for (i = 0; i != RTE_DIM(try_pflock_data.data.u64); i++) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u64[i] != 0) {
+			printf("%s(%u) error: unexpected data pattern\n",
+				__func__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+	}
+
+	rte_pflock_read_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_write(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i, v;
+
+	v = RTE_MAX(lc % UINT8_MAX, 1U);
+
+	rc = rte_pflock_write_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	/* update by bytes in reverse order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != 0) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = v;
+	}
+
+	/* restore by bytes in reverse order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != v) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = 0;
+	}
+
+	rte_pflock_write_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_read_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_RDLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_read(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static int
+try_write_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_WRLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_write(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static void
+print_try_lcore_stats(const struct try_pflock_lcore *tlc, uint32_t lc)
+{
+	uint64_t f, s;
+
+	f = RTE_MAX(tlc->stat.fail, 1ULL);
+	s = RTE_MAX(tlc->stat.success, 1ULL);
+
+	printf("try_lcore_data[%u]={\n"
+		"\trc=%d,\n"
+		"\ttype=%s,\n"
+		"\tfail=%" PRIu64 ",\n"
+		"\tsuccess=%" PRIu64 ",\n"
+		"\tcycles=%" PRIu64 ",\n"
+		"\tcycles/op=%#Lf,\n"
+		"\tcycles/success=%#Lf,\n"
+		"\tsuccess/fail=%#Lf,\n"
+		"};\n",
+		lc,
+		tlc->rc,
+		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
+		tlc->stat.fail,
+		tlc->stat.success,
+		tlc->stat.tick,
+		(long double)tlc->stat.tick /
+		(tlc->stat.fail + tlc->stat.success),
+		(long double)tlc->stat.tick / s,
+		(long double)tlc->stat.success / f);
+}
+
+static void
+collect_try_lcore_stats(struct try_pflock_lcore *tlc,
+	const struct try_pflock_lcore *lc)
+{
+	tlc->stat.tick += lc->stat.tick;
+	tlc->stat.fail += lc->stat.fail;
+	tlc->stat.success += lc->stat.success;
+}
+
+/*
+ * Process collected results:
+ *  - check status
+ *  - collect and print statistics
+ */
+static int
+process_try_lcore_stats(void)
+{
+	int32_t rc;
+	uint32_t lc, rd, wr;
+	struct try_pflock_lcore rlc, wlc;
+
+	memset(&rlc, 0, sizeof(rlc));
+	memset(&wlc, 0, sizeof(wlc));
+
+	rlc.type = LC_TYPE_RDLOCK;
+	wlc.type = LC_TYPE_WRLOCK;
+	rd = 0;
+	wr = 0;
+
+	rc = 0;
+	RTE_LCORE_FOREACH(lc) {
+		rc |= try_lcore_data[lc].rc;
+		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
+			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
+			rd++;
+		} else {
+			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
+			wr++;
+		}
+	}
+
+	if (rc == 0) {
+		RTE_LCORE_FOREACH(lc)
+			print_try_lcore_stats(try_lcore_data + lc, lc);
+
+		if (rd != 0) {
+			printf("aggregated stats for %u RDLOCK cores:\n", rd);
+			print_try_lcore_stats(&rlc, rd);
+		}
+
+		if (wr != 0) {
+			printf("aggregated stats for %u WRLOCK cores:\n", wr);
+			print_try_lcore_stats(&wlc, wr);
+		}
+	}
+
+	return rc;
+}
+
+static void
+try_test_reset(void)
+{
+	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
+	memset(&try_pflock_data, 0, sizeof(try_pflock_data));
+	try_pflock_data.tick = TEST_SEC * rte_get_tsc_hz();
+}
+
+/* all lcores grab RDLOCK */
+static int
+try_pflock_test_rda(void)
+{
+	try_test_reset();
+
+	/* start read test on all available lcores */
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* all worker lcores grab RDLOCK, main one grabs WRLOCK */
+static int
+try_pflock_test_rds_wrm(void)
+{
+	try_test_reset();
+
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
+	try_write_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
+static int
+try_pflock_test_rde_wro(void)
+{
+	uint32_t lc, mlc;
+
+	try_test_reset();
+
+	mlc = rte_get_main_lcore();
+
+	RTE_LCORE_FOREACH(lc) {
+		if (lc != mlc) {
+			if ((lc & 1) == 0)
+				rte_eal_remote_launch(try_read_lcore,
+						NULL, lc);
+			else
+				rte_eal_remote_launch(try_write_lcore,
+						NULL, lc);
+		}
+	}
+	try_read_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
+
+/* subtests used in meson for CI */
+REGISTER_TEST_COMMAND(pflock_test1_autotest, pflock_test1);
+REGISTER_TEST_COMMAND(pflock_rda_autotest, try_pflock_test_rda);
+REGISTER_TEST_COMMAND(pflock_rds_wrm_autotest, try_pflock_test_rds_wrm);
+REGISTER_TEST_COMMAND(pflock_rde_wro_autotest, try_pflock_test_rde_wro);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..6808c70c34a2
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * Copyright 2011-2015 Samy Al Bahra.
+ * All rights reserved.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	union rte_pflock_ticket {
+		uint32_t tickets;
+		struct {
+			uint16_t in;
+			uint16_t out;
+		};
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/**
+ * Constants used to map the bits in reader counter
+ *
+ * +-----------------+-+-+
+ * |     Readers     |W|P|
+ * |                 |R|H|
+ * +-----------------+-+-+
+ */
+
+#define RTE_PFLOCK_LSB   0xFFF0
+#define RTE_PFLOCK_RINC  0x100		/* Reader increment value. */
+#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.tickets = 0;
+	pf->wr.tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint32_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a pflock for reading
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_read_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->rd.tickets, __ATOMIC_RELAXED);
+
+	/* loop until writer shows up */
+	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
+		new.out = old.out;
+		new.in = old.in + RTE_PFLOCK_RINC;
+		if (__atomic_compare_exchange_n(&pf->rd.tickets, &old.tickets, new.tickets,
+						0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+			return 0;	/* got it */
+
+		/* either new reader got in (so retry) or a writer */
+	}
+
+	/* If writer is present then we are busy */
+	return -EBUSY;
+}
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket;
+
+	/* Acquire ownership of write-phase. */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * Need ACQUIRE to prevent speculative execution of the wait loop
+	 */
+	ticket = __atomic_fetch_add(&pf->rd.in,
+				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				    __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     another writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_write_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+	uint16_t ticket;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->wr.tickets, __ATOMIC_RELAXED);
+	new.out = old.out;
+	new.in  = old.in + 1;
+	ticket = new.in;
+
+	/* if writer is already present then too busy */
+	if (old.out != new.in ||
+	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets, new.tickets,
+					 0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+		return -EBUSY; /* another writer is present already */
+
+	/*
+	 * We now own the write phase, but still need to tell
+	 * readers and wait for them.
+	 *
+	 * Need ACQUIRE semantics to avoid speculative execution of wait loop
+	 */
+	ticket  = __atomic_fetch_add(&pf->rd.in,
+				 (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				 __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..e7b875ac56a8
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-03 19:19     ` [dpdk-dev] [PATCH v3] " Stephen Hemminger
@ 2021-03-26 17:17       ` Stephen Hemminger
  2021-03-29  3:14       ` Honnappa Nagarahalli
  1 sibling, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-26 17:17 UTC (permalink / raw)
  To: honnappa.nagarahalli, dev; +Cc: Stephen Hemminger

On Wed,  3 Mar 2021 11:19:45 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> This is a new type of reader-writer lock that provides better fairness
> guarantees which makes it better for typical DPDK applications.
> They lock internally uses two ticket pools, one for readers and one
> for writers.
> 
> Phase fair reader writer locks ensure that neither reader or writer will be
> starved. Neither reader or writer are preferred, they execute in
> alternating phases. All operations of the same time (reader or writer)
> that try to acquire the lock are handled in FIFO order.  Write
> operations are exclusive, and multiple read operations can be run
> together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

Why has there been no review of this patch?

The only complaint in patchwork is a bogus checkpatch warning about
possible spelling error.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-03 19:19     ` [dpdk-dev] [PATCH v3] " Stephen Hemminger
  2021-03-26 17:17       ` Stephen Hemminger
@ 2021-03-29  3:14       ` Honnappa Nagarahalli
  2021-03-29 17:22         ` Stephen Hemminger
  2021-03-29 19:58         ` Stephen Hemminger
  1 sibling, 2 replies; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-03-29  3:14 UTC (permalink / raw)
  To: Stephen Hemminger, dev; +Cc: Stephen Hemminger, nd, Honnappa Nagarahalli, nd

<snip>

> Subject: [PATCH v3] pflock: implementation of phase-fair reader writer locks
> 
> This is a new type of reader-writer lock that provides better fairness
> guarantees which makes it better for typical DPDK applications.
> They lock internally uses two ticket pools, one for readers and one for
    ^^^^ The

> writers.
> 
> Phase fair reader writer locks ensure that neither reader or writer will be
> starved. Neither reader or writer are preferred, they execute in alternating
> phases. All operations of the same time (reader or writer) that try to acquire
                                                                  ^^^^ type

> the lock are handled in FIFO order.  Write operations are exclusive, and
> multiple read operations can be run together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> v3 - fix some spelling errors inherited in app/test/test_pflock
> 
>  app/test/meson.build                        |   6 +
>  app/test/test_pflock.c                      | 542 ++++++++++++++++++++
>  lib/librte_eal/arm/include/meson.build      |   1 +
>  lib/librte_eal/arm/include/rte_pflock.h     |  18 +
>  lib/librte_eal/include/generic/rte_pflock.h | 273 ++++++++++
>  lib/librte_eal/ppc/include/meson.build      |   1 +
>  lib/librte_eal/ppc/include/rte_pflock.h     |  16 +
>  lib/librte_eal/x86/include/meson.build      |   1 +
>  lib/librte_eal/x86/include/rte_pflock.h     |  18 +
>  9 files changed, 876 insertions(+)
>  create mode 100644 app/test/test_pflock.c  create mode 100644
> lib/librte_eal/arm/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
>  create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/x86/include/rte_pflock.h
> 
> diff --git a/app/test/meson.build b/app/test/meson.build index
> 561e493a2944..134098de9ac2 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -90,6 +90,7 @@ test_sources = files('commands.c',
>  	'test_mcslock.c',
>  	'test_mp_secondary.c',
>  	'test_per_lcore.c',
> +	'test_pflock.c',
>  	'test_pmd_perf.c',
>  	'test_power.c',
>  	'test_power_cpufreq.c',
> @@ -228,6 +229,11 @@ fast_tests = [
>          ['meter_autotest', true],
>          ['multiprocess_autotest', false],
>          ['per_lcore_autotest', true],
> +        ['pflock_autotest', true],
> +        ['pflock_test1_autotest', true],
> +        ['pflock_rda_autotest', true],
> +        ['pflock_rds_wrm_autotest', true],
> +        ['pflock_rde_wro_autotest', true],
>          ['prefetch_autotest', true],
>          ['rcu_qsbr_autotest', true],
>          ['red_autotest', true],
> diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c new file mode
> 100644 index 000000000000..f46610b73914
> --- /dev/null
> +++ b/app/test/test_pflock.c
> @@ -0,0 +1,542 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2014 Intel Corporation  */
Update the copyright

> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <unistd.h>
> +#include <sys/queue.h>
> +#include <string.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_per_lcore.h>
> +#include <rte_launch.h>
> +#include <rte_pause.h>
> +#include <rte_pflock.h>
> +#include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +
> +#include "test.h"
> +
> +/*
> + * phase fair lock test
> + * ===========
> + * Provides UT for phase fair lock API.
> + * Main concern is on functional testing, but also provides some
> + * performance measurements.
> + * Obviously for proper testing need to be executed with more than one
> lcore.
> + */
> +
> +#define ITER_NUM	0x80
> +
> +#define TEST_SEC	5
> +
> +static rte_pflock_t sl;
> +static rte_pflock_t sl_tab[RTE_MAX_LCORE]; static uint32_t synchro;
> +
> +enum {
> +	LC_TYPE_RDLOCK,
> +	LC_TYPE_WRLOCK,
> +};
> +
> +static struct {
> +	rte_pflock_t lock;
> +	uint64_t tick;
> +	volatile union {
> +		uint8_t u8[RTE_CACHE_LINE_SIZE];
> +		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
> +	} data;
> +} __rte_cache_aligned try_pflock_data;
> +
> +struct try_pflock_lcore {
> +	int32_t rc;
> +	int32_t type;
> +	struct {
> +		uint64_t tick;
> +		uint64_t fail;
> +		uint64_t success;
> +	} stat;
> +} __rte_cache_aligned;
> +
> +static struct try_pflock_lcore try_lcore_data[RTE_MAX_LCORE];
> +
> +static int
> +test_pflock_per_core(__rte_unused void *arg) {
> +	rte_pflock_write_lock(&sl);
> +	printf("Global write lock taken on core %u\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl);
> +
> +	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
> +	printf("Hello from core %u !\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
> +
> +	rte_pflock_read_lock(&sl);
> +	printf("Global read lock taken on core %u\n", rte_lcore_id());
> +	rte_delay_ms(100);
> +	printf("Release global read lock on core %u\n", rte_lcore_id());
> +	rte_pflock_read_unlock(&sl);
> +
> +	return 0;
> +}
> +
> +static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER; static volatile
> +uint64_t pflock_data; static uint64_t time_count[RTE_MAX_LCORE] = {0};
> +
> +#define MAX_LOOP 10000
> +#define TEST_PFLOCK_DEBUG 0
> +
> +static int
> +load_loop_fn(__rte_unused void *arg)
> +{
> +	uint64_t time_diff = 0, begin;
> +	uint64_t hz = rte_get_timer_hz();
> +	uint64_t lcount = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	/* wait synchro for workers */
> +	if (lcore != rte_get_main_lcore())
> +		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
> +
> +	begin = rte_rdtsc_precise();
> +	while (lcount < MAX_LOOP) {
> +		rte_pflock_write_lock(&lk);
> +		++pflock_data;
> +		rte_pflock_write_unlock(&lk);
> +
> +		rte_pflock_read_lock(&lk);
> +		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
> +			printf("Core [%u] pflock_data = %"PRIu64"\n",
> +				lcore, pflock_data);
> +		rte_pflock_read_unlock(&lk);
> +
> +		lcount++;
> +		/* delay to make lock duty cycle slightly realistic */
> +		rte_pause();
> +	}
> +
> +	time_diff = rte_rdtsc_precise() - begin;
> +	time_count[lcore] = time_diff * 1000000 / hz;
> +	return 0;
> +}
> +
> +static int
> +test_pflock_perf(void)
> +{
> +	unsigned int i;
> +	uint64_t total = 0;
> +
> +	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
> +
> +	/* clear synchro and start workers */
> +	synchro = 0;
> +	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) <
> 0)
> +		return -1;
> +
> +	/* start synchro and launch test on main */
> +	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
> +	load_loop_fn(NULL);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	RTE_LCORE_FOREACH(i) {
> +		printf("Core [%u] cost time = %"PRIu64" us\n",
> +			i, time_count[i]);
> +		total += time_count[i];
> +	}
> +
> +	printf("Total cost time = %"PRIu64" us\n", total);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	return 0;
> +}
> +
> +/*
> + * - There is a global pflock and a table of pflocks (one per lcore).
> + *
> + * - The test function takes all of these locks and launches the
> + *   ``test_pflock_per_core()`` function on each core (except the main).
> + *
> + *   - The function takes the global write lock, display something,
> + *     then releases the global lock.
> + *   - Then, it takes the per-lcore write lock, display something, and
> + *     releases the per-core lock.
> + *   - Finally, a read lock is taken during 100 ms, then released.
> + *
> + * - The main function unlocks the per-lcore locks sequentially and
> + *   waits between each lock. This triggers the display of a message
> + *   for each core, in the correct order.
> + *
> + *   Then, it tries to take the global write lock and display the last
> + *   message. The autotest script checks that the message order is correct.
> + */
> +static int
> +pflock_test1(void)
> +{
> +	int i;
> +
> +	rte_pflock_init(&sl);
> +	for (i = 0; i < RTE_MAX_LCORE; i++)
> +		rte_pflock_init(&sl_tab[i]);
> +
> +	rte_pflock_write_lock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_lock(&sl_tab[i]);
> +		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
> +	}
> +
> +	rte_pflock_write_unlock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_unlock(&sl_tab[i]);
> +		rte_delay_ms(100);
> +	}
> +
> +	rte_pflock_write_lock(&sl);
> +	/* this message should be the last message of test */
> +	printf("Global write lock taken on main core %u\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	if (test_pflock_perf() < 0)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int
> +test_pflock(void)
> +{
> +	uint32_t i;
> +	int32_t rc, ret;
> +
> +	static const struct {
> +		const char *name;
> +		int (*ftst)(void);
> +	} test[] = {
> +		{
> +			.name = "pflock_test1",
> +			.ftst = pflock_test1,
> +		},
> +	};
> +
> +	ret = 0;
> +	for (i = 0; i != RTE_DIM(test); i++) {
> +		printf("starting test %s;\n", test[i].name);
> +		rc = test[i].ftst();
> +		printf("test %s completed with status %d\n", test[i].name,
> rc);
> +		ret |= rc;
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +try_read(uint32_t lc)
> +{
> +	int32_t rc;
> +	uint32_t i;
> +
> +	rc = rte_pflock_read_trylock(&try_pflock_data.lock);
> +	if (rc != 0)
> +		return rc;
> +
> +	for (i = 0; i != RTE_DIM(try_pflock_data.data.u64); i++) {
> +
> +		/* race condition occurred, lock doesn't work properly */
> +		if (try_pflock_data.data.u64[i] != 0) {
> +			printf("%s(%u) error: unexpected data pattern\n",
> +				__func__, lc);
> +			rte_memdump(stdout, NULL,
> +				(void *)(uintptr_t)&try_pflock_data.data,
> +				sizeof(try_pflock_data.data));
> +			rc = -EFAULT;
> +			break;
> +		}
> +	}
> +
> +	rte_pflock_read_unlock(&try_pflock_data.lock);
> +	return rc;
> +}
> +
> +static int
> +try_write(uint32_t lc)
> +{
> +	int32_t rc;
> +	uint32_t i, v;
> +
> +	v = RTE_MAX(lc % UINT8_MAX, 1U);
> +
> +	rc = rte_pflock_write_trylock(&try_pflock_data.lock);
> +	if (rc != 0)
> +		return rc;
> +
> +	/* update by bytes in reverse order */
> +	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
> +
> +		/* race condition occurred, lock doesn't work properly */
> +		if (try_pflock_data.data.u8[i] != 0) {
> +			printf("%s:%d(%u) error: unexpected data
> pattern\n",
> +				__func__, __LINE__, lc);
> +			rte_memdump(stdout, NULL,
> +				(void *)(uintptr_t)&try_pflock_data.data,
> +				sizeof(try_pflock_data.data));
> +			rc = -EFAULT;
> +			break;
> +		}
> +
> +		try_pflock_data.data.u8[i] = v;
> +	}
> +
> +	/* restore by bytes in reverse order */
> +	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
> +
> +		/* race condition occurred, lock doesn't work properly */
> +		if (try_pflock_data.data.u8[i] != v) {
> +			printf("%s:%d(%u) error: unexpected data
> pattern\n",
> +				__func__, __LINE__, lc);
> +			rte_memdump(stdout, NULL,
> +				(void *)(uintptr_t)&try_pflock_data.data,
> +				sizeof(try_pflock_data.data));
> +			rc = -EFAULT;
> +			break;
> +		}
> +
> +		try_pflock_data.data.u8[i] = 0;
> +	}
> +
> +	rte_pflock_write_unlock(&try_pflock_data.lock);
> +	return rc;
> +}
> +
> +static int
> +try_read_lcore(__rte_unused void *data) {
> +	int32_t rc;
> +	uint32_t i, lc;
> +	uint64_t ftm, stm, tm;
> +	struct try_pflock_lcore *lcd;
> +
> +	lc = rte_lcore_id();
> +	lcd = try_lcore_data + lc;
> +	lcd->type = LC_TYPE_RDLOCK;
> +
> +	ftm = try_pflock_data.tick;
> +	stm = rte_get_timer_cycles();
> +
> +	do {
> +		for (i = 0; i != ITER_NUM; i++) {
> +			rc = try_read(lc);
> +			if (rc == 0)
> +				lcd->stat.success++;
> +			else if (rc == -EBUSY)
> +				lcd->stat.fail++;
> +			else
> +				break;
> +			rc = 0;
> +		}
> +		tm = rte_get_timer_cycles() - stm;
> +	} while (tm < ftm && rc == 0);
> +
> +	lcd->rc = rc;
> +	lcd->stat.tick = tm;
> +	return rc;
> +}
> +
> +static int
> +try_write_lcore(__rte_unused void *data) {
> +	int32_t rc;
> +	uint32_t i, lc;
> +	uint64_t ftm, stm, tm;
> +	struct try_pflock_lcore *lcd;
> +
> +	lc = rte_lcore_id();
> +	lcd = try_lcore_data + lc;
> +	lcd->type = LC_TYPE_WRLOCK;
> +
> +	ftm = try_pflock_data.tick;
> +	stm = rte_get_timer_cycles();
> +
> +	do {
> +		for (i = 0; i != ITER_NUM; i++) {
> +			rc = try_write(lc);
> +			if (rc == 0)
> +				lcd->stat.success++;
> +			else if (rc == -EBUSY)
> +				lcd->stat.fail++;
> +			else
> +				break;
> +			rc = 0;
> +		}
> +		tm = rte_get_timer_cycles() - stm;
> +	} while (tm < ftm && rc == 0);
> +
> +	lcd->rc = rc;
> +	lcd->stat.tick = tm;
> +	return rc;
> +}
> +
> +static void
> +print_try_lcore_stats(const struct try_pflock_lcore *tlc, uint32_t lc)
> +{
> +	uint64_t f, s;
> +
> +	f = RTE_MAX(tlc->stat.fail, 1ULL);
> +	s = RTE_MAX(tlc->stat.success, 1ULL);
> +
> +	printf("try_lcore_data[%u]={\n"
> +		"\trc=%d,\n"
> +		"\ttype=%s,\n"
> +		"\tfail=%" PRIu64 ",\n"
> +		"\tsuccess=%" PRIu64 ",\n"
> +		"\tcycles=%" PRIu64 ",\n"
> +		"\tcycles/op=%#Lf,\n"
> +		"\tcycles/success=%#Lf,\n"
> +		"\tsuccess/fail=%#Lf,\n"
> +		"};\n",
> +		lc,
> +		tlc->rc,
> +		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
> +		tlc->stat.fail,
> +		tlc->stat.success,
> +		tlc->stat.tick,
> +		(long double)tlc->stat.tick /
> +		(tlc->stat.fail + tlc->stat.success),
> +		(long double)tlc->stat.tick / s,
> +		(long double)tlc->stat.success / f);
> +}
> +
> +static void
> +collect_try_lcore_stats(struct try_pflock_lcore *tlc,
> +	const struct try_pflock_lcore *lc)
> +{
> +	tlc->stat.tick += lc->stat.tick;
> +	tlc->stat.fail += lc->stat.fail;
> +	tlc->stat.success += lc->stat.success; }
> +
> +/*
> + * Process collected results:
> + *  - check status
> + *  - collect and print statistics
> + */
> +static int
> +process_try_lcore_stats(void)
> +{
> +	int32_t rc;
> +	uint32_t lc, rd, wr;
> +	struct try_pflock_lcore rlc, wlc;
> +
> +	memset(&rlc, 0, sizeof(rlc));
> +	memset(&wlc, 0, sizeof(wlc));
> +
> +	rlc.type = LC_TYPE_RDLOCK;
> +	wlc.type = LC_TYPE_WRLOCK;
> +	rd = 0;
> +	wr = 0;
> +
> +	rc = 0;
> +	RTE_LCORE_FOREACH(lc) {
> +		rc |= try_lcore_data[lc].rc;
> +		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
> +			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
> +			rd++;
> +		} else {
> +			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
> +			wr++;
> +		}
> +	}
> +
> +	if (rc == 0) {
> +		RTE_LCORE_FOREACH(lc)
> +			print_try_lcore_stats(try_lcore_data + lc, lc);
> +
> +		if (rd != 0) {
> +			printf("aggregated stats for %u RDLOCK cores:\n",
> rd);
> +			print_try_lcore_stats(&rlc, rd);
> +		}
> +
> +		if (wr != 0) {
> +			printf("aggregated stats for %u WRLOCK cores:\n",
> wr);
> +			print_try_lcore_stats(&wlc, wr);
> +		}
> +	}
> +
> +	return rc;
> +}
> +
> +static void
> +try_test_reset(void)
> +{
> +	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
> +	memset(&try_pflock_data, 0, sizeof(try_pflock_data));
> +	try_pflock_data.tick = TEST_SEC * rte_get_tsc_hz(); }
> +
> +/* all lcores grab RDLOCK */
> +static int
> +try_pflock_test_rda(void)
> +{
> +	try_test_reset();
> +
> +	/* start read test on all available lcores */
> +	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
> +	rte_eal_mp_wait_lcore();
> +
> +	return process_try_lcore_stats();
> +}
> +
> +/* all worker lcores grab RDLOCK, main one grabs WRLOCK */ static int
> +try_pflock_test_rds_wrm(void)
> +{
> +	try_test_reset();
> +
> +	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
> +	try_write_lcore(NULL);
> +	rte_eal_mp_wait_lcore();
> +
> +	return process_try_lcore_stats();
> +}
> +
> +/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
> +static int
> +try_pflock_test_rde_wro(void)
> +{
> +	uint32_t lc, mlc;
> +
> +	try_test_reset();
> +
> +	mlc = rte_get_main_lcore();
> +
> +	RTE_LCORE_FOREACH(lc) {
> +		if (lc != mlc) {
> +			if ((lc & 1) == 0)
> +				rte_eal_remote_launch(try_read_lcore,
> +						NULL, lc);
> +			else
> +				rte_eal_remote_launch(try_write_lcore,
> +						NULL, lc);
> +		}
> +	}
> +	try_read_lcore(NULL);
> +	rte_eal_mp_wait_lcore();
> +
> +	return process_try_lcore_stats();
> +}
> +
> +REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
> +
> +/* subtests used in meson for CI */
> +REGISTER_TEST_COMMAND(pflock_test1_autotest, pflock_test1);
> +REGISTER_TEST_COMMAND(pflock_rda_autotest, try_pflock_test_rda);
> +REGISTER_TEST_COMMAND(pflock_rds_wrm_autotest,
> +try_pflock_test_rds_wrm);
> +REGISTER_TEST_COMMAND(pflock_rde_wro_autotest,
> +try_pflock_test_rde_wro);
> diff --git a/lib/librte_eal/arm/include/meson.build
> b/lib/librte_eal/arm/include/meson.build
> index 770766de1a34..2c3cff61bed6 100644
> --- a/lib/librte_eal/arm/include/meson.build
> +++ b/lib/librte_eal/arm/include/meson.build
> @@ -21,6 +21,7 @@ arch_headers = files(
>  	'rte_pause_32.h',
>  	'rte_pause_64.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch_32.h',
>  	'rte_prefetch_64.h',
> diff --git a/lib/librte_eal/arm/include/rte_pflock.h
> b/lib/librte_eal/arm/include/rte_pflock.h
> new file mode 100644
> index 000000000000..bb9934eec469
> --- /dev/null
> +++ b/lib/librte_eal/arm/include/rte_pflock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_PFLOCK_ARM_H_
> +#define _RTE_PFLOCK_ARM_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_ARM_H_ */
> diff --git a/lib/librte_eal/include/generic/rte_pflock.h
> b/lib/librte_eal/include/generic/rte_pflock.h
> new file mode 100644
> index 000000000000..6808c70c34a2
> --- /dev/null
> +++ b/lib/librte_eal/include/generic/rte_pflock.h
> @@ -0,0 +1,273 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corp.
> + * Copyright 2011-2015 Samy Al Bahra.
Any reason for adding the above copy right?

> + * All rights reserved.
> + */
> +
> +#ifndef _RTE_PFLOCK_H_
> +#define _RTE_PFLOCK_H_
> +
> +/**
> + * @file
> + *
> + * Phase-fair locks
> + *
> + * This file defines an API for Phase Fair reader writer locks,
> + * which is a variant of typical reader-writer locks that prevent
> + * starvation. In this type of lock, readers and writers alternate.
> + * This significantly reduces the worst-case blocking for readers and writers.
> + *
> + * This is an implementation derived from FreeBSD
> + * based on the work described in:
> + *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
> + *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
> + *
> + * All locks must be initialised before use, and only initialised once.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_pause.h>
> +
> +/**
> + * The rte_pflock_t type.
> + */
> +struct rte_pflock {
> +	union rte_pflock_ticket {
> +		uint32_t tickets;
> +		struct {
> +			uint16_t in;
> +			uint16_t out;
> +		};
> +	} rd, wr;
Just wondering if placing these on 2 different cache lines would help the performance?

> +};
> +typedef struct rte_pflock rte_pflock_t;
> +
> +/**
> + * Constants used to map the bits in reader counter
> + *
> + * +-----------------+-+-+
> + * |     Readers     |W|P|
> + * |                 |R|H|
> + * +-----------------+-+-+
It would be good to indicate the reserved part.

> + */
> +
> +#define RTE_PFLOCK_LSB   0xFFF0
Based on the value of RTE_PFLOCK_RINC, should this be 0xFF00? 

> +#define RTE_PFLOCK_RINC  0x100		/* Reader increment value.
Does this mean, there can be only 256 concurrent readers?

> */
> +#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
> +#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
> +#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
> +
> +/**
> + * A static pflock initializer.
> + */
> +#define RTE_PFLOCK_INITIALIZER {  }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Initialize the pflock to an unlocked state.
> + *
> + * @param pf
> + *   A pointer to the pflock.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_init(struct rte_pflock *pf)
> +{
> +	pf->rd.tickets = 0;
> +	pf->wr.tickets = 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Take a pflock for read.
> + *
> + * @param pf
> + *   A pointer to a pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_read_lock(rte_pflock_t *pf)
> +{
> +	uint32_t w;
> +
> +	/*
> +	 * If no writer is present, then the operation has completed
> +	 * successfully.
> +	 */
> +	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC,
> __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
Any reason for the RELEASE? I think ACQUIRE is enough as the write to rd.in is not releasing any previous memory operations.


> +	if (w == 0)
> +		return;
> +
> +	/* Wait for current write phase to complete. */
> +	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) &
> RTE_PFLOCK_WBITS) == w)
> +		rte_pause();
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a pflock locked for reading.
> + *
> + * @param pf
> + *   A pointer to the pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_read_unlock(rte_pflock_t *pf) {
> +	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC,
> __ATOMIC_RELEASE); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Try to take a pflock for reading
> + *
> + * @param pf
> + *   A pointer to a pflock structure.
> + * @return
> + *   - zero if the lock is successfully taken
> + *   - -EBUSY if lock could not be acquired for reading because a
> + *     writer holds the lock
> + */
> +__rte_experimental
> +static inline int
> +rte_pflock_read_trylock(rte_pflock_t *pf) {
> +	union rte_pflock_ticket old, new;
> +
> +	/* Get current state of the lock */
> +	old.tickets = __atomic_load_n(&pf->rd.tickets,
> __ATOMIC_RELAXED);
> +
> +	/* loop until writer shows up */
> +	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
> +		new.out = old.out;
> +		new.in = old.in + RTE_PFLOCK_RINC;
> +		if (__atomic_compare_exchange_n(&pf->rd.tickets,
> &old.tickets, new.tickets,
> +						0, __ATOMIC_ACQ_REL,
                                                                                                                           ^^^ I think ACQUIRE is enough. We are not releasing anything to other threads.

> __ATOMIC_RELAXED))
> +			return 0;	/* got it */
> +
> +		/* either new reader got in (so retry) or a writer */
> +	}
> +
> +	/* If writer is present then we are busy */
> +	return -EBUSY;
> +}
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Take the pflock for write.
> + *
> + * @param pf
> + *   A pointer to the ticketlock.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_write_lock(rte_pflock_t *pf) {
> +	uint16_t ticket;
> +
> +	/* Acquire ownership of write-phase. */
> +	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
> +	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
> +
> +	/*
> +	 * Acquire ticket on read-side in order to allow them
> +	 * to flush. Indicates to any incoming reader that a
> +	 * write-phase is pending.
> +	 *
> +	 * Need ACQUIRE to prevent speculative execution of the wait loop
I do not think the entire wait loop will be executed speculatively. Only the load of pf->rd.out would happen speculatively. There is a dependency on 'ticket' variable. So, the load of the 'ticket' variable should happen after 'ticket' is updated below.
 
> +	 */
> +	ticket = __atomic_fetch_add(&pf->rd.in,
> +				    (ticket & RTE_PFLOCK_PHID) |
> RTE_PFLOCK_PRES,
> +				    __ATOMIC_ACQUIRE);
Since, it is ok to execute part of the wait loop above this. We could make this RELAXED.
Also, since we just need to set the 2 bits, is it better to use __atomic_fetch_or? It also matches with the use of __atomic_fetch_and in the unlock API.

> +
> +	/* Wait for any pending readers to flush. */
> +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
RELAXED here will allow the critical section to execute above the wait loop. Hence it is better to make this ACQUIRE.

> }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a pflock held for writing.
> + *
> + * @param pf
> + *   A pointer to a pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_write_unlock(rte_pflock_t *pf) {
> +	/* Migrate from write phase to read phase. */
> +	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB,
> __ATOMIC_RELEASE);
> +
> +	/* Allow other writers to continue. */
> +	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Try to take the pflock for write.
> + *
> + * @param pf
> + *   A pointer to the pflock.
> + * @return
> + *   - zero if the lock is successfully taken
> + *   - -EBUSY if lock could not be acquired for writing because
> + *     another writer holds the lock
What about the readers holding the lock?

> + */
> +__rte_experimental
> +static inline int
> +rte_pflock_write_trylock(rte_pflock_t *pf) {
> +	union rte_pflock_ticket old, new;
> +	uint16_t ticket;
> +
> +	/* Get current state of the lock */
> +	old.tickets = __atomic_load_n(&pf->wr.tickets,
> __ATOMIC_RELAXED);
> +	new.out = old.out;
> +	new.in  = old.in + 1;
> +	ticket = new.in;
> +
> +	/* if writer is already present then too busy */
> +	if (old.out != new.in ||
> +	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets,
> new.tickets,
> +					 0, __ATOMIC_ACQ_REL,
> __ATOMIC_RELAXED))
> +		return -EBUSY; /* another writer is present already */
> +
> +	/*
> +	 * We now own the write phase, but still need to tell
> +	 * readers and wait for them.
The write lock is taken if there are no readers AND no writers (unlike the read lock which is taken if there are no writers waiting (only))
Since this is a try lock, should we wait for the readers to give up the lock?
I think, if the readers are present, we should give up the writer phase and return. 

> +	 *
> +	 * Need ACQUIRE semantics to avoid speculative execution of wait
> loop
> +	 */
> +	ticket  = __atomic_fetch_add(&pf->rd.in,
> +				 (ticket & RTE_PFLOCK_PHID) |
> RTE_PFLOCK_PRES,
> +				 __ATOMIC_ACQUIRE);
> +
> +	/* Wait for any pending readers to flush. */
> +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
> +	return 0;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* RTE_PFLOCK_H */
> diff --git a/lib/librte_eal/ppc/include/meson.build
> b/lib/librte_eal/ppc/include/meson.build
> index dae40ede546e..7692a531ccba 100644
> --- a/lib/librte_eal/ppc/include/meson.build
> +++ b/lib/librte_eal/ppc/include/meson.build
> @@ -11,6 +11,7 @@ arch_headers = files(
>  	'rte_mcslock.h',
>  	'rte_memcpy.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch.h',
>  	'rte_rwlock.h',
> diff --git a/lib/librte_eal/ppc/include/rte_pflock.h
> b/lib/librte_eal/ppc/include/rte_pflock.h
> new file mode 100644
> index 000000000000..e7b875ac56a8
> --- /dev/null
> +++ b/lib/librte_eal/ppc/include/rte_pflock.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: BSD-3-Clause  */ #ifndef
Copyright header missing?

> +_RTE_PFLOCK_PPC_64_H_ #define _RTE_PFLOCK_PPC_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_PPC_64_H_ */
> diff --git a/lib/librte_eal/x86/include/meson.build
> b/lib/librte_eal/x86/include/meson.build
> index 1a6ad0b17342..f43645c20899 100644
> --- a/lib/librte_eal/x86/include/meson.build
> +++ b/lib/librte_eal/x86/include/meson.build
> @@ -10,6 +10,7 @@ arch_headers = files(
>  	'rte_mcslock.h',
>  	'rte_memcpy.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch.h',
>  	'rte_rtm.h',
> diff --git a/lib/librte_eal/x86/include/rte_pflock.h
> b/lib/librte_eal/x86/include/rte_pflock.h
> new file mode 100644
> index 000000000000..c2d876062c08
> --- /dev/null
> +++ b/lib/librte_eal/x86/include/rte_pflock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_PFLOCK_X86_64_H_
> +#define _RTE_PFLOCK_X86_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_X86_64_H_ */
> --
> 2.30.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-29  3:14       ` Honnappa Nagarahalli
@ 2021-03-29 17:22         ` Stephen Hemminger
  2021-03-29 18:09           ` Honnappa Nagarahalli
  2021-03-29 19:58         ` Stephen Hemminger
  1 sibling, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-29 17:22 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, Stephen Hemminger, nd

On Mon, 29 Mar 2021 03:14:29 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> <snip>
> 
> > Subject: [PATCH v3] pflock: implementation of phase-fair reader writer locks
> > 
> > This is a new type of reader-writer lock that provides better fairness
> > guarantees which makes it better for typical DPDK applications.
> > They lock internally uses two ticket pools, one for readers and one for  
>     ^^^^ The
> 
> > writers.
> > 
> > Phase fair reader writer locks ensure that neither reader or writer will be
> > starved. Neither reader or writer are preferred, they execute in alternating
> > phases. All operations of the same time (reader or writer) that try to acquire  
>                                                                   ^^^^ type
> 
> > the lock are handled in FIFO order.  Write operations are exclusive, and
> > multiple read operations can be run together (until a write arrives).
> > 
> > A similar implementation is in Concurrency Kit package in FreeBSD.
> > For more information see:
> >    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
> >     Real-Time Systems",
> >     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

Any more comments? Other than the typos in the commit log...

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-29 17:22         ` Stephen Hemminger
@ 2021-03-29 18:09           ` Honnappa Nagarahalli
  0 siblings, 0 replies; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-03-29 18:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Stephen Hemminger, nd, Honnappa Nagarahalli, nd

<snip>

> >
> > > Subject: [PATCH v3] pflock: implementation of phase-fair reader
> > > writer locks
> > >
> > > This is a new type of reader-writer lock that provides better
> > > fairness guarantees which makes it better for typical DPDK applications.
> > > They lock internally uses two ticket pools, one for readers and one
> > > for
> >     ^^^^ The
> >
> > > writers.
> > >
> > > Phase fair reader writer locks ensure that neither reader or writer
> > > will be starved. Neither reader or writer are preferred, they
> > > execute in alternating phases. All operations of the same time
> > > (reader or writer) that try to acquire
> >                                                                   ^^^^
> > type
> >
> > > the lock are handled in FIFO order.  Write operations are exclusive,
> > > and multiple read operations can be run together (until a write arrives).
> > >
> > > A similar implementation is in Concurrency Kit package in FreeBSD.
> > > For more information see:
> > >    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
> > >     Real-Time Systems",
> > >     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> > >
> > > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> 
> Any more comments? Other than the typos in the commit log...
There are additional comments in the code (in librte_eal/include/generic/rte_pflock.h), you need to scroll down as the test code is on the top. I did not take a deeper look at the test code yet.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-29  3:14       ` Honnappa Nagarahalli
  2021-03-29 17:22         ` Stephen Hemminger
@ 2021-03-29 19:58         ` Stephen Hemminger
  2021-03-30  0:18           ` Honnappa Nagarahalli
  1 sibling, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-29 19:58 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, Stephen Hemminger, nd

Meta question: is implementing trylock worth it?
The original did not have it.

There are tradeoffs about number of readers and added complexity
in the code?

> > diff --git a/lib/librte_eal/include/generic/rte_pflock.h
> > b/lib/librte_eal/include/generic/rte_pflock.h
> > new file mode 100644
> > index 000000000000..6808c70c34a2
> > --- /dev/null
> > +++ b/lib/librte_eal/include/generic/rte_pflock.h
> > @@ -0,0 +1,273 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2021 Microsoft Corp.
> > + * Copyright 2011-2015 Samy Al Bahra.  
> Any reason for adding the above copy right?

Code originally came from Concurrency Kit, so wanted to keep
attribution to original author
 
> > + * The rte_pflock_t type.
> > + */
> > +struct rte_pflock {
> > +	union rte_pflock_ticket {
> > +		uint32_t tickets;
> > +		struct {
> > +			uint16_t in;
> > +			uint16_t out;
> > +		};
> > +	} rd, wr;  
> Just wondering if placing these on 2 different cache lines would help the performance?

That won't work because the implementation of trylock requires compare/exchange of
the whole structure as an atomic operation.

> 
> > +};
> > +typedef struct rte_pflock rte_pflock_t;
> > +
> > +/**
> > + * Constants used to map the bits in reader counter
> > + *
> > + * +-----------------+-+-+
> > + * |     Readers     |W|P|
> > + * |                 |R|H|
> > + * +-----------------+-+-+  
> It would be good to indicate the reserved part.

Ok

> 
> > + */
> > +
> > +#define RTE_PFLOCK_LSB   0xFFF0  
> Based on the value of RTE_PFLOCK_RINC, should this be 0xFF00? 

The unused bits never get set so it doesn't matter

> 
> > +#define RTE_PFLOCK_RINC  0x100		/* Reader increment value.  
> Does this mean, there can be only 256 concurrent readers?

Yes, there is a tradeoff.  If you assume that the largest atomic operation
is 64 bits, and you want to support trylock then 256 readers is the limit.

The original code has 32 bit counters but no trylock.


> > +__rte_experimental
> > +static inline void
> > +rte_pflock_read_lock(rte_pflock_t *pf)
> > +{
> > +	uint32_t w;
> > +
> > +	/*
> > +	 * If no writer is present, then the operation has completed
> > +	 * successfully.
> > +	 */
> > +	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC,
> > __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;  
> Any reason for the RELEASE? I think ACQUIRE is enough as the write to rd.in is not releasing any previous memory operations.

That make sense, will fix

> > +__rte_experimental
> > +static inline int
> > +rte_pflock_read_trylock(rte_pflock_t *pf) {
> > +	union rte_pflock_ticket old, new;
> > +
> > +	/* Get current state of the lock */
> > +	old.tickets = __atomic_load_n(&pf->rd.tickets,
> > __ATOMIC_RELAXED);
> > +
> > +	/* loop until writer shows up */
> > +	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
> > +		new.out = old.out;
> > +		new.in = old.in + RTE_PFLOCK_RINC;
> > +		if (__atomic_compare_exchange_n(&pf->rd.tickets,
> > &old.tickets, new.tickets,
> > +						0, __ATOMIC_ACQ_REL,  
>                                                                                                                            
              ^^^ I think ACQUIRE is enough. We are not releasing anything to other threads.

Fixed.

> 
> > __ATOMIC_RELAXED))
> > +			return 0;	/* got it */
> > +
> > +		/* either new reader got in (so retry) or a writer */
> > +	}
> > +

> > +__rte_experimental
> > +static inline void
> > +rte_pflock_write_lock(rte_pflock_t *pf) {
> > +	uint16_t ticket;
> > +
> > +	/* Acquire ownership of write-phase. */
> > +	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
> > +	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
> > +
> > +	/*
> > +	 * Acquire ticket on read-side in order to allow them
> > +	 * to flush. Indicates to any incoming reader that a
> > +	 * write-phase is pending.
> > +	 *
> > +	 * Need ACQUIRE to prevent speculative execution of the wait loop  
> I do not think the entire wait loop will be executed speculatively. Only the load of pf->rd.out would happen speculatively. There is a dependency on 'ticket' variable. So, the load of the 'ticket' variable should happen after 'ticket' is updated below.
>  
> > +	 */
> > +	ticket = __atomic_fetch_add(&pf->rd.in,
> > +				    (ticket & RTE_PFLOCK_PHID) |
> > RTE_PFLOCK_PRES,
> > +				    __ATOMIC_ACQUIRE);  
> Since, it is ok to execute part of the wait loop above this. We could make this RELAXED.
> Also, since we just need to set the 2 bits, is it better to use __atomic_fetch_or? It also matches with the use of __atomic_fetch_and in the unlock API.


	ticket = __atomic_fetch_or(&pf->rd.in, 
				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
				    __ATOMIC_RELAXED

> > +
> > +	/* Wait for any pending readers to flush. */
> > +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);  
> RELAXED here will allow the critical section to execute above the wait loop. Hence it is better to make this ACQUIRE.

	Would it be better to add a fence instead?
> 
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Try to take the pflock for write.
> > + *
> > + * @param pf
> > + *   A pointer to the pflock.
> > + * @return
> > + *   - zero if the lock is successfully taken
> > + *   - -EBUSY if lock could not be acquired for writing because
> > + *     another writer holds the lock  
> What about the readers holding the lock?

Originally, I had it return -EBUSY, but then all the write trylock would
fail.  Trylock doesn't seem to play well with phase fair nature.

Writing is a two part operation in this model, if the 1st part succeeds
(which changes the phase), then there is no way to backout/undo the 
ticket.



> 
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_pflock_write_trylock(rte_pflock_t *pf) {
> > +	union rte_pflock_ticket old, new;
> > +	uint16_t ticket;
> > +
> > +	/* Get current state of the lock */
> > +	old.tickets = __atomic_load_n(&pf->wr.tickets,
> > __ATOMIC_RELAXED);
> > +	new.out = old.out;
> > +	new.in  = old.in + 1;
> > +	ticket = new.in;
> > +
> > +	/* if writer is already present then too busy */
> > +	if (old.out != new.in ||
> > +	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets,
> > new.tickets,
> > +					 0, __ATOMIC_ACQ_REL,
> > __ATOMIC_RELAXED))
> > +		return -EBUSY; /* another writer is present already */
> > +
> > +	/*
> > +	 * We now own the write phase, but still need to tell
> > +	 * readers and wait for them.  
> The write lock is taken if there are no readers AND no writers (unlike the read lock which is taken if there are no writers waiting (only))
> Since this is a try lock, should we wait for the readers to give up the lock?
> I think, if the readers are present, we should give up the writer phase and return. 
> 
> > +	 *
> > +	 * Need ACQUIRE semantics to avoid speculative execution of wait
> > loop
> > +	 */
> > +	ticket  = __atomic_fetch_add(&pf->rd.in,
> > +				 (ticket & RTE_PFLOCK_PHID) |
> > RTE_PFLOCK_PRES,
> > +				 __ATOMIC_ACQUIRE);
> > +
> > +	/* Wait for any pending readers to flush. */
> > +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
> > +	return 0;
> > +}
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* RTE_PFLOCK_H */
> > diff --git a/lib/librte_eal/ppc/include/meson.build
> > b/lib/librte_eal/ppc/include/meson.build
> > index dae40ede546e..7692a531ccba 100644
> > --- a/lib/librte_eal/ppc/include/meson.build
> > +++ b/lib/librte_eal/ppc/include/meson.build
> > @@ -11,6 +11,7 @@ arch_headers = files(
> >  	'rte_mcslock.h',
> >  	'rte_memcpy.h',
> >  	'rte_pause.h',
> > +	'rte_pflock.h',
> >  	'rte_power_intrinsics.h',
> >  	'rte_prefetch.h',
> >  	'rte_rwlock.h',
> > diff --git a/lib/librte_eal/ppc/include/rte_pflock.h
> > b/lib/librte_eal/ppc/include/rte_pflock.h
> > new file mode 100644
> > index 000000000000..e7b875ac56a8
> > --- /dev/null
> > +++ b/lib/librte_eal/ppc/include/rte_pflock.h
> > @@ -0,0 +1,16 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause  */ #ifndef  
> Copyright header missing?
> 
> > +_RTE_PFLOCK_PPC_64_H_ #define _RTE_PFLOCK_PPC_64_H_
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include "generic/rte_pflock.h"
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_PFLOCK_PPC_64_H_ */
> > diff --git a/lib/librte_eal/x86/include/meson.build
> > b/lib/librte_eal/x86/include/meson.build
> > index 1a6ad0b17342..f43645c20899 100644
> > --- a/lib/librte_eal/x86/include/meson.build
> > +++ b/lib/librte_eal/x86/include/meson.build
> > @@ -10,6 +10,7 @@ arch_headers = files(
> >  	'rte_mcslock.h',
> >  	'rte_memcpy.h',
> >  	'rte_pause.h',
> > +	'rte_pflock.h',
> >  	'rte_power_intrinsics.h',
> >  	'rte_prefetch.h',
> >  	'rte_rtm.h',
> > diff --git a/lib/librte_eal/x86/include/rte_pflock.h
> > b/lib/librte_eal/x86/include/rte_pflock.h
> > new file mode 100644
> > index 000000000000..c2d876062c08
> > --- /dev/null
> > +++ b/lib/librte_eal/x86/include/rte_pflock.h
> > @@ -0,0 +1,18 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2021 Microsoft Corporation  */
> > +
> > +#ifndef _RTE_PFLOCK_X86_64_H_
> > +#define _RTE_PFLOCK_X86_64_H_
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include "generic/rte_pflock.h"
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_PFLOCK_X86_64_H_ */
> > --
> > 2.30.1  
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-29 19:58         ` Stephen Hemminger
@ 2021-03-30  0:18           ` Honnappa Nagarahalli
  2021-03-30  4:56             ` Stephen Hemminger
  0 siblings, 1 reply; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-03-30  0:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Stephen Hemminger, nd, Honnappa Nagarahalli, nd

<snip>

> Subject: Re: [PATCH v3] pflock: implementation of phase-fair reader writer locks
> 
> Meta question: is implementing trylock worth it?
> The original did not have it.
If there is no use for it currently, I suggest not to add. If someone sees a need, they can always add. I am ok if you add as well.

> 
> There are tradeoffs about number of readers and added complexity in the code?
If we increase the size of 'in' and 'out' to 32b and 'tickets' to 64b, that should increase the number of readers. Do you see any other complexity?
This would mean 64b compare-swap in try lock, that should be fine I think.

> 
> > > diff --git a/lib/librte_eal/include/generic/rte_pflock.h
> > > b/lib/librte_eal/include/generic/rte_pflock.h
> > > new file mode 100644
> > > index 000000000000..6808c70c34a2
> > > --- /dev/null
> > > +++ b/lib/librte_eal/include/generic/rte_pflock.h
> > > @@ -0,0 +1,273 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2021 Microsoft Corp.
> > > + * Copyright 2011-2015 Samy Al Bahra.
> > Any reason for adding the above copy right?
> 
> Code originally came from Concurrency Kit, so wanted to keep attribution to
> original author
Ack

> 
> > > + * The rte_pflock_t type.
> > > + */
> > > +struct rte_pflock {
> > > +	union rte_pflock_ticket {
> > > +		uint32_t tickets;
> > > +		struct {
> > > +			uint16_t in;
> > > +			uint16_t out;
> > > +		};
> > > +	} rd, wr;
> > Just wondering if placing these on 2 different cache lines would help the
> performance?
> 
> That won't work because the implementation of trylock requires
> compare/exchange of the whole structure as an atomic operation.
I meant, placing 'rd' and 'wr' on separate cache lines. It might help in the reader-writer contention case.

> 
> >
> > > +};
> > > +typedef struct rte_pflock rte_pflock_t;
> > > +
> > > +/**
> > > + * Constants used to map the bits in reader counter
> > > + *
> > > + * +-----------------+-+-+
> > > + * |     Readers     |W|P|
> > > + * |                 |R|H|
> > > + * +-----------------+-+-+
> > It would be good to indicate the reserved part.
> 
> Ok
> 
> >
> > > + */
> > > +
> > > +#define RTE_PFLOCK_LSB   0xFFF0
> > Based on the value of RTE_PFLOCK_RINC, should this be 0xFF00?
> 
> The unused bits never get set so it doesn't matter
Agree, it just creates confusion while reading the code.

> 
> >
> > > +#define RTE_PFLOCK_RINC  0x100		/* Reader increment value.
> > Does this mean, there can be only 256 concurrent readers?
> 
> Yes, there is a tradeoff.  If you assume that the largest atomic operation is 64
> bits, and you want to support trylock then 256 readers is the limit.
>
May be I am missing something, I see that you are using 32b atomic operations. 'union rte_pflock_ticket' is 32b.

> The original code has 32 bit counters but no trylock.
> 
> 
> > > +__rte_experimental
> > > +static inline void
> > > +rte_pflock_read_lock(rte_pflock_t *pf) {
> > > +	uint32_t w;
> > > +
> > > +	/*
> > > +	 * If no writer is present, then the operation has completed
> > > +	 * successfully.
> > > +	 */
> > > +	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC,
> > > __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
> > Any reason for the RELEASE? I think ACQUIRE is enough as the write to rd.in is
> not releasing any previous memory operations.
> 
> That make sense, will fix
> 
> > > +__rte_experimental
> > > +static inline int
> > > +rte_pflock_read_trylock(rte_pflock_t *pf) {
> > > +	union rte_pflock_ticket old, new;
> > > +
> > > +	/* Get current state of the lock */
> > > +	old.tickets = __atomic_load_n(&pf->rd.tickets,
> > > __ATOMIC_RELAXED);
> > > +
> > > +	/* loop until writer shows up */
> > > +	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
> > > +		new.out = old.out;
> > > +		new.in = old.in + RTE_PFLOCK_RINC;
> > > +		if (__atomic_compare_exchange_n(&pf->rd.tickets,
> > > &old.tickets, new.tickets,
> > > +						0, __ATOMIC_ACQ_REL,
> >
>               ^^^ I think ACQUIRE is enough. We are not releasing anything to other
> threads.
> 
> Fixed.
> 
> >
> > > __ATOMIC_RELAXED))
> > > +			return 0;	/* got it */
> > > +
> > > +		/* either new reader got in (so retry) or a writer */
> > > +	}
> > > +
> 
> > > +__rte_experimental
> > > +static inline void
> > > +rte_pflock_write_lock(rte_pflock_t *pf) {
> > > +	uint16_t ticket;
> > > +
> > > +	/* Acquire ownership of write-phase. */
> > > +	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
> > > +	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
> > > +
> > > +	/*
> > > +	 * Acquire ticket on read-side in order to allow them
> > > +	 * to flush. Indicates to any incoming reader that a
> > > +	 * write-phase is pending.
> > > +	 *
> > > +	 * Need ACQUIRE to prevent speculative execution of the wait loop
> > I do not think the entire wait loop will be executed speculatively. Only the load
> of pf->rd.out would happen speculatively. There is a dependency on 'ticket'
> variable. So, the load of the 'ticket' variable should happen after 'ticket' is
> updated below.
> >
> > > +	 */
> > > +	ticket = __atomic_fetch_add(&pf->rd.in,
> > > +				    (ticket & RTE_PFLOCK_PHID) |
> > > RTE_PFLOCK_PRES,
> > > +				    __ATOMIC_ACQUIRE);
> > Since, it is ok to execute part of the wait loop above this. We could make this
> RELAXED.
> > Also, since we just need to set the 2 bits, is it better to use __atomic_fetch_or?
> It also matches with the use of __atomic_fetch_and in the unlock API.
> 
> 
> 	ticket = __atomic_fetch_or(&pf->rd.in,
> 				    (ticket & RTE_PFLOCK_PHID) |
> RTE_PFLOCK_PRES,
> 				    __ATOMIC_RELAXED
Ack

> 
> > > +
> > > +	/* Wait for any pending readers to flush. */
> > > +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
> > RELAXED here will allow the critical section to execute above the wait loop.
> Hence it is better to make this ACQUIRE.
> 
> 	Would it be better to add a fence instead?
Acquire fence here will have more restrictions for the micro-architecture to optimize. It does not allow the earlier reads to cross the fence (a load-acquire would allow it).
Good to run performance tests to confirm.

> >
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Try to take the pflock for write.
> > > + *
> > > + * @param pf
> > > + *   A pointer to the pflock.
> > > + * @return
> > > + *   - zero if the lock is successfully taken
> > > + *   - -EBUSY if lock could not be acquired for writing because
> > > + *     another writer holds the lock
> > What about the readers holding the lock?
> 
> Originally, I had it return -EBUSY, but then all the write trylock would fail.
Ok. It is also not clear what the try lock should do? Without the clear requirements we will bind ourselves into an implement which might not be suitable in the future. May be it is better to skip it.

> Trylock doesn't seem to play well with phase fair nature.
> 
> Writing is a two part operation in this model, if the 1st part succeeds (which
> changes the phase), then there is no way to backout/undo the ticket.
The undo operation is similar to 'rte_pflock_write_unlock'? Reset the phase bits and increment the ticket.

> 
> 
> 
> >
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_pflock_write_trylock(rte_pflock_t *pf) {
> > > +	union rte_pflock_ticket old, new;
> > > +	uint16_t ticket;
> > > +
> > > +	/* Get current state of the lock */
> > > +	old.tickets = __atomic_load_n(&pf->wr.tickets,
> > > __ATOMIC_RELAXED);
> > > +	new.out = old.out;
> > > +	new.in  = old.in + 1;
> > > +	ticket = new.in;
> > > +
> > > +	/* if writer is already present then too busy */
> > > +	if (old.out != new.in ||
> > > +	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets,
> > > new.tickets,
> > > +					 0, __ATOMIC_ACQ_REL,
> > > __ATOMIC_RELAXED))
> > > +		return -EBUSY; /* another writer is present already */
> > > +
> > > +	/*
> > > +	 * We now own the write phase, but still need to tell
> > > +	 * readers and wait for them.
> > The write lock is taken if there are no readers AND no writers (unlike the read
> lock which is taken if there are no writers waiting (only))
> > Since this is a try lock, should we wait for the readers to give up the lock?
> > I think, if the readers are present, we should give up the writer phase and
> return.
> >
> > > +	 *
> > > +	 * Need ACQUIRE semantics to avoid speculative execution of wait
> > > loop
> > > +	 */
> > > +	ticket  = __atomic_fetch_add(&pf->rd.in,
> > > +				 (ticket & RTE_PFLOCK_PHID) |
> > > RTE_PFLOCK_PRES,
> > > +				 __ATOMIC_ACQUIRE);
> > > +
> > > +	/* Wait for any pending readers to flush. */
> > > +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
> > > +	return 0;
> > > +}
> > > +
> > > +#ifdef __cplusplus
> > > +}
> > > +#endif
> > > +
> > > +#endif /* RTE_PFLOCK_H */
> > > diff --git a/lib/librte_eal/ppc/include/meson.build
> > > b/lib/librte_eal/ppc/include/meson.build
> > > index dae40ede546e..7692a531ccba 100644
> > > --- a/lib/librte_eal/ppc/include/meson.build
> > > +++ b/lib/librte_eal/ppc/include/meson.build
> > > @@ -11,6 +11,7 @@ arch_headers = files(
> > >  	'rte_mcslock.h',
> > >  	'rte_memcpy.h',
> > >  	'rte_pause.h',
> > > +	'rte_pflock.h',
> > >  	'rte_power_intrinsics.h',
> > >  	'rte_prefetch.h',
> > >  	'rte_rwlock.h',
> > > diff --git a/lib/librte_eal/ppc/include/rte_pflock.h
> > > b/lib/librte_eal/ppc/include/rte_pflock.h
> > > new file mode 100644
> > > index 000000000000..e7b875ac56a8
> > > --- /dev/null
> > > +++ b/lib/librte_eal/ppc/include/rte_pflock.h
> > > @@ -0,0 +1,16 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause  */ #ifndef
> > Copyright header missing?
> >
> > > +_RTE_PFLOCK_PPC_64_H_ #define _RTE_PFLOCK_PPC_64_H_
> > > +
> > > +#ifdef __cplusplus
> > > +extern "C" {
> > > +#endif
> > > +
> > > +#include "generic/rte_pflock.h"
> > > +
> > > +#ifdef __cplusplus
> > > +}
> > > +#endif
> > > +
> > > +#endif /* _RTE_PFLOCK_PPC_64_H_ */
> > > diff --git a/lib/librte_eal/x86/include/meson.build
> > > b/lib/librte_eal/x86/include/meson.build
> > > index 1a6ad0b17342..f43645c20899 100644
> > > --- a/lib/librte_eal/x86/include/meson.build
> > > +++ b/lib/librte_eal/x86/include/meson.build
> > > @@ -10,6 +10,7 @@ arch_headers = files(
> > >  	'rte_mcslock.h',
> > >  	'rte_memcpy.h',
> > >  	'rte_pause.h',
> > > +	'rte_pflock.h',
> > >  	'rte_power_intrinsics.h',
> > >  	'rte_prefetch.h',
> > >  	'rte_rtm.h',
> > > diff --git a/lib/librte_eal/x86/include/rte_pflock.h
> > > b/lib/librte_eal/x86/include/rte_pflock.h
> > > new file mode 100644
> > > index 000000000000..c2d876062c08
> > > --- /dev/null
<snip>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pflock: implementation of phase-fair reader writer locks
  2021-03-30  0:18           ` Honnappa Nagarahalli
@ 2021-03-30  4:56             ` Stephen Hemminger
  0 siblings, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-30  4:56 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, Stephen Hemminger, nd

On Tue, 30 Mar 2021 00:18:40 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> > That won't work because the implementation of trylock requires
> > compare/exchange of the whole structure as an atomic operation.  
> I meant, placing 'rd' and 'wr' on separate cache lines. It might help in the reader-writer contention case.

Except ARM default cache size is 128 which makes these locks rather large.
The reader is the hot case and it doesn't look at the writer part.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v4] pflock: add phase-fair reader writer locks
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
                       ` (2 preceding siblings ...)
  2021-03-03 19:19     ` [dpdk-dev] [PATCH v3] " Stephen Hemminger
@ 2021-03-30  5:00     ` Stephen Hemminger
  2021-03-30  5:14       ` Stephen Hemminger
  2021-03-31  4:19       ` Honnappa Nagarahalli
  2021-04-02  1:42     ` [dpdk-dev] [PATCH v5] pflock: implementation of " Stephen Hemminger
  4 siblings, 2 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-30  5:00 UTC (permalink / raw)
  To: Honnappa.Nagarahalli; +Cc: dev, Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees which better suited for typical DPDK applications.
A pflock has two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader nor writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same type (reader or writer)
that acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/meson.build                        |   2 +
 app/test/test_pflock.c                      | 193 +++++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
 lib/librte_eal/include/generic/rte_pflock.h | 202 ++++++++++++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  17 ++
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
 9 files changed, 453 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 76eaaea45746..bd50818f82b0 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,7 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..5e3c05767767
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+test_pflock(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..7c183633df60
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Derived from Concurrency Kit
+ * Copyright 2011-2015 Samy Al Bahra.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	struct {
+		uint16_t in;
+		uint16_t out;
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/*
+ * Allocation of bits to reader
+ *
+ * 16                 8 7    2 1 0
+ * +-------------------+------+-+-+
+ * | rin: reads issued |unused| | |
+ * +-------------------+------+-+-+
+ *                             ^ ^
+ *                             | |
+ *     PRES: writer present ---+ |
+ *     PHID: writer phase id ----+
+ *
+ * 16                2 7    0
+ * +------------------+------+
+ * |rout:read complete|unused|
+ * +------------------+------+
+ *
+ * The maximum number of readers is 4095
+ */
+
+/* Constants used to map the bits in reader counter */
+#define RTE_PFLOCK_WBITS 0x3	/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2	/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1	/* Phase ID bit. */
+#define RTE_PFLOCK_LSB   0xFFF0 /* reader bits. */
+#define RTE_PFLOCK_RINC  0x10	/* Reader increment. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	memset(pf, 0, sizeof(*pf));
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint16_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQUIRE)
+		& RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket, w;
+
+	/* Acquire ownership of write-phase.
+	 * This is same as rte_tickelock_lock().
+	 */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_RELAXED);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_ACQUIRE);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * The load of rd.out in wait loop could be executed
+	 * speculatively.
+	 */
+	w = RTE_PFLOCK_PRES | (ticket & RTE_PFLOCK_PHID);
+	ticket = __atomic_fetch_add(&pf->rd.in, w, __ATOMIC_RELAXED);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_ACQUIRE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..27c201b11d05
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v4] pflock: add phase-fair reader writer locks
  2021-03-30  5:00     ` [dpdk-dev] [PATCH v4] pflock: add " Stephen Hemminger
@ 2021-03-30  5:14       ` Stephen Hemminger
  2021-03-31  4:19       ` Honnappa Nagarahalli
  1 sibling, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-30  5:14 UTC (permalink / raw)
  To: Honnappa.Nagarahalli; +Cc: dev

On Mon, 29 Mar 2021 22:00:47 -0700
Stephen Hemminger <stephen@networkplumber.org> wrote:

> This is a new type of reader-writer lock that provides better fairness
> guarantees which better suited for typical DPDK applications.
> A pflock has two ticket pools, one for readers and one
> for writers.
> 
> Phase fair reader writer locks ensure that neither reader nor writer will be
> starved. Neither reader or writer are preferred, they execute in
> alternating phases. All operations of the same type (reader or writer)
> that acquire the lock are handled in FIFO order.  Write
> operations are exclusive, and multiple read operations can be run
> together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

v4 incorporates feedback, adds some comments and drops the trylock for now.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v4] pflock: add phase-fair reader writer locks
  2021-03-30  5:00     ` [dpdk-dev] [PATCH v4] pflock: add " Stephen Hemminger
  2021-03-30  5:14       ` Stephen Hemminger
@ 2021-03-31  4:19       ` Honnappa Nagarahalli
  2021-03-31 16:32         ` Stephen Hemminger
  2021-04-02  1:37         ` Stephen Hemminger
  1 sibling, 2 replies; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-03-31  4:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, nd, Honnappa Nagarahalli, nd

<snip>

Few minor comments, overall looks good. Tested on few Arm platforms.

> 
> This is a new type of reader-writer lock that provides better fairness
> guarantees which better suited for typical DPDK applications.
> A pflock has two ticket pools, one for readers and one for writers.
> 
> Phase fair reader writer locks ensure that neither reader nor writer will be
> starved. Neither reader or writer are preferred, they execute in alternating
> phases. All operations of the same type (reader or writer) that acquire the
> lock are handled in FIFO order.  Write operations are exclusive, and multiple
> read operations can be run together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  app/test/meson.build                        |   2 +
>  app/test/test_pflock.c                      | 193 +++++++++++++++++++
>  lib/librte_eal/arm/include/meson.build      |   1 +
>  lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
>  lib/librte_eal/include/generic/rte_pflock.h | 202 ++++++++++++++++++++
>  lib/librte_eal/ppc/include/meson.build      |   1 +
>  lib/librte_eal/ppc/include/rte_pflock.h     |  17 ++
>  lib/librte_eal/x86/include/meson.build      |   1 +
>  lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
>  9 files changed, 453 insertions(+)
>  create mode 100644 app/test/test_pflock.c  create mode 100644
> lib/librte_eal/arm/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
>  create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/x86/include/rte_pflock.h
> 
> diff --git a/app/test/meson.build b/app/test/meson.build index
> 76eaaea45746..bd50818f82b0 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -90,6 +90,7 @@ test_sources = files('commands.c',
>  	'test_mcslock.c',
>  	'test_mp_secondary.c',
>  	'test_per_lcore.c',
> +	'test_pflock.c',
>  	'test_pmd_perf.c',
>  	'test_power.c',
>  	'test_power_cpufreq.c',
> @@ -228,6 +229,7 @@ fast_tests = [
>          ['meter_autotest', true],
>          ['multiprocess_autotest', false],
>          ['per_lcore_autotest', true],
> +        ['pflock_autotest', true],
>          ['prefetch_autotest', true],
>          ['rcu_qsbr_autotest', true],
>          ['red_autotest', true],
> diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c new file mode
> 100644 index 000000000000..5e3c05767767
> --- /dev/null
> +++ b/app/test/test_pflock.c
> @@ -0,0 +1,193 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <unistd.h>
> +#include <sys/queue.h>
> +#include <string.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_per_lcore.h>
> +#include <rte_launch.h>
> +#include <rte_pause.h>
> +#include <rte_pflock.h>
> +#include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +
> +#include "test.h"
> +
> +/*
> + * phase fair lock test
> + * ===========
> + * Provides UT for phase fair lock API.
> + * Main concern is on functional testing, but also provides some
> + * performance measurements.
> + * Obviously for proper testing need to be executed with more than one
> lcore.
> + */
> +
> +#define ITER_NUM	0x80
> +
> +#define TEST_SEC	5
The above 2 #defines are not used, you can remove them

> +
> +static rte_pflock_t sl;
> +static rte_pflock_t sl_tab[RTE_MAX_LCORE]; static uint32_t synchro;
> +
> +enum {
> +	LC_TYPE_RDLOCK,
> +	LC_TYPE_WRLOCK,
> +};
This enum is not used, you can remove it

> +
> +static int
> +test_pflock_per_core(__rte_unused void *arg) {
> +	rte_pflock_write_lock(&sl);
> +	printf("Global write lock taken on core %u\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl);
> +
> +	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
> +	printf("Hello from core %u !\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
> +
> +	rte_pflock_read_lock(&sl);
> +	printf("Global read lock taken on core %u\n", rte_lcore_id());
> +	rte_delay_ms(100);
> +	printf("Release global read lock on core %u\n", rte_lcore_id());
> +	rte_pflock_read_unlock(&sl);
> +
> +	return 0;
> +}
> +
> +static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER; static volatile
> +uint64_t pflock_data; static uint64_t time_count[RTE_MAX_LCORE] = {0};
> +
> +#define MAX_LOOP 10000
> +#define TEST_PFLOCK_DEBUG 0
> +
> +static int
> +load_loop_fn(__rte_unused void *arg)
> +{
> +	uint64_t time_diff = 0, begin;
> +	uint64_t hz = rte_get_timer_hz();
> +	uint64_t lcount = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	/* wait synchro for workers */
> +	if (lcore != rte_get_main_lcore())
> +		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
> +
> +	begin = rte_rdtsc_precise();
> +	while (lcount < MAX_LOOP) {
> +		rte_pflock_write_lock(&lk);
> +		++pflock_data;
This should be an atomic increment, better to use atomic fetch add

> +		rte_pflock_write_unlock(&lk);
> +
> +		rte_pflock_read_lock(&lk);
> +		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
> +			printf("Core [%u] pflock_data = %"PRIu64"\n",
> +				lcore, pflock_data);
> +		rte_pflock_read_unlock(&lk);
> +
> +		lcount++;
> +		/* delay to make lock duty cycle slightly realistic */
> +		rte_pause();
> +	}
> +
> +	time_diff = rte_rdtsc_precise() - begin;
> +	time_count[lcore] = time_diff * 1000000 / hz;
> +	return 0;
> +}
> +
> +static int
> +test_pflock_perf(void)
> +{
> +	unsigned int i;
> +	uint64_t total = 0;
> +
> +	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
> +
> +	/* clear synchro and start workers */
> +	synchro = 0;
> +	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) <
> 0)
> +		return -1;
> +
> +	/* start synchro and launch test on main */
> +	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
> +	load_loop_fn(NULL);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	RTE_LCORE_FOREACH(i) {
> +		printf("Core [%u] cost time = %"PRIu64" us\n",
> +			i, time_count[i]);
> +		total += time_count[i];
> +	}
> +
> +	printf("Total cost time = %"PRIu64" us\n", total);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	return 0;
> +}
> +
> +/*
> + * - There is a global pflock and a table of pflocks (one per lcore).
> + *
> + * - The test function takes all of these locks and launches the
> + *   ``test_pflock_per_core()`` function on each core (except the main).
> + *
> + *   - The function takes the global write lock, display something,
> + *     then releases the global lock.
> + *   - Then, it takes the per-lcore write lock, display something, and
> + *     releases the per-core lock.
> + *   - Finally, a read lock is taken during 100 ms, then released.
> + *
> + * - The main function unlocks the per-lcore locks sequentially and
> + *   waits between each lock. This triggers the display of a message
> + *   for each core, in the correct order.
> + *
> + *   Then, it tries to take the global write lock and display the last
> + *   message. The autotest script checks that the message order is correct.
> + */
> +static int
> +test_pflock(void)
> +{
> +	int i;
> +
> +	rte_pflock_init(&sl);
> +	for (i = 0; i < RTE_MAX_LCORE; i++)
> +		rte_pflock_init(&sl_tab[i]);
> +
> +	rte_pflock_write_lock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_lock(&sl_tab[i]);
> +		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
> +	}
> +
> +	rte_pflock_write_unlock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_unlock(&sl_tab[i]);
> +		rte_delay_ms(100);
> +	}
> +
> +	rte_pflock_write_lock(&sl);
> +	/* this message should be the last message of test */
> +	printf("Global write lock taken on main core %u\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	if (test_pflock_perf() < 0)
Suggest separating out the performance test so that it is not run on the cloud CI platforms (which have issues with performance tests timing out). I think autotest_data.py needs to be modified.

> +		return -1;
> +
> +	return 0;
> +}
> +
> +REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
> diff --git a/lib/librte_eal/arm/include/meson.build
> b/lib/librte_eal/arm/include/meson.build
> index 770766de1a34..2c3cff61bed6 100644
> --- a/lib/librte_eal/arm/include/meson.build
> +++ b/lib/librte_eal/arm/include/meson.build
> @@ -21,6 +21,7 @@ arch_headers = files(
>  	'rte_pause_32.h',
>  	'rte_pause_64.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch_32.h',
>  	'rte_prefetch_64.h',
> diff --git a/lib/librte_eal/arm/include/rte_pflock.h
> b/lib/librte_eal/arm/include/rte_pflock.h
> new file mode 100644
> index 000000000000..bb9934eec469
> --- /dev/null
> +++ b/lib/librte_eal/arm/include/rte_pflock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_PFLOCK_ARM_H_
> +#define _RTE_PFLOCK_ARM_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_ARM_H_ */
> diff --git a/lib/librte_eal/include/generic/rte_pflock.h
> b/lib/librte_eal/include/generic/rte_pflock.h
> new file mode 100644
> index 000000000000..7c183633df60
> --- /dev/null
> +++ b/lib/librte_eal/include/generic/rte_pflock.h
> @@ -0,0 +1,202 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corp.
> + * All rights reserved.
> + *
> + * Derived from Concurrency Kit
> + * Copyright 2011-2015 Samy Al Bahra.
> + */
> +
> +#ifndef _RTE_PFLOCK_H_
> +#define _RTE_PFLOCK_H_
> +
> +/**
> + * @file
> + *
> + * Phase-fair locks
> + *
> + * This file defines an API for Phase Fair reader writer locks,
> + * which is a variant of typical reader-writer locks that prevent
> + * starvation. In this type of lock, readers and writers alternate.
> + * This significantly reduces the worst-case blocking for readers and writers.
> + *
> + * This is an implementation derived from FreeBSD
> + * based on the work described in:
> + *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
> + *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
> + *
> + * All locks must be initialised before use, and only initialised once.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_pause.h>
> +
> +/**
> + * The rte_pflock_t type.
> + */
> +struct rte_pflock {
> +	struct {
> +		uint16_t in;
> +		uint16_t out;
> +	} rd, wr;
> +};
> +typedef struct rte_pflock rte_pflock_t;
> +
> +/*
> + * Allocation of bits to reader
> + *
> + * 16                 8 7    2 1 0
Typo, this numbering should be 15 4 3 2 1 0

> + * +-------------------+------+-+-+
> + * | rin: reads issued |unused| | |
> + * +-------------------+------+-+-+
> + *                             ^ ^
> + *                             | |
> + *     PRES: writer present ---+ |
> + *     PHID: writer phase id ----+
> + *
> + * 16                2 7    0
Here, it should be 15 4 3 0

> + * +------------------+------+
> + * |rout:read complete|unused|
> + * +------------------+------+
> + *
> + * The maximum number of readers is 4095  */
> +
> +/* Constants used to map the bits in reader counter */
> +#define RTE_PFLOCK_WBITS 0x3	/* Writer bits in reader. */
> +#define RTE_PFLOCK_PRES  0x2	/* Writer present bit. */
> +#define RTE_PFLOCK_PHID  0x1	/* Phase ID bit. */
> +#define RTE_PFLOCK_LSB   0xFFF0 /* reader bits. */
> +#define RTE_PFLOCK_RINC  0x10	/* Reader increment. */
> +
> +/**
> + * A static pflock initializer.
> + */
> +#define RTE_PFLOCK_INITIALIZER {  }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Initialize the pflock to an unlocked state.
> + *
> + * @param pf
> + *   A pointer to the pflock.
> + */
> +__rte_experimental
> +static inline void
Minor, this API does not need to be inline.

> +rte_pflock_init(struct rte_pflock *pf)
> +{
> +	memset(pf, 0, sizeof(*pf));
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Take a pflock for read.
> + *
> + * @param pf
> + *   A pointer to a pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_read_lock(rte_pflock_t *pf)
> +{
> +	uint16_t w;
> +
> +	/*
> +	 * If no writer is present, then the operation has completed
> +	 * successfully.
> +	 */
> +	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC,
> __ATOMIC_ACQUIRE)
> +		& RTE_PFLOCK_WBITS;
> +	if (w == 0)
> +		return;
> +
> +	/* Wait for current write phase to complete. */
> +	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) &
> RTE_PFLOCK_WBITS) == w)
> +		rte_pause();
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a pflock locked for reading.
> + *
> + * @param pf
> + *   A pointer to the pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_read_unlock(rte_pflock_t *pf) {
> +	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC,
> __ATOMIC_RELEASE); }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Take the pflock for write.
> + *
> + * @param pf
> + *   A pointer to the ticketlock.
Typo                                ^^^^^^^^ pflock

> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_write_lock(rte_pflock_t *pf) {
> +	uint16_t ticket, w;
> +
> +	/* Acquire ownership of write-phase.
> +	 * This is same as rte_tickelock_lock().
> +	 */
> +	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_RELAXED);
> +	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_ACQUIRE);
> +
> +	/*
> +	 * Acquire ticket on read-side in order to allow them
> +	 * to flush. Indicates to any incoming reader that a
> +	 * write-phase is pending.
> +	 *
> +	 * The load of rd.out in wait loop could be executed
> +	 * speculatively.
> +	 */
> +	w = RTE_PFLOCK_PRES | (ticket & RTE_PFLOCK_PHID);
> +	ticket = __atomic_fetch_add(&pf->rd.in, w, __ATOMIC_RELAXED);
> +
> +	/* Wait for any pending readers to flush. */
> +	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_ACQUIRE);
> }
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a pflock held for writing.
> + *
> + * @param pf
> + *   A pointer to a pflock structure.
> + */
> +__rte_experimental
> +static inline void
> +rte_pflock_write_unlock(rte_pflock_t *pf) {
> +	/* Migrate from write phase to read phase. */
> +	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB,
> __ATOMIC_RELEASE);
> +
> +	/* Allow other writers to continue. */
> +	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE); }
> +
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* RTE_PFLOCK_H */
> diff --git a/lib/librte_eal/ppc/include/meson.build
> b/lib/librte_eal/ppc/include/meson.build
> index dae40ede546e..7692a531ccba 100644
> --- a/lib/librte_eal/ppc/include/meson.build
> +++ b/lib/librte_eal/ppc/include/meson.build
> @@ -11,6 +11,7 @@ arch_headers = files(
>  	'rte_mcslock.h',
>  	'rte_memcpy.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch.h',
>  	'rte_rwlock.h',
> diff --git a/lib/librte_eal/ppc/include/rte_pflock.h
> b/lib/librte_eal/ppc/include/rte_pflock.h
> new file mode 100644
> index 000000000000..27c201b11d05
> --- /dev/null
> +++ b/lib/librte_eal/ppc/include/rte_pflock.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */ #ifndef
> +_RTE_PFLOCK_PPC_64_H_ #define _RTE_PFLOCK_PPC_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_PPC_64_H_ */
> diff --git a/lib/librte_eal/x86/include/meson.build
> b/lib/librte_eal/x86/include/meson.build
> index 1a6ad0b17342..f43645c20899 100644
> --- a/lib/librte_eal/x86/include/meson.build
> +++ b/lib/librte_eal/x86/include/meson.build
> @@ -10,6 +10,7 @@ arch_headers = files(
>  	'rte_mcslock.h',
>  	'rte_memcpy.h',
>  	'rte_pause.h',
> +	'rte_pflock.h',
>  	'rte_power_intrinsics.h',
>  	'rte_prefetch.h',
>  	'rte_rtm.h',
> diff --git a/lib/librte_eal/x86/include/rte_pflock.h
> b/lib/librte_eal/x86/include/rte_pflock.h
> new file mode 100644
> index 000000000000..c2d876062c08
> --- /dev/null
> +++ b/lib/librte_eal/x86/include/rte_pflock.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#ifndef _RTE_PFLOCK_X86_64_H_
> +#define _RTE_PFLOCK_X86_64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_pflock.h"
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PFLOCK_X86_64_H_ */
> --
> 2.30.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v4] pflock: add phase-fair reader writer locks
  2021-03-31  4:19       ` Honnappa Nagarahalli
@ 2021-03-31 16:32         ` Stephen Hemminger
  2021-04-02  1:37         ` Stephen Hemminger
  1 sibling, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-03-31 16:32 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd

On Wed, 31 Mar 2021 04:19:14 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> > +
> > +	rte_pflock_write_lock(&sl);
> > +	/* this message should be the last message of test */
> > +	printf("Global write lock taken on main core %u\n", rte_lcore_id());
> > +	rte_pflock_write_unlock(&sl);
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +	if (test_pflock_perf() < 0)  
> Suggest separating out the performance test so that it is not run on the cloud CI platforms (which have issues with performance tests timing out). I think autotest_data.py needs to be modified.

I was just copying existing pattern of all lock tests (rwlock, mcslock. ticketlock).
They run the perf test as well.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v4] pflock: add phase-fair reader writer locks
  2021-03-31  4:19       ` Honnappa Nagarahalli
  2021-03-31 16:32         ` Stephen Hemminger
@ 2021-04-02  1:37         ` Stephen Hemminger
  1 sibling, 0 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-04-02  1:37 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd

On Wed, 31 Mar 2021 04:19:14 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> > +__rte_experimental
> > +static inline void  
> Minor, this API does not need to be inline.


Unfortunately, it has to be inline otherwise Gcc warns
about unused static function in every file that doesn't use it.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
                       ` (3 preceding siblings ...)
  2021-03-30  5:00     ` [dpdk-dev] [PATCH v4] pflock: add " Stephen Hemminger
@ 2021-04-02  1:42     ` Stephen Hemminger
  2021-04-06 21:56       ` Honnappa Nagarahalli
  2021-04-07 15:09       ` Ananyev, Konstantin
  4 siblings, 2 replies; 27+ messages in thread
From: Stephen Hemminger @ 2021-04-02  1:42 UTC (permalink / raw)
  To: dev, Honnappa Nagarahalli; +Cc: Stephen Hemminger

This is a new type of reader-writer lock that provides better fairness
guarantees which better suited for typical DPDK applications.
A pflock has two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader nor writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same type (reader or writer)
that acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v5 - cleanup typos in the lock code comments
     minor revision to unit test.
     Note: the unit test is intentionally the same as other locking tests.

 app/test/meson.build                        |   2 +
 app/test/test_pflock.c                      | 197 +++++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
 lib/librte_eal/include/generic/rte_pflock.h | 205 ++++++++++++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  17 ++
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
 9 files changed, 460 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h

diff --git a/app/test/meson.build b/app/test/meson.build
index 76eaaea45746..bd50818f82b0 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@ test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,7 @@ fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..6922bbc2f813
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+
+static int
+load_loop_fn(void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const int use_lock = *(int *)arg;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		if (use_lock)
+			rte_pflock_write_lock(&lk);
+		lcount++;
+		if (use_lock)
+			rte_pflock_write_unlock(&lk);
+
+		if (use_lock) {
+			rte_pflock_read_lock(&lk);
+			rte_pflock_read_unlock(&lk);
+		}
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	int lock = 0;
+	uint64_t total = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	printf("\nTest with no lock on single core...\n");
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nTest with phase fair lock on single core...\n");
+	lock = 1;
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	__atomic_store_n(&synchro, 0, __ATOMIC_RELAXED);
+	if (rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(&lock);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+test_pflock(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@ arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..72cf6440f7f1
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Derived from Concurrency Kit
+ * Copyright 2011-2015 Samy Al Bahra.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	struct {
+		uint16_t in;
+		uint16_t out;
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/*
+ * Allocation of bits to reader
+ *
+ * 15                 4 3 2 1 0
+ * +-------------------+---+-+-+
+ * | rin: reads issued |x|x| | |
+ * +-------------------+---+-+-+
+ *                          ^ ^
+ *                          | |
+ * PRES: writer present ----/ |
+ * PHID: writer phase id -----/
+ *
+ * 15                4 3 2 1 0
+ * +------------------+------+
+ * |rout:read complete|unused|
+ * +------------------+------+
+ *
+ * The maximum number of readers is 4095
+ */
+
+/* Constants used to map the bits in reader counter */
+#define RTE_PFLOCK_WBITS 0x3	/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2	/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1	/* Phase ID bit. */
+#define RTE_PFLOCK_LSB   0xFFF0 /* reader bits. */
+#define RTE_PFLOCK_RINC  0x10	/* Reader increment. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.in = 0;
+	pf->rd.out = 0;
+	pf->wr.in = 0;
+	pf->wr.out = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint16_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQUIRE)
+		& RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE)
+		& RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket, w;
+
+	/* Acquire ownership of write-phase.
+	 * This is same as rte_tickelock_lock().
+	 */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_RELAXED);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_ACQUIRE);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * The load of rd.out in wait loop could be executed
+	 * speculatively.
+	 */
+	w = RTE_PFLOCK_PRES | (ticket & RTE_PFLOCK_PHID);
+	ticket = __atomic_fetch_add(&pf->rd.in, w, __ATOMIC_RELAXED);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_ACQUIRE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..27c201b11d05
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@ arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */
-- 
2.30.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-04-02  1:42     ` [dpdk-dev] [PATCH v5] pflock: implementation of " Stephen Hemminger
@ 2021-04-06 21:56       ` Honnappa Nagarahalli
  2021-04-06 22:33         ` Stephen Hemminger
  2021-04-07 15:09       ` Ananyev, Konstantin
  1 sibling, 1 reply; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-04-06 21:56 UTC (permalink / raw)
  To: Stephen Hemminger, dev; +Cc: nd, Honnappa Nagarahalli, nd

<snip>

> 
> This is a new type of reader-writer lock that provides better fairness
> guarantees which better suited for typical DPDK applications.
> A pflock has two ticket pools, one for readers and one for writers.
> 
> Phase fair reader writer locks ensure that neither reader nor writer will be
> starved. Neither reader or writer are preferred, they execute in alternating
> phases. All operations of the same type (reader or writer) that acquire the
> lock are handled in FIFO order.  Write operations are exclusive, and multiple
> read operations can be run together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Looks good.
Acked-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

One question below

> ---
> v5 - cleanup typos in the lock code comments
>      minor revision to unit test.
>      Note: the unit test is intentionally the same as other locking tests.
> 
>  app/test/meson.build                        |   2 +
>  app/test/test_pflock.c                      | 197 +++++++++++++++++++
>  lib/librte_eal/arm/include/meson.build      |   1 +
>  lib/librte_eal/arm/include/rte_pflock.h     |  18 ++
>  lib/librte_eal/include/generic/rte_pflock.h | 205 ++++++++++++++++++++
>  lib/librte_eal/ppc/include/meson.build      |   1 +
>  lib/librte_eal/ppc/include/rte_pflock.h     |  17 ++
>  lib/librte_eal/x86/include/meson.build      |   1 +
>  lib/librte_eal/x86/include/rte_pflock.h     |  18 ++
>  9 files changed, 460 insertions(+)
>  create mode 100644 app/test/test_pflock.c  create mode 100644
> lib/librte_eal/arm/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
>  create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
>  create mode 100644 lib/librte_eal/x86/include/rte_pflock.h
> 

<snip>

> diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c new file mode
> 100644 index 000000000000..6922bbc2f813
> --- /dev/null
> +++ b/app/test/test_pflock.c
> @@ -0,0 +1,197 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Microsoft Corporation  */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <unistd.h>
> +#include <sys/queue.h>
> +#include <string.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_per_lcore.h>
> +#include <rte_launch.h>
> +#include <rte_pause.h>
> +#include <rte_pflock.h>
> +#include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +
> +#include "test.h"
> +

<snip>

> +
> +/*
> + * - There is a global pflock and a table of pflocks (one per lcore).
> + *
> + * - The test function takes all of these locks and launches the
> + *   ``test_pflock_per_core()`` function on each core (except the main).
> + *
> + *   - The function takes the global write lock, display something,
> + *     then releases the global lock.
> + *   - Then, it takes the per-lcore write lock, display something, and
> + *     releases the per-core lock.
> + *   - Finally, a read lock is taken during 100 ms, then released.
> + *
> + * - The main function unlocks the per-lcore locks sequentially and
> + *   waits between each lock. This triggers the display of a message
> + *   for each core, in the correct order.
> + *
> + *   Then, it tries to take the global write lock and display the last
> + *   message. The autotest script checks that the message order is correct.
How does the autotest script does this?

> + */
> +static int
> +test_pflock(void)
> +{
> +	int i;
> +
> +	rte_pflock_init(&sl);
> +	for (i = 0; i < RTE_MAX_LCORE; i++)
> +		rte_pflock_init(&sl_tab[i]);
> +
> +	rte_pflock_write_lock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_lock(&sl_tab[i]);
> +		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
> +	}
> +
> +	rte_pflock_write_unlock(&sl);
> +
> +	RTE_LCORE_FOREACH_WORKER(i) {
> +		rte_pflock_write_unlock(&sl_tab[i]);
> +		rte_delay_ms(100);
> +	}
> +
> +	rte_pflock_write_lock(&sl);
> +	/* this message should be the last message of test */
> +	printf("Global write lock taken on main core %u\n", rte_lcore_id());
> +	rte_pflock_write_unlock(&sl);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	if (test_pflock_perf() < 0)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);

<snip>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-04-06 21:56       ` Honnappa Nagarahalli
@ 2021-04-06 22:33         ` Stephen Hemminger
  2021-04-07  0:17           ` Honnappa Nagarahalli
  0 siblings, 1 reply; 27+ messages in thread
From: Stephen Hemminger @ 2021-04-06 22:33 UTC (permalink / raw)
  To: Honnappa Nagarahalli; +Cc: dev, nd

On Tue, 6 Apr 2021 21:56:05 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> How does the autotest script does this?

This was copy-paste from original. Think the original was wrong here.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-04-06 22:33         ` Stephen Hemminger
@ 2021-04-07  0:17           ` Honnappa Nagarahalli
  0 siblings, 0 replies; 27+ messages in thread
From: Honnappa Nagarahalli @ 2021-04-07  0:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, nd, Honnappa Nagarahalli, nd

<snip>

> Subject: Re: [PATCH v5] pflock: implementation of phase-fair reader writer
> locks
> 
> On Tue, 6 Apr 2021 21:56:05 +0000
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:
> 
> > How does the autotest script does this?
> 
> This was copy-paste from original. Think the original was wrong here.
Ok, would it be possible to automate the test case validation?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-04-02  1:42     ` [dpdk-dev] [PATCH v5] pflock: implementation of " Stephen Hemminger
  2021-04-06 21:56       ` Honnappa Nagarahalli
@ 2021-04-07 15:09       ` Ananyev, Konstantin
  2021-04-14 15:36         ` David Marchand
  1 sibling, 1 reply; 27+ messages in thread
From: Ananyev, Konstantin @ 2021-04-07 15:09 UTC (permalink / raw)
  To: Stephen Hemminger, dev, Honnappa Nagarahalli



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Stephen Hemminger
> Sent: Friday, April 2, 2021 2:43 AM
> To: dev@dpdk.org; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Subject: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
> 
> This is a new type of reader-writer lock that provides better fairness
> guarantees which better suited for typical DPDK applications.
> A pflock has two ticket pools, one for readers and one
> for writers.
> 
> Phase fair reader writer locks ensure that neither reader nor writer will be
> starved. Neither reader or writer are preferred, they execute in
> alternating phases. All operations of the same type (reader or writer)
> that acquire the lock are handled in FIFO order.  Write
> operations are exclusive, and multiple read operations can be run
> together (until a write arrives).
> 
> A similar implementation is in Concurrency Kit package in FreeBSD.
> For more information see:
>    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
>     Real-Time Systems",
>     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.30.2


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [dpdk-dev] [PATCH v5] pflock: implementation of phase-fair reader writer locks
  2021-04-07 15:09       ` Ananyev, Konstantin
@ 2021-04-14 15:36         ` David Marchand
  0 siblings, 0 replies; 27+ messages in thread
From: David Marchand @ 2021-04-14 15:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Honnappa Nagarahalli, Ananyev, Konstantin

On Wed, Apr 7, 2021 at 5:10 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > This is a new type of reader-writer lock that provides better fairness
> > guarantees which better suited for typical DPDK applications.
> > A pflock has two ticket pools, one for readers and one
> > for writers.
> >
> > Phase fair reader writer locks ensure that neither reader nor writer will be
> > starved. Neither reader or writer are preferred, they execute in
> > alternating phases. All operations of the same type (reader or writer)
> > that acquire the lock are handled in FIFO order.  Write
> > operations are exclusive, and multiple read operations can be run
> > together (until a write arrives).
> >
> > A similar implementation is in Concurrency Kit package in FreeBSD.
> > For more information see:
> >    "Reader-Writer Synchronization for Shared-Memory Multiprocessor
> >     Real-Time Systems",
> >     http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

I added a short release note update.
I chose "phase-fair" as the unique way to identify this new API.
I updated the patch with this for consistency.
I put this new API under the EAL common code section in MAINTAINERS.


A comparison on when/why to prefer this over the existing locks would
be helpful in the developer guide.
There is also a question from Honnappa about test validation.
Can you follow up on those two points?


Applied, thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2021-04-14 15:36 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-12  6:05 [dpdk-dev] [RFC] eal: add fair reader writer lock Stephen Hemminger
2021-01-14 17:34 ` [dpdk-dev] [PATCH v1] eal: add ticket based " Stephen Hemminger
2021-01-27 10:25   ` Ruifeng Wang
2021-01-28  1:32     ` Stephen Hemminger
2021-01-28  1:16   ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
2021-02-12  1:38   ` [dpdk-dev] [RFC] pflock: add implementation of phase-fair locks Stephen Hemminger
2021-02-28 17:21     ` [dpdk-dev] [PATCH v1] pflock: implementation of phase-fair reader writer locks Stephen Hemminger
2021-03-03 18:30     ` [dpdk-dev] [PATCH v2] " Stephen Hemminger
2021-03-03 19:19     ` [dpdk-dev] [PATCH v3] " Stephen Hemminger
2021-03-26 17:17       ` Stephen Hemminger
2021-03-29  3:14       ` Honnappa Nagarahalli
2021-03-29 17:22         ` Stephen Hemminger
2021-03-29 18:09           ` Honnappa Nagarahalli
2021-03-29 19:58         ` Stephen Hemminger
2021-03-30  0:18           ` Honnappa Nagarahalli
2021-03-30  4:56             ` Stephen Hemminger
2021-03-30  5:00     ` [dpdk-dev] [PATCH v4] pflock: add " Stephen Hemminger
2021-03-30  5:14       ` Stephen Hemminger
2021-03-31  4:19       ` Honnappa Nagarahalli
2021-03-31 16:32         ` Stephen Hemminger
2021-04-02  1:37         ` Stephen Hemminger
2021-04-02  1:42     ` [dpdk-dev] [PATCH v5] pflock: implementation of " Stephen Hemminger
2021-04-06 21:56       ` Honnappa Nagarahalli
2021-04-06 22:33         ` Stephen Hemminger
2021-04-07  0:17           ` Honnappa Nagarahalli
2021-04-07 15:09       ` Ananyev, Konstantin
2021-04-14 15:36         ` David Marchand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).