* [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb()
@ 2017-12-01 11:12 Konstantin Ananyev
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
2017-12-11 17:08 ` [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Bruce Richardson
0 siblings, 2 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2017-12-01 11:12 UTC (permalink / raw)
To: dev, dev; +Cc: Konstantin Ananyev
Simple functional test for rte_smp_mb() implementations.
Also when executed on a single lcore could be used as rough
estimation how many cycles particular implementation of rte_smp_mb()
might take.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
test/test/Makefile | 1 +
test/test/test_mb.c | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 279 insertions(+)
create mode 100644 test/test/test_mb.c
diff --git a/test/test/Makefile b/test/test/Makefile
index bb54c9808..3134283a8 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
SRCS-y += test_memory.c
SRCS-y += test_memzone.c
SRCS-y += test_bitmap.c
+SRCS-y += test_mb.c
SRCS-y += test_ring.c
SRCS-y += test_ring_perf.c
diff --git a/test/test/test_mb.c b/test/test/test_mb.c
new file mode 100644
index 000000000..5526a0366
--- /dev/null
+++ b/test/test/test_mb.c
@@ -0,0 +1,278 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /*
+ * This is a simple functional test for rte_smp_mb() implementation.
+ * I.E. make sure that LOAD and STORE operations that precede the
+ * rte_smp_mb() call are globally visible across the lcores
+ * before the the LOAD and STORE operations that follows it.
+ * The test uses simple implementation of Peterson's lock algorithm
+ * for two execution units to make sure that rte_smp_mb() prevents
+ * store-load reordering to happen.
+ * Also when executed on a single lcore could be used as a approxiamate
+ * estimation of number of cycles particular implementation of rte_smp_mb()
+ * will take.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_pause.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_vect.h>
+#include <rte_debug.h>
+
+#include "test.h"
+
+#define ADD_MAX 8
+#define ITER_MAX 0x1000000
+
+enum plock_use_type {
+ USE_MB,
+ USE_SMP_MB,
+ USE_NUM
+};
+
+struct plock {
+ volatile uint32_t flag[2];
+ volatile uint32_t victim;
+ enum plock_use_type utype;
+};
+
+struct plock_test {
+ struct plock lock;
+ uint32_t val;
+ uint32_t iter;
+};
+
+struct lcore_plock_test {
+ struct plock_test *pt[2];
+ uint32_t sum[2];
+ uint32_t iter;
+ uint32_t lc;
+};
+
+static inline void
+store_load_barrier(uint32_t utype)
+{
+ if (utype == USE_MB)
+ rte_mb();
+ else if (utype == USE_SMP_MB)
+ rte_smp_mb();
+ else
+ RTE_VERIFY(0);
+}
+
+static void
+plock_lock(struct plock *l, uint32_t self)
+{
+ uint32_t other;
+
+ other = self ^ 1;
+
+ l->flag[self] = 1;
+ l->victim = self;
+
+ store_load_barrier(l->utype);
+
+ while (l->flag[other] == 1 && l->victim == self)
+ rte_pause();
+}
+
+static void
+plock_unlock(struct plock *l, uint32_t self)
+{
+ rte_smp_wmb();
+ l->flag[self] = 0;
+}
+
+static void
+plock_reset(struct plock *l, enum plock_use_type utype)
+{
+ memset(l, 0, sizeof(*l));
+ l->utype = utype;
+}
+
+static void
+plock_add(struct plock_test *pt, uint32_t self, uint32_t n)
+{
+ plock_lock(&pt->lock, self);
+ pt->iter++;
+ pt->val += n;
+ plock_unlock(&pt->lock, self);
+}
+
+static int
+plock_test1_lcore(void *data)
+{
+ uint64_t tm;
+ uint32_t i, lc, ln, n;
+ struct lcore_plock_test *lpt;
+
+ lpt = data;
+ lc = rte_lcore_id();
+
+ for (ln = rte_lcore_count(); ln != 0 && lpt->lc != lc; lpt++, ln--)
+ ;
+
+ if (ln == 0) {
+ printf("%s(%u) error at init\n", __func__, lc);
+ return -1;
+ }
+
+ n = rte_rand() % ADD_MAX;
+ tm = rte_get_timer_cycles();
+
+ for (i = 0; i != lpt->iter; i++) {
+
+ plock_add(lpt->pt[0], 0, n);
+ plock_add(lpt->pt[1], 1, n);
+
+ lpt->sum[0] += n;
+ lpt->sum[1] += n;
+
+ n = (n + 1) % ADD_MAX;
+ }
+
+ tm = rte_get_timer_cycles() - tm;
+
+ printf("%s(%u): %u iterations finished, in %" PRIu64
+ " cycles, %#Lf cycles/iteration, "
+ "local sum={%u, %u}\n",
+ __func__, lc, i, tm, (long double)tm / i,
+ lpt->sum[0], lpt->sum[1]);
+ return 0;
+}
+
+static int
+plock_test(uint32_t iter, enum plock_use_type utype)
+{
+ int32_t rc;
+ uint32_t i, lc, n;
+ uint32_t *sum;
+ struct plock_test *pt;
+ struct lcore_plock_test *lpt;
+
+ n = rte_lcore_count();
+ pt = calloc(n + 1, sizeof(*pt));
+ lpt = calloc(n, sizeof(*lpt));
+ sum = calloc(n + 1, sizeof(*sum));
+
+ printf("%s(iter=%u, utype=%u) started on %u lcores\n",
+ __func__, iter, utype, n);
+
+ if (pt == NULL || lpt == NULL) {
+ printf("%s: failed to allocate memory for %u lcores\n",
+ __func__, n);
+ free(pt);
+ free(lpt);
+ free(sum);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != n + 1; i++)
+ plock_reset(&pt[i].lock, utype);
+
+ i = 0;
+ RTE_LCORE_FOREACH(lc) {
+
+ lpt[i].lc = lc;
+ lpt[i].iter = iter;
+ lpt[i].pt[0] = pt + i;
+ lpt[i].pt[1] = pt + i + 1;
+ i++;
+ }
+
+ lpt[i - 1].pt[1] = pt;
+
+ for (i = 0; i != n; i++)
+ printf("lpt[%u]={lc=%u, pt={%p, %p},};\n",
+ i, lpt[i].lc, lpt[i].pt[0], lpt[i].pt[1]);
+
+
+ rte_eal_mp_remote_launch(plock_test1_lcore, lpt, CALL_MASTER);
+ rte_eal_mp_wait_lcore();
+
+ for (i = 0; i != n; i++) {
+ sum[i] += lpt[i].sum[0];
+ sum[i + 1] += lpt[i].sum[1];
+ }
+
+ sum[0] += sum[i];
+
+ rc = 0;
+ for (i = 0; i != n; i++) {
+ printf("%s: sum[%u]=%u, pt[%u].val=%u, pt[%u].iter=%u;\n",
+ __func__, i, sum[i], i, pt[i].val, i, pt[i].iter);
+ if (sum[i] != pt[i].val || 2 * iter != pt[i].iter) {
+ printf("error: local and shared sums don't much\n");
+ rc = -1;
+ }
+ }
+
+ free(pt);
+ free(lpt);
+ free(sum);
+
+ printf("%s(utype=%u) returns %d\n", __func__, utype, rc);
+ return rc;
+}
+
+static int
+test_mb(void)
+{
+ int32_t i, ret, rc[USE_NUM];
+
+ for (i = 0; i != RTE_DIM(rc); i++)
+ rc[i] = plock_test(ITER_MAX, i);
+
+ ret = 0;
+ for (i = 0; i != RTE_DIM(rc); i++) {
+ printf("%s for utype=%d %s\n",
+ __func__, i, rc[i] == 0 ? "passed" : "failed");
+ ret |= rc[i];
+ }
+
+ return ret;
+}
+
+REGISTER_TEST_COMMAND(mb_autotest, test_mb);
--
2.13.5
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 11:12 [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
@ 2017-12-01 11:12 ` Konstantin Ananyev
2017-12-01 18:04 ` Stephen Hemminger
` (4 more replies)
2017-12-11 17:08 ` [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Bruce Richardson
1 sibling, 5 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2017-12-01 11:12 UTC (permalink / raw)
To: dev, dev; +Cc: Konstantin Ananyev
On x86 it is possible to use lock-prefixed instructions to get
the similar effect as mfence.
As pointed by Java guys, on most modern HW that gives a better
performance than using mfence:
https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
That patch adopts that technique for rte_smp_mb() implementation.
On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
i.e. from ~110 to ~55 cycles per operation.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
.../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
index 4eac66631..07b7fa7f7 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
@@ -55,12 +55,53 @@ extern "C" {
#define rte_rmb() _mm_lfence()
-#define rte_smp_mb() rte_mb()
-
#define rte_smp_wmb() rte_compiler_barrier()
#define rte_smp_rmb() rte_compiler_barrier()
+/*
+ * From Intel Software Development Manual; Vol 3;
+ * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
+ * ...
+ * . Reads are not reordered with other reads.
+ * . Writes are not reordered with older reads.
+ * . Writes to memory are not reordered with other writes,
+ * with the following exceptions:
+ * . streaming stores (writes) executed with the non-temporal move
+ * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
+ * . string operations (see Section 8.2.4.1).
+ * ...
+ * . Reads may be reordered with older writes to different locations but not
+ * with older writes to the same location.
+ * . Reads or writes cannot be reordered with I/O instructions,
+ * locked instructions, or serializing instructions.
+ * . Reads cannot pass earlier LFENCE and MFENCE instructions.
+ * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
+ * . LFENCE instructions cannot pass earlier reads.
+ * . SFENCE instructions cannot pass earlier writes ...
+ * . MFENCE instructions cannot pass earlier reads, writes ...
+ *
+ * As pointed by Java guys, that makes possible to use lock-prefixed
+ * instructions to get the same effect as mfence and on most modern HW
+ * that gives a better perfomarnce than using mfence:
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * So below we use that technique for rte_smp_mb() implementation.
+ */
+
+#ifdef RTE_ARCH_I686
+#define RTE_SP RTE_STR(esp)
+#else
+#define RTE_SP RTE_STR(rsp)
+#endif
+
+#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
+
+static __rte_always_inline void
+rte_smp_mb(void)
+{
+ asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
+}
+
#define rte_io_mb() rte_mb()
#define rte_io_wmb() rte_compiler_barrier()
--
2.13.5
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
@ 2017-12-01 18:04 ` Stephen Hemminger
2017-12-01 23:08 ` Ananyev, Konstantin
2017-12-11 17:11 ` Bruce Richardson
` (3 subsequent siblings)
4 siblings, 1 reply; 26+ messages in thread
From: Stephen Hemminger @ 2017-12-01 18:04 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
On Fri, 1 Dec 2017 11:12:51 +0000
Konstantin Ananyev <konstantin.ananyev@intel.com> wrote:
> On x86 it is possible to use lock-prefixed instructions to get
> the similar effect as mfence.
> As pointed by Java guys, on most modern HW that gives a better
> performance than using mfence:
> https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> That patch adopts that technique for rte_smp_mb() implementation.
> On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> i.e. from ~110 to ~55 cycles per operation.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> .../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> index 4eac66631..07b7fa7f7 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> @@ -55,12 +55,53 @@ extern "C" {
>
> #define rte_rmb() _mm_lfence()
>
> -#define rte_smp_mb() rte_mb()
> -
> #define rte_smp_wmb() rte_compiler_barrier()
>
> #define rte_smp_rmb() rte_compiler_barrier()
>
> +/*
> + * From Intel Software Development Manual; Vol 3;
> + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> + * ...
> + * . Reads are not reordered with other reads.
> + * . Writes are not reordered with older reads.
> + * . Writes to memory are not reordered with other writes,
> + * with the following exceptions:
> + * . streaming stores (writes) executed with the non-temporal move
> + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> + * . string operations (see Section 8.2.4.1).
> + * ...
> + * . Reads may be reordered with older writes to different locations but not
> + * with older writes to the same location.
> + * . Reads or writes cannot be reordered with I/O instructions,
> + * locked instructions, or serializing instructions.
> + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
> + * . LFENCE instructions cannot pass earlier reads.
> + * . SFENCE instructions cannot pass earlier writes ...
> + * . MFENCE instructions cannot pass earlier reads, writes ...
> + *
> + * As pointed by Java guys, that makes possible to use lock-prefixed
> + * instructions to get the same effect as mfence and on most modern HW
> + * that gives a better perfomarnce than using mfence:
> + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> + * So below we use that technique for rte_smp_mb() implementation.
> + */
> +
> +#ifdef RTE_ARCH_I686
> +#define RTE_SP RTE_STR(esp)
> +#else
> +#define RTE_SP RTE_STR(rsp)
> +#endif
> +
> +#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
> +
> +static __rte_always_inline void
> +rte_smp_mb(void)
> +{
> + asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
> +}
> +
> #define rte_io_mb() rte_mb()
>
> #define rte_io_wmb() rte_compiler_barrier()
The lock instruction is a stronger barrier than the compiler barrier
and has worse performance impact. Are you sure it is necessary to use it in DPDK.
Linux kernel has successfully used simple compiler reodering barrier for years.
Don't confuse rte_smp_mb with the required barrier for talking to I/O devices.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 18:04 ` Stephen Hemminger
@ 2017-12-01 23:08 ` Ananyev, Konstantin
2018-03-08 21:15 ` Stephen Hemminger
0 siblings, 1 reply; 26+ messages in thread
From: Ananyev, Konstantin @ 2017-12-01 23:08 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: dev
Hi Stephen,
> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Friday, December 1, 2017 6:04 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
>
> On Fri, 1 Dec 2017 11:12:51 +0000
> Konstantin Ananyev <konstantin.ananyev@intel.com> wrote:
>
> > On x86 it is possible to use lock-prefixed instructions to get
> > the similar effect as mfence.
> > As pointed by Java guys, on most modern HW that gives a better
> > performance than using mfence:
> > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > That patch adopts that technique for rte_smp_mb() implementation.
> > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > i.e. from ~110 to ~55 cycles per operation.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > ---
> > .../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
> > 1 file changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > index 4eac66631..07b7fa7f7 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > @@ -55,12 +55,53 @@ extern "C" {
> >
> > #define rte_rmb() _mm_lfence()
> >
> > -#define rte_smp_mb() rte_mb()
> > -
> > #define rte_smp_wmb() rte_compiler_barrier()
> >
> > #define rte_smp_rmb() rte_compiler_barrier()
> >
> > +/*
> > + * From Intel Software Development Manual; Vol 3;
> > + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> > + * ...
> > + * . Reads are not reordered with other reads.
> > + * . Writes are not reordered with older reads.
> > + * . Writes to memory are not reordered with other writes,
> > + * with the following exceptions:
> > + * . streaming stores (writes) executed with the non-temporal move
> > + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> > + * . string operations (see Section 8.2.4.1).
> > + * ...
> > + * . Reads may be reordered with older writes to different locations but not
> > + * with older writes to the same location.
> > + * . Reads or writes cannot be reordered with I/O instructions,
> > + * locked instructions, or serializing instructions.
> > + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> > + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
> > + * . LFENCE instructions cannot pass earlier reads.
> > + * . SFENCE instructions cannot pass earlier writes ...
> > + * . MFENCE instructions cannot pass earlier reads, writes ...
> > + *
> > + * As pointed by Java guys, that makes possible to use lock-prefixed
> > + * instructions to get the same effect as mfence and on most modern HW
> > + * that gives a better perfomarnce than using mfence:
> > + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > + * So below we use that technique for rte_smp_mb() implementation.
> > + */
> > +
> > +#ifdef RTE_ARCH_I686
> > +#define RTE_SP RTE_STR(esp)
> > +#else
> > +#define RTE_SP RTE_STR(rsp)
> > +#endif
> > +
> > +#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
> > +
> > +static __rte_always_inline void
> > +rte_smp_mb(void)
> > +{
> > + asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
> > +}
> > +
> > #define rte_io_mb() rte_mb()
> >
> > #define rte_io_wmb() rte_compiler_barrier()
>
> The lock instruction is a stronger barrier than the compiler barrier
> and has worse performance impact. Are you sure it is necessary to use it in DPDK.
> Linux kernel has successfully used simple compiler reodering barrier for years.
Where do you see compiler barrier?
Right now for x86 rte_smp_mb()==rte_mb()==mfence.
So I am replacing mfence with 'lock add'.
As comment above says - on most modern x86 systems it is faster,
while allow to preserve memory ordering.
Konstantin
>
> Don't confuse rte_smp_mb with the required barrier for talking to I/O devices.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-01 11:12 [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
@ 2017-12-11 17:08 ` Bruce Richardson
1 sibling, 0 replies; 26+ messages in thread
From: Bruce Richardson @ 2017-12-11 17:08 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
On Fri, Dec 01, 2017 at 11:12:50AM +0000, Konstantin Ananyev wrote:
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> test/test/Makefile | 1 +
> test/test/test_mb.c | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 279 insertions(+)
> create mode 100644 test/test/test_mb.c
>
First pass comment on this patch is that it really could do with some
comments explaining what is happening and why. Although it's meant to be
a "simple functional test", it doesn't appear to be so on first reading,
without comments as to how the test is meant to work and what the
expected output values are.
/Bruce
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
2017-12-01 18:04 ` Stephen Hemminger
@ 2017-12-11 17:11 ` Bruce Richardson
2017-12-11 17:30 ` Ananyev, Konstantin
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
` (2 subsequent siblings)
4 siblings, 1 reply; 26+ messages in thread
From: Bruce Richardson @ 2017-12-11 17:11 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
On Fri, Dec 01, 2017 at 11:12:51AM +0000, Konstantin Ananyev wrote:
> On x86 it is possible to use lock-prefixed instructions to get
> the similar effect as mfence.
> As pointed by Java guys, on most modern HW that gives a better
> performance than using mfence:
> https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> That patch adopts that technique for rte_smp_mb() implementation.
> On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> i.e. from ~110 to ~55 cycles per operation.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> .../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
<snip>
> + * As pointed by Java guys, that makes possible to use lock-prefixed
> + * instructions to get the same effect as mfence and on most modern HW
> + * that gives a better perfomarnce than using mfence:
> + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> + * So below we use that technique for rte_smp_mb() implementation.
> + */
> +
> +#ifdef RTE_ARCH_I686
> +#define RTE_SP RTE_STR(esp)
> +#else
> +#define RTE_SP RTE_STR(rsp)
> +#endif
> +
> +#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
> +
> +static __rte_always_inline void
> +rte_smp_mb(void)
> +{
> + asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
> +}
Rather than #defining RTE_SP and RTE_MB_DUMMY_MEMP, why not just put the
#ifdef into the rte_smp_mb itself and have two asm volatile lines with
hard-coded register names in them? It would be shorter and I think a lot
easier to read.
/Bruce
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-11 17:11 ` Bruce Richardson
@ 2017-12-11 17:30 ` Ananyev, Konstantin
0 siblings, 0 replies; 26+ messages in thread
From: Ananyev, Konstantin @ 2017-12-11 17:30 UTC (permalink / raw)
To: Richardson, Bruce; +Cc: dev
Hi Bruce,
> -----Original Message-----
> From: Richardson, Bruce
> Sent: Monday, December 11, 2017 5:11 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
>
> On Fri, Dec 01, 2017 at 11:12:51AM +0000, Konstantin Ananyev wrote:
> > On x86 it is possible to use lock-prefixed instructions to get
> > the similar effect as mfence.
> > As pointed by Java guys, on most modern HW that gives a better
> > performance than using mfence:
> > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > That patch adopts that technique for rte_smp_mb() implementation.
> > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > i.e. from ~110 to ~55 cycles per operation.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > ---
> > .../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
> > 1 file changed, 43 insertions(+), 2 deletions(-)
> >
> <snip>
> > + * As pointed by Java guys, that makes possible to use lock-prefixed
> > + * instructions to get the same effect as mfence and on most modern HW
> > + * that gives a better perfomarnce than using mfence:
> > + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > + * So below we use that technique for rte_smp_mb() implementation.
> > + */
> > +
> > +#ifdef RTE_ARCH_I686
> > +#define RTE_SP RTE_STR(esp)
> > +#else
> > +#define RTE_SP RTE_STR(rsp)
> > +#endif
> > +
> > +#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
> > +
> > +static __rte_always_inline void
> > +rte_smp_mb(void)
> > +{
> > + asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
> > +}
>
> Rather than #defining RTE_SP and RTE_MB_DUMMY_MEMP, why not just put the
> #ifdef into the rte_smp_mb itself and have two asm volatile lines with
> hard-coded register names in them? It would be shorter and I think a lot
> easier to read.
Fine by me.
Any other thoughts from anyone till I submit v2?
Konstantin
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v2 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
2017-12-01 18:04 ` Stephen Hemminger
2017-12-11 17:11 ` Bruce Richardson
@ 2017-12-18 15:34 ` Konstantin Ananyev
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 " Konstantin Ananyev
4 siblings, 0 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2017-12-18 15:34 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
v2 changes:
Address Bruce code-review comments:
- get rid of macros to simplify the code
- add more comments to the test case
Konstantin Ananyev (2):
test/test: introduce new test-case for rte_smp_mb()
eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
.../common/include/arch/x86/rte_atomic.h | 44 ++-
test/test/Makefile | 1 +
test/test/test_mb.c | 315 +++++++++++++++++++++
3 files changed, 358 insertions(+), 2 deletions(-)
create mode 100644 test/test/test_mb.c
--
2.13.5
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
` (2 preceding siblings ...)
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
@ 2017-12-18 15:34 ` Konstantin Ananyev
2018-01-12 17:23 ` Thomas Monjalon
` (5 more replies)
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 " Konstantin Ananyev
4 siblings, 6 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2017-12-18 15:34 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
Simple functional test for rte_smp_mb() implementations.
Also when executed on a single lcore could be used as rough
estimation how many cycles particular implementation of rte_smp_mb()
might take.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
test/test/Makefile | 1 +
test/test/test_mb.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 316 insertions(+)
create mode 100644 test/test/test_mb.c
diff --git a/test/test/Makefile b/test/test/Makefile
index bb54c9808..3134283a8 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
SRCS-y += test_memory.c
SRCS-y += test_memzone.c
SRCS-y += test_bitmap.c
+SRCS-y += test_mb.c
SRCS-y += test_ring.c
SRCS-y += test_ring_perf.c
diff --git a/test/test/test_mb.c b/test/test/test_mb.c
new file mode 100644
index 000000000..52c73fb6b
--- /dev/null
+++ b/test/test/test_mb.c
@@ -0,0 +1,315 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /*
+ * This is a simple functional test for rte_smp_mb() implementation.
+ * I.E. make sure that LOAD and STORE operations that precede the
+ * rte_smp_mb() call are globally visible across the lcores
+ * before the the LOAD and STORE operations that follows it.
+ * The test uses simple implementation of Peterson's lock algorithm
+ * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
+ * for two execution units to make sure that rte_smp_mb() prevents
+ * store-load reordering to happen.
+ * Also when executed on a single lcore could be used as a approxiamate
+ * estimation of number of cycles particular implementation of rte_smp_mb()
+ * will take.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_pause.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_vect.h>
+#include <rte_debug.h>
+
+#include "test.h"
+
+#define ADD_MAX 8
+#define ITER_MAX 0x1000000
+
+enum plock_use_type {
+ USE_MB,
+ USE_SMP_MB,
+ USE_NUM
+};
+
+struct plock {
+ volatile uint32_t flag[2];
+ volatile uint32_t victim;
+ enum plock_use_type utype;
+};
+
+/*
+ * Lock plus protected by it two counters.
+ */
+struct plock_test {
+ struct plock lock;
+ uint32_t val;
+ uint32_t iter;
+};
+
+/*
+ * Each active lcore shares plock_test struct with it's left and right
+ * neighbours.
+ */
+struct lcore_plock_test {
+ struct plock_test *pt[2]; /* shared, lock-protected data */
+ uint32_t sum[2]; /* local copy of the shared data */
+ uint32_t iter; /* number of iterations to perfom */
+ uint32_t lc; /* given lcore id */
+};
+
+static inline void
+store_load_barrier(uint32_t utype)
+{
+ if (utype == USE_MB)
+ rte_mb();
+ else if (utype == USE_SMP_MB)
+ rte_smp_mb();
+ else
+ RTE_VERIFY(0);
+}
+
+/*
+ * Peterson lock implementation.
+ */
+static void
+plock_lock(struct plock *l, uint32_t self)
+{
+ uint32_t other;
+
+ other = self ^ 1;
+
+ l->flag[self] = 1;
+ l->victim = self;
+
+ store_load_barrier(l->utype);
+
+ while (l->flag[other] == 1 && l->victim == self)
+ rte_pause();
+}
+
+static void
+plock_unlock(struct plock *l, uint32_t self)
+{
+ rte_smp_wmb();
+ l->flag[self] = 0;
+}
+
+static void
+plock_reset(struct plock *l, enum plock_use_type utype)
+{
+ memset(l, 0, sizeof(*l));
+ l->utype = utype;
+}
+
+/*
+ * grab the lock, update both counters, release the lock.
+ */
+static void
+plock_add(struct plock_test *pt, uint32_t self, uint32_t n)
+{
+ plock_lock(&pt->lock, self);
+ pt->iter++;
+ pt->val += n;
+ plock_unlock(&pt->lock, self);
+}
+
+static int
+plock_test1_lcore(void *data)
+{
+ uint64_t tm;
+ uint32_t i, lc, ln, n;
+ struct lcore_plock_test *lpt;
+
+ lpt = data;
+ lc = rte_lcore_id();
+
+ /* find lcore_plock_test struct for given lcore */
+ for (ln = rte_lcore_count(); ln != 0 && lpt->lc != lc; lpt++, ln--)
+ ;
+
+ if (ln == 0) {
+ printf("%s(%u) error at init\n", __func__, lc);
+ return -1;
+ }
+
+ n = rte_rand() % ADD_MAX;
+ tm = rte_get_timer_cycles();
+
+ /*
+ * for each iteration:
+ * - update shared, locked protected data in a safe manner
+ * - update local copy of the shared data
+ */
+ for (i = 0; i != lpt->iter; i++) {
+
+ plock_add(lpt->pt[0], 0, n);
+ plock_add(lpt->pt[1], 1, n);
+
+ lpt->sum[0] += n;
+ lpt->sum[1] += n;
+
+ n = (n + 1) % ADD_MAX;
+ }
+
+ tm = rte_get_timer_cycles() - tm;
+
+ printf("%s(%u): %u iterations finished, in %" PRIu64
+ " cycles, %#Lf cycles/iteration, "
+ "local sum={%u, %u}\n",
+ __func__, lc, i, tm, (long double)tm / i,
+ lpt->sum[0], lpt->sum[1]);
+ return 0;
+}
+
+/*
+ * For N active lcores we allocate N+1 lcore_plock_test structures.
+ * Each active lcore shares one lcore_plock_test structure with its
+ * left lcore neighbor and one lcore_plock_test structure with its
+ * right lcore neighbor.
+ * During the test each lcore updates data in both shared structures and
+ * its local copies. Then at validation phase we check that our shared
+ * and local data are the same.
+ */
+static int
+plock_test(uint32_t iter, enum plock_use_type utype)
+{
+ int32_t rc;
+ uint32_t i, lc, n;
+ uint32_t *sum;
+ struct plock_test *pt;
+ struct lcore_plock_test *lpt;
+
+ /* init phase, allocate and initialize shared data */
+
+ n = rte_lcore_count();
+ pt = calloc(n + 1, sizeof(*pt));
+ lpt = calloc(n, sizeof(*lpt));
+ sum = calloc(n + 1, sizeof(*sum));
+
+ printf("%s(iter=%u, utype=%u) started on %u lcores\n",
+ __func__, iter, utype, n);
+
+ if (pt == NULL || lpt == NULL) {
+ printf("%s: failed to allocate memory for %u lcores\n",
+ __func__, n);
+ free(pt);
+ free(lpt);
+ free(sum);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != n + 1; i++)
+ plock_reset(&pt[i].lock, utype);
+
+ i = 0;
+ RTE_LCORE_FOREACH(lc) {
+
+ lpt[i].lc = lc;
+ lpt[i].iter = iter;
+ lpt[i].pt[0] = pt + i;
+ lpt[i].pt[1] = pt + i + 1;
+ i++;
+ }
+
+ lpt[i - 1].pt[1] = pt;
+
+ for (i = 0; i != n; i++)
+ printf("lpt[%u]={lc=%u, pt={%p, %p},};\n",
+ i, lpt[i].lc, lpt[i].pt[0], lpt[i].pt[1]);
+
+
+ /* test phase - start and wait for completion on each active lcore */
+
+ rte_eal_mp_remote_launch(plock_test1_lcore, lpt, CALL_MASTER);
+ rte_eal_mp_wait_lcore();
+
+ /* validation phase - make sure that shared and local data match */
+
+ for (i = 0; i != n; i++) {
+ sum[i] += lpt[i].sum[0];
+ sum[i + 1] += lpt[i].sum[1];
+ }
+
+ sum[0] += sum[i];
+
+ rc = 0;
+ for (i = 0; i != n; i++) {
+ printf("%s: sum[%u]=%u, pt[%u].val=%u, pt[%u].iter=%u;\n",
+ __func__, i, sum[i], i, pt[i].val, i, pt[i].iter);
+
+ /* race condition occurred, lock doesn't work properly */
+ if (sum[i] != pt[i].val || 2 * iter != pt[i].iter) {
+ printf("error: local and shared sums don't much\n");
+ rc = -1;
+ }
+ }
+
+ free(pt);
+ free(lpt);
+ free(sum);
+
+ printf("%s(utype=%u) returns %d\n", __func__, utype, rc);
+ return rc;
+}
+
+static int
+test_mb(void)
+{
+ int32_t i, ret, rc[USE_NUM];
+
+ for (i = 0; i != RTE_DIM(rc); i++)
+ rc[i] = plock_test(ITER_MAX, i);
+
+ ret = 0;
+ for (i = 0; i != RTE_DIM(rc); i++) {
+ printf("%s for utype=%d %s\n",
+ __func__, i, rc[i] == 0 ? "passed" : "failed");
+ ret |= rc[i];
+ }
+
+ return ret;
+}
+
+REGISTER_TEST_COMMAND(mb_autotest, test_mb);
--
2.13.5
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
` (3 preceding siblings ...)
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
@ 2017-12-18 15:34 ` Konstantin Ananyev
2017-12-18 15:46 ` Bruce Richardson
4 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2017-12-18 15:34 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
On x86 it is possible to use lock-prefixed instructions to get
the similar effect as mfence.
As pointed by Java guys, on most modern HW that gives a better
performance than using mfence:
https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
That patch adopts that technique for rte_smp_mb() implementation.
On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
i.e. from ~110 to ~55 cycles per operation.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
.../common/include/arch/x86/rte_atomic.h | 44 +++++++++++++++++++++-
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
index 4eac66631..8b68ba327 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
@@ -55,12 +55,52 @@ extern "C" {
#define rte_rmb() _mm_lfence()
-#define rte_smp_mb() rte_mb()
-
#define rte_smp_wmb() rte_compiler_barrier()
#define rte_smp_rmb() rte_compiler_barrier()
+/*
+ * From Intel Software Development Manual; Vol 3;
+ * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
+ * ...
+ * . Reads are not reordered with other reads.
+ * . Writes are not reordered with older reads.
+ * . Writes to memory are not reordered with other writes,
+ * with the following exceptions:
+ * . streaming stores (writes) executed with the non-temporal move
+ * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
+ * . string operations (see Section 8.2.4.1).
+ * ...
+ * . Reads may be reordered with older writes to different locations but not
+ * with older writes to the same location.
+ * . Reads or writes cannot be reordered with I/O instructions,
+ * locked instructions, or serializing instructions.
+ * . Reads cannot pass earlier LFENCE and MFENCE instructions.
+ * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
+ * . LFENCE instructions cannot pass earlier reads.
+ * . SFENCE instructions cannot pass earlier writes ...
+ * . MFENCE instructions cannot pass earlier reads, writes ...
+ *
+ * As pointed by Java guys, that makes possible to use lock-prefixed
+ * instructions to get the same effect as mfence and on most modern HW
+ * that gives a better perfomance then using mfence:
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * Basic idea is to use lock prefixed add with some dummy memory location
+ * as the destination. From their experiments 128B(2 cache lines) below
+ * current stack pointer looks like a good candidate.
+ * So below we use that techinque for rte_smp_mb() implementation.
+ */
+
+static __rte_always_inline void
+rte_smp_mb(void)
+{
+#ifdef RTE_ARCH_I686
+ asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
+#else
+ asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
+#endif
+}
+
#define rte_io_mb() rte_mb()
#define rte_io_wmb() rte_compiler_barrier()
--
2.13.5
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v2 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 " Konstantin Ananyev
@ 2017-12-18 15:46 ` Bruce Richardson
0 siblings, 0 replies; 26+ messages in thread
From: Bruce Richardson @ 2017-12-18 15:46 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
On Mon, Dec 18, 2017 at 03:34:13PM +0000, Konstantin Ananyev wrote:
> On x86 it is possible to use lock-prefixed instructions to get
> the similar effect as mfence.
> As pointed by Java guys, on most modern HW that gives a better
> performance than using mfence:
> https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> That patch adopts that technique for rte_smp_mb() implementation.
> On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> i.e. from ~110 to ~55 cycles per operation.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
@ 2018-01-12 17:23 ` Thomas Monjalon
2018-01-12 17:58 ` Ananyev, Konstantin
2018-01-13 13:54 ` Wiles, Keith
` (4 subsequent siblings)
5 siblings, 1 reply; 26+ messages in thread
From: Thomas Monjalon @ 2018-01-12 17:23 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
Hi,
18/12/2017 16:34, Konstantin Ananyev:
> test/test/test_mb.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
[...]
> +REGISTER_TEST_COMMAND(mb_autotest, test_mb);
For readability, do you agree to rename "mb" to "barrier"
in function and file name?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
2018-01-12 17:23 ` Thomas Monjalon
@ 2018-01-12 17:58 ` Ananyev, Konstantin
0 siblings, 0 replies; 26+ messages in thread
From: Ananyev, Konstantin @ 2018-01-12 17:58 UTC (permalink / raw)
To: Thomas Monjalon; +Cc: dev
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Friday, January 12, 2018 5:24 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
>
> Hi,
>
> 18/12/2017 16:34, Konstantin Ananyev:
> > test/test/test_mb.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> [...]
> > +REGISTER_TEST_COMMAND(mb_autotest, test_mb);
>
> For readability, do you agree to rename "mb" to "barrier"
> in function and file name?
Yes.
Konstantin
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2018-01-12 17:23 ` Thomas Monjalon
@ 2018-01-13 13:54 ` Wiles, Keith
2018-01-13 13:54 ` Wiles, Keith
` (3 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Wiles, Keith @ 2018-01-13 13:54 UTC (permalink / raw)
To: Ananyev, Konstantin; +Cc: dev
> On Dec 18, 2017, at 9:34 AM, Konstantin Ananyev <konstantin.ananyev@intel.com> wrote:
>
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> test/test/Makefile | 1 +
> test/test/test_mb.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 316 insertions(+)
> create mode 100644 test/test/test_mb.c
>
> diff --git a/test/test/Makefile b/test/test/Makefile
> index bb54c9808..3134283a8 100644
> --- a/test/test/Makefile
> +++ b/test/test/Makefile
> @@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
> SRCS-y += test_memory.c
> SRCS-y += test_memzone.c
> SRCS-y += test_bitmap.c
> +SRCS-y += test_mb.c
>
> SRCS-y += test_ring.c
> SRCS-y += test_ring_perf.c
> diff --git a/test/test/test_mb.c b/test/test/test_mb.c
> new file mode 100644
> index 000000000..52c73fb6b
> --- /dev/null
> +++ b/test/test/test_mb.c
> @@ -0,0 +1,315 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
SPDX header should be used, correct?
> +
> + /*
> + * This is a simple functional test for rte_smp_mb() implementation.
> + * I.E. make sure that LOAD and STORE operations that precede the
> + * rte_smp_mb() call are globally visible across the lcores
> + * before the the LOAD and STORE operations that follows it.
> + * The test uses simple implementation of Peterson's lock algorithm
> + * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
> + * for two execution units to make sure that rte_smp_mb() prevents
> + * store-load reordering to happen.
> + * Also when executed on a single lcore could be used as a approxiamate
> + * estimation of number of cycles particular implementation of rte_smp_mb()
> + * will take.
> + */
> +
>
Regards,
Keith
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2018-01-12 17:23 ` Thomas Monjalon
2018-01-13 13:54 ` Wiles, Keith
@ 2018-01-13 13:54 ` Wiles, Keith
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
` (2 subsequent siblings)
5 siblings, 0 replies; 26+ messages in thread
From: Wiles, Keith @ 2018-01-13 13:54 UTC (permalink / raw)
To: Ananyev, Konstantin; +Cc: dev
> On Dec 18, 2017, at 9:34 AM, Konstantin Ananyev <konstantin.ananyev@intel.com> wrote:
>
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> test/test/Makefile | 1 +
> test/test/test_mb.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 316 insertions(+)
> create mode 100644 test/test/test_mb.c
>
> diff --git a/test/test/Makefile b/test/test/Makefile
> index bb54c9808..3134283a8 100644
> --- a/test/test/Makefile
> +++ b/test/test/Makefile
> @@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
> SRCS-y += test_memory.c
> SRCS-y += test_memzone.c
> SRCS-y += test_bitmap.c
> +SRCS-y += test_mb.c
>
> SRCS-y += test_ring.c
> SRCS-y += test_ring_perf.c
> diff --git a/test/test/test_mb.c b/test/test/test_mb.c
> new file mode 100644
> index 000000000..52c73fb6b
> --- /dev/null
> +++ b/test/test/test_mb.c
> @@ -0,0 +1,315 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
SPDX header should be used, correct?
> +
> + /*
> + * This is a simple functional test for rte_smp_mb() implementation.
> + * I.E. make sure that LOAD and STORE operations that precede the
> + * rte_smp_mb() call are globally visible across the lcores
> + * before the the LOAD and STORE operations that follows it.
> + * The test uses simple implementation of Peterson's lock algorithm
> + * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
> + * for two execution units to make sure that rte_smp_mb() prevents
> + * store-load reordering to happen.
> + * Also when executed on a single lcore could be used as a approxiamate
> + * estimation of number of cycles particular implementation of rte_smp_mb()
> + * will take.
> + */
> +
>
Regards,
Keith
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v3 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
` (2 preceding siblings ...)
2018-01-13 13:54 ` Wiles, Keith
@ 2018-01-15 15:04 ` Konstantin Ananyev
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
5 siblings, 0 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2018-01-15 15:04 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
v3 changes:
Address Thomas and Keith code-review comments:
- for readability, use "barrier" instead of "mb" in function and file naming.
- Use SPDX tag for license header.
v2 changes:
Address Bruce code-review comments:
- get rid of macros to simplify the code
- add more comments to the test case
Konstantin Ananyev (2):
test/test: introduce new test-case for rte_smp_mb()
eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
.../common/include/arch/x86/rte_atomic.h | 44 +++-
test/test/Makefile | 1 +
test/test/test_barrier.c | 286 +++++++++++++++++++++
3 files changed, 329 insertions(+), 2 deletions(-)
create mode 100644 test/test/test_barrier.c
--
2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
` (3 preceding siblings ...)
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
@ 2018-01-15 15:04 ` Konstantin Ananyev
2018-01-16 0:16 ` Thomas Monjalon
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
5 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2018-01-15 15:04 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
Simple functional test for rte_smp_mb() implementations.
Also when executed on a single lcore could be used as rough
estimation how many cycles particular implementation of rte_smp_mb()
might take.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
test/test/Makefile | 1 +
test/test/test_barrier.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 287 insertions(+)
create mode 100644 test/test/test_barrier.c
diff --git a/test/test/Makefile b/test/test/Makefile
index e7818dc6e..80fb09e2e 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -67,6 +67,7 @@ SRCS-y += test_spinlock.c
SRCS-y += test_memory.c
SRCS-y += test_memzone.c
SRCS-y += test_bitmap.c
+SRCS-y += test_barrier.c
SRCS-y += test_ring.c
SRCS-y += test_ring_perf.c
diff --git a/test/test/test_barrier.c b/test/test/test_barrier.c
new file mode 100644
index 000000000..82b572c3e
--- /dev/null
+++ b/test/test/test_barrier.c
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+ /*
+ * This is a simple functional test for rte_smp_mb() implementation.
+ * I.E. make sure that LOAD and STORE operations that precede the
+ * rte_smp_mb() call are globally visible across the lcores
+ * before the the LOAD and STORE operations that follows it.
+ * The test uses simple implementation of Peterson's lock algorithm
+ * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
+ * for two execution units to make sure that rte_smp_mb() prevents
+ * store-load reordering to happen.
+ * Also when executed on a single lcore could be used as a approxiamate
+ * estimation of number of cycles particular implementation of rte_smp_mb()
+ * will take.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_pause.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_vect.h>
+#include <rte_debug.h>
+
+#include "test.h"
+
+#define ADD_MAX 8
+#define ITER_MAX 0x1000000
+
+enum plock_use_type {
+ USE_MB,
+ USE_SMP_MB,
+ USE_NUM
+};
+
+struct plock {
+ volatile uint32_t flag[2];
+ volatile uint32_t victim;
+ enum plock_use_type utype;
+};
+
+/*
+ * Lock plus protected by it two counters.
+ */
+struct plock_test {
+ struct plock lock;
+ uint32_t val;
+ uint32_t iter;
+};
+
+/*
+ * Each active lcore shares plock_test struct with it's left and right
+ * neighbours.
+ */
+struct lcore_plock_test {
+ struct plock_test *pt[2]; /* shared, lock-protected data */
+ uint32_t sum[2]; /* local copy of the shared data */
+ uint32_t iter; /* number of iterations to perfom */
+ uint32_t lc; /* given lcore id */
+};
+
+static inline void
+store_load_barrier(uint32_t utype)
+{
+ if (utype == USE_MB)
+ rte_mb();
+ else if (utype == USE_SMP_MB)
+ rte_smp_mb();
+ else
+ RTE_VERIFY(0);
+}
+
+/*
+ * Peterson lock implementation.
+ */
+static void
+plock_lock(struct plock *l, uint32_t self)
+{
+ uint32_t other;
+
+ other = self ^ 1;
+
+ l->flag[self] = 1;
+ l->victim = self;
+
+ store_load_barrier(l->utype);
+
+ while (l->flag[other] == 1 && l->victim == self)
+ rte_pause();
+}
+
+static void
+plock_unlock(struct plock *l, uint32_t self)
+{
+ rte_smp_wmb();
+ l->flag[self] = 0;
+}
+
+static void
+plock_reset(struct plock *l, enum plock_use_type utype)
+{
+ memset(l, 0, sizeof(*l));
+ l->utype = utype;
+}
+
+/*
+ * grab the lock, update both counters, release the lock.
+ */
+static void
+plock_add(struct plock_test *pt, uint32_t self, uint32_t n)
+{
+ plock_lock(&pt->lock, self);
+ pt->iter++;
+ pt->val += n;
+ plock_unlock(&pt->lock, self);
+}
+
+static int
+plock_test1_lcore(void *data)
+{
+ uint64_t tm;
+ uint32_t i, lc, ln, n;
+ struct lcore_plock_test *lpt;
+
+ lpt = data;
+ lc = rte_lcore_id();
+
+ /* find lcore_plock_test struct for given lcore */
+ for (ln = rte_lcore_count(); ln != 0 && lpt->lc != lc; lpt++, ln--)
+ ;
+
+ if (ln == 0) {
+ printf("%s(%u) error at init\n", __func__, lc);
+ return -1;
+ }
+
+ n = rte_rand() % ADD_MAX;
+ tm = rte_get_timer_cycles();
+
+ /*
+ * for each iteration:
+ * - update shared, locked protected data in a safe manner
+ * - update local copy of the shared data
+ */
+ for (i = 0; i != lpt->iter; i++) {
+
+ plock_add(lpt->pt[0], 0, n);
+ plock_add(lpt->pt[1], 1, n);
+
+ lpt->sum[0] += n;
+ lpt->sum[1] += n;
+
+ n = (n + 1) % ADD_MAX;
+ }
+
+ tm = rte_get_timer_cycles() - tm;
+
+ printf("%s(%u): %u iterations finished, in %" PRIu64
+ " cycles, %#Lf cycles/iteration, "
+ "local sum={%u, %u}\n",
+ __func__, lc, i, tm, (long double)tm / i,
+ lpt->sum[0], lpt->sum[1]);
+ return 0;
+}
+
+/*
+ * For N active lcores we allocate N+1 lcore_plock_test structures.
+ * Each active lcore shares one lcore_plock_test structure with its
+ * left lcore neighbor and one lcore_plock_test structure with its
+ * right lcore neighbor.
+ * During the test each lcore updates data in both shared structures and
+ * its local copies. Then at validation phase we check that our shared
+ * and local data are the same.
+ */
+static int
+plock_test(uint32_t iter, enum plock_use_type utype)
+{
+ int32_t rc;
+ uint32_t i, lc, n;
+ uint32_t *sum;
+ struct plock_test *pt;
+ struct lcore_plock_test *lpt;
+
+ /* init phase, allocate and initialize shared data */
+
+ n = rte_lcore_count();
+ pt = calloc(n + 1, sizeof(*pt));
+ lpt = calloc(n, sizeof(*lpt));
+ sum = calloc(n + 1, sizeof(*sum));
+
+ printf("%s(iter=%u, utype=%u) started on %u lcores\n",
+ __func__, iter, utype, n);
+
+ if (pt == NULL || lpt == NULL) {
+ printf("%s: failed to allocate memory for %u lcores\n",
+ __func__, n);
+ free(pt);
+ free(lpt);
+ free(sum);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != n + 1; i++)
+ plock_reset(&pt[i].lock, utype);
+
+ i = 0;
+ RTE_LCORE_FOREACH(lc) {
+
+ lpt[i].lc = lc;
+ lpt[i].iter = iter;
+ lpt[i].pt[0] = pt + i;
+ lpt[i].pt[1] = pt + i + 1;
+ i++;
+ }
+
+ lpt[i - 1].pt[1] = pt;
+
+ for (i = 0; i != n; i++)
+ printf("lpt[%u]={lc=%u, pt={%p, %p},};\n",
+ i, lpt[i].lc, lpt[i].pt[0], lpt[i].pt[1]);
+
+
+ /* test phase - start and wait for completion on each active lcore */
+
+ rte_eal_mp_remote_launch(plock_test1_lcore, lpt, CALL_MASTER);
+ rte_eal_mp_wait_lcore();
+
+ /* validation phase - make sure that shared and local data match */
+
+ for (i = 0; i != n; i++) {
+ sum[i] += lpt[i].sum[0];
+ sum[i + 1] += lpt[i].sum[1];
+ }
+
+ sum[0] += sum[i];
+
+ rc = 0;
+ for (i = 0; i != n; i++) {
+ printf("%s: sum[%u]=%u, pt[%u].val=%u, pt[%u].iter=%u;\n",
+ __func__, i, sum[i], i, pt[i].val, i, pt[i].iter);
+
+ /* race condition occurred, lock doesn't work properly */
+ if (sum[i] != pt[i].val || 2 * iter != pt[i].iter) {
+ printf("error: local and shared sums don't much\n");
+ rc = -1;
+ }
+ }
+
+ free(pt);
+ free(lpt);
+ free(sum);
+
+ printf("%s(utype=%u) returns %d\n", __func__, utype, rc);
+ return rc;
+}
+
+static int
+test_barrier(void)
+{
+ int32_t i, ret, rc[USE_NUM];
+
+ for (i = 0; i != RTE_DIM(rc); i++)
+ rc[i] = plock_test(ITER_MAX, i);
+
+ ret = 0;
+ for (i = 0; i != RTE_DIM(rc); i++) {
+ printf("%s for utype=%d %s\n",
+ __func__, i, rc[i] == 0 ? "passed" : "failed");
+ ret |= rc[i];
+ }
+
+ return ret;
+}
+
+REGISTER_TEST_COMMAND(barrier_autotest, test_barrier);
--
2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
` (4 preceding siblings ...)
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
@ 2018-01-15 15:04 ` Konstantin Ananyev
2018-01-15 15:09 ` Konstantin Ananyev
5 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2018-01-15 15:04 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
On x86 it is possible to use lock-prefixed instructions to get
the similar effect as mfence.
As pointed by Java guys, on most modern HW that gives a better
performance than using mfence:
https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
That patch adopts that technique for rte_smp_mb() implementation.
On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
i.e. from ~110 to ~55 cycles per operation.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
.../common/include/arch/x86/rte_atomic.h | 44 +++++++++++++++++++++-
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
index 8469f97e1..9d466d94a 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
@@ -26,12 +26,52 @@ extern "C" {
#define rte_rmb() _mm_lfence()
-#define rte_smp_mb() rte_mb()
-
#define rte_smp_wmb() rte_compiler_barrier()
#define rte_smp_rmb() rte_compiler_barrier()
+/*
+ * From Intel Software Development Manual; Vol 3;
+ * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
+ * ...
+ * . Reads are not reordered with other reads.
+ * . Writes are not reordered with older reads.
+ * . Writes to memory are not reordered with other writes,
+ * with the following exceptions:
+ * . streaming stores (writes) executed with the non-temporal move
+ * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
+ * . string operations (see Section 8.2.4.1).
+ * ...
+ * . Reads may be reordered with older writes to different locations but not
+ * with older writes to the same location.
+ * . Reads or writes cannot be reordered with I/O instructions,
+ * locked instructions, or serializing instructions.
+ * . Reads cannot pass earlier LFENCE and MFENCE instructions.
+ * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
+ * . LFENCE instructions cannot pass earlier reads.
+ * . SFENCE instructions cannot pass earlier writes ...
+ * . MFENCE instructions cannot pass earlier reads, writes ...
+ *
+ * As pointed by Java guys, that makes possible to use lock-prefixed
+ * instructions to get the same effect as mfence and on most modern HW
+ * that gives a better perfomance then using mfence:
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * Basic idea is to use lock prefixed add with some dummy memory location
+ * as the destination. From their experiments 128B(2 cache lines) below
+ * current stack pointer looks like a good candidate.
+ * So below we use that techinque for rte_smp_mb() implementation.
+ */
+
+static __rte_always_inline void
+rte_smp_mb(void)
+{
+#ifdef RTE_ARCH_I686
+ asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
+#else
+ asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
+#endif
+}
+
#define rte_io_mb() rte_mb()
#define rte_io_wmb() rte_compiler_barrier()
--
2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
@ 2018-01-15 15:09 ` Konstantin Ananyev
[not found] ` <8b05f533-d146-7f97-48f4-82ddcfc3613b@redhat.com>
2018-01-29 15:47 ` [dpdk-dev] " Thomas Monjalon
0 siblings, 2 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2018-01-15 15:09 UTC (permalink / raw)
To: dev; +Cc: Konstantin Ananyev
On x86 it is possible to use lock-prefixed instructions to get
the similar effect as mfence.
As pointed by Java guys, on most modern HW that gives a better
performance than using mfence:
https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
That patch adopts that technique for rte_smp_mb() implementation.
On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
i.e. from ~110 to ~55 cycles per operation.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
.../common/include/arch/x86/rte_atomic.h | 44 +++++++++++++++++++++-
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
index 8469f97e1..9d466d94a 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
@@ -26,12 +26,52 @@ extern "C" {
#define rte_rmb() _mm_lfence()
-#define rte_smp_mb() rte_mb()
-
#define rte_smp_wmb() rte_compiler_barrier()
#define rte_smp_rmb() rte_compiler_barrier()
+/*
+ * From Intel Software Development Manual; Vol 3;
+ * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
+ * ...
+ * . Reads are not reordered with other reads.
+ * . Writes are not reordered with older reads.
+ * . Writes to memory are not reordered with other writes,
+ * with the following exceptions:
+ * . streaming stores (writes) executed with the non-temporal move
+ * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
+ * . string operations (see Section 8.2.4.1).
+ * ...
+ * . Reads may be reordered with older writes to different locations but not
+ * with older writes to the same location.
+ * . Reads or writes cannot be reordered with I/O instructions,
+ * locked instructions, or serializing instructions.
+ * . Reads cannot pass earlier LFENCE and MFENCE instructions.
+ * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
+ * . LFENCE instructions cannot pass earlier reads.
+ * . SFENCE instructions cannot pass earlier writes ...
+ * . MFENCE instructions cannot pass earlier reads, writes ...
+ *
+ * As pointed by Java guys, that makes possible to use lock-prefixed
+ * instructions to get the same effect as mfence and on most modern HW
+ * that gives a better perfomance then using mfence:
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * Basic idea is to use lock prefixed add with some dummy memory location
+ * as the destination. From their experiments 128B(2 cache lines) below
+ * current stack pointer looks like a good candidate.
+ * So below we use that techinque for rte_smp_mb() implementation.
+ */
+
+static __rte_always_inline void
+rte_smp_mb(void)
+{
+#ifdef RTE_ARCH_I686
+ asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
+#else
+ asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
+#endif
+}
+
#define rte_io_mb() rte_mb()
#define rte_io_wmb() rte_compiler_barrier()
--
2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb()
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
@ 2018-01-16 0:16 ` Thomas Monjalon
0 siblings, 0 replies; 26+ messages in thread
From: Thomas Monjalon @ 2018-01-16 0:16 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev, jerin.jacob
15/01/2018 16:04, Konstantin Ananyev:
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> test/test/Makefile | 1 +
> test/test/test_barrier.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++
I will add
F: test/test/test_barrier.c
in MAINTAINERS, when applying.
> +static inline void
> +store_load_barrier(uint32_t utype)
> +{
> + if (utype == USE_MB)
> + rte_mb();
> + else if (utype == USE_SMP_MB)
> + rte_smp_mb();
> + else
> + RTE_VERIFY(0);
> +}
It does not compile on ARM64.
I have sent a fix for ARM64 barrier macros:
https://dpdk.org/patch/33762
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] Fwd: [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
[not found] ` <8b05f533-d146-7f97-48f4-82ddcfc3613b@redhat.com>
@ 2018-01-16 1:54 ` Michael S. Tsirkin
2018-01-29 9:29 ` Ananyev, Konstantin
0 siblings, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2018-01-16 1:54 UTC (permalink / raw)
To: Maxime Coquelin; +Cc: Konstantin Ananyev, dev, Bruce Richardson
On Mon, Jan 15, 2018 at 04:15:00PM +0100, Maxime Coquelin wrote:
> Hi Michael,
>
> FYI:
>
> -------- Forwarded Message --------
> Subject: [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions
> to reduce cost of rte_smp_mb()
> Date: Mon, 15 Jan 2018 15:09:31 +0000
> From: Konstantin Ananyev <konstantin.ananyev@intel.com>
> To: dev@dpdk.org
> CC: Konstantin Ananyev <konstantin.ananyev@intel.com>
>
> On x86 it is possible to use lock-prefixed instructions to get
> the similar effect as mfence.
> As pointed by Java guys, on most modern HW that gives a better
> performance than using mfence:
> https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> That patch adopts that technique for rte_smp_mb() implementation.
> On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> i.e. from ~110 to ~55 cycles per operation.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> .../common/include/arch/x86/rte_atomic.h | 44
> +++++++++++++++++++++-
> 1 file changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> index 8469f97e1..9d466d94a 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> @@ -26,12 +26,52 @@ extern "C" {
> #define rte_rmb() _mm_lfence()
> -#define rte_smp_mb() rte_mb()
> -
> #define rte_smp_wmb() rte_compiler_barrier()
> #define rte_smp_rmb() rte_compiler_barrier()
> +/*
> + * From Intel Software Development Manual; Vol 3;
> + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> + * ...
> + * . Reads are not reordered with other reads.
> + * . Writes are not reordered with older reads.
> + * . Writes to memory are not reordered with other writes,
> + * with the following exceptions:
> + * . streaming stores (writes) executed with the non-temporal move
> + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> + * . string operations (see Section 8.2.4.1).
> + * ...
> + * . Reads may be reordered with older writes to different locations but
> not
> + * with older writes to the same location.
> + * . Reads or writes cannot be reordered with I/O instructions,
> + * locked instructions, or serializing instructions.
> + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE
> instructions.
> + * . LFENCE instructions cannot pass earlier reads.
> + * . SFENCE instructions cannot pass earlier writes ...
> + * . MFENCE instructions cannot pass earlier reads, writes ...
> + *
> + * As pointed by Java guys, that makes possible to use lock-prefixed
> + * instructions to get the same effect as mfence and on most modern HW
> + * that gives a better perfomance then using mfence:
> + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> + * Basic idea is to use lock prefixed add with some dummy memory location
> + * as the destination. From their experiments 128B(2 cache lines) below
> + * current stack pointer looks like a good candidate.
> + * So below we use that techinque for rte_smp_mb() implementation.
> + */
> +
> +static __rte_always_inline void
> +rte_smp_mb(void)
> +{
> +#ifdef RTE_ARCH_I686
> + asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
> +#else
> + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
> +#endif
> +}
> +
> #define rte_io_mb() rte_mb()
> #define rte_io_wmb() rte_compiler_barrier()
In my testing this appears to be suboptimal when the calling
function is large. The following seems to work better:
+static __rte_always_inline void
+rte_smp_mb(void)
+{
+#ifdef RTE_ARCH_I686
+ asm volatile("lock addl $0, -132(%%esp); " ::: "memory");
+#else
+ asm volatile("lock addl $0, -132(%%rsp); " ::: "memory");
+#endif
+}
+
The reason most likely is that 128 still overlaps the x86
red zone by 4 bytes.
Feel free to reuse, and add
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> --
> 2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] Fwd: [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2018-01-16 1:54 ` [dpdk-dev] Fwd: " Michael S. Tsirkin
@ 2018-01-29 9:29 ` Ananyev, Konstantin
2018-01-29 17:29 ` Michael S. Tsirkin
0 siblings, 1 reply; 26+ messages in thread
From: Ananyev, Konstantin @ 2018-01-29 9:29 UTC (permalink / raw)
To: Michael S. Tsirkin, Maxime Coquelin; +Cc: dev, Richardson, Bruce
Hi Michael,
>
> On Mon, Jan 15, 2018 at 04:15:00PM +0100, Maxime Coquelin wrote:
> > Hi Michael,
> >
> > FYI:
> >
> > -------- Forwarded Message --------
> > Subject: [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions
> > to reduce cost of rte_smp_mb()
> > Date: Mon, 15 Jan 2018 15:09:31 +0000
> > From: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > To: dev@dpdk.org
> > CC: Konstantin Ananyev <konstantin.ananyev@intel.com>
> >
> > On x86 it is possible to use lock-prefixed instructions to get
> > the similar effect as mfence.
> > As pointed by Java guys, on most modern HW that gives a better
> > performance than using mfence:
> > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > That patch adopts that technique for rte_smp_mb() implementation.
> > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > i.e. from ~110 to ~55 cycles per operation.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> > .../common/include/arch/x86/rte_atomic.h | 44
> > +++++++++++++++++++++-
> > 1 file changed, 42 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > index 8469f97e1..9d466d94a 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > @@ -26,12 +26,52 @@ extern "C" {
> > #define rte_rmb() _mm_lfence()
> > -#define rte_smp_mb() rte_mb()
> > -
> > #define rte_smp_wmb() rte_compiler_barrier()
> > #define rte_smp_rmb() rte_compiler_barrier()
> > +/*
> > + * From Intel Software Development Manual; Vol 3;
> > + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> > + * ...
> > + * . Reads are not reordered with other reads.
> > + * . Writes are not reordered with older reads.
> > + * . Writes to memory are not reordered with other writes,
> > + * with the following exceptions:
> > + * . streaming stores (writes) executed with the non-temporal move
> > + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> > + * . string operations (see Section 8.2.4.1).
> > + * ...
> > + * . Reads may be reordered with older writes to different locations but
> > not
> > + * with older writes to the same location.
> > + * . Reads or writes cannot be reordered with I/O instructions,
> > + * locked instructions, or serializing instructions.
> > + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> > + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE
> > instructions.
> > + * . LFENCE instructions cannot pass earlier reads.
> > + * . SFENCE instructions cannot pass earlier writes ...
> > + * . MFENCE instructions cannot pass earlier reads, writes ...
> > + *
> > + * As pointed by Java guys, that makes possible to use lock-prefixed
> > + * instructions to get the same effect as mfence and on most modern HW
> > + * that gives a better perfomance then using mfence:
> > + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > + * Basic idea is to use lock prefixed add with some dummy memory location
> > + * as the destination. From their experiments 128B(2 cache lines) below
> > + * current stack pointer looks like a good candidate.
> > + * So below we use that techinque for rte_smp_mb() implementation.
> > + */
> > +
> > +static __rte_always_inline void
> > +rte_smp_mb(void)
> > +{
> > +#ifdef RTE_ARCH_I686
> > + asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
> > +#else
> > + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
> > +#endif
> > +}
> > +
> > #define rte_io_mb() rte_mb()
> > #define rte_io_wmb() rte_compiler_barrier()
>
> In my testing this appears to be suboptimal when the calling
> function is large. The following seems to work better:
>
> +static __rte_always_inline void
> +rte_smp_mb(void)
> +{
> +#ifdef RTE_ARCH_I686
> + asm volatile("lock addl $0, -132(%%esp); " ::: "memory");
> +#else
> + asm volatile("lock addl $0, -132(%%rsp); " ::: "memory");
> +#endif
> +}
> +
>
> The reason most likely is that 128 still overlaps the x86
> red zone by 4 bytes.
I tried what you suggested but for my cases didn't see any improvement so far.
Can you explain a bit more why do you expect it to be faster?
Probably some particular scenario?
Konstantin
>
> Feel free to reuse, and add
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
>
> > --
> > 2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2018-01-15 15:09 ` Konstantin Ananyev
[not found] ` <8b05f533-d146-7f97-48f4-82ddcfc3613b@redhat.com>
@ 2018-01-29 15:47 ` Thomas Monjalon
2018-01-30 19:33 ` Stephen Hemminger
1 sibling, 1 reply; 26+ messages in thread
From: Thomas Monjalon @ 2018-01-29 15:47 UTC (permalink / raw)
To: Konstantin Ananyev; +Cc: dev
15/01/2018 16:09, Konstantin Ananyev:
> On x86 it is possible to use lock-prefixed instructions to get
> the similar effect as mfence.
> As pointed by Java guys, on most modern HW that gives a better
> performance than using mfence:
> https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> That patch adopts that technique for rte_smp_mb() implementation.
> On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> i.e. from ~110 to ~55 cycles per operation.
>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Applied, thanks
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] Fwd: [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2018-01-29 9:29 ` Ananyev, Konstantin
@ 2018-01-29 17:29 ` Michael S. Tsirkin
0 siblings, 0 replies; 26+ messages in thread
From: Michael S. Tsirkin @ 2018-01-29 17:29 UTC (permalink / raw)
To: Ananyev, Konstantin; +Cc: Maxime Coquelin, dev, Richardson, Bruce
On Mon, Jan 29, 2018 at 09:29:52AM +0000, Ananyev, Konstantin wrote:
> Hi Michael,
>
> >
> > On Mon, Jan 15, 2018 at 04:15:00PM +0100, Maxime Coquelin wrote:
> > > Hi Michael,
> > >
> > > FYI:
> > >
> > > -------- Forwarded Message --------
> > > Subject: [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions
> > > to reduce cost of rte_smp_mb()
> > > Date: Mon, 15 Jan 2018 15:09:31 +0000
> > > From: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > To: dev@dpdk.org
> > > CC: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > >
> > > On x86 it is possible to use lock-prefixed instructions to get
> > > the similar effect as mfence.
> > > As pointed by Java guys, on most modern HW that gives a better
> > > performance than using mfence:
> > > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > > That patch adopts that technique for rte_smp_mb() implementation.
> > > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > > i.e. from ~110 to ~55 cycles per operation.
> > >
> > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > ---
> > > .../common/include/arch/x86/rte_atomic.h | 44
> > > +++++++++++++++++++++-
> > > 1 file changed, 42 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > index 8469f97e1..9d466d94a 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > @@ -26,12 +26,52 @@ extern "C" {
> > > #define rte_rmb() _mm_lfence()
> > > -#define rte_smp_mb() rte_mb()
> > > -
> > > #define rte_smp_wmb() rte_compiler_barrier()
> > > #define rte_smp_rmb() rte_compiler_barrier()
> > > +/*
> > > + * From Intel Software Development Manual; Vol 3;
> > > + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> > > + * ...
> > > + * . Reads are not reordered with other reads.
> > > + * . Writes are not reordered with older reads.
> > > + * . Writes to memory are not reordered with other writes,
> > > + * with the following exceptions:
> > > + * . streaming stores (writes) executed with the non-temporal move
> > > + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> > > + * . string operations (see Section 8.2.4.1).
> > > + * ...
> > > + * . Reads may be reordered with older writes to different locations but
> > > not
> > > + * with older writes to the same location.
> > > + * . Reads or writes cannot be reordered with I/O instructions,
> > > + * locked instructions, or serializing instructions.
> > > + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> > > + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE
> > > instructions.
> > > + * . LFENCE instructions cannot pass earlier reads.
> > > + * . SFENCE instructions cannot pass earlier writes ...
> > > + * . MFENCE instructions cannot pass earlier reads, writes ...
> > > + *
> > > + * As pointed by Java guys, that makes possible to use lock-prefixed
> > > + * instructions to get the same effect as mfence and on most modern HW
> > > + * that gives a better perfomance then using mfence:
> > > + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > > + * Basic idea is to use lock prefixed add with some dummy memory location
> > > + * as the destination. From their experiments 128B(2 cache lines) below
> > > + * current stack pointer looks like a good candidate.
> > > + * So below we use that techinque for rte_smp_mb() implementation.
> > > + */
> > > +
> > > +static __rte_always_inline void
> > > +rte_smp_mb(void)
> > > +{
> > > +#ifdef RTE_ARCH_I686
> > > + asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
> > > +#else
> > > + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
> > > +#endif
> > > +}
> > > +
> > > #define rte_io_mb() rte_mb()
> > > #define rte_io_wmb() rte_compiler_barrier()
> >
> > In my testing this appears to be suboptimal when the calling
> > function is large. The following seems to work better:
> >
> > +static __rte_always_inline void
> > +rte_smp_mb(void)
> > +{
> > +#ifdef RTE_ARCH_I686
> > + asm volatile("lock addl $0, -132(%%esp); " ::: "memory");
> > +#else
> > + asm volatile("lock addl $0, -132(%%rsp); " ::: "memory");
> > +#endif
> > +}
> > +
> >
> > The reason most likely is that 128 still overlaps the x86
> > red zone by 4 bytes.
>
> I tried what you suggested but for my cases didn't see any improvement so far.
> Can you explain a bit more why do you expect it to be faster?
> Probably some particular scenario?
> Konstantin
It would depend on how much of a redzone is in use.
If the last 4 bytes of the red zone get used, you will
see a bit more stalls when trying to run the xor command.
You will need to call it from a function with lots of
scratch/temporary variables for that to be the case.
> >
> > Feel free to reuse, and add
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> >
> > > --
> > > 2.13.6
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2018-01-29 15:47 ` [dpdk-dev] " Thomas Monjalon
@ 2018-01-30 19:33 ` Stephen Hemminger
0 siblings, 0 replies; 26+ messages in thread
From: Stephen Hemminger @ 2018-01-30 19:33 UTC (permalink / raw)
To: Thomas Monjalon; +Cc: Konstantin Ananyev, dev
On Mon, 29 Jan 2018 16:47:45 +0100
Thomas Monjalon <thomas@monjalon.net> wrote:
> 15/01/2018 16:09, Konstantin Ananyev:
> > On x86 it is possible to use lock-prefixed instructions to get
> > the similar effect as mfence.
> > As pointed by Java guys, on most modern HW that gives a better
> > performance than using mfence:
> > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > That patch adopts that technique for rte_smp_mb() implementation.
> > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > i.e. from ~110 to ~55 cycles per operation.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
>
> Applied, thanks
Does this change since lfence is one of the ways to block Spectre variant.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
2017-12-01 23:08 ` Ananyev, Konstantin
@ 2018-03-08 21:15 ` Stephen Hemminger
0 siblings, 0 replies; 26+ messages in thread
From: Stephen Hemminger @ 2018-03-08 21:15 UTC (permalink / raw)
To: Ananyev, Konstantin; +Cc: dev
On Fri, 1 Dec 2017 23:08:39 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> Hi Stephen,
>
> > -----Original Message-----
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Friday, December 1, 2017 6:04 PM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb()
> >
> > On Fri, 1 Dec 2017 11:12:51 +0000
> > Konstantin Ananyev <konstantin.ananyev@intel.com> wrote:
> >
> > > On x86 it is possible to use lock-prefixed instructions to get
> > > the similar effect as mfence.
> > > As pointed by Java guys, on most modern HW that gives a better
> > > performance than using mfence:
> > > https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > > That patch adopts that technique for rte_smp_mb() implementation.
> > > On BDW 2.2 mb_autotest on single lcore reports 2X cycle reduction,
> > > i.e. from ~110 to ~55 cycles per operation.
> > >
> > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > ---
> > > .../common/include/arch/x86/rte_atomic.h | 45 +++++++++++++++++++++-
> > > 1 file changed, 43 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > index 4eac66631..07b7fa7f7 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
> > > @@ -55,12 +55,53 @@ extern "C" {
> > >
> > > #define rte_rmb() _mm_lfence()
> > >
> > > -#define rte_smp_mb() rte_mb()
> > > -
> > > #define rte_smp_wmb() rte_compiler_barrier()
> > >
> > > #define rte_smp_rmb() rte_compiler_barrier()
> > >
> > > +/*
> > > + * From Intel Software Development Manual; Vol 3;
> > > + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
> > > + * ...
> > > + * . Reads are not reordered with other reads.
> > > + * . Writes are not reordered with older reads.
> > > + * . Writes to memory are not reordered with other writes,
> > > + * with the following exceptions:
> > > + * . streaming stores (writes) executed with the non-temporal move
> > > + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
> > > + * . string operations (see Section 8.2.4.1).
> > > + * ...
> > > + * . Reads may be reordered with older writes to different locations but not
> > > + * with older writes to the same location.
> > > + * . Reads or writes cannot be reordered with I/O instructions,
> > > + * locked instructions, or serializing instructions.
> > > + * . Reads cannot pass earlier LFENCE and MFENCE instructions.
> > > + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
> > > + * . LFENCE instructions cannot pass earlier reads.
> > > + * . SFENCE instructions cannot pass earlier writes ...
> > > + * . MFENCE instructions cannot pass earlier reads, writes ...
> > > + *
> > > + * As pointed by Java guys, that makes possible to use lock-prefixed
> > > + * instructions to get the same effect as mfence and on most modern HW
> > > + * that gives a better perfomarnce than using mfence:
> > > + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
> > > + * So below we use that technique for rte_smp_mb() implementation.
> > > + */
> > > +
> > > +#ifdef RTE_ARCH_I686
> > > +#define RTE_SP RTE_STR(esp)
> > > +#else
> > > +#define RTE_SP RTE_STR(rsp)
> > > +#endif
> > > +
> > > +#define RTE_MB_DUMMY_MEMP "-128(%%" RTE_SP ")"
> > > +
> > > +static __rte_always_inline void
> > > +rte_smp_mb(void)
> > > +{
> > > + asm volatile("lock addl $0," RTE_MB_DUMMY_MEMP "; " ::: "memory");
> > > +}
> > > +
> > > #define rte_io_mb() rte_mb()
> > >
> > > #define rte_io_wmb() rte_compiler_barrier()
> >
> > The lock instruction is a stronger barrier than the compiler barrier
> > and has worse performance impact. Are you sure it is necessary to use it in DPDK.
> > Linux kernel has successfully used simple compiler reodering barrier for years.
>
> Where do you see compiler barrier?
> Right now for x86 rte_smp_mb()==rte_mb()==mfence.
> So I am replacing mfence with 'lock add'.
> As comment above says - on most modern x86 systems it is faster,
> while allow to preserve memory ordering.
There are cases like virtio/vhost where we could be using compiler_barrier.
The mfence to lock add conversion makes sense.
^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2018-03-08 21:15 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-01 11:12 [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2017-12-01 11:12 ` [dpdk-dev] [PATCH 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
2017-12-01 18:04 ` Stephen Hemminger
2017-12-01 23:08 ` Ananyev, Konstantin
2018-03-08 21:15 ` Stephen Hemminger
2017-12-11 17:11 ` Bruce Richardson
2017-12-11 17:30 ` Ananyev, Konstantin
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2018-01-12 17:23 ` Thomas Monjalon
2018-01-12 17:58 ` Ananyev, Konstantin
2018-01-13 13:54 ` Wiles, Keith
2018-01-13 13:54 ` Wiles, Keith
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 0/2] eal/x86: Optimize rte_smp_mb() and create a new test case for it Konstantin Ananyev
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 1/2] test/test: introduce new test-case for rte_smp_mb() Konstantin Ananyev
2018-01-16 0:16 ` Thomas Monjalon
2018-01-15 15:04 ` [dpdk-dev] [PATCH v3 2/2] eal/x86: Use lock-prefixed instructions to reduce cost of rte_smp_mb() Konstantin Ananyev
2018-01-15 15:09 ` Konstantin Ananyev
[not found] ` <8b05f533-d146-7f97-48f4-82ddcfc3613b@redhat.com>
2018-01-16 1:54 ` [dpdk-dev] Fwd: " Michael S. Tsirkin
2018-01-29 9:29 ` Ananyev, Konstantin
2018-01-29 17:29 ` Michael S. Tsirkin
2018-01-29 15:47 ` [dpdk-dev] " Thomas Monjalon
2018-01-30 19:33 ` Stephen Hemminger
2017-12-18 15:34 ` [dpdk-dev] [PATCH v2 " Konstantin Ananyev
2017-12-18 15:46 ` Bruce Richardson
2017-12-11 17:08 ` [dpdk-dev] [PATCH 1/2] test/test: introduce new test-case for rte_smp_mb() Bruce Richardson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).