Hi Gavin,
> > > >
> > > > > diff --git a/lib/librte_eal/common/include/generic/rte_ticketlock.h
> > > > b/lib/librte_eal/common/include/generic/rte_ticketlock.h
> > > > > new file mode 100644
> > > > > index 0000000..d63aaaa
> > > > > --- /dev/null
> > > > > +++ b/lib/librte_eal/common/include/generic/rte_ticketlock.h
> > > > > @@ -0,0 +1,308 @@
> > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > + * Copyright(c) 2019 Arm Limited
> > > > > + */
> > > > > +
> > > > > +#ifndef _RTE_TICKETLOCK_H_
> > > > > +#define _RTE_TICKETLOCK_H_
> > > > > +
> > > > > +/**
> > > > > + * @file
> > > > > + *
> > > > > + * RTE ticket locks
> > > > > + *
> > > > > + * This file defines an API for ticket locks, which give each waiting
> > > > > + * thread a ticket and take the lock one by one, first come, first
> > > > > + * serviced.
> > > > > + *
> > > > > + * All locks must be initialised before use, and only initialised once.
> > > > > + *
> > > > > + */
> > > > > +
> > > > > +#ifdef __cplusplus
> > > > > +extern "C" {
> > > > > +#endif
> > > > > +
> > > > > +#include <rte_common.h>
> > > > > +#include <rte_lcore.h>
> > > > > +#include <rte_pause.h>
> > > > > +
> > > > > +/**
> > > > > + * The rte_ticketlock_t type.
> > > > > + */
> > > > > +typedef struct {
> > > > > +	uint16_t current;
> > > > > +	uint16_t next;
> > > > > +} rte_ticketlock_t;
> > > > > +
> > > > > +/**
> > > > > + * A static ticketlock initializer.
> > > > > + */
> > > > > +#define RTE_TICKETLOCK_INITIALIZER { 0 }
> > > > > +
> > > > > +/**
> > > > > + * Initialize the ticketlock to an unlocked state.
> > > > > + *
> > > > > + * @param tl
> > > > > + *   A pointer to the ticketlock.
> > > > > + */
> > > > > +static inline __rte_experimental void
> > > > > +rte_ticketlock_init(rte_ticketlock_t *tl)
> > > > > +{
> > > > > +	__atomic_store_n(&tl->current, 0, __ATOMIC_RELAXED);
> > > > > +	__atomic_store_n(&tl->next, 0, __ATOMIC_RELAXED);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * Take the ticketlock.
> > > > > + *
> > > > > + * @param tl
> > > > > + *   A pointer to the ticketlock.
> > > > > + */
> > > > > +static inline __rte_experimental void
> > > > > +rte_ticketlock_lock(rte_ticketlock_t *tl)
> > > > > +{
> > > > > +	uint16_t me = __atomic_fetch_add(&tl->next, 1,
> > > > __ATOMIC_RELAXED);
> > > > > +	while (__atomic_load_n(&tl->current, __ATOMIC_ACQUIRE) != me)
> > > > > +		rte_pause();
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * Release the ticketlock.
> > > > > + *
> > > > > + * @param tl
> > > > > + *   A pointer to the ticketlock.
> > > > > + */
> > > > > +static inline __rte_experimental void
> > > > > +rte_ticketlock_unlock(rte_ticketlock_t *tl)
> > > > > +{
> > > > > +	uint16_t i = __atomic_load_n(&tl->current, __ATOMIC_RELAXED);
> > > > > +	__atomic_store_n(&tl->current, i+1, __ATOMIC_RELEASE);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * Try to take the lock.
> > > > > + *
> > > > > + * @param tl
> > > > > + *   A pointer to the ticketlock.
> > > > > + * @return
> > > > > + *   1 if the lock is successfully taken; 0 otherwise.
> > > > > + */
> > > > > +static inline __rte_experimental int
> > > > > +rte_ticketlock_trylock(rte_ticketlock_t *tl)
> > > > > +{
> > > > > +	uint16_t next = __atomic_load_n(&tl->next, __ATOMIC_RELAXED);
> > > > > +	uint16_t cur = __atomic_load_n(&tl->current, __ATOMIC_RELAXED);
> > > > > +	if (next == cur) {
> > > >
> > > > Probably a naīve one:
> > > > Suppose next==cur==1 here, then this thread will experience really long
> > > > context switch,
> > >
> > > By saying context switch, do you mean running to here, it is out of CPU time
> > and starving for CPU?
> >
> > Yes.
> >
> > >
> > > > so next time it continues its execution tl->next value will wrap-up and will
> > > > be 1 again, and tl->current==0 (lock held).
> > > > I suppose this function will set tl->next=2 and will return a success?
> > >
> > > If this thread was swapped out and another thread took/attempted to take
> > the lock, yes, tl->next == 2 here,
> > > But as next == 1 unchanged, so it would not return a success.
> >
> > I am not talking about situation when tl->next == 2,tl->current==1 (just one
> > lock() was executed by different thread).
> > I am talking about situation when this thread was out of cpu for significant
> > amount of cycles,
> > and in that period tl->next and tl->current were wrapped around (they both
> > reached UINT16_MAX, then 0).
> > i.e. UINT16_MAX lock/unlock were executed while this thread was away from
> > cpu.
> > After that another thread just did successful lock(), so tl->next==1 and tl-
> > >current==0.
> > Now this thread wakeups and continues with:
> > __atomic_compare_exchange_n(&tl->next, &next, next+1, ...)
> > As both tl->next==1 and next==1, it will succeed.
> > So we have 2 threads assuming they grabbed the lock successfully.
> > Konstantin
> >
> Now I understood your points, but not sure if it is a rare or even impossible case for this thread stalls for CPU and during this time, the other
> threads have taken the lock for 2^16 times, to wrap up.

I am agree it should be very rare, but I am not sure it is impossible.
Let say thread is doing lock/unlock in a loop, with one iteration ~100 cycles.
Then it would wrap around in ~6.5M cycles (~3ms on modern cpus).

> 
> Anyway I made a patch, currently in internal review to fix this issue, the basic idea is to compare not only the next, but also the current, and
> update the next(+1 and take the lock) only if both of them were not changed(or wrapped up and the lock released).
> I will submit the patch after internal review approved. Please let me know if you have more comments.

Ok, thanks
Konstantin