In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 seconds after pcap_init is called when using a TSC with a frequency of 2.5GHz. To avoid the overflow, reread the time and TSC once delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there is no overflow if there is a several second gap between calls to pcapng_tsc_to_ns() the actual check to reread the clock is: delta > ((1ULL << 63) / NSEC_PER_SEC) Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") Cc: stable@dpdk.org Signed-off-by: Quentin Armitage <quentin@armitage.org.uk> --- lib/pcapng/rte_pcapng.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc69..7770be725f 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -34,7 +34,7 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +static struct pcapng_time { uint64_t ns; uint64_t cycles; } pcapng_time; @@ -53,7 +53,21 @@ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { uint64_t delta; + /* With a TSC frequency of 2.5GHz, delta * NSEC_PER_SEC will + * wrap in under 8 seconds. Once half that time has elapsed + * reread the system clock and TSC to ensure wrapping does not + * occur. + */ delta = cycles - pcapng_time.cycles; + if (delta > ((1ULL << 63) / NSEC_PER_SEC)) { + pcapng_init(); + if (cycles > pcapng_time.cycles) + delta = cycles - pcapng_time.cycles; + else { + delta = pcapng_time.cycles - cycles; + return pcapng_time.ns - (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + } + } return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); } -- 2.34.1
On Sat, 7 May 2022 17:12:36 +0100
Quentin Armitage <quentin@armitage.org.uk> wrote:
> In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> seconds after pcap_init is called when using a TSC with a frequency
> of 2.5GHz.
>
> To avoid the overflow, reread the time and TSC once
> delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there
> is no overflow if there is a several second gap between calls to
> pcapng_tsc_to_ns() the actual check to reread the clock is:
> delta > ((1ULL << 63) / NSEC_PER_SEC)
>
> Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> Cc: stable@dpdk.org
>
> Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
> ---
> lib/pcapng/rte_pcapng.c | 15 ++++++++++++++-
> 1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
> index 90b2f5bc69..7770be725f 100644
> --- a/lib/pcapng/rte_pcapng.c
> +++ b/lib/pcapng/rte_pcapng.c
> @@ -34,7 +34,7 @@ struct rte_pcapng {
> };
>
> /* For converting TSC cycles to PCAPNG ns format */
> -struct pcapng_time {
> +static struct pcapng_time {
> uint64_t ns;
> uint64_t cycles;
> } pcapng_time;
> @@ -53,7 +53,21 @@ static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
> {
> uint64_t delta;
>
> + /* With a TSC frequency of 2.5GHz, delta * NSEC_PER_SEC will
> + * wrap in under 8 seconds. Once half that time has elapsed
> + * reread the system clock and TSC to ensure wrapping does not
> + * occur.
> + */
> delta = cycles - pcapng_time.cycles;
> + if (delta > ((1ULL << 63) / NSEC_PER_SEC)) {
> + pcapng_init();
> + if (cycles > pcapng_time.cycles)
> + delta = cycles - pcapng_time.cycles;
> + else {
> + delta = pcapng_time.cycles - cycles;
> + return pcapng_time.ns - (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
> + }
> + }
> return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
> }
>
Can't this be fixed by scaling better? Calling pcapng_init in fast path would
cause a system call, thats bad.
On Sat, 7 May 2022 17:12:36 +0100 Quentin Armitage <quentin@armitage.org.uk> wrote: > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 > seconds after pcap_init is called when using a TSC with a frequency > of 2.5GHz. > > To avoid the overflow, reread the time and TSC once > delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there > is no overflow if there is a several second gap between calls to > pcapng_tsc_to_ns() the actual check to reread the clock is: > delta > ((1ULL << 63) / NSEC_PER_SEC) > > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") > Cc: stable@dpdk.org > > Signed-off-by: Quentin Armitage <quentin@armitage.org.uk> What about something like this instead. diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc6905..c5534301bf2c 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -19,6 +19,7 @@ #include <rte_ether.h> #include <rte_mbuf.h> #include <rte_pcapng.h> +#include <rte_reciprocal.h> #include <rte_time.h> #include "pcapng_proto.h" @@ -34,27 +35,39 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +#define TICK_SCALE 16u +static struct { uint64_t ns; uint64_t cycles; + struct rte_reciprocal_u64 inverse; } pcapng_time; RTE_INIT(pcapng_init) { struct timespec ts; + uint64_t scale_tick_per_ns; pcapng_time.cycles = rte_get_tsc_cycles(); clock_gettime(CLOCK_REALTIME, &ts); pcapng_time.ns = rte_timespec_to_ns(&ts); + + scale_tick_per_ns = (rte_get_tsc_hz() * TICK_SCALE) / NSEC_PER_SEC; + pcapng_time.inverse = rte_reciprocal_value_u64(scale_tick_per_ns); } /* PCAPNG timestamps are in nanoseconds */ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { - uint64_t delta; + uint64_t delta, elapsed; delta = cycles - pcapng_time.cycles; - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + + /* Compute elapsed time in nanoseconds scaled by TICK_SCALE + * since the start of the capture. + * With scale of 4 this will roll over in 36 years. + */ + elapsed = rte_reciprocal_divide_u64(delta, &pcapng_time.inverse); + return pcapng_time.ns + elapsed / TICK_SCALE; } /* length of option including padding */
On Wed, 2022-05-11 at 09:46 -0700, Stephen Hemminger wrote: > On Sat, 7 May 2022 17:12:36 +0100 > Quentin Armitage <quentin@armitage.org.uk> wrote: > > > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 > > seconds after pcap_init is called when using a TSC with a frequency > > of 2.5GHz. > > > > To avoid the overflow, reread the time and TSC once > > delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there > > is no overflow if there is a several second gap between calls to > > pcapng_tsc_to_ns() the actual check to reread the clock is: > > delta > ((1ULL << 63) / NSEC_PER_SEC) > > > > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") > > Cc: stable@dpdk.org > > > > Signed-off-by: Quentin Armitage <quentin@armitage.org.uk> > > What about something like this instead. > > diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c > index 90b2f5bc6905..c5534301bf2c 100644 > --- a/lib/pcapng/rte_pcapng.c > +++ b/lib/pcapng/rte_pcapng.c > @@ -19,6 +19,7 @@ > #include <rte_ether.h> > #include <rte_mbuf.h> > #include <rte_pcapng.h> > +#include <rte_reciprocal.h> > #include <rte_time.h> > > #include "pcapng_proto.h" > @@ -34,27 +35,39 @@ struct rte_pcapng { > }; > > /* For converting TSC cycles to PCAPNG ns format */ > -struct pcapng_time { > +#define TICK_SCALE 16u > +static struct { > uint64_t ns; > uint64_t cycles; > + struct rte_reciprocal_u64 inverse; > } pcapng_time; > > RTE_INIT(pcapng_init) > { > struct timespec ts; > + uint64_t scale_tick_per_ns; > > pcapng_time.cycles = rte_get_tsc_cycles(); > clock_gettime(CLOCK_REALTIME, &ts); > pcapng_time.ns = rte_timespec_to_ns(&ts); > + > + scale_tick_per_ns = (rte_get_tsc_hz() * TICK_SCALE) / NSEC_PER_SEC; > + pcapng_time.inverse = rte_reciprocal_value_u64(scale_tick_per_ns); > } > > /* PCAPNG timestamps are in nanoseconds */ > static uint64_t pcapng_tsc_to_ns(uint64_t cycles) > { > - uint64_t delta; > + uint64_t delta, elapsed; > > delta = cycles - pcapng_time.cycles; > - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); > + > + /* Compute elapsed time in nanoseconds scaled by TICK_SCALE > + * since the start of the capture. > + * With scale of 4 this will roll over in 36 years. > + */ > + elapsed = rte_reciprocal_divide_u64(delta, &pcapng_time.inverse); > + return pcapng_time.ns + elapsed / TICK_SCALE; > } > > /* length of option including padding */ > The final statement of pcapng_tsc_to_ns() should be: return pcapng_time.ns + elapsed * TICK_SCALE; There is also a problem that rte_get_tsc_hz() returns eal_tsc_resolution_hz, but this is not initialized until rte_eal_init() is called, so rte_get_tsc_hz() cannot be called from a constructor function. While both of the above problems can easily be solved, I think there is a problem with accuracy with this approach. With a 3GHz clock, scale_tick_per_ns would be 48. For other clock speeds there can be a truncation in the calculation. With a 3.3GHz clock, scale_tick_per_ns will be truncated from 52.8 to 52, resulting in a 1.5% or so error in the time returned by pcapng_tsc_to_ns() (a 2.3GHz clock results in a 2.2% error). Increasing TICK_SCALE reduces the %age error, but also reduces the time before overflow occurs. If the approach in the following patch is considered to be acceptable, I would be very happy to submit an updated patch. The one concern I have about the patch is introducing a new constructor priority, RTE_PRIORITY_TIMER, which may be considered to be inappropriate. If it is inappropriate, then the simplest alternative would be to introduce a new function rte_tsc_get_hz_init() which calls set_tsc_freq() if eal_tsc_resolution_hz has not been initialized (alternatively rte_get_tsc_hz() could be modified to make the check, but that then produces an overhead every time the function is called). diff --git a/lib/eal/common/eal_common_timer.c b/lib/eal/common/eal_common_timer.c index 5686a5102b..cb3fa1e240 100644 --- a/lib/eal/common/eal_common_timer.c +++ b/lib/eal/common/eal_common_timer.c @@ -54,6 +54,9 @@ set_tsc_freq(void) struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; uint64_t freq; + if (eal_tsc_resolution_hz) + return; + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { /* * Just use the primary process calculated TSC rate in any @@ -86,3 +89,8 @@ RTE_INIT(rte_timer_init) /* set rte_delay_us_block as a delay function */ rte_delay_us_callback_register(rte_delay_us_block); } + +RTE_INIT_PRIO(rte_tsc_init, TIMER) +{ + set_tsc_freq(); +} diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h index 67587025ab..a0d64ff4f2 100644 --- a/lib/eal/include/rte_common.h +++ b/lib/eal/include/rte_common.h @@ -161,6 +161,7 @@ typedef uint16_t unaligned_uint16_t; #define RTE_PRIORITY_LOG 101 #define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_TIMER 115 #define RTE_PRIORITY_CLASS 120 #define RTE_PRIORITY_LAST 65535 diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc69..09d42bbc9a 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -19,6 +19,7 @@ #include <rte_ether.h> #include <rte_mbuf.h> #include <rte_pcapng.h> +#include <rte_reciprocal.h> #include <rte_time.h> #include "pcapng_proto.h" @@ -34,9 +35,11 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +static struct { uint64_t ns; uint64_t cycles; + uint64_t tsc_hz; + struct rte_reciprocal_u64 tsc_hz_inverse; } pcapng_time; RTE_INIT(pcapng_init) @@ -45,16 +48,45 @@ RTE_INIT(pcapng_init) pcapng_time.cycles = rte_get_tsc_cycles(); clock_gettime(CLOCK_REALTIME, &ts); + pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2; pcapng_time.ns = rte_timespec_to_ns(&ts); + + pcapng_time.tsc_hz = rte_get_tsc_hz(); + pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz); } /* PCAPNG timestamps are in nanoseconds */ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { - uint64_t delta; - + uint64_t delta, secs; + + /* In essence the calculation is: + * delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz() + * but this overflows within 4 to 8 seconds depending on TSC frequency. + * Instead, if delta >= pcapng_time.tsc_hz: + * Increase pcapng_time.ns and pcapng_time.cycles by the number of + * whole seconds in delta and reduce delta accordingly. + * delta will therefore always lie in the interval [0, pcapng_time.tsc_hz), + * which will not overflow when multiplied by NSEC_PER_SEC provided the + * TSC frequency < approx 18.4GHz. + * + * Currently all TSCs operate below 5GHz. + */ delta = cycles - pcapng_time.cycles; - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + if (unlikely(delta >= pcapng_time.tsc_hz)) { + if (likely(delta < pcapng_time.tsc_hz * 2)) { + delta -= pcapng_time.tsc_hz; + pcapng_time.cycles += pcapng_time.tsc_hz; + pcapng_time.ns += NSEC_PER_SEC; + } else { + secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse); + delta -= secs * pcapng_time.tsc_hz; + pcapng_time.cycles += secs * pcapng_time.tsc_hz; + pcapng_time.ns += secs * NSEC_PER_SEC; + } + } + + return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC, &pcapng_time.tsc_hz_inverse); } /* length of option including padding */
On Sat, 14 May 2022 18:14:48 +0100
Quentin Armitage <quentin@armitage.org.uk> wrote:
> The final statement of pcapng_tsc_to_ns() should be:
> return pcapng_time.ns + elapsed * TICK_SCALE;
>
> There is also a problem that rte_get_tsc_hz() returns eal_tsc_resolution_hz, but
> this is not initialized until rte_eal_init() is called, so rte_get_tsc_hz()
> cannot be called from a constructor function.
Better to do initialization on first use then.
It would be safer than the contstructor.
In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8 seconds after pcap_init is called when using a TSC with a frequency of 2.5GHz. To avoid the overflow, update the saved time and TSC value once delta >= tsc_hz. Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files") Cc: stable@dpdk.org Signed-off-by: Quentin Armitage <quentin@armitage.org.uk> --- v2: - Don't call clock_gettime() in fast path - Update pcapng_time.ns and pcapng_time.cycles to ensure delta < tsc_hz - Stop using constructor to initialise pcapng_time.tsc_hz since it is not initialised until rte_eal_init() is called - use mean value of TSC before and after call to clock_gettime() - only call rte_get_tsc_hz() once - use rte_reciprocal functions instead of division lib/pcapng/rte_pcapng.c | 47 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c index 90b2f5bc69..06ad712bd1 100644 --- a/lib/pcapng/rte_pcapng.c +++ b/lib/pcapng/rte_pcapng.c @@ -19,6 +19,7 @@ #include <rte_ether.h> #include <rte_mbuf.h> #include <rte_pcapng.h> +#include <rte_reciprocal.h> #include <rte_time.h> #include "pcapng_proto.h" @@ -34,27 +35,63 @@ struct rte_pcapng { }; /* For converting TSC cycles to PCAPNG ns format */ -struct pcapng_time { +static struct pcapng_time { uint64_t ns; uint64_t cycles; + uint64_t tsc_hz; + struct rte_reciprocal_u64 tsc_hz_inverse; } pcapng_time; -RTE_INIT(pcapng_init) +static inline void +pcapng_init(void) { struct timespec ts; pcapng_time.cycles = rte_get_tsc_cycles(); clock_gettime(CLOCK_REALTIME, &ts); + pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2; pcapng_time.ns = rte_timespec_to_ns(&ts); + + pcapng_time.tsc_hz = rte_get_tsc_hz(); + pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz); } /* PCAPNG timestamps are in nanoseconds */ static uint64_t pcapng_tsc_to_ns(uint64_t cycles) { - uint64_t delta; - + uint64_t delta, secs; + + if (!pcapng_time.tsc_hz) + pcapng_init(); + + /* In essence the calculation is: + * delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz() + * but this overflows within 4 to 8 seconds depending on TSC frequency. + * Instead, if delta >= pcapng_time.tsc_hz: + * Increase pcapng_time.ns and pcapng_time.cycles by the number of + * whole seconds in delta and reduce delta accordingly. + * delta will therefore always lie in the interval [0, pcapng_time.tsc_hz), + * which will not overflow when multiplied by NSEC_PER_SEC provided the + * TSC frequency < approx 18.4GHz. + * + * Currently all TSCs operate below 5GHz. + */ delta = cycles - pcapng_time.cycles; - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz(); + if (unlikely(delta >= pcapng_time.tsc_hz)) { + if (likely(delta < pcapng_time.tsc_hz * 2)) { + delta -= pcapng_time.tsc_hz; + pcapng_time.cycles += pcapng_time.tsc_hz; + pcapng_time.ns += NSEC_PER_SEC; + } else { + secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse); + delta -= secs * pcapng_time.tsc_hz; + pcapng_time.cycles += secs * pcapng_time.tsc_hz; + pcapng_time.ns += secs * NSEC_PER_SEC; + } + } + + return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC, + &pcapng_time.tsc_hz_inverse); } /* length of option including padding */ -- 2.34.3
On Tue, 17 May 2022 11:01:15 +0100
Quentin Armitage <quentin@armitage.org.uk> wrote:
> In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> seconds after pcap_init is called when using a TSC with a frequency
> of 2.5GHz.
>
> To avoid the overflow, update the saved time and TSC value once
> delta >= tsc_hz.
>
> Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> Cc: stable@dpdk.org
>
> Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
Thanks for fixing this.
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
On Tue, 17 May 2022 11:01:15 +0100
Quentin Armitage <quentin@armitage.org.uk> wrote:
> In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> seconds after pcap_init is called when using a TSC with a frequency
> of 2.5GHz.
>
> To avoid the overflow, update the saved time and TSC value once
> delta >= tsc_hz.
>
> Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> Cc: stable@dpdk.org
>
> Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
> ---
> v2:
> - Don't call clock_gettime() in fast path
> - Update pcapng_time.ns and pcapng_time.cycles to ensure delta < tsc_hz
> - Stop using constructor to initialise pcapng_time.tsc_hz since
> it is not initialised until rte_eal_init() is called
> - use mean value of TSC before and after call to clock_gettime()
> - only call rte_get_tsc_hz() once
> - use rte_reciprocal functions instead of division
>
> lib/pcapng/rte_pcapng.c | 47 ++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 42 insertions(+), 5 deletions(-)
Side note: looked at an alternative using the timestamp offset option described in the
PCAPNG IETF standard.
if_tsoffset:
The if_tsoffset option is a 64-bit signed integer value that
specifies an offset (in seconds) that must be added to the timestamp of
each packet to obtain the absolute timestamp of a packet. If the option
is missing, the timestamps stored in the packet MUST be considered
absolute timestamps. The time zone of the offset can be specified with
the option if_tzone.
But not supported even by current wireshark so not useful.
17/05/2022 17:15, Stephen Hemminger:
> On Tue, 17 May 2022 11:01:15 +0100
> Quentin Armitage <quentin@armitage.org.uk> wrote:
>
> > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> > seconds after pcap_init is called when using a TSC with a frequency
> > of 2.5GHz.
> >
> > To avoid the overflow, update the saved time and TSC value once
> > delta >= tsc_hz.
> >
> > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
>
> Thanks for fixing this.
>
> Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Applied, thanks.