From: Quentin Armitage <quentin@armitage.org.uk>
To: Stephen Hemminger <stephen@networkplumber.org>
Cc: Reshma Pattan <reshma.pattan@intel.com>,
Ray Kinsella <mdr@ashroe.eu>,
dev@dpdk.org, stable@dpdk.org
Subject: Re: [PATCH] libpcapng: fix timestamp wrapping in output files
Date: Sat, 14 May 2022 18:14:48 +0100 [thread overview]
Message-ID: <45bdbfad3857848faa2edc185db1ac639a901b23.camel@armitage.org.uk> (raw)
In-Reply-To: <20220511094655.4f885c84@hermes.local>
On Wed, 2022-05-11 at 09:46 -0700, Stephen Hemminger wrote:
> On Sat, 7 May 2022 17:12:36 +0100
> Quentin Armitage <quentin@armitage.org.uk> wrote:
>
> > In pcap_tsc_to_ns(), delta * NSEC_PER_SEC will overflow approx 8
> > seconds after pcap_init is called when using a TSC with a frequency
> > of 2.5GHz.
> >
> > To avoid the overflow, reread the time and TSC once
> > delta * NSEC_PER_SEC > (1 << 63). In order to ensure that there
> > is no overflow if there is a several second gap between calls to
> > pcapng_tsc_to_ns() the actual check to reread the clock is:
> > delta > ((1ULL << 63) / NSEC_PER_SEC)
> >
> > Fixes: 8d23ce8f5ee ("pcapng: add new library for writing pcapng files")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
>
> What about something like this instead.
>
> diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
> index 90b2f5bc6905..c5534301bf2c 100644
> --- a/lib/pcapng/rte_pcapng.c
> +++ b/lib/pcapng/rte_pcapng.c
> @@ -19,6 +19,7 @@
> #include <rte_ether.h>
> #include <rte_mbuf.h>
> #include <rte_pcapng.h>
> +#include <rte_reciprocal.h>
> #include <rte_time.h>
>
> #include "pcapng_proto.h"
> @@ -34,27 +35,39 @@ struct rte_pcapng {
> };
>
> /* For converting TSC cycles to PCAPNG ns format */
> -struct pcapng_time {
> +#define TICK_SCALE 16u
> +static struct {
> uint64_t ns;
> uint64_t cycles;
> + struct rte_reciprocal_u64 inverse;
> } pcapng_time;
>
> RTE_INIT(pcapng_init)
> {
> struct timespec ts;
> + uint64_t scale_tick_per_ns;
>
> pcapng_time.cycles = rte_get_tsc_cycles();
> clock_gettime(CLOCK_REALTIME, &ts);
> pcapng_time.ns = rte_timespec_to_ns(&ts);
> +
> + scale_tick_per_ns = (rte_get_tsc_hz() * TICK_SCALE) / NSEC_PER_SEC;
> + pcapng_time.inverse = rte_reciprocal_value_u64(scale_tick_per_ns);
> }
>
> /* PCAPNG timestamps are in nanoseconds */
> static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
> {
> - uint64_t delta;
> + uint64_t delta, elapsed;
>
> delta = cycles - pcapng_time.cycles;
> - return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
> +
> + /* Compute elapsed time in nanoseconds scaled by TICK_SCALE
> + * since the start of the capture.
> + * With scale of 4 this will roll over in 36 years.
> + */
> + elapsed = rte_reciprocal_divide_u64(delta, &pcapng_time.inverse);
> + return pcapng_time.ns + elapsed / TICK_SCALE;
> }
>
> /* length of option including padding */
>
The final statement of pcapng_tsc_to_ns() should be:
return pcapng_time.ns + elapsed * TICK_SCALE;
There is also a problem that rte_get_tsc_hz() returns eal_tsc_resolution_hz, but
this is not initialized until rte_eal_init() is called, so rte_get_tsc_hz()
cannot be called from a constructor function.
While both of the above problems can easily be solved, I think there is a
problem with accuracy with this approach. With a 3GHz clock, scale_tick_per_ns
would be 48. For other clock speeds there can be a truncation in the
calculation. With a 3.3GHz clock, scale_tick_per_ns will be truncated from 52.8
to 52, resulting in a 1.5% or so error in the time returned by
pcapng_tsc_to_ns() (a 2.3GHz clock results in a 2.2% error). Increasing
TICK_SCALE reduces the %age error, but also reduces the time before overflow
occurs.
If the approach in the following patch is considered to be acceptable, I would
be very happy to submit an updated patch. The one concern I have about the patch
is introducing a new constructor priority, RTE_PRIORITY_TIMER, which may be
considered to be inappropriate. If it is inappropriate, then the simplest
alternative would be to introduce a new function rte_tsc_get_hz_init() which
calls set_tsc_freq() if eal_tsc_resolution_hz has not been initialized
(alternatively rte_get_tsc_hz() could be modified to make the check, but that
then produces an overhead every time the function is called).
diff --git a/lib/eal/common/eal_common_timer.c
b/lib/eal/common/eal_common_timer.c
index 5686a5102b..cb3fa1e240 100644
--- a/lib/eal/common/eal_common_timer.c
+++ b/lib/eal/common/eal_common_timer.c
@@ -54,6 +54,9 @@ set_tsc_freq(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
uint64_t freq;
+ if (eal_tsc_resolution_hz)
+ return;
+
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
/*
* Just use the primary process calculated TSC rate in any
@@ -86,3 +89,8 @@ RTE_INIT(rte_timer_init)
/* set rte_delay_us_block as a delay function */
rte_delay_us_callback_register(rte_delay_us_block);
}
+
+RTE_INIT_PRIO(rte_tsc_init, TIMER)
+{
+ set_tsc_freq();
+}
diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 67587025ab..a0d64ff4f2 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -161,6 +161,7 @@ typedef uint16_t unaligned_uint16_t;
#define RTE_PRIORITY_LOG 101
#define RTE_PRIORITY_BUS 110
+#define RTE_PRIORITY_TIMER 115
#define RTE_PRIORITY_CLASS 120
#define RTE_PRIORITY_LAST 65535
diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index 90b2f5bc69..09d42bbc9a 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -19,6 +19,7 @@
#include <rte_ether.h>
#include <rte_mbuf.h>
#include <rte_pcapng.h>
+#include <rte_reciprocal.h>
#include <rte_time.h>
#include "pcapng_proto.h"
@@ -34,9 +35,11 @@ struct rte_pcapng {
};
/* For converting TSC cycles to PCAPNG ns format */
-struct pcapng_time {
+static struct {
uint64_t ns;
uint64_t cycles;
+ uint64_t tsc_hz;
+ struct rte_reciprocal_u64 tsc_hz_inverse;
} pcapng_time;
RTE_INIT(pcapng_init)
@@ -45,16 +48,45 @@ RTE_INIT(pcapng_init)
pcapng_time.cycles = rte_get_tsc_cycles();
clock_gettime(CLOCK_REALTIME, &ts);
+ pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2;
pcapng_time.ns = rte_timespec_to_ns(&ts);
+
+ pcapng_time.tsc_hz = rte_get_tsc_hz();
+ pcapng_time.tsc_hz_inverse =
rte_reciprocal_value_u64(pcapng_time.tsc_hz);
}
/* PCAPNG timestamps are in nanoseconds */
static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
{
- uint64_t delta;
-
+ uint64_t delta, secs;
+
+ /* In essence the calculation is:
+ * delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC /
rte_get_tsc_hz()
+ * but this overflows within 4 to 8 seconds depending on TSC frequency.
+ * Instead, if delta >= pcapng_time.tsc_hz:
+ * Increase pcapng_time.ns and pcapng_time.cycles by the number of
+ * whole seconds in delta and reduce delta accordingly.
+ * delta will therefore always lie in the interval [0,
pcapng_time.tsc_hz),
+ * which will not overflow when multiplied by NSEC_PER_SEC provided the
+ * TSC frequency < approx 18.4GHz.
+ *
+ * Currently all TSCs operate below 5GHz.
+ */
delta = cycles - pcapng_time.cycles;
- return pcapng_time.ns + (delta * NSEC_PER_SEC) / rte_get_tsc_hz();
+ if (unlikely(delta >= pcapng_time.tsc_hz)) {
+ if (likely(delta < pcapng_time.tsc_hz * 2)) {
+ delta -= pcapng_time.tsc_hz;
+ pcapng_time.cycles += pcapng_time.tsc_hz;
+ pcapng_time.ns += NSEC_PER_SEC;
+ } else {
+ secs = rte_reciprocal_divide_u64(delta,
&pcapng_time.tsc_hz_inverse);
+ delta -= secs * pcapng_time.tsc_hz;
+ pcapng_time.cycles += secs * pcapng_time.tsc_hz;
+ pcapng_time.ns += secs * NSEC_PER_SEC;
+ }
+ }
+
+ return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC,
&pcapng_time.tsc_hz_inverse);
}
/* length of option including padding */
next prev parent reply other threads:[~2022-05-14 17:15 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-07 16:12 Quentin Armitage
2022-05-11 16:08 ` Stephen Hemminger
2022-05-11 16:46 ` Stephen Hemminger
2022-05-14 17:14 ` Quentin Armitage [this message]
2022-05-16 13:26 ` Stephen Hemminger
2022-05-17 10:01 ` [PATCH v2] " Quentin Armitage
2022-05-17 15:15 ` Stephen Hemminger
2022-06-01 14:38 ` Thomas Monjalon
2022-05-17 21:04 ` Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=45bdbfad3857848faa2edc185db1ac639a901b23.camel@armitage.org.uk \
--to=quentin@armitage.org.uk \
--cc=dev@dpdk.org \
--cc=mdr@ashroe.eu \
--cc=reshma.pattan@intel.com \
--cc=stable@dpdk.org \
--cc=stephen@networkplumber.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).