DPDK patches and discussions
 help / color / mirror / Atom feed
* [RFC] net/af_packet: make stats reset reliable
@ 2024-04-25 17:46 Ferruh Yigit
  2024-04-26 11:33 ` Morten Brørup
                   ` (3 more replies)
  0 siblings, 4 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-04-25 17:46 UTC (permalink / raw)
  To: John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

For stats reset, use an offset instead of zeroing out actual stats values,
get_stats() displays diff between stats and offset.
This way stats only updated in datapath and offset only updated in stats
reset function. This makes stats reset function more reliable.

As stats only written by single thread, we can remove 'volatile' qualifier
which should improve the performance in datapath.

Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
---
Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Morten Brørup <mb@smartsharesystems.com>

This update triggered by mail list discussion [1].

[1]
https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
---
 drivers/net/af_packet/rte_eth_af_packet.c | 69 +++++++++++++++--------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 397a32db5886..2061cdab4997 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -51,8 +51,10 @@ struct pkt_rx_queue {
 	uint16_t in_port;
 	uint8_t vlan_strip;
 
-	volatile unsigned long rx_pkts;
-	volatile unsigned long rx_bytes;
+	uint64_t rx_pkts;
+	uint64_t rx_bytes;
+	uint64_t rx_pkts_offset;
+	uint64_t rx_bytes_offset;
 };
 
 struct pkt_tx_queue {
@@ -64,9 +66,12 @@ struct pkt_tx_queue {
 	unsigned int framecount;
 	unsigned int framenum;
 
-	volatile unsigned long tx_pkts;
-	volatile unsigned long err_pkts;
-	volatile unsigned long tx_bytes;
+	uint64_t tx_pkts;
+	uint64_t err_pkts;
+	uint64_t tx_bytes;
+	uint64_t tx_pkts_offset;
+	uint64_t err_pkts_offset;
+	uint64_t tx_bytes_offset;
 };
 
 struct pmd_internals {
@@ -385,8 +390,18 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	return 0;
 }
 
+
+static uint64_t
+stats_get_diff(uint64_t stats, uint64_t offset)
+{
+	if (stats >= offset)
+		return stats - offset;
+	/* unlikely wraparound case */
+	return UINT64_MAX + stats - offset;
+}
+
 static int
-eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
 	unsigned i, imax;
 	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
@@ -396,27 +411,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
-		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
-		rx_total += igb_stats->q_ipackets[i];
-		rx_bytes_total += igb_stats->q_ibytes[i];
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts, rxq->rx_pkts_offset);
+		stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes, rxq->rx_bytes_offset);
+		rx_total += stats->q_ipackets[i];
+		rx_bytes_total += stats->q_ibytes[i];
 	}
 
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
-		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
-		tx_total += igb_stats->q_opackets[i];
-		tx_err_total += internal->tx_queue[i].err_pkts;
-		tx_bytes_total += igb_stats->q_obytes[i];
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		stats->q_opackets[i] = stats_get_diff(txq->tx_pkts, txq->tx_pkts_offset);
+		stats->q_obytes[i] = stats_get_diff(txq->tx_bytes, txq->tx_bytes_offset);
+		tx_total += stats->q_opackets[i];
+		tx_err_total += stats_get_diff(txq->err_pkts, txq->err_pkts_offset);
+		tx_bytes_total += stats->q_obytes[i];
 	}
 
-	igb_stats->ipackets = rx_total;
-	igb_stats->ibytes = rx_bytes_total;
-	igb_stats->opackets = tx_total;
-	igb_stats->oerrors = tx_err_total;
-	igb_stats->obytes = tx_bytes_total;
+	stats->ipackets = rx_total;
+	stats->ibytes = rx_bytes_total;
+	stats->opackets = tx_total;
+	stats->oerrors = tx_err_total;
+	stats->obytes = tx_bytes_total;
 	return 0;
 }
 
@@ -427,14 +444,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
 	struct pmd_internals *internal = dev->data->dev_private;
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->rx_queue[i].rx_pkts = 0;
-		internal->rx_queue[i].rx_bytes = 0;
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		rxq->rx_pkts_offset = rxq->rx_pkts;
+		rxq->rx_bytes_offset = rxq->rx_bytes;
 	}
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->tx_queue[i].tx_pkts = 0;
-		internal->tx_queue[i].err_pkts = 0;
-		internal->tx_queue[i].tx_bytes = 0;
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		txq->tx_pkts_offset = txq->tx_pkts;
+		txq->err_pkts_offset = txq->err_pkts;
+		txq->tx_bytes_offset = txq->tx_bytes;
 	}
 
 	return 0;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC] net/af_packet: make stats reset reliable
  2024-04-25 17:46 [RFC] net/af_packet: make stats reset reliable Ferruh Yigit
@ 2024-04-26 11:33 ` Morten Brørup
  2024-04-26 13:37   ` Ferruh Yigit
  2024-04-28 15:42   ` Mattias Rönnblom
  2024-04-26 14:38 ` [RFC v2] " Ferruh Yigit
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 42+ messages in thread
From: Morten Brørup @ 2024-04-26 11:33 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> +static uint64_t
> +stats_get_diff(uint64_t stats, uint64_t offset)
> +{
> +	if (stats >= offset)
> +		return stats - offset;
> +	/* unlikely wraparound case */
> +	return UINT64_MAX + stats - offset;

The numbers are unsigned, so wrapping comes for free.

Remove the comparison and always return stats - offset.

Using uint8_t for easier explanation, if offset is 255 and stats is 0, then the diff should be 1.
Returning stats - offset:
stats - offset = 0 - 255 = 0 - 0xFF = 1.

Returning UINT8_MAX + stats - offset is wrong:
UINT8_MAX + stats - offset = 255 - 0 - 255 = 0.

Besides that, it looks good to me.


While reviewing, I came across the rx_mbuf_alloc_failed counter in the rte_eth_dev_data structure:
https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/rte_ethdev.c#L3145
https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/ethdev_driver.h#L127

Doesn't it have the same problem?


BTW, the af_packet PMD doesn't increase the rx_mbuf_alloc_failed counter on mbuf allocation failures. But that's a separate bug.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC] net/af_packet: make stats reset reliable
  2024-04-26 11:33 ` Morten Brørup
@ 2024-04-26 13:37   ` Ferruh Yigit
  2024-04-26 14:56     ` Morten Brørup
  2024-04-28 15:42   ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2024-04-26 13:37 UTC (permalink / raw)
  To: Morten Brørup, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

On 4/26/2024 12:33 PM, Morten Brørup wrote:
>> +static uint64_t
>> +stats_get_diff(uint64_t stats, uint64_t offset)
>> +{
>> +	if (stats >= offset)
>> +		return stats - offset;
>> +	/* unlikely wraparound case */
>> +	return UINT64_MAX + stats - offset;
> 
> The numbers are unsigned, so wrapping comes for free.
> 
> Remove the comparison and always return stats - offset.
> 
> Using uint8_t for easier explanation, if offset is 255 and stats is 0, then the diff should be 1.
> Returning stats - offset:
> stats - offset = 0 - 255 = 0 - 0xFF = 1.
> 
> Returning UINT8_MAX + stats - offset is wrong:
> UINT8_MAX + stats - offset = 255 - 0 - 255 = 0.
> 
> Besides that, it looks good to me.
> 

Yes, it is wrong, and thanks for removing comparison tip.

But thinking twice, taking wrapping into account for a uint64_t variable
can be being too cautious anyway. I will remove it completely.

> 
> While reviewing, I came across the rx_mbuf_alloc_failed counter in the rte_eth_dev_data structure:
> https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/rte_ethdev.c#L3145
> https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/ethdev_driver.h#L127
> 
> Doesn't it have the same problem?
> 

stats reset problem? af_packet is not collecting 'rx_mbuf_alloc_failed',
so nothing to do there for af_packet.

> 
> BTW, the af_packet PMD doesn't increase the rx_mbuf_alloc_failed counter on mbuf allocation failures. But that's a separate bug.
> 

Yes it is missing, but I wouldn't call it a bug, just one of the stats
is missing. And yes this can be handled separately if required.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [RFC v2] net/af_packet: make stats reset reliable
  2024-04-25 17:46 [RFC] net/af_packet: make stats reset reliable Ferruh Yigit
  2024-04-26 11:33 ` Morten Brørup
@ 2024-04-26 14:38 ` Ferruh Yigit
  2024-04-26 14:47   ` Morten Brørup
  2024-04-28 15:11   ` Mattias Rönnblom
  2024-04-26 21:28 ` [RFC] " Patrick Robb
  2024-05-03 15:45 ` [RFC v3] " Ferruh Yigit
  3 siblings, 2 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-04-26 14:38 UTC (permalink / raw)
  To: John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

For stats reset, use an offset instead of zeroing out actual stats values,
get_stats() displays diff between stats and offset.
This way stats only updated in datapath and offset only updated in stats
reset function. This makes stats reset function more reliable.

As stats only written by single thread, we can remove 'volatile' qualifier
which should improve the performance in datapath.

Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
---
Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Morten Brørup <mb@smartsharesystems.com>

This update triggered by mail list discussion [1].

[1]
https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/

v2:
* Remove wrapping check for stats
---
 drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 397a32db5886..10c8e1e50139 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -51,8 +51,10 @@ struct pkt_rx_queue {
 	uint16_t in_port;
 	uint8_t vlan_strip;
 
-	volatile unsigned long rx_pkts;
-	volatile unsigned long rx_bytes;
+	uint64_t rx_pkts;
+	uint64_t rx_bytes;
+	uint64_t rx_pkts_offset;
+	uint64_t rx_bytes_offset;
 };
 
 struct pkt_tx_queue {
@@ -64,9 +66,12 @@ struct pkt_tx_queue {
 	unsigned int framecount;
 	unsigned int framenum;
 
-	volatile unsigned long tx_pkts;
-	volatile unsigned long err_pkts;
-	volatile unsigned long tx_bytes;
+	uint64_t tx_pkts;
+	uint64_t err_pkts;
+	uint64_t tx_bytes;
+	uint64_t tx_pkts_offset;
+	uint64_t err_pkts_offset;
+	uint64_t tx_bytes_offset;
 };
 
 struct pmd_internals {
@@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	return 0;
 }
 
+
+static uint64_t
+stats_get_diff(uint64_t stats, uint64_t offset)
+{
+	return stats - offset;
+}
+
 static int
-eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
 	unsigned i, imax;
 	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
@@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
-		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
-		rx_total += igb_stats->q_ipackets[i];
-		rx_bytes_total += igb_stats->q_ibytes[i];
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts, rxq->rx_pkts_offset);
+		stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes, rxq->rx_bytes_offset);
+		rx_total += stats->q_ipackets[i];
+		rx_bytes_total += stats->q_ibytes[i];
 	}
 
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
-		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
-		tx_total += igb_stats->q_opackets[i];
-		tx_err_total += internal->tx_queue[i].err_pkts;
-		tx_bytes_total += igb_stats->q_obytes[i];
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		stats->q_opackets[i] = stats_get_diff(txq->tx_pkts, txq->tx_pkts_offset);
+		stats->q_obytes[i] = stats_get_diff(txq->tx_bytes, txq->tx_bytes_offset);
+		tx_total += stats->q_opackets[i];
+		tx_err_total += stats_get_diff(txq->err_pkts, txq->err_pkts_offset);
+		tx_bytes_total += stats->q_obytes[i];
 	}
 
-	igb_stats->ipackets = rx_total;
-	igb_stats->ibytes = rx_bytes_total;
-	igb_stats->opackets = tx_total;
-	igb_stats->oerrors = tx_err_total;
-	igb_stats->obytes = tx_bytes_total;
+	stats->ipackets = rx_total;
+	stats->ibytes = rx_bytes_total;
+	stats->opackets = tx_total;
+	stats->oerrors = tx_err_total;
+	stats->obytes = tx_bytes_total;
 	return 0;
 }
 
@@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
 	struct pmd_internals *internal = dev->data->dev_private;
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->rx_queue[i].rx_pkts = 0;
-		internal->rx_queue[i].rx_bytes = 0;
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		rxq->rx_pkts_offset = rxq->rx_pkts;
+		rxq->rx_bytes_offset = rxq->rx_bytes;
 	}
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->tx_queue[i].tx_pkts = 0;
-		internal->tx_queue[i].err_pkts = 0;
-		internal->tx_queue[i].tx_bytes = 0;
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		txq->tx_pkts_offset = txq->tx_pkts;
+		txq->err_pkts_offset = txq->err_pkts;
+		txq->tx_bytes_offset = txq->tx_bytes;
 	}
 
 	return 0;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC v2] net/af_packet: make stats reset reliable
  2024-04-26 14:38 ` [RFC v2] " Ferruh Yigit
@ 2024-04-26 14:47   ` Morten Brørup
  2024-04-28 15:11   ` Mattias Rönnblom
  1 sibling, 0 replies; 42+ messages in thread
From: Morten Brørup @ 2024-04-26 14:47 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> +static uint64_t
> +stats_get_diff(uint64_t stats, uint64_t offset)
> +{
> +	return stats - offset;
> +}

No need for this function; just subtract inline instead.

With the suggested change,
Reviewed-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC] net/af_packet: make stats reset reliable
  2024-04-26 13:37   ` Ferruh Yigit
@ 2024-04-26 14:56     ` Morten Brørup
  0 siblings, 0 replies; 42+ messages in thread
From: Morten Brørup @ 2024-04-26 14:56 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
> Sent: Friday, 26 April 2024 15.38
> 
> On 4/26/2024 12:33 PM, Morten Brørup wrote:

[...]

> > While reviewing, I came across the rx_mbuf_alloc_failed counter in the
> rte_eth_dev_data structure:
> > https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/rte_ethdev.c#L3145
> >
> https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/ethdev_driver.h#L127
> >
> > Doesn't it have the same problem?
> >
> 
> stats reset problem? af_packet is not collecting 'rx_mbuf_alloc_failed',
> so nothing to do there for af_packet.

Agreed, not related to af_packet or this patch.

I'm just wondering if a similar or other patch should be applied to rx_mbuf_alloc_failed in the ethdev layer.
rx_mbuf_alloc_failed is shared by lcores, so perhaps it should be atomic, or atomically incremented by drivers using it.

> 
> >
> > BTW, the af_packet PMD doesn't increase the rx_mbuf_alloc_failed counter on
> mbuf allocation failures. But that's a separate bug.
> >
> 
> Yes it is missing, but I wouldn't call it a bug, just one of the stats
> is missing. And yes this can be handled separately if required.

OK, then just some stats missing, not a bug.
Quite useful stats for debugging a production system, if mbuf allocations fail.
But still, not related to this patch.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC] net/af_packet: make stats reset reliable
  2024-04-25 17:46 [RFC] net/af_packet: make stats reset reliable Ferruh Yigit
  2024-04-26 11:33 ` Morten Brørup
  2024-04-26 14:38 ` [RFC v2] " Ferruh Yigit
@ 2024-04-26 21:28 ` Patrick Robb
  2024-05-03 15:45 ` [RFC v3] " Ferruh Yigit
  3 siblings, 0 replies; 42+ messages in thread
From: Patrick Robb @ 2024-04-26 21:28 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Stephen Hemminger, Morten Brørup

[-- Attachment #1: Type: text/plain, Size: 296 bytes --]

Recheck-request: iol-compile-amd64-testing

The DPDK Community Lab updated to the latest Alpine image yesterday, which
resulted in all Alpine builds failing. The failure is unrelated to your
patch, and this recheck should remove the fail on Patchwork, as we have
disabled Alpine testing for now.

[-- Attachment #2: Type: text/html, Size: 361 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-04-26 14:38 ` [RFC v2] " Ferruh Yigit
  2024-04-26 14:47   ` Morten Brørup
@ 2024-04-28 15:11   ` Mattias Rönnblom
  2024-05-01 16:19     ` Ferruh Yigit
  2024-05-07  7:23     ` Mattias Rönnblom
  1 sibling, 2 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-04-28 15:11 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 2024-04-26 16:38, Ferruh Yigit wrote:
> For stats reset, use an offset instead of zeroing out actual stats values,
> get_stats() displays diff between stats and offset.
> This way stats only updated in datapath and offset only updated in stats
> reset function. This makes stats reset function more reliable.
> 
> As stats only written by single thread, we can remove 'volatile' qualifier
> which should improve the performance in datapath.
> 

volatile wouldn't help you if you had multiple writers, so that can't be 
the reason for its removal. It would be more accurate to say it should 
be replaced with atomic updates. If you don't use volatile and don't use 
atomics, you have to consider if the compiler can reach the conclusion 
that it does not need to store the counter value for future use *for 
that thread*. Since otherwise, I don't think the store actually needs to 
occur. Since DPDK statistics tend to work, it's pretty obvious that 
current compilers tend not to reach this conclusion.

If this should be done 100% properly, the update operation should be a 
non-atomic load, non-atomic add, and an atomic store. Similarly, for the 
reset, the offset store should be atomic.

Considered the state of the rest of the DPDK code base, I think a 
non-atomic, non-volatile solution is also fine.

(That said, I think we're better off just deprecating stats reset 
altogether, and returning -ENOTSUP here meanwhile.)

> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> ---
> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Morten Brørup <mb@smartsharesystems.com>
> 
> This update triggered by mail list discussion [1].
> 
> [1]
> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
> 
> v2:
> * Remove wrapping check for stats
> ---
>   drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>   1 file changed, 41 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
> index 397a32db5886..10c8e1e50139 100644
> --- a/drivers/net/af_packet/rte_eth_af_packet.c
> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>   	uint16_t in_port;
>   	uint8_t vlan_strip;
>   
> -	volatile unsigned long rx_pkts;
> -	volatile unsigned long rx_bytes;
> +	uint64_t rx_pkts;
> +	uint64_t rx_bytes;
> +	uint64_t rx_pkts_offset;
> +	uint64_t rx_bytes_offset;

I suggest you introduce a separate struct for reset-able counters. It'll 
make things cleaner, and you can sneak in atomics without too much 
atomics-related bloat.

struct counter
{
	uint64_t count;
	uint64_t offset;
};

/../
	struct counter rx_pkts;
	struct counter rx_bytes;
/../

static uint64_t
counter_value(struct counter *counter)
{
	uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
	uint64_t offset = __atomic_load_n(&counter->offset, __ATOMIC_RELAXED);

	return count + offset;
}

static void
counter_reset(struct counter *counter)
{
	uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);

	__atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
}

static void
counter_add(struct counter *counter, uint64_t operand)
{
	__atomic_store_n(&counter->count, counter->count + operand, 
__ATOMIC_RELAXED);
}

You'd have to port this to <rte_stdatomic.h> calls, which prevents 
non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be 
replaced with explicit relaxed non-atomic load. Otherwise, if you just 
use "counter->count", that would be an atomic load with sequential 
consistency memory order on C11 atomics-based builds, which would result 
in a barrier, at least on weakly ordered machines (e.g., ARM).

I would still use a struct and some helper-functions even for the less 
ambitious, non-atomic variant.

The only drawback of using GCC built-ins type atomics here, versus an 
atomic- and volatile-free approach, is that current compilers seems to 
refuse merging atomic stores. It's beyond me why this is the case. If 
you store to a variable twice in quick succession, it'll be two store 
machine instructions, even in cases where the compiler *knows* the value 
is identical. So volatile, even though you didn't ask for it. Weird.

So if you have a loop, you may want to make an "counter_add()" in the 
end from a temporary, to get the final 0.001% of performance.

If the tech board thinks MT-safe reset-able software-manage statistics 
is the future (as opposed to dropping reset support, for example), I 
think this stuff should go into a separate header file, so other PMDs 
can reuse it. Maybe out of scope for this patch.

>   };
>   
>   struct pkt_tx_queue {
> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>   	unsigned int framecount;
>   	unsigned int framenum;
>   
> -	volatile unsigned long tx_pkts;
> -	volatile unsigned long err_pkts;
> -	volatile unsigned long tx_bytes;
> +	uint64_t tx_pkts;
> +	uint64_t err_pkts;
> +	uint64_t tx_bytes;
> +	uint64_t tx_pkts_offset;
> +	uint64_t err_pkts_offset;
> +	uint64_t tx_bytes_offset;
>   };
>   
>   struct pmd_internals {
> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>   	return 0;
>   }
>   
> +
> +static uint64_t
> +stats_get_diff(uint64_t stats, uint64_t offset)
> +{
> +	return stats - offset;
> +}
> +
>   static int
> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>   {
>   	unsigned i, imax;
>   	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>   	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>   	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>   	for (i = 0; i < imax; i++) {
> -		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
> -		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
> -		rx_total += igb_stats->q_ipackets[i];
> -		rx_bytes_total += igb_stats->q_ibytes[i];
> +		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
> +		stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts, rxq->rx_pkts_offset);
> +		stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes, rxq->rx_bytes_offset);
> +		rx_total += stats->q_ipackets[i];
> +		rx_bytes_total += stats->q_ibytes[i];
>   	}
>   
>   	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>   	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>   	for (i = 0; i < imax; i++) {
> -		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
> -		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
> -		tx_total += igb_stats->q_opackets[i];
> -		tx_err_total += internal->tx_queue[i].err_pkts;
> -		tx_bytes_total += igb_stats->q_obytes[i];
> +		struct pkt_tx_queue *txq = &internal->tx_queue[i];
> +		stats->q_opackets[i] = stats_get_diff(txq->tx_pkts, txq->tx_pkts_offset);
> +		stats->q_obytes[i] = stats_get_diff(txq->tx_bytes, txq->tx_bytes_offset);
> +		tx_total += stats->q_opackets[i];
> +		tx_err_total += stats_get_diff(txq->err_pkts, txq->err_pkts_offset);
> +		tx_bytes_total += stats->q_obytes[i];
>   	}
>   
> -	igb_stats->ipackets = rx_total;
> -	igb_stats->ibytes = rx_bytes_total;
> -	igb_stats->opackets = tx_total;
> -	igb_stats->oerrors = tx_err_total;
> -	igb_stats->obytes = tx_bytes_total;
> +	stats->ipackets = rx_total;
> +	stats->ibytes = rx_bytes_total;
> +	stats->opackets = tx_total;
> +	stats->oerrors = tx_err_total;
> +	stats->obytes = tx_bytes_total;
>   	return 0;
>   }
>   
> @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>   	struct pmd_internals *internal = dev->data->dev_private;
>   
>   	for (i = 0; i < internal->nb_queues; i++) {
> -		internal->rx_queue[i].rx_pkts = 0;
> -		internal->rx_queue[i].rx_bytes = 0;
> +		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
> +		rxq->rx_pkts_offset = rxq->rx_pkts;
> +		rxq->rx_bytes_offset = rxq->rx_bytes;
>   	}
>   
>   	for (i = 0; i < internal->nb_queues; i++) {
> -		internal->tx_queue[i].tx_pkts = 0;
> -		internal->tx_queue[i].err_pkts = 0;
> -		internal->tx_queue[i].tx_bytes = 0;
> +		struct pkt_tx_queue *txq = &internal->tx_queue[i];
> +		txq->tx_pkts_offset = txq->tx_pkts;
> +		txq->err_pkts_offset = txq->err_pkts;
> +		txq->tx_bytes_offset = txq->tx_bytes;
>   	}
>   
>   	return 0;

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC] net/af_packet: make stats reset reliable
  2024-04-26 11:33 ` Morten Brørup
  2024-04-26 13:37   ` Ferruh Yigit
@ 2024-04-28 15:42   ` Mattias Rönnblom
  1 sibling, 0 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-04-28 15:42 UTC (permalink / raw)
  To: Morten Brørup, Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

On 2024-04-26 13:33, Morten Brørup wrote:
>> +static uint64_t
>> +stats_get_diff(uint64_t stats, uint64_t offset)
>> +{
>> +	if (stats >= offset)
>> +		return stats - offset;
>> +	/* unlikely wraparound case */
>> +	return UINT64_MAX + stats - offset;
> 
> The numbers are unsigned, so wrapping comes for free.
> 

With 64-bit counters, will they ever wrap? If you constantly run 100 
Gbps it'll take > 1000 years before the byte counter wrap.

> Remove the comparison and always return stats - offset.
> 
> Using uint8_t for easier explanation, if offset is 255 and stats is 0, then the diff should be 1.
> Returning stats - offset:
> stats - offset = 0 - 255 = 0 - 0xFF = 1.
> 
> Returning UINT8_MAX + stats - offset is wrong:
> UINT8_MAX + stats - offset = 255 - 0 - 255 = 0.
> 
> Besides that, it looks good to me.
> 
> 
> While reviewing, I came across the rx_mbuf_alloc_failed counter in the rte_eth_dev_data structure:
> https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/rte_ethdev.c#L3145
> https://elixir.bootlin.com/dpdk/v24.03/source/lib/ethdev/ethdev_driver.h#L127
> 
> Doesn't it have the same problem?
> 
> 
> BTW, the af_packet PMD doesn't increase the rx_mbuf_alloc_failed counter on mbuf allocation failures. But that's a separate bug.
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-04-28 15:11   ` Mattias Rönnblom
@ 2024-05-01 16:19     ` Ferruh Yigit
  2024-05-02  5:51       ` Mattias Rönnblom
  2024-05-07  7:23     ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-01 16:19 UTC (permalink / raw)
  To: Mattias Rönnblom, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 4/28/2024 4:11 PM, Mattias Rönnblom wrote:
> On 2024-04-26 16:38, Ferruh Yigit wrote:
>> For stats reset, use an offset instead of zeroing out actual stats
>> values,
>> get_stats() displays diff between stats and offset.
>> This way stats only updated in datapath and offset only updated in stats
>> reset function. This makes stats reset function more reliable.
>>
>> As stats only written by single thread, we can remove 'volatile'
>> qualifier
>> which should improve the performance in datapath.
>>
> 
> volatile wouldn't help you if you had multiple writers, so that can't be
> the reason for its removal. It would be more accurate to say it should
> be replaced with atomic updates. If you don't use volatile and don't use
> atomics, you have to consider if the compiler can reach the conclusion
> that it does not need to store the counter value for future use *for
> that thread*. Since otherwise, I don't think the store actually needs to
> occur. Since DPDK statistics tend to work, it's pretty obvious that
> current compilers tend not to reach this conclusion.
> 

Thanks Mattias for clarifying why we need volatile or atomics even with
single writer.

> If this should be done 100% properly, the update operation should be a
> non-atomic load, non-atomic add, and an atomic store. Similarly, for the
> reset, the offset store should be atomic.
> 

ack

> Considered the state of the rest of the DPDK code base, I think a
> non-atomic, non-volatile solution is also fine.
> 

Yes, this seems working practically but I guess better to follow above
suggestion.

> (That said, I think we're better off just deprecating stats reset
> altogether, and returning -ENOTSUP here meanwhile.)
> 

As long as reset is reliable (here I mean it reset stats in every call)
and doesn't impact datapath performance, I am for to continue with it.
Returning non supported won't bring more benefit to users.

>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>> ---
>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>
>> This update triggered by mail list discussion [1].
>>
>> [1]
>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>
>> v2:
>> * Remove wrapping check for stats
>> ---
>>   drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>>   1 file changed, 41 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>> b/drivers/net/af_packet/rte_eth_af_packet.c
>> index 397a32db5886..10c8e1e50139 100644
>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>       uint16_t in_port;
>>       uint8_t vlan_strip;
>>   -    volatile unsigned long rx_pkts;
>> -    volatile unsigned long rx_bytes;
>> +    uint64_t rx_pkts;
>> +    uint64_t rx_bytes;
>> +    uint64_t rx_pkts_offset;
>> +    uint64_t rx_bytes_offset;
> 
> I suggest you introduce a separate struct for reset-able counters. It'll
> make things cleaner, and you can sneak in atomics without too much
> atomics-related bloat.
> 
> struct counter
> {
>     uint64_t count;
>     uint64_t offset;
> };
> 
> /../
>     struct counter rx_pkts;
>     struct counter rx_bytes;
> /../
> 
> static uint64_t
> counter_value(struct counter *counter)
> {
>     uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>     uint64_t offset = __atomic_load_n(&counter->offset, __ATOMIC_RELAXED);
> 
>     return count + offset;
> }
> 
> static void
> counter_reset(struct counter *counter)
> {
>     uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
> 
>     __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
> }
> 
> static void
> counter_add(struct counter *counter, uint64_t operand)
> {
>     __atomic_store_n(&counter->count, counter->count + operand,
> __ATOMIC_RELAXED);
> }
> 

Ack for separate struct for reset-able counters.

> You'd have to port this to <rte_stdatomic.h> calls, which prevents
> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be
> replaced with explicit relaxed non-atomic load. Otherwise, if you just
> use "counter->count", that would be an atomic load with sequential
> consistency memory order on C11 atomics-based builds, which would result
> in a barrier, at least on weakly ordered machines (e.g., ARM).
> 

I am not sure I understand above.
As load and add will be non-atomic, why not access them directly, like:
`uint64_t count = counter->count;`

So my understanding is, remove `volatile`, load and add without atomics,
and only use relaxed ordered atomics for store (to ensure value in
register stored to memory).

I will send a new version of RFC with above understanding.

> I would still use a struct and some helper-functions even for the less
> ambitious, non-atomic variant.
> 
> The only drawback of using GCC built-ins type atomics here, versus an
> atomic- and volatile-free approach, is that current compilers seems to
> refuse merging atomic stores. It's beyond me why this is the case. If
> you store to a variable twice in quick succession, it'll be two store
> machine instructions, even in cases where the compiler *knows* the value
> is identical. So volatile, even though you didn't ask for it. Weird.
> 
> So if you have a loop, you may want to make an "counter_add()" in the
> end from a temporary, to get the final 0.001% of performance.
> 

ack

I can't really say which one of the following is better (because of
store in empty poll), but I will keep it as it is (b.):

a.
for (i < nb_pkt) {
	stats =+ 1;
}


b.
for (i < nb_pkt) {
	tmp =+ 1;
}
stats += tmp;


c.
for (i < nb_pkt) {
	tmp =+ 1;
}
if (tmp)
	stats += tmp;



> If the tech board thinks MT-safe reset-able software-manage statistics
> is the future (as opposed to dropping reset support, for example), I
> think this stuff should go into a separate header file, so other PMDs
> can reuse it. Maybe out of scope for this patch.
> 

I don't think we need MT-safe reset, the patch is already out to
document current status.
For HW stats reset is already reliable and for SW drives offset based
approach can make is reliable.

Unless you explicitly asked for it, I don't think this is in the agenda
of the techboard.


>>   };
>>     struct pkt_tx_queue {
>> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>>       unsigned int framecount;
>>       unsigned int framenum;
>>   -    volatile unsigned long tx_pkts;
>> -    volatile unsigned long err_pkts;
>> -    volatile unsigned long tx_bytes;
>> +    uint64_t tx_pkts;
>> +    uint64_t err_pkts;
>> +    uint64_t tx_bytes;
>> +    uint64_t tx_pkts_offset;
>> +    uint64_t err_pkts_offset;
>> +    uint64_t tx_bytes_offset;
>>   };
>>     struct pmd_internals {
>> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct
>> rte_eth_dev_info *dev_info)
>>       return 0;
>>   }
>>   +
>> +static uint64_t
>> +stats_get_diff(uint64_t stats, uint64_t offset)
>> +{
>> +    return stats - offset;
>> +}
>> +
>>   static int
>> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>>   {
>>       unsigned i, imax;
>>       unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct
>> rte_eth_stats *igb_stats)
>>       imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>       for (i = 0; i < imax; i++) {
>> -        igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
>> -        igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
>> -        rx_total += igb_stats->q_ipackets[i];
>> -        rx_bytes_total += igb_stats->q_ibytes[i];
>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>> +        stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts,
>> rxq->rx_pkts_offset);
>> +        stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes,
>> rxq->rx_bytes_offset);
>> +        rx_total += stats->q_ipackets[i];
>> +        rx_bytes_total += stats->q_ibytes[i];
>>       }
>>         imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>       for (i = 0; i < imax; i++) {
>> -        igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
>> -        igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
>> -        tx_total += igb_stats->q_opackets[i];
>> -        tx_err_total += internal->tx_queue[i].err_pkts;
>> -        tx_bytes_total += igb_stats->q_obytes[i];
>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>> +        stats->q_opackets[i] = stats_get_diff(txq->tx_pkts,
>> txq->tx_pkts_offset);
>> +        stats->q_obytes[i] = stats_get_diff(txq->tx_bytes,
>> txq->tx_bytes_offset);
>> +        tx_total += stats->q_opackets[i];
>> +        tx_err_total += stats_get_diff(txq->err_pkts,
>> txq->err_pkts_offset);
>> +        tx_bytes_total += stats->q_obytes[i];
>>       }
>>   -    igb_stats->ipackets = rx_total;
>> -    igb_stats->ibytes = rx_bytes_total;
>> -    igb_stats->opackets = tx_total;
>> -    igb_stats->oerrors = tx_err_total;
>> -    igb_stats->obytes = tx_bytes_total;
>> +    stats->ipackets = rx_total;
>> +    stats->ibytes = rx_bytes_total;
>> +    stats->opackets = tx_total;
>> +    stats->oerrors = tx_err_total;
>> +    stats->obytes = tx_bytes_total;
>>       return 0;
>>   }
>>   @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>>       struct pmd_internals *internal = dev->data->dev_private;
>>         for (i = 0; i < internal->nb_queues; i++) {
>> -        internal->rx_queue[i].rx_pkts = 0;
>> -        internal->rx_queue[i].rx_bytes = 0;
>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>> +        rxq->rx_pkts_offset = rxq->rx_pkts;
>> +        rxq->rx_bytes_offset = rxq->rx_bytes;
>>       }
>>         for (i = 0; i < internal->nb_queues; i++) {
>> -        internal->tx_queue[i].tx_pkts = 0;
>> -        internal->tx_queue[i].err_pkts = 0;
>> -        internal->tx_queue[i].tx_bytes = 0;
>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>> +        txq->tx_pkts_offset = txq->tx_pkts;
>> +        txq->err_pkts_offset = txq->err_pkts;
>> +        txq->tx_bytes_offset = txq->tx_bytes;
>>       }
>>         return 0;


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-01 16:19     ` Ferruh Yigit
@ 2024-05-02  5:51       ` Mattias Rönnblom
  2024-05-02 14:22         ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-02  5:51 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 2024-05-01 18:19, Ferruh Yigit wrote:
> On 4/28/2024 4:11 PM, Mattias Rönnblom wrote:
>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>> For stats reset, use an offset instead of zeroing out actual stats
>>> values,
>>> get_stats() displays diff between stats and offset.
>>> This way stats only updated in datapath and offset only updated in stats
>>> reset function. This makes stats reset function more reliable.
>>>
>>> As stats only written by single thread, we can remove 'volatile'
>>> qualifier
>>> which should improve the performance in datapath.
>>>
>>
>> volatile wouldn't help you if you had multiple writers, so that can't be
>> the reason for its removal. It would be more accurate to say it should
>> be replaced with atomic updates. If you don't use volatile and don't use
>> atomics, you have to consider if the compiler can reach the conclusion
>> that it does not need to store the counter value for future use *for
>> that thread*. Since otherwise, I don't think the store actually needs to
>> occur. Since DPDK statistics tend to work, it's pretty obvious that
>> current compilers tend not to reach this conclusion.
>>
> 
> Thanks Mattias for clarifying why we need volatile or atomics even with
> single writer.
> 
>> If this should be done 100% properly, the update operation should be a
>> non-atomic load, non-atomic add, and an atomic store. Similarly, for the
>> reset, the offset store should be atomic.
>>
> 
> ack
> 
>> Considered the state of the rest of the DPDK code base, I think a
>> non-atomic, non-volatile solution is also fine.
>>
> 
> Yes, this seems working practically but I guess better to follow above
> suggestion.
> 
>> (That said, I think we're better off just deprecating stats reset
>> altogether, and returning -ENOTSUP here meanwhile.)
>>
> 
> As long as reset is reliable (here I mean it reset stats in every call)
> and doesn't impact datapath performance, I am for to continue with it.
> Returning non supported won't bring more benefit to users.
> 
>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>> ---
>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>
>>> This update triggered by mail list discussion [1].
>>>
>>> [1]
>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>>
>>> v2:
>>> * Remove wrapping check for stats
>>> ---
>>>    drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>>>    1 file changed, 41 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>>> b/drivers/net/af_packet/rte_eth_af_packet.c
>>> index 397a32db5886..10c8e1e50139 100644
>>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>>        uint16_t in_port;
>>>        uint8_t vlan_strip;
>>>    -    volatile unsigned long rx_pkts;
>>> -    volatile unsigned long rx_bytes;
>>> +    uint64_t rx_pkts;
>>> +    uint64_t rx_bytes;
>>> +    uint64_t rx_pkts_offset;
>>> +    uint64_t rx_bytes_offset;
>>
>> I suggest you introduce a separate struct for reset-able counters. It'll
>> make things cleaner, and you can sneak in atomics without too much
>> atomics-related bloat.
>>
>> struct counter
>> {
>>      uint64_t count;
>>      uint64_t offset;
>> };
>>
>> /../
>>      struct counter rx_pkts;
>>      struct counter rx_bytes;
>> /../
>>
>> static uint64_t
>> counter_value(struct counter *counter)
>> {
>>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>      uint64_t offset = __atomic_load_n(&counter->offset, __ATOMIC_RELAXED);
>>
>>      return count + offset;
>> }
>>
>> static void
>> counter_reset(struct counter *counter)
>> {
>>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>
>>      __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>> }
>>
>> static void
>> counter_add(struct counter *counter, uint64_t operand)
>> {
>>      __atomic_store_n(&counter->count, counter->count + operand,
>> __ATOMIC_RELAXED);
>> }
>>
> 
> Ack for separate struct for reset-able counters.
> 
>> You'd have to port this to <rte_stdatomic.h> calls, which prevents
>> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be
>> replaced with explicit relaxed non-atomic load. Otherwise, if you just
>> use "counter->count", that would be an atomic load with sequential
>> consistency memory order on C11 atomics-based builds, which would result
>> in a barrier, at least on weakly ordered machines (e.g., ARM).
>>
> 
> I am not sure I understand above.
> As load and add will be non-atomic, why not access them directly, like:
> `uint64_t count = counter->count;`
> 

In case count is _Atomic (i.e., on enable_stdatomic=true builds), "count 
= counter->count" will imply a memory barrier. On x86_64, I think it 
will "only" be a compiler barrier (i.e., preventing optimization). On 
weakly ordered machines, it will result in a barrier-instruction (or an 
instruction-which-is-also-a-barrier, like in the example below).

include <stdatomic.h>

int relaxed_load(_Atomic int *p)
{
     atomic_load_explicit(p, memory_order_relaxed);
}

int direct_load(_Atomic int *p)
{
     return *p;
}

GCC 13.2 ARM64 ->

relaxed_load:
         ldr     w0, [x0]
         ret
direct_load:
         ldar    w0, [x0]
         ret

> So my understanding is, remove `volatile`, load and add without atomics,
> and only use relaxed ordered atomics for store (to ensure value in
> register stored to memory).
> 

Yes, that would be the best option, would the DPDK atomics API allow its 
implementation - but it doesn't. At least not if you care about what 
happens in enable_stdatomic=true builds.

The second-best option is to use a rte_memory_order_relaxed atomic load, 
a regular non-atomic add, and a relaxed atomic store.

> I will send a new version of RFC with above understanding.
> 
>> I would still use a struct and some helper-functions even for the less
>> ambitious, non-atomic variant.
>>
>> The only drawback of using GCC built-ins type atomics here, versus an
>> atomic- and volatile-free approach, is that current compilers seems to
>> refuse merging atomic stores. It's beyond me why this is the case. If
>> you store to a variable twice in quick succession, it'll be two store
>> machine instructions, even in cases where the compiler *knows* the value
>> is identical. So volatile, even though you didn't ask for it. Weird.
>>
>> So if you have a loop, you may want to make an "counter_add()" in the
>> end from a temporary, to get the final 0.001% of performance.
>>
> 
> ack
> 
> I can't really say which one of the following is better (because of
> store in empty poll), but I will keep it as it is (b.):
> 
> a.
> for (i < nb_pkt) {
> 	stats =+ 1;
> }
> 
> 
> b.
> for (i < nb_pkt) {
> 	tmp =+ 1;
> }
> stats += tmp;
> 
> 
> c.
> for (i < nb_pkt) {
> 	tmp =+ 1;
> }
> if (tmp)
> 	stats += tmp;
> 
> 
> 
>> If the tech board thinks MT-safe reset-able software-manage statistics
>> is the future (as opposed to dropping reset support, for example), I
>> think this stuff should go into a separate header file, so other PMDs
>> can reuse it. Maybe out of scope for this patch.
>>
> 
> I don't think we need MT-safe reset, the patch is already out to
> document current status.

Well, what you are working on is a MT-safe reset, in the sense it allows 
for one (1) resetting thread properly synchronize with multiple 
concurrent counter-updating threads.

It's not going to be completely MT safe, since you can't have two 
threads calling the reset function in parallel.

Any change to the API should make this clear.

> For HW stats reset is already reliable and for SW drives offset based
> approach can make is reliable.
> 
> Unless you explicitly asked for it, I don't think this is in the agenda
> of the techboard.
> 
> 
>>>    };
>>>      struct pkt_tx_queue {
>>> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>>>        unsigned int framecount;
>>>        unsigned int framenum;
>>>    -    volatile unsigned long tx_pkts;
>>> -    volatile unsigned long err_pkts;
>>> -    volatile unsigned long tx_bytes;
>>> +    uint64_t tx_pkts;
>>> +    uint64_t err_pkts;
>>> +    uint64_t tx_bytes;
>>> +    uint64_t tx_pkts_offset;
>>> +    uint64_t err_pkts_offset;
>>> +    uint64_t tx_bytes_offset;
>>>    };
>>>      struct pmd_internals {
>>> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct
>>> rte_eth_dev_info *dev_info)
>>>        return 0;
>>>    }
>>>    +
>>> +static uint64_t
>>> +stats_get_diff(uint64_t stats, uint64_t offset)
>>> +{
>>> +    return stats - offset;
>>> +}
>>> +
>>>    static int
>>> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>>>    {
>>>        unsigned i, imax;
>>>        unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>>> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct
>>> rte_eth_stats *igb_stats)
>>>        imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>                internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>        for (i = 0; i < imax; i++) {
>>> -        igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
>>> -        igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
>>> -        rx_total += igb_stats->q_ipackets[i];
>>> -        rx_bytes_total += igb_stats->q_ibytes[i];
>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>> +        stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts,
>>> rxq->rx_pkts_offset);
>>> +        stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes,
>>> rxq->rx_bytes_offset);
>>> +        rx_total += stats->q_ipackets[i];
>>> +        rx_bytes_total += stats->q_ibytes[i];
>>>        }
>>>          imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>                internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>        for (i = 0; i < imax; i++) {
>>> -        igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
>>> -        igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
>>> -        tx_total += igb_stats->q_opackets[i];
>>> -        tx_err_total += internal->tx_queue[i].err_pkts;
>>> -        tx_bytes_total += igb_stats->q_obytes[i];
>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>> +        stats->q_opackets[i] = stats_get_diff(txq->tx_pkts,
>>> txq->tx_pkts_offset);
>>> +        stats->q_obytes[i] = stats_get_diff(txq->tx_bytes,
>>> txq->tx_bytes_offset);
>>> +        tx_total += stats->q_opackets[i];
>>> +        tx_err_total += stats_get_diff(txq->err_pkts,
>>> txq->err_pkts_offset);
>>> +        tx_bytes_total += stats->q_obytes[i];
>>>        }
>>>    -    igb_stats->ipackets = rx_total;
>>> -    igb_stats->ibytes = rx_bytes_total;
>>> -    igb_stats->opackets = tx_total;
>>> -    igb_stats->oerrors = tx_err_total;
>>> -    igb_stats->obytes = tx_bytes_total;
>>> +    stats->ipackets = rx_total;
>>> +    stats->ibytes = rx_bytes_total;
>>> +    stats->opackets = tx_total;
>>> +    stats->oerrors = tx_err_total;
>>> +    stats->obytes = tx_bytes_total;
>>>        return 0;
>>>    }
>>>    @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>>>        struct pmd_internals *internal = dev->data->dev_private;
>>>          for (i = 0; i < internal->nb_queues; i++) {
>>> -        internal->rx_queue[i].rx_pkts = 0;
>>> -        internal->rx_queue[i].rx_bytes = 0;
>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>> +        rxq->rx_pkts_offset = rxq->rx_pkts;
>>> +        rxq->rx_bytes_offset = rxq->rx_bytes;
>>>        }
>>>          for (i = 0; i < internal->nb_queues; i++) {
>>> -        internal->tx_queue[i].tx_pkts = 0;
>>> -        internal->tx_queue[i].err_pkts = 0;
>>> -        internal->tx_queue[i].tx_bytes = 0;
>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>> +        txq->tx_pkts_offset = txq->tx_pkts;
>>> +        txq->err_pkts_offset = txq->err_pkts;
>>> +        txq->tx_bytes_offset = txq->tx_bytes;
>>>        }
>>>          return 0;
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02  5:51       ` Mattias Rönnblom
@ 2024-05-02 14:22         ` Ferruh Yigit
  2024-05-02 15:59           ` Stephen Hemminger
  2024-05-02 17:37           ` Mattias Rönnblom
  0 siblings, 2 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-02 14:22 UTC (permalink / raw)
  To: Mattias Rönnblom, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 5/2/2024 6:51 AM, Mattias Rönnblom wrote:
> On 2024-05-01 18:19, Ferruh Yigit wrote:
>> On 4/28/2024 4:11 PM, Mattias Rönnblom wrote:
>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>>> For stats reset, use an offset instead of zeroing out actual stats
>>>> values,
>>>> get_stats() displays diff between stats and offset.
>>>> This way stats only updated in datapath and offset only updated in
>>>> stats
>>>> reset function. This makes stats reset function more reliable.
>>>>
>>>> As stats only written by single thread, we can remove 'volatile'
>>>> qualifier
>>>> which should improve the performance in datapath.
>>>>
>>>
>>> volatile wouldn't help you if you had multiple writers, so that can't be
>>> the reason for its removal. It would be more accurate to say it should
>>> be replaced with atomic updates. If you don't use volatile and don't use
>>> atomics, you have to consider if the compiler can reach the conclusion
>>> that it does not need to store the counter value for future use *for
>>> that thread*. Since otherwise, I don't think the store actually needs to
>>> occur. Since DPDK statistics tend to work, it's pretty obvious that
>>> current compilers tend not to reach this conclusion.
>>>
>>
>> Thanks Mattias for clarifying why we need volatile or atomics even with
>> single writer.
>>
>>> If this should be done 100% properly, the update operation should be a
>>> non-atomic load, non-atomic add, and an atomic store. Similarly, for the
>>> reset, the offset store should be atomic.
>>>
>>
>> ack
>>
>>> Considered the state of the rest of the DPDK code base, I think a
>>> non-atomic, non-volatile solution is also fine.
>>>
>>
>> Yes, this seems working practically but I guess better to follow above
>> suggestion.
>>
>>> (That said, I think we're better off just deprecating stats reset
>>> altogether, and returning -ENOTSUP here meanwhile.)
>>>
>>
>> As long as reset is reliable (here I mean it reset stats in every call)
>> and doesn't impact datapath performance, I am for to continue with it.
>> Returning non supported won't bring more benefit to users.
>>
>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> ---
>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>
>>>> This update triggered by mail list discussion [1].
>>>>
>>>> [1]
>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>>>
>>>> v2:
>>>> * Remove wrapping check for stats
>>>> ---
>>>>    drivers/net/af_packet/rte_eth_af_packet.c | 66
>>>> ++++++++++++++---------
>>>>    1 file changed, 41 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>>>> b/drivers/net/af_packet/rte_eth_af_packet.c
>>>> index 397a32db5886..10c8e1e50139 100644
>>>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>>>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>>>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>>>        uint16_t in_port;
>>>>        uint8_t vlan_strip;
>>>>    -    volatile unsigned long rx_pkts;
>>>> -    volatile unsigned long rx_bytes;
>>>> +    uint64_t rx_pkts;
>>>> +    uint64_t rx_bytes;
>>>> +    uint64_t rx_pkts_offset;
>>>> +    uint64_t rx_bytes_offset;
>>>
>>> I suggest you introduce a separate struct for reset-able counters. It'll
>>> make things cleaner, and you can sneak in atomics without too much
>>> atomics-related bloat.
>>>
>>> struct counter
>>> {
>>>      uint64_t count;
>>>      uint64_t offset;
>>> };
>>>
>>> /../
>>>      struct counter rx_pkts;
>>>      struct counter rx_bytes;
>>> /../
>>>
>>> static uint64_t
>>> counter_value(struct counter *counter)
>>> {
>>>      uint64_t count = __atomic_load_n(&counter->count,
>>> __ATOMIC_RELAXED);
>>>      uint64_t offset = __atomic_load_n(&counter->offset,
>>> __ATOMIC_RELAXED);
>>>
>>>      return count + offset;
>>> }
>>>
>>> static void
>>> counter_reset(struct counter *counter)
>>> {
>>>      uint64_t count = __atomic_load_n(&counter->count,
>>> __ATOMIC_RELAXED);
>>>
>>>      __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>>> }
>>>
>>> static void
>>> counter_add(struct counter *counter, uint64_t operand)
>>> {
>>>      __atomic_store_n(&counter->count, counter->count + operand,
>>> __ATOMIC_RELAXED);
>>> }
>>>
>>
>> Ack for separate struct for reset-able counters.
>>
>>> You'd have to port this to <rte_stdatomic.h> calls, which prevents
>>> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be
>>> replaced with explicit relaxed non-atomic load. Otherwise, if you just
>>> use "counter->count", that would be an atomic load with sequential
>>> consistency memory order on C11 atomics-based builds, which would result
>>> in a barrier, at least on weakly ordered machines (e.g., ARM).
>>>
>>
>> I am not sure I understand above.
>> As load and add will be non-atomic, why not access them directly, like:
>> `uint64_t count = counter->count;`
>>
> 
> In case count is _Atomic (i.e., on enable_stdatomic=true builds), "count
> = counter->count" will imply a memory barrier. On x86_64, I think it
> will "only" be a compiler barrier (i.e., preventing optimization). On
> weakly ordered machines, it will result in a barrier-instruction (or an
> instruction-which-is-also-a-barrier, like in the example below).
> 
> include <stdatomic.h>
> 
> int relaxed_load(_Atomic int *p)
> {
>     atomic_load_explicit(p, memory_order_relaxed);
> }
> 
> int direct_load(_Atomic int *p)
> {
>     return *p;
> }
> 
> GCC 13.2 ARM64 ->
> 
> relaxed_load:
>         ldr     w0, [x0]
>         ret
> direct_load:
>         ldar    w0, [x0]
>         ret
> 
>


Do we need to declare count as '_Atomic', I wasn't planning to make
variable _Atomic. This way assignment won't introduce any memory barrier.


>> So my understanding is, remove `volatile`, load and add without atomics,
>> and only use relaxed ordered atomics for store (to ensure value in
>> register stored to memory).
>>
> 
> Yes, that would be the best option, would the DPDK atomics API allow its
> implementation - but it doesn't. At least not if you care about what
> happens in enable_stdatomic=true builds.
> 
> The second-best option is to use a rte_memory_order_relaxed atomic load,
> a regular non-atomic add, and a relaxed atomic store.
> 
>> I will send a new version of RFC with above understanding.
>>
>>> I would still use a struct and some helper-functions even for the less
>>> ambitious, non-atomic variant.
>>>
>>> The only drawback of using GCC built-ins type atomics here, versus an
>>> atomic- and volatile-free approach, is that current compilers seems to
>>> refuse merging atomic stores. It's beyond me why this is the case. If
>>> you store to a variable twice in quick succession, it'll be two store
>>> machine instructions, even in cases where the compiler *knows* the value
>>> is identical. So volatile, even though you didn't ask for it. Weird.
>>>
>>> So if you have a loop, you may want to make an "counter_add()" in the
>>> end from a temporary, to get the final 0.001% of performance.
>>>
>>
>> ack
>>
>> I can't really say which one of the following is better (because of
>> store in empty poll), but I will keep it as it is (b.):
>>
>> a.
>> for (i < nb_pkt) {
>>     stats =+ 1;
>> }
>>
>>
>> b.
>> for (i < nb_pkt) {
>>     tmp =+ 1;
>> }
>> stats += tmp;
>>
>>
>> c.
>> for (i < nb_pkt) {
>>     tmp =+ 1;
>> }
>> if (tmp)
>>     stats += tmp;
>>
>>
>>
>>> If the tech board thinks MT-safe reset-able software-manage statistics
>>> is the future (as opposed to dropping reset support, for example), I
>>> think this stuff should go into a separate header file, so other PMDs
>>> can reuse it. Maybe out of scope for this patch.
>>>
>>
>> I don't think we need MT-safe reset, the patch is already out to
>> document current status.
> 
> Well, what you are working on is a MT-safe reset, in the sense it allows
> for one (1) resetting thread properly synchronize with multiple
> concurrent counter-updating threads.
> 
> It's not going to be completely MT safe, since you can't have two
> threads calling the reset function in parallel.
> 

This is what I meant with "MT-safe reset", so multiple threads not
allowed to call stats reset in parallel.

And for multiple concurrent counter-updating threads case, suggestion is
to stop forwarding.

Above two are the update I added to 'rte_eth_stats_reset()' API, and I
believe we can continue with this restriction for the API.

> Any change to the API should make this clear.
> 
>> For HW stats reset is already reliable and for SW drives offset based
>> approach can make is reliable.
>>
>> Unless you explicitly asked for it, I don't think this is in the agenda
>> of the techboard.
>>
>>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 14:22         ` Ferruh Yigit
@ 2024-05-02 15:59           ` Stephen Hemminger
  2024-05-02 18:20             ` Ferruh Yigit
  2024-05-02 17:37           ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-02 15:59 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Thu, 2 May 2024 15:22:35 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> > 
> > It's not going to be completely MT safe, since you can't have two
> > threads calling the reset function in parallel.
> >   
> 
> This is what I meant with "MT-safe reset", so multiple threads not
> allowed to call stats reset in parallel.
> 
> And for multiple concurrent counter-updating threads case, suggestion is
> to stop forwarding.
> 
> Above two are the update I added to 'rte_eth_stats_reset()' API, and

I can't see the point of full MT-safe reset. No other driver does it.
The ethdev control side api's are not MT-safe and making them truly
MT-safe would add additional ref count and locking.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 14:22         ` Ferruh Yigit
  2024-05-02 15:59           ` Stephen Hemminger
@ 2024-05-02 17:37           ` Mattias Rönnblom
  2024-05-02 18:26             ` Stephen Hemminger
  1 sibling, 1 reply; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-02 17:37 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 2024-05-02 16:22, Ferruh Yigit wrote:
> On 5/2/2024 6:51 AM, Mattias Rönnblom wrote:
>> On 2024-05-01 18:19, Ferruh Yigit wrote:
>>> On 4/28/2024 4:11 PM, Mattias Rönnblom wrote:
>>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>>>> For stats reset, use an offset instead of zeroing out actual stats
>>>>> values,
>>>>> get_stats() displays diff between stats and offset.
>>>>> This way stats only updated in datapath and offset only updated in
>>>>> stats
>>>>> reset function. This makes stats reset function more reliable.
>>>>>
>>>>> As stats only written by single thread, we can remove 'volatile'
>>>>> qualifier
>>>>> which should improve the performance in datapath.
>>>>>
>>>>
>>>> volatile wouldn't help you if you had multiple writers, so that can't be
>>>> the reason for its removal. It would be more accurate to say it should
>>>> be replaced with atomic updates. If you don't use volatile and don't use
>>>> atomics, you have to consider if the compiler can reach the conclusion
>>>> that it does not need to store the counter value for future use *for
>>>> that thread*. Since otherwise, I don't think the store actually needs to
>>>> occur. Since DPDK statistics tend to work, it's pretty obvious that
>>>> current compilers tend not to reach this conclusion.
>>>>
>>>
>>> Thanks Mattias for clarifying why we need volatile or atomics even with
>>> single writer.
>>>
>>>> If this should be done 100% properly, the update operation should be a
>>>> non-atomic load, non-atomic add, and an atomic store. Similarly, for the
>>>> reset, the offset store should be atomic.
>>>>
>>>
>>> ack
>>>
>>>> Considered the state of the rest of the DPDK code base, I think a
>>>> non-atomic, non-volatile solution is also fine.
>>>>
>>>
>>> Yes, this seems working practically but I guess better to follow above
>>> suggestion.
>>>
>>>> (That said, I think we're better off just deprecating stats reset
>>>> altogether, and returning -ENOTSUP here meanwhile.)
>>>>
>>>
>>> As long as reset is reliable (here I mean it reset stats in every call)
>>> and doesn't impact datapath performance, I am for to continue with it.
>>> Returning non supported won't bring more benefit to users.
>>>
>>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>>> ---
>>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>>
>>>>> This update triggered by mail list discussion [1].
>>>>>
>>>>> [1]
>>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>>>>
>>>>> v2:
>>>>> * Remove wrapping check for stats
>>>>> ---
>>>>>     drivers/net/af_packet/rte_eth_af_packet.c | 66
>>>>> ++++++++++++++---------
>>>>>     1 file changed, 41 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>>>>> b/drivers/net/af_packet/rte_eth_af_packet.c
>>>>> index 397a32db5886..10c8e1e50139 100644
>>>>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>>>>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>>>>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>>>>         uint16_t in_port;
>>>>>         uint8_t vlan_strip;
>>>>>     -    volatile unsigned long rx_pkts;
>>>>> -    volatile unsigned long rx_bytes;
>>>>> +    uint64_t rx_pkts;
>>>>> +    uint64_t rx_bytes;
>>>>> +    uint64_t rx_pkts_offset;
>>>>> +    uint64_t rx_bytes_offset;
>>>>
>>>> I suggest you introduce a separate struct for reset-able counters. It'll
>>>> make things cleaner, and you can sneak in atomics without too much
>>>> atomics-related bloat.
>>>>
>>>> struct counter
>>>> {
>>>>       uint64_t count;
>>>>       uint64_t offset;
>>>> };
>>>>
>>>> /../
>>>>       struct counter rx_pkts;
>>>>       struct counter rx_bytes;
>>>> /../
>>>>
>>>> static uint64_t
>>>> counter_value(struct counter *counter)
>>>> {
>>>>       uint64_t count = __atomic_load_n(&counter->count,
>>>> __ATOMIC_RELAXED);
>>>>       uint64_t offset = __atomic_load_n(&counter->offset,
>>>> __ATOMIC_RELAXED);
>>>>
>>>>       return count + offset;
>>>> }
>>>>
>>>> static void
>>>> counter_reset(struct counter *counter)
>>>> {
>>>>       uint64_t count = __atomic_load_n(&counter->count,
>>>> __ATOMIC_RELAXED);
>>>>
>>>>       __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>>>> }
>>>>
>>>> static void
>>>> counter_add(struct counter *counter, uint64_t operand)
>>>> {
>>>>       __atomic_store_n(&counter->count, counter->count + operand,
>>>> __ATOMIC_RELAXED);
>>>> }
>>>>
>>>
>>> Ack for separate struct for reset-able counters.
>>>
>>>> You'd have to port this to <rte_stdatomic.h> calls, which prevents
>>>> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be
>>>> replaced with explicit relaxed non-atomic load. Otherwise, if you just
>>>> use "counter->count", that would be an atomic load with sequential
>>>> consistency memory order on C11 atomics-based builds, which would result
>>>> in a barrier, at least on weakly ordered machines (e.g., ARM).
>>>>
>>>
>>> I am not sure I understand above.
>>> As load and add will be non-atomic, why not access them directly, like:
>>> `uint64_t count = counter->count;`
>>>
>>
>> In case count is _Atomic (i.e., on enable_stdatomic=true builds), "count
>> = counter->count" will imply a memory barrier. On x86_64, I think it
>> will "only" be a compiler barrier (i.e., preventing optimization). On
>> weakly ordered machines, it will result in a barrier-instruction (or an
>> instruction-which-is-also-a-barrier, like in the example below).
>>
>> include <stdatomic.h>
>>
>> int relaxed_load(_Atomic int *p)
>> {
>>      atomic_load_explicit(p, memory_order_relaxed);
>> }
>>
>> int direct_load(_Atomic int *p)
>> {
>>      return *p;
>> }
>>
>> GCC 13.2 ARM64 ->
>>
>> relaxed_load:
>>          ldr     w0, [x0]
>>          ret
>> direct_load:
>>          ldar    w0, [x0]
>>          ret
>>
>>
> 
> 
> Do we need to declare count as '_Atomic', I wasn't planning to make
> variable _Atomic. This way assignment won't introduce any memory barrier.
> 

To use atomics in DPDK, the current requirements seems to be to use 
RTE_ATOMIC(). That macro expands to _Atomic in enable_stdatomic=true 
builds, and nothing otherwise.

Carefully crafted code using atomics will achieved the same performance 
and be more correct than the non-atomic variant. However, in practice, I 
think the non-atomic variant is very likely to produce the desired results.

> 
>>> So my understanding is, remove `volatile`, load and add without atomics,
>>> and only use relaxed ordered atomics for store (to ensure value in
>>> register stored to memory).
>>>
>>
>> Yes, that would be the best option, would the DPDK atomics API allow its
>> implementation - but it doesn't. At least not if you care about what
>> happens in enable_stdatomic=true builds.
>>
>> The second-best option is to use a rte_memory_order_relaxed atomic load,
>> a regular non-atomic add, and a relaxed atomic store.
>>
>>> I will send a new version of RFC with above understanding.
>>>
>>>> I would still use a struct and some helper-functions even for the less
>>>> ambitious, non-atomic variant.
>>>>
>>>> The only drawback of using GCC built-ins type atomics here, versus an
>>>> atomic- and volatile-free approach, is that current compilers seems to
>>>> refuse merging atomic stores. It's beyond me why this is the case. If
>>>> you store to a variable twice in quick succession, it'll be two store
>>>> machine instructions, even in cases where the compiler *knows* the value
>>>> is identical. So volatile, even though you didn't ask for it. Weird.
>>>>
>>>> So if you have a loop, you may want to make an "counter_add()" in the
>>>> end from a temporary, to get the final 0.001% of performance.
>>>>
>>>
>>> ack
>>>
>>> I can't really say which one of the following is better (because of
>>> store in empty poll), but I will keep it as it is (b.):
>>>
>>> a.
>>> for (i < nb_pkt) {
>>>      stats =+ 1;
>>> }
>>>
>>>
>>> b.
>>> for (i < nb_pkt) {
>>>      tmp =+ 1;
>>> }
>>> stats += tmp;
>>>
>>>
>>> c.
>>> for (i < nb_pkt) {
>>>      tmp =+ 1;
>>> }
>>> if (tmp)
>>>      stats += tmp;
>>>
>>>
>>>
>>>> If the tech board thinks MT-safe reset-able software-manage statistics
>>>> is the future (as opposed to dropping reset support, for example), I
>>>> think this stuff should go into a separate header file, so other PMDs
>>>> can reuse it. Maybe out of scope for this patch.
>>>>
>>>
>>> I don't think we need MT-safe reset, the patch is already out to
>>> document current status.
>>
>> Well, what you are working on is a MT-safe reset, in the sense it allows
>> for one (1) resetting thread properly synchronize with multiple
>> concurrent counter-updating threads.
>>
>> It's not going to be completely MT safe, since you can't have two
>> threads calling the reset function in parallel.
>>
> 
> This is what I meant with "MT-safe reset", so multiple threads not
> allowed to call stats reset in parallel.
> 
> And for multiple concurrent counter-updating threads case, suggestion is
> to stop forwarding.
> 
> Above two are the update I added to 'rte_eth_stats_reset()' API, and I
> believe we can continue with this restriction for the API.
> 
>> Any change to the API should make this clear.
>>
>>> For HW stats reset is already reliable and for SW drives offset based
>>> approach can make is reliable.
>>>
>>> Unless you explicitly asked for it, I don't think this is in the agenda
>>> of the techboard.
>>>
>>>
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 15:59           ` Stephen Hemminger
@ 2024-05-02 18:20             ` Ferruh Yigit
  0 siblings, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-02 18:20 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On 5/2/2024 4:59 PM, Stephen Hemminger wrote:
> On Thu, 2 May 2024 15:22:35 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>>>
>>> It's not going to be completely MT safe, since you can't have two
>>> threads calling the reset function in parallel.
>>>   
>>
>> This is what I meant with "MT-safe reset", so multiple threads not
>> allowed to call stats reset in parallel.
>>
>> And for multiple concurrent counter-updating threads case, suggestion is
>> to stop forwarding.
>>
>> Above two are the update I added to 'rte_eth_stats_reset()' API, and
> 
> I can't see the point of full MT-safe reset. No other driver does it.
> The ethdev control side api's are not MT-safe and making them truly
> MT-safe would add additional ref count and locking.
>

Agree, that is why clarified this in ethdev API:
https:/dpdk.org/patch/139681


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 17:37           ` Mattias Rönnblom
@ 2024-05-02 18:26             ` Stephen Hemminger
  2024-05-02 21:26               ` Mattias Rönnblom
  0 siblings, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-02 18:26 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Ferruh Yigit, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Thu, 2 May 2024 19:37:28 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > 
> > Do we need to declare count as '_Atomic', I wasn't planning to make
> > variable _Atomic. This way assignment won't introduce any memory barrier.
> >   
> 
> To use atomics in DPDK, the current requirements seems to be to use 
> RTE_ATOMIC(). That macro expands to _Atomic in enable_stdatomic=true 
> builds, and nothing otherwise.
> 
> Carefully crafted code using atomics will achieved the same performance 
> and be more correct than the non-atomic variant. However, in practice, I 
> think the non-atomic variant is very likely to produce the desired results.

You are confusing atomic usage for thread safety, with the necessity
of compiler barriers. 

Stats should not be volatile.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 18:26             ` Stephen Hemminger
@ 2024-05-02 21:26               ` Mattias Rönnblom
  2024-05-02 21:46                 ` Stephen Hemminger
  0 siblings, 1 reply; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-02 21:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ferruh Yigit, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On 2024-05-02 20:26, Stephen Hemminger wrote:
> On Thu, 2 May 2024 19:37:28 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>>>
>>> Do we need to declare count as '_Atomic', I wasn't planning to make
>>> variable _Atomic. This way assignment won't introduce any memory barrier.
>>>    
>>
>> To use atomics in DPDK, the current requirements seems to be to use
>> RTE_ATOMIC(). That macro expands to _Atomic in enable_stdatomic=true
>> builds, and nothing otherwise.
>>
>> Carefully crafted code using atomics will achieved the same performance
>> and be more correct than the non-atomic variant. However, in practice, I
>> think the non-atomic variant is very likely to produce the desired results.
> 
> You are confusing atomic usage for thread safety, with the necessity
> of compiler barriers.
> 

Are you suggesting that program-level C11 atomic stores risk being 
delayed, indefinitely? I could only find a draft version of the 
standard, but there 7.17.3 says "Implementations should make atomic 
stores visible to atomic loads within a reasonable amount of time."

An atomic relaxed store will be much cheaper than a compiler barrier.

> Stats should not be volatile.

Sure, and I also don't think compiler barriers should be needed.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-02 21:26               ` Mattias Rönnblom
@ 2024-05-02 21:46                 ` Stephen Hemminger
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-02 21:46 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Ferruh Yigit, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Thu, 2 May 2024 23:26:31 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > 
> > You are confusing atomic usage for thread safety, with the necessity
> > of compiler barriers.
> >   
> 
> Are you suggesting that program-level C11 atomic stores risk being 
> delayed, indefinitely? I could only find a draft version of the 
> standard, but there 7.17.3 says "Implementations should make atomic 
> stores visible to atomic loads within a reasonable amount of time."
> 
> An atomic relaxed store will be much cheaper than a compiler barrier.

There is a confusion between language designers C11 and system implementer and
CPU design. The language people confuse compiler with hardware in standards.
Because of course the compiler knows all (not).

Read the extended discussion on memory models in Linux kernel documentation.


https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [RFC v3] net/af_packet: make stats reset reliable
  2024-04-25 17:46 [RFC] net/af_packet: make stats reset reliable Ferruh Yigit
                   ` (2 preceding siblings ...)
  2024-04-26 21:28 ` [RFC] " Patrick Robb
@ 2024-05-03 15:45 ` Ferruh Yigit
  2024-05-03 22:00   ` Stephen Hemminger
  2024-05-07 15:27   ` Morten Brørup
  3 siblings, 2 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-03 15:45 UTC (permalink / raw)
  To: John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

For stats reset, use an offset instead of zeroing out actual stats values,
get_stats() displays diff between stats and offset.
This way stats only updated in datapath and offset only updated in stats
reset function. This makes stats reset function more reliable.

As stats only written by single thread, we can remove 'volatile' qualifier
which should improve the performance in datapath.

While updating around, 'igb_stats' parameter renamed as 'stats'.

Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
---
Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Morten Brørup <mb@smartsharesystems.com>

This update triggered by mail list discussion [1].

[1]
https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/

v2:
* Remove wrapping check for stats

v3:
* counter and offset put into same struct per stats
* Use atomic load / store for stats values
---
 drivers/net/af_packet/rte_eth_af_packet.c | 98 ++++++++++++++++-------
 1 file changed, 68 insertions(+), 30 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 6b7b16f3486d..ebef1cb06450 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -6,6 +6,7 @@
  * All rights reserved.
  */
 
+#include <rte_atomic.h>
 #include <rte_common.h>
 #include <rte_string_fns.h>
 #include <rte_mbuf.h>
@@ -40,6 +41,11 @@
 #define DFLT_FRAME_SIZE		(1 << 11)
 #define DFLT_FRAME_COUNT	(1 << 9)
 
+struct stats {
+	uint64_t counter;
+	uint64_t offset;
+};
+
 struct __rte_cache_aligned pkt_rx_queue {
 	int sockfd;
 
@@ -52,8 +58,8 @@ struct __rte_cache_aligned pkt_rx_queue {
 	uint16_t in_port;
 	uint8_t vlan_strip;
 
-	volatile unsigned long rx_pkts;
-	volatile unsigned long rx_bytes;
+	struct stats rx_pkts;
+	struct stats rx_bytes;
 };
 
 struct __rte_cache_aligned pkt_tx_queue {
@@ -65,9 +71,9 @@ struct __rte_cache_aligned pkt_tx_queue {
 	unsigned int framecount;
 	unsigned int framenum;
 
-	volatile unsigned long tx_pkts;
-	volatile unsigned long err_pkts;
-	volatile unsigned long tx_bytes;
+	struct stats tx_pkts;
+	struct stats err_pkts;
+	struct stats tx_bytes;
 };
 
 struct pmd_internals {
@@ -111,6 +117,34 @@ RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE);
 	rte_log(RTE_LOG_ ## level, af_packet_logtype, \
 		"%s(): " fmt ":%s\n", __func__, ##args, strerror(errno))
 
+static inline uint64_t
+stats_get(struct stats *s)
+{
+	uint64_t counter = rte_atomic_load_explicit(&s->counter,
+			rte_memory_order_relaxed);
+	uint64_t offset = rte_atomic_load_explicit(&s->offset,
+			rte_memory_order_relaxed);
+	return counter - offset;
+}
+
+static inline void
+stats_add(struct stats *s, uint16_t n)
+{
+	uint64_t counter = s->counter;
+	counter += n;
+	rte_atomic_store_explicit(&s->counter, counter,
+			rte_memory_order_relaxed);
+}
+
+static inline void
+stats_reset(struct stats *s)
+{
+	uint64_t counter = rte_atomic_load_explicit(&s->counter,
+			rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&s->offset, counter,
+			rte_memory_order_relaxed);
+}
+
 static uint16_t
 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -169,8 +203,8 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		num_rx_bytes += mbuf->pkt_len;
 	}
 	pkt_q->framenum = framenum;
-	pkt_q->rx_pkts += num_rx;
-	pkt_q->rx_bytes += num_rx_bytes;
+	stats_add(&pkt_q->rx_pkts, num_rx);
+	stats_add(&pkt_q->rx_bytes, num_rx_bytes);
 	return num_rx;
 }
 
@@ -305,9 +339,9 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	}
 
 	pkt_q->framenum = framenum;
-	pkt_q->tx_pkts += num_tx;
-	pkt_q->err_pkts += i - num_tx;
-	pkt_q->tx_bytes += num_tx_bytes;
+	stats_add(&pkt_q->tx_pkts, num_tx);
+	stats_add(&pkt_q->err_pkts, i - num_tx);
+	stats_add(&pkt_q->tx_bytes, num_tx_bytes);
 	return i;
 }
 
@@ -387,7 +421,7 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 }
 
 static int
-eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
 	unsigned i, imax;
 	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
@@ -397,27 +431,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
-		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
-		rx_total += igb_stats->q_ipackets[i];
-		rx_bytes_total += igb_stats->q_ibytes[i];
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		stats->q_ipackets[i] = stats_get(&rxq->rx_pkts);
+		stats->q_ibytes[i] = stats_get(&rxq->rx_bytes);
+		rx_total += stats->q_ipackets[i];
+		rx_bytes_total += stats->q_ibytes[i];
 	}
 
 	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
 	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
 	for (i = 0; i < imax; i++) {
-		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
-		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
-		tx_total += igb_stats->q_opackets[i];
-		tx_err_total += internal->tx_queue[i].err_pkts;
-		tx_bytes_total += igb_stats->q_obytes[i];
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		stats->q_opackets[i] = stats_get(&txq->tx_pkts);
+		stats->q_obytes[i] = stats_get(&txq->tx_bytes);
+		tx_total += stats->q_opackets[i];
+		tx_err_total += stats_get(&txq->err_pkts);
+		tx_bytes_total += stats->q_obytes[i];
 	}
 
-	igb_stats->ipackets = rx_total;
-	igb_stats->ibytes = rx_bytes_total;
-	igb_stats->opackets = tx_total;
-	igb_stats->oerrors = tx_err_total;
-	igb_stats->obytes = tx_bytes_total;
+	stats->ipackets = rx_total;
+	stats->ibytes = rx_bytes_total;
+	stats->opackets = tx_total;
+	stats->oerrors = tx_err_total;
+	stats->obytes = tx_bytes_total;
 	return 0;
 }
 
@@ -428,14 +464,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
 	struct pmd_internals *internal = dev->data->dev_private;
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->rx_queue[i].rx_pkts = 0;
-		internal->rx_queue[i].rx_bytes = 0;
+		struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		stats_reset(&rxq->rx_pkts);
+		stats_reset(&rxq->rx_bytes);
 	}
 
 	for (i = 0; i < internal->nb_queues; i++) {
-		internal->tx_queue[i].tx_pkts = 0;
-		internal->tx_queue[i].err_pkts = 0;
-		internal->tx_queue[i].tx_bytes = 0;
+		struct pkt_tx_queue *txq = &internal->tx_queue[i];
+		stats_reset(&txq->tx_pkts);
+		stats_reset(&txq->err_pkts);
+		stats_reset(&txq->tx_bytes);
 	}
 
 	return 0;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-03 15:45 ` [RFC v3] " Ferruh Yigit
@ 2024-05-03 22:00   ` Stephen Hemminger
  2024-05-07 13:48     ` Ferruh Yigit
  2024-05-08  7:19     ` Mattias Rönnblom
  2024-05-07 15:27   ` Morten Brørup
  1 sibling, 2 replies; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-03 22:00 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup

On Fri, 3 May 2024 16:45:47 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> For stats reset, use an offset instead of zeroing out actual stats values,
> get_stats() displays diff between stats and offset.
> This way stats only updated in datapath and offset only updated in stats
> reset function. This makes stats reset function more reliable.
> 
> As stats only written by single thread, we can remove 'volatile' qualifier
> which should improve the performance in datapath.
> 
> While updating around, 'igb_stats' parameter renamed as 'stats'.
> 
> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> ---
> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Morten Brørup <mb@smartsharesystems.com>
> 
> This update triggered by mail list discussion [1].
> 
> [1]
> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/


NAK

I did not hear a good argument why atomic or volatile was necessary in the first place.
Why?

Why is this driver special (a snowflake) compared to all the other drivers doing software
statistics (tap, virtio, xdp, ring, memif, netvsc, vmware)?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-04-28 15:11   ` Mattias Rönnblom
  2024-05-01 16:19     ` Ferruh Yigit
@ 2024-05-07  7:23     ` Mattias Rönnblom
  2024-05-07 13:49       ` Ferruh Yigit
  2024-05-07 19:19       ` Morten Brørup
  1 sibling, 2 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-07  7:23 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 2024-04-28 17:11, Mattias Rönnblom wrote:
> On 2024-04-26 16:38, Ferruh Yigit wrote:
>> For stats reset, use an offset instead of zeroing out actual stats 
>> values,
>> get_stats() displays diff between stats and offset.
>> This way stats only updated in datapath and offset only updated in stats
>> reset function. This makes stats reset function more reliable.
>>
>> As stats only written by single thread, we can remove 'volatile' 
>> qualifier
>> which should improve the performance in datapath.
>>
> 
> volatile wouldn't help you if you had multiple writers, so that can't be 
> the reason for its removal. It would be more accurate to say it should 
> be replaced with atomic updates. If you don't use volatile and don't use 
> atomics, you have to consider if the compiler can reach the conclusion 
> that it does not need to store the counter value for future use *for 
> that thread*. Since otherwise, I don't think the store actually needs to 
> occur. Since DPDK statistics tend to work, it's pretty obvious that 
> current compilers tend not to reach this conclusion.
> 
> If this should be done 100% properly, the update operation should be a 
> non-atomic load, non-atomic add, and an atomic store. Similarly, for the 
> reset, the offset store should be atomic.
> 
> Considered the state of the rest of the DPDK code base, I think a 
> non-atomic, non-volatile solution is also fine.
> 
> (That said, I think we're better off just deprecating stats reset 
> altogether, and returning -ENOTSUP here meanwhile.)
> 
>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>> ---
>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>
>> This update triggered by mail list discussion [1].
>>
>> [1]
>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>
>> v2:
>> * Remove wrapping check for stats
>> ---
>>   drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>>   1 file changed, 41 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c 
>> b/drivers/net/af_packet/rte_eth_af_packet.c
>> index 397a32db5886..10c8e1e50139 100644
>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>       uint16_t in_port;
>>       uint8_t vlan_strip;
>> -    volatile unsigned long rx_pkts;
>> -    volatile unsigned long rx_bytes;
>> +    uint64_t rx_pkts;
>> +    uint64_t rx_bytes;
>> +    uint64_t rx_pkts_offset;
>> +    uint64_t rx_bytes_offset;
> 
> I suggest you introduce a separate struct for reset-able counters. It'll 
> make things cleaner, and you can sneak in atomics without too much 
> atomics-related bloat.
> 
> struct counter
> {
>      uint64_t count;
>      uint64_t offset;
> };
> 
> /../
>      struct counter rx_pkts;
>      struct counter rx_bytes;
> /../
> 
> static uint64_t
> counter_value(struct counter *counter)
> {
>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>      uint64_t offset = __atomic_load_n(&counter->offset, __ATOMIC_RELAXED);
> 

Since the count and the offset are written to independently, without any 
ordering restrictions, an update and a reset in quick succession may 
cause the offset store to be globally visible before the new count. In 
such a scenario, a reader could see an offset > count.

Thus, unless I'm missing something, one should add a

if (unlikely(offset > count))
	return 0;

here. With the appropriate comment explaining why this might be.

Another approach would be to think about what memory barriers may be 
required to make sure one sees the count update before the offset 
update, but, intuitively, that seems like both more complex and more 
costly (performance-wise).

>      return count + offset;
> }
> 
> static void
> counter_reset(struct counter *counter)
> {
>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
> 
>      __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
> }
> 
> static void
> counter_add(struct counter *counter, uint64_t operand)
> {
>      __atomic_store_n(&counter->count, counter->count + operand, 
> __ATOMIC_RELAXED);
> }
> 
> You'd have to port this to <rte_stdatomic.h> calls, which prevents 
> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must be 
> replaced with explicit relaxed non-atomic load. Otherwise, if you just 
> use "counter->count", that would be an atomic load with sequential 
> consistency memory order on C11 atomics-based builds, which would result 
> in a barrier, at least on weakly ordered machines (e.g., ARM).
> 
> I would still use a struct and some helper-functions even for the less 
> ambitious, non-atomic variant.
> 
> The only drawback of using GCC built-ins type atomics here, versus an 
> atomic- and volatile-free approach, is that current compilers seems to 
> refuse merging atomic stores. It's beyond me why this is the case. If 
> you store to a variable twice in quick succession, it'll be two store 
> machine instructions, even in cases where the compiler *knows* the value 
> is identical. So volatile, even though you didn't ask for it. Weird.
> 
> So if you have a loop, you may want to make an "counter_add()" in the 
> end from a temporary, to get the final 0.001% of performance.
> 
> If the tech board thinks MT-safe reset-able software-manage statistics 
> is the future (as opposed to dropping reset support, for example), I 
> think this stuff should go into a separate header file, so other PMDs 
> can reuse it. Maybe out of scope for this patch.
> 
>>   };
>>   struct pkt_tx_queue {
>> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>>       unsigned int framecount;
>>       unsigned int framenum;
>> -    volatile unsigned long tx_pkts;
>> -    volatile unsigned long err_pkts;
>> -    volatile unsigned long tx_bytes;
>> +    uint64_t tx_pkts;
>> +    uint64_t err_pkts;
>> +    uint64_t tx_bytes;
>> +    uint64_t tx_pkts_offset;
>> +    uint64_t err_pkts_offset;
>> +    uint64_t tx_bytes_offset;
>>   };
>>   struct pmd_internals {
>> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct 
>> rte_eth_dev_info *dev_info)
>>       return 0;
>>   }
>> +
>> +static uint64_t
>> +stats_get_diff(uint64_t stats, uint64_t offset)
>> +{
>> +    return stats - offset;
>> +}
>> +
>>   static int
>> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>>   {
>>       unsigned i, imax;
>>       unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct 
>> rte_eth_stats *igb_stats)
>>       imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>       for (i = 0; i < imax; i++) {
>> -        igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
>> -        igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
>> -        rx_total += igb_stats->q_ipackets[i];
>> -        rx_bytes_total += igb_stats->q_ibytes[i];
>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>> +        stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts, 
>> rxq->rx_pkts_offset);
>> +        stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes, 
>> rxq->rx_bytes_offset);
>> +        rx_total += stats->q_ipackets[i];
>> +        rx_bytes_total += stats->q_ibytes[i];
>>       }
>>       imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>       for (i = 0; i < imax; i++) {
>> -        igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
>> -        igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
>> -        tx_total += igb_stats->q_opackets[i];
>> -        tx_err_total += internal->tx_queue[i].err_pkts;
>> -        tx_bytes_total += igb_stats->q_obytes[i];
>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>> +        stats->q_opackets[i] = stats_get_diff(txq->tx_pkts, 
>> txq->tx_pkts_offset);
>> +        stats->q_obytes[i] = stats_get_diff(txq->tx_bytes, 
>> txq->tx_bytes_offset);
>> +        tx_total += stats->q_opackets[i];
>> +        tx_err_total += stats_get_diff(txq->err_pkts, 
>> txq->err_pkts_offset);
>> +        tx_bytes_total += stats->q_obytes[i];
>>       }
>> -    igb_stats->ipackets = rx_total;
>> -    igb_stats->ibytes = rx_bytes_total;
>> -    igb_stats->opackets = tx_total;
>> -    igb_stats->oerrors = tx_err_total;
>> -    igb_stats->obytes = tx_bytes_total;
>> +    stats->ipackets = rx_total;
>> +    stats->ibytes = rx_bytes_total;
>> +    stats->opackets = tx_total;
>> +    stats->oerrors = tx_err_total;
>> +    stats->obytes = tx_bytes_total;
>>       return 0;
>>   }
>> @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>>       struct pmd_internals *internal = dev->data->dev_private;
>>       for (i = 0; i < internal->nb_queues; i++) {
>> -        internal->rx_queue[i].rx_pkts = 0;
>> -        internal->rx_queue[i].rx_bytes = 0;
>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>> +        rxq->rx_pkts_offset = rxq->rx_pkts;
>> +        rxq->rx_bytes_offset = rxq->rx_bytes;
>>       }
>>       for (i = 0; i < internal->nb_queues; i++) {
>> -        internal->tx_queue[i].tx_pkts = 0;
>> -        internal->tx_queue[i].err_pkts = 0;
>> -        internal->tx_queue[i].tx_bytes = 0;
>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>> +        txq->tx_pkts_offset = txq->tx_pkts;
>> +        txq->err_pkts_offset = txq->err_pkts;
>> +        txq->tx_bytes_offset = txq->tx_bytes;
>>       }
>>       return 0;

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-03 22:00   ` Stephen Hemminger
@ 2024-05-07 13:48     ` Ferruh Yigit
  2024-05-07 14:52       ` Stephen Hemminger
  2024-05-08  7:19     ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-07 13:48 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup, Honnappa Nagarahalli

On 5/3/2024 11:00 PM, Stephen Hemminger wrote:
> On Fri, 3 May 2024 16:45:47 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> For stats reset, use an offset instead of zeroing out actual stats values,
>> get_stats() displays diff between stats and offset.
>> This way stats only updated in datapath and offset only updated in stats
>> reset function. This makes stats reset function more reliable.
>>
>> As stats only written by single thread, we can remove 'volatile' qualifier
>> which should improve the performance in datapath.
>>
>> While updating around, 'igb_stats' parameter renamed as 'stats'.
>>
>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>> ---
>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>
>> This update triggered by mail list discussion [1].
>>
>> [1]
>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
> 
> 
> NAK
> 
> I did not hear a good argument why atomic or volatile was necessary in the first place.
> Why?
> 

Sure, the patch is done as RFC intentionally to discuss the approach.

Agree that volatile and atomics (fetch + add + store) is not required
for thread synchronization, as only one CPU updates stats.
Even this understanding is important because there are PMDs using full
atomics for stats update, like null PMD [1], this will help up clear them.


And there is a case, stats reset and stats updated in different threads
simultaneously, for this 'volatile' is not sufficient anyway and full
atomics is required. As this will cause performance impact we are
already saying stats update and reset can't happen at the same time [2].
With this update volatile and atomics are not required for this case too.
(Also using offset to increase stats reset reliability.)


In this patch volatile replaced with atomic load and atomic store (not
atomic fetch and add), to ensure that stats stored to memory and not
kept in device registers only.
With volatile, it is guaranteed that updated stats stored back to
memory, but without volatile and atomics I am not sure if this is
guaranteed. Practically I can see this working, but theoretically not
sure. This is similar concern with change in your patch that is casting
to volatile to ensure value read from memory [3].

Expectation is, only atomics load and store will have smaller
performance impact than volatile, ensuring memory load and store when
needed.


[1]
https://git.dpdk.org/dpdk/tree/drivers/net/null/rte_eth_null.c?h=v24.03#n105

[2]
https://patches.dpdk.org/project/dpdk/patch/20240425165308.1078454-1-ferruh.yigit@amd.com/

[3]
https://inbox.dpdk.org/dev/20240430154129.7347-1-stephen@networkplumber.org/
`#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))`



> Why is this driver special (a snowflake) compared to all the other drivers doing software
> statistics (tap, virtio, xdp, ring, memif, netvsc, vmware)?
>

Nothing special at all, only discussion started based on af_packet
implementation. If we give a decision based on this RFC, same logic can
be followed with existing or new software PMDs.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07  7:23     ` Mattias Rönnblom
@ 2024-05-07 13:49       ` Ferruh Yigit
  2024-05-07 14:51         ` Stephen Hemminger
  2024-05-08  6:25         ` Mattias Rönnblom
  2024-05-07 19:19       ` Morten Brørup
  1 sibling, 2 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-07 13:49 UTC (permalink / raw)
  To: Mattias Rönnblom, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 5/7/2024 8:23 AM, Mattias Rönnblom wrote:
> On 2024-04-28 17:11, Mattias Rönnblom wrote:
>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>> For stats reset, use an offset instead of zeroing out actual stats
>>> values,
>>> get_stats() displays diff between stats and offset.
>>> This way stats only updated in datapath and offset only updated in stats
>>> reset function. This makes stats reset function more reliable.
>>>
>>> As stats only written by single thread, we can remove 'volatile'
>>> qualifier
>>> which should improve the performance in datapath.
>>>
>>
>> volatile wouldn't help you if you had multiple writers, so that can't
>> be the reason for its removal. It would be more accurate to say it
>> should be replaced with atomic updates. If you don't use volatile and
>> don't use atomics, you have to consider if the compiler can reach the
>> conclusion that it does not need to store the counter value for future
>> use *for that thread*. Since otherwise, I don't think the store
>> actually needs to occur. Since DPDK statistics tend to work, it's
>> pretty obvious that current compilers tend not to reach this conclusion.
>>
>> If this should be done 100% properly, the update operation should be a
>> non-atomic load, non-atomic add, and an atomic store. Similarly, for
>> the reset, the offset store should be atomic.
>>
>> Considered the state of the rest of the DPDK code base, I think a
>> non-atomic, non-volatile solution is also fine.
>>
>> (That said, I think we're better off just deprecating stats reset
>> altogether, and returning -ENOTSUP here meanwhile.)
>>
>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>> ---
>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>
>>> This update triggered by mail list discussion [1].
>>>
>>> [1]
>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>>
>>> v2:
>>> * Remove wrapping check for stats
>>> ---
>>>   drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>>>   1 file changed, 41 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>>> b/drivers/net/af_packet/rte_eth_af_packet.c
>>> index 397a32db5886..10c8e1e50139 100644
>>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>>       uint16_t in_port;
>>>       uint8_t vlan_strip;
>>> -    volatile unsigned long rx_pkts;
>>> -    volatile unsigned long rx_bytes;
>>> +    uint64_t rx_pkts;
>>> +    uint64_t rx_bytes;
>>> +    uint64_t rx_pkts_offset;
>>> +    uint64_t rx_bytes_offset;
>>
>> I suggest you introduce a separate struct for reset-able counters.
>> It'll make things cleaner, and you can sneak in atomics without too
>> much atomics-related bloat.
>>
>> struct counter
>> {
>>      uint64_t count;
>>      uint64_t offset;
>> };
>>
>> /../
>>      struct counter rx_pkts;
>>      struct counter rx_bytes;
>> /../
>>
>> static uint64_t
>> counter_value(struct counter *counter)
>> {
>>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>      uint64_t offset = __atomic_load_n(&counter->offset,
>> __ATOMIC_RELAXED);
>>
> 
> Since the count and the offset are written to independently, without any
> ordering restrictions, an update and a reset in quick succession may
> cause the offset store to be globally visible before the new count. In
> such a scenario, a reader could see an offset > count.
> 
> Thus, unless I'm missing something, one should add a
> 
> if (unlikely(offset > count))
>     return 0;
> 
> here. With the appropriate comment explaining why this might be.
> 
> Another approach would be to think about what memory barriers may be
> required to make sure one sees the count update before the offset
> update, but, intuitively, that seems like both more complex and more
> costly (performance-wise).
> 

We are going with lazy alternative and requesting to stop forwarding
before stats reset, this should prevent 'count' and 'offset' being
updated simultaneously.


>>      return count + offset;
>> }
>>
>> static void
>> counter_reset(struct counter *counter)
>> {
>>      uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>
>>      __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>> }
>>
>> static void
>> counter_add(struct counter *counter, uint64_t operand)
>> {
>>      __atomic_store_n(&counter->count, counter->count + operand,
>> __ATOMIC_RELAXED);
>> }
>>
>> You'd have to port this to <rte_stdatomic.h> calls, which prevents
>> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must
>> be replaced with explicit relaxed non-atomic load. Otherwise, if you
>> just use "counter->count", that would be an atomic load with
>> sequential consistency memory order on C11 atomics-based builds, which
>> would result in a barrier, at least on weakly ordered machines (e.g.,
>> ARM).
>>
>> I would still use a struct and some helper-functions even for the less
>> ambitious, non-atomic variant.
>>
>> The only drawback of using GCC built-ins type atomics here, versus an
>> atomic- and volatile-free approach, is that current compilers seems to
>> refuse merging atomic stores. It's beyond me why this is the case. If
>> you store to a variable twice in quick succession, it'll be two store
>> machine instructions, even in cases where the compiler *knows* the
>> value is identical. So volatile, even though you didn't ask for it.
>> Weird.
>>
>> So if you have a loop, you may want to make an "counter_add()" in the
>> end from a temporary, to get the final 0.001% of performance.
>>
>> If the tech board thinks MT-safe reset-able software-manage statistics
>> is the future (as opposed to dropping reset support, for example), I
>> think this stuff should go into a separate header file, so other PMDs
>> can reuse it. Maybe out of scope for this patch.
>>
>>>   };
>>>   struct pkt_tx_queue {
>>> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>>>       unsigned int framecount;
>>>       unsigned int framenum;
>>> -    volatile unsigned long tx_pkts;
>>> -    volatile unsigned long err_pkts;
>>> -    volatile unsigned long tx_bytes;
>>> +    uint64_t tx_pkts;
>>> +    uint64_t err_pkts;
>>> +    uint64_t tx_bytes;
>>> +    uint64_t tx_pkts_offset;
>>> +    uint64_t err_pkts_offset;
>>> +    uint64_t tx_bytes_offset;
>>>   };
>>>   struct pmd_internals {
>>> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct
>>> rte_eth_dev_info *dev_info)
>>>       return 0;
>>>   }
>>> +
>>> +static uint64_t
>>> +stats_get_diff(uint64_t stats, uint64_t offset)
>>> +{
>>> +    return stats - offset;
>>> +}
>>> +
>>>   static int
>>> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>>>   {
>>>       unsigned i, imax;
>>>       unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>>> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct
>>> rte_eth_stats *igb_stats)
>>>       imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>       for (i = 0; i < imax; i++) {
>>> -        igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
>>> -        igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
>>> -        rx_total += igb_stats->q_ipackets[i];
>>> -        rx_bytes_total += igb_stats->q_ibytes[i];
>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>> +        stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts,
>>> rxq->rx_pkts_offset);
>>> +        stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes,
>>> rxq->rx_bytes_offset);
>>> +        rx_total += stats->q_ipackets[i];
>>> +        rx_bytes_total += stats->q_ibytes[i];
>>>       }
>>>       imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>               internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>       for (i = 0; i < imax; i++) {
>>> -        igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
>>> -        igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
>>> -        tx_total += igb_stats->q_opackets[i];
>>> -        tx_err_total += internal->tx_queue[i].err_pkts;
>>> -        tx_bytes_total += igb_stats->q_obytes[i];
>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>> +        stats->q_opackets[i] = stats_get_diff(txq->tx_pkts,
>>> txq->tx_pkts_offset);
>>> +        stats->q_obytes[i] = stats_get_diff(txq->tx_bytes,
>>> txq->tx_bytes_offset);
>>> +        tx_total += stats->q_opackets[i];
>>> +        tx_err_total += stats_get_diff(txq->err_pkts,
>>> txq->err_pkts_offset);
>>> +        tx_bytes_total += stats->q_obytes[i];
>>>       }
>>> -    igb_stats->ipackets = rx_total;
>>> -    igb_stats->ibytes = rx_bytes_total;
>>> -    igb_stats->opackets = tx_total;
>>> -    igb_stats->oerrors = tx_err_total;
>>> -    igb_stats->obytes = tx_bytes_total;
>>> +    stats->ipackets = rx_total;
>>> +    stats->ibytes = rx_bytes_total;
>>> +    stats->opackets = tx_total;
>>> +    stats->oerrors = tx_err_total;
>>> +    stats->obytes = tx_bytes_total;
>>>       return 0;
>>>   }
>>> @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>>>       struct pmd_internals *internal = dev->data->dev_private;
>>>       for (i = 0; i < internal->nb_queues; i++) {
>>> -        internal->rx_queue[i].rx_pkts = 0;
>>> -        internal->rx_queue[i].rx_bytes = 0;
>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>> +        rxq->rx_pkts_offset = rxq->rx_pkts;
>>> +        rxq->rx_bytes_offset = rxq->rx_bytes;
>>>       }
>>>       for (i = 0; i < internal->nb_queues; i++) {
>>> -        internal->tx_queue[i].tx_pkts = 0;
>>> -        internal->tx_queue[i].err_pkts = 0;
>>> -        internal->tx_queue[i].tx_bytes = 0;
>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>> +        txq->tx_pkts_offset = txq->tx_pkts;
>>> +        txq->err_pkts_offset = txq->err_pkts;
>>> +        txq->tx_bytes_offset = txq->tx_bytes;
>>>       }
>>>       return 0;


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 13:49       ` Ferruh Yigit
@ 2024-05-07 14:51         ` Stephen Hemminger
  2024-05-07 16:00           ` Morten Brørup
  2024-05-08  6:28           ` Mattias Rönnblom
  2024-05-08  6:25         ` Mattias Rönnblom
  1 sibling, 2 replies; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-07 14:51 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Tue, 7 May 2024 14:49:19 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> On 5/7/2024 8:23 AM, Mattias Rönnblom wrote:
> > On 2024-04-28 17:11, Mattias Rönnblom wrote:  
> >> On 2024-04-26 16:38, Ferruh Yigit wrote:  
> >>> For stats reset, use an offset instead of zeroing out actual stats
> >>> values,
> >>> get_stats() displays diff between stats and offset.
> >>> This way stats only updated in datapath and offset only updated in stats
> >>> reset function. This makes stats reset function more reliable.
> >>>
> >>> As stats only written by single thread, we can remove 'volatile'
> >>> qualifier
> >>> which should improve the performance in datapath.
> >>>  
> >>
> >> volatile wouldn't help you if you had multiple writers, so that can't
> >> be the reason for its removal. It would be more accurate to say it
> >> should be replaced with atomic updates. If you don't use volatile and
> >> don't use atomics, you have to consider if the compiler can reach the
> >> conclusion that it does not need to store the counter value for future
> >> use *for that thread*. Since otherwise, I don't think the store
> >> actually needs to occur. Since DPDK statistics tend to work, it's
> >> pretty obvious that current compilers tend not to reach this conclusion.
> >>
> >> If this should be done 100% properly, the update operation should be a
> >> non-atomic load, non-atomic add, and an atomic store. Similarly, for
> >> the reset, the offset store should be atomic.
> >>
> >> Considered the state of the rest of the DPDK code base, I think a
> >> non-atomic, non-volatile solution is also fine.
> >>
> >> (That said, I think we're better off just deprecating stats reset
> >> altogether, and returning -ENOTSUP here meanwhile.)
> >>  
> >>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> >>> ---
> >>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> >>> Cc: Stephen Hemminger <stephen@networkplumber.org>
> >>> Cc: Morten Brørup <mb@smartsharesystems.com>
> >>>
> >>> This update triggered by mail list discussion [1].
> >>>
> >>> [1]
> >>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/

I would prefer that the SW statistics be handled generically by ethdev
layers and used by all such drivers.

The most complete version of SW stats now is in the virtio driver.
If reset needs to be reliable (debatable), then it needs to be done without
atomics.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-07 13:48     ` Ferruh Yigit
@ 2024-05-07 14:52       ` Stephen Hemminger
  2024-05-07 17:27         ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-07 14:52 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup, Honnappa Nagarahalli

On Tue, 7 May 2024 14:48:51 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> On 5/3/2024 11:00 PM, Stephen Hemminger wrote:
> > On Fri, 3 May 2024 16:45:47 +0100
> > Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> >   
> >> For stats reset, use an offset instead of zeroing out actual stats values,
> >> get_stats() displays diff between stats and offset.
> >> This way stats only updated in datapath and offset only updated in stats
> >> reset function. This makes stats reset function more reliable.
> >>
> >> As stats only written by single thread, we can remove 'volatile' qualifier
> >> which should improve the performance in datapath.
> >>
> >> While updating around, 'igb_stats' parameter renamed as 'stats'.
> >>
> >> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> >> ---
> >> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> >> Cc: Stephen Hemminger <stephen@networkplumber.org>
> >> Cc: Morten Brørup <mb@smartsharesystems.com>
> >>
> >> This update triggered by mail list discussion [1].
> >>
> >> [1]
> >> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/  
> > 
> > 
> > NAK
> > 
> > I did not hear a good argument why atomic or volatile was necessary in the first place.
> > Why?
> >   
> 
> Sure, the patch is done as RFC intentionally to discuss the approach.
> 
> Agree that volatile and atomics (fetch + add + store) is not required
> for thread synchronization, as only one CPU updates stats.
> Even this understanding is important because there are PMDs using full
> atomics for stats update, like null PMD [1], this will help up clear them.
> 
> 
> And there is a case, stats reset and stats updated in different threads
> simultaneously, for this 'volatile' is not sufficient anyway and full
> atomics is required. As this will cause performance impact we are
> already saying stats update and reset can't happen at the same time [2].
> With this update volatile and atomics are not required for this case too.
> (Also using offset to increase stats reset reliability.)
> 
> 
> In this patch volatile replaced with atomic load and atomic store (not
> atomic fetch and add), to ensure that stats stored to memory and not
> kept in device registers only.
> With volatile, it is guaranteed that updated stats stored back to
> memory, but without volatile and atomics I am not sure if this is
> guaranteed. Practically I can see this working, but theoretically not
> sure. This is similar concern with change in your patch that is casting
> to volatile to ensure value read from memory [3].
> 
> Expectation is, only atomics load and store will have smaller
> performance impact than volatile, ensuring memory load and store when
> needed.

The device register worry, can just be handled with a compiler barrier.
Does not need the stronger guarantee of atomic or volatile.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-03 15:45 ` [RFC v3] " Ferruh Yigit
  2024-05-03 22:00   ` Stephen Hemminger
@ 2024-05-07 15:27   ` Morten Brørup
  2024-05-07 17:40     ` Ferruh Yigit
  1 sibling, 1 reply; 42+ messages in thread
From: Morten Brørup @ 2024-05-07 15:27 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
> Sent: Friday, 3 May 2024 17.46
> 
> For stats reset, use an offset instead of zeroing out actual stats values,
> get_stats() displays diff between stats and offset.
> This way stats only updated in datapath and offset only updated in stats
> reset function. This makes stats reset function more reliable.
> 
> As stats only written by single thread, we can remove 'volatile' qualifier
> which should improve the performance in datapath.
> 
> While updating around, 'igb_stats' parameter renamed as 'stats'.
> 
> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> ---
> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Morten Brørup <mb@smartsharesystems.com>
> 
> This update triggered by mail list discussion [1].
> 
> [1]
> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-
> 5f4dd3969f99@lysator.liu.se/
> 
> v2:
> * Remove wrapping check for stats
> 
> v3:
> * counter and offset put into same struct per stats
> * Use atomic load / store for stats values
> ---

Note: My comments below relate to software PMDs only.

Design for the following invariants:
1. "counter" may increase at any time. (So stopping forwarding is not required.)
2. "counter" may not decrease.
3. "offset" is always <= "counter".

So:

Stats_get() must read "offset" before "counter"; if "counter" races to increase in the mean time, it doesn't hurt. Stats_get() is a relatively "cold" function, so barriers etc. are acceptable.

Assuming that stats_add() lazy-writes "counter"; if stats_get() reads an old value, its result will be slightly off, but not negative.

Similarly for stats_reset(), which obviously reads "counter" before writing "offset"; if "counter" races to increase in the mean time, the too low "offset" will not cause negative stats from stats_get().


And a requested change for performance:

> +struct stats {
> +	uint64_t counter;
> +	uint64_t offset;
> +};

The "offset" is cold.
Stats_add(), which is the only hot function, only touches "counter".

Instead of having a struct with {counter, offset}, I strongly prefer having them separate.
E.g. as a struct defining the set of statistics (e.g. pkts, bytes, errors), instantiated once for the counters (in a hot zone of the device data structure) and once for the offsets (in a cold zone of the device data structure).
There could be variants of this "set of statistics" struct, e.g. one for RX and a different one for TX. (Each variant would be instantiated twice, once for counters, and once for offsets.)


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 14:51         ` Stephen Hemminger
@ 2024-05-07 16:00           ` Morten Brørup
  2024-05-07 16:54             ` Ferruh Yigit
  2024-05-08  7:48             ` Mattias Rönnblom
  2024-05-08  6:28           ` Mattias Rönnblom
  1 sibling, 2 replies; 42+ messages in thread
From: Morten Brørup @ 2024-05-07 16:00 UTC (permalink / raw)
  To: Stephen Hemminger, Ferruh Yigit
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, 7 May 2024 16.51

> I would prefer that the SW statistics be handled generically by ethdev
> layers and used by all such drivers.

I agree.

Please note that maintaining counters in the ethdev layer might cause more cache misses than maintaining them in the hot parts of the individual drivers' data structures, so it's not all that simple. ;-)

Until then, let's find a short term solution, viable to implement across all software NIC drivers without API/ABI breakage.

> 
> The most complete version of SW stats now is in the virtio driver.

It looks like the virtio PMD maintains the counters; they are not retrieved from the host.

Considering a DPDK application running as a virtual machine (guest) on a host server...

If the host is unable to put a packet onto the guest's virtio RX queue - like when a HW NIC is out of RX descriptors - is it counted somewhere visible to the guest?

Similarly, if the guest is unable to put a packet onto its virtio TX queue, is it counted somewhere visible to the host?

> If reset needs to be reliable (debatable), then it needs to be done without
> atomics.

Let's modify that slightly: Without performance degradation in the fast path.
I'm not sure that all atomic operations are slow.
But you are right that it needs to be done without _Atomic counters; they seem to be slow.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 16:00           ` Morten Brørup
@ 2024-05-07 16:54             ` Ferruh Yigit
  2024-05-07 18:47               ` Stephen Hemminger
  2024-05-08  7:48             ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-07 16:54 UTC (permalink / raw)
  To: Morten Brørup, Stephen Hemminger
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom

On 5/7/2024 5:00 PM, Morten Brørup wrote:
>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>> Sent: Tuesday, 7 May 2024 16.51
> 
>> I would prefer that the SW statistics be handled generically by ethdev
>> layers and used by all such drivers.
> 
> I agree.
> 
> Please note that maintaining counters in the ethdev layer might cause more cache misses than maintaining them in the hot parts of the individual drivers' data structures, so it's not all that simple. ;-)
> 
> Until then, let's find a short term solution, viable to implement across all software NIC drivers without API/ABI breakage.
> 

I am against ehtdev layer being aware of SW drivers and behave
differently for them.
This is dev_ops and can be managed per driver. We can add helper
functions for drivers if there is a common pattern.

>>
>> The most complete version of SW stats now is in the virtio driver.
> 
> It looks like the virtio PMD maintains the counters; they are not retrieved from the host.
> 
> Considering a DPDK application running as a virtual machine (guest) on a host server...
> 
> If the host is unable to put a packet onto the guest's virtio RX queue - like when a HW NIC is out of RX descriptors - is it counted somewhere visible to the guest?
> 
> Similarly, if the guest is unable to put a packet onto its virtio TX queue, is it counted somewhere visible to the host?
> 
>> If reset needs to be reliable (debatable), then it needs to be done without
>> atomics.
> 
> Let's modify that slightly: Without performance degradation in the fast path.
> I'm not sure that all atomic operations are slow.
> But you are right that it needs to be done without _Atomic counters; they seem to be slow.
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-07 14:52       ` Stephen Hemminger
@ 2024-05-07 17:27         ` Ferruh Yigit
  0 siblings, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-07 17:27 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup, Honnappa Nagarahalli

On 5/7/2024 3:52 PM, Stephen Hemminger wrote:
> On Tue, 7 May 2024 14:48:51 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> On 5/3/2024 11:00 PM, Stephen Hemminger wrote:
>>> On Fri, 3 May 2024 16:45:47 +0100
>>> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
>>>   
>>>> For stats reset, use an offset instead of zeroing out actual stats values,
>>>> get_stats() displays diff between stats and offset.
>>>> This way stats only updated in datapath and offset only updated in stats
>>>> reset function. This makes stats reset function more reliable.
>>>>
>>>> As stats only written by single thread, we can remove 'volatile' qualifier
>>>> which should improve the performance in datapath.
>>>>
>>>> While updating around, 'igb_stats' parameter renamed as 'stats'.
>>>>
>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> ---
>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>
>>>> This update triggered by mail list discussion [1].
>>>>
>>>> [1]
>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/  
>>>
>>>
>>> NAK
>>>
>>> I did not hear a good argument why atomic or volatile was necessary in the first place.
>>> Why?
>>>   
>>
>> Sure, the patch is done as RFC intentionally to discuss the approach.
>>
>> Agree that volatile and atomics (fetch + add + store) is not required
>> for thread synchronization, as only one CPU updates stats.
>> Even this understanding is important because there are PMDs using full
>> atomics for stats update, like null PMD [1], this will help up clear them.
>>
>>
>> And there is a case, stats reset and stats updated in different threads
>> simultaneously, for this 'volatile' is not sufficient anyway and full
>> atomics is required. As this will cause performance impact we are
>> already saying stats update and reset can't happen at the same time [2].
>> With this update volatile and atomics are not required for this case too.
>> (Also using offset to increase stats reset reliability.)
>>
>>
>> In this patch volatile replaced with atomic load and atomic store (not
>> atomic fetch and add), to ensure that stats stored to memory and not
>> kept in device registers only.
>> With volatile, it is guaranteed that updated stats stored back to
>> memory, but without volatile and atomics I am not sure if this is
>> guaranteed. Practically I can see this working, but theoretically not
>> sure. This is similar concern with change in your patch that is casting
>> to volatile to ensure value read from memory [3].
>>
>> Expectation is, only atomics load and store will have smaller
>> performance impact than volatile, ensuring memory load and store when
>> needed.
> 
> The device register worry, can just be handled with a compiler barrier.
> Does not need the stronger guarantee of atomic or volatile.
>

Based on Morten's email, counter being stored late to memory may not be
an issue, so may not need even compiler barrier, let me check again.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-07 15:27   ` Morten Brørup
@ 2024-05-07 17:40     ` Ferruh Yigit
  0 siblings, 0 replies; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-07 17:40 UTC (permalink / raw)
  To: Morten Brørup, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

On 5/7/2024 4:27 PM, Morten Brørup wrote:
>> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
>> Sent: Friday, 3 May 2024 17.46
>>
>> For stats reset, use an offset instead of zeroing out actual stats values,
>> get_stats() displays diff between stats and offset.
>> This way stats only updated in datapath and offset only updated in stats
>> reset function. This makes stats reset function more reliable.
>>
>> As stats only written by single thread, we can remove 'volatile' qualifier
>> which should improve the performance in datapath.
>>
>> While updating around, 'igb_stats' parameter renamed as 'stats'.
>>
>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>> ---
>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>
>> This update triggered by mail list discussion [1].
>>
>> [1]
>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-
>> 5f4dd3969f99@lysator.liu.se/
>>
>> v2:
>> * Remove wrapping check for stats
>>
>> v3:
>> * counter and offset put into same struct per stats
>> * Use atomic load / store for stats values
>> ---
> 
> Note: My comments below relate to software PMDs only.
> 
> Design for the following invariants:
> 1. "counter" may increase at any time. (So stopping forwarding is not required.)
>

Mattias mentioned a case [1] that may end up 'offset > count', for being
safe side we may start with restriction.

[1]
https://inbox.dpdk.org/dev/20240425174617.2126159-1-ferruh.yigit@amd.com/T/#m29cd179228c164181d2bb7dea716dee6e91ab169

> 2. "counter" may not decrease.
> 3. "offset" is always <= "counter".
> 
> So:
> 
> Stats_get() must read "offset" before "counter"; if "counter" races to increase in the mean time, it doesn't hurt. Stats_get() is a relatively "cold" function, so barriers etc. are acceptable.
> 
> Assuming that stats_add() lazy-writes "counter"; if stats_get() reads an old value, its result will be slightly off, but not negative.
> 
> Similarly for stats_reset(), which obviously reads "counter" before writing "offset"; if "counter" races to increase in the mean time, the too low "offset" will not cause negative stats from stats_get().
> 

ack on above items.

> 
> And a requested change for performance:
> 
>> +struct stats {
>> +	uint64_t counter;
>> +	uint64_t offset;
>> +};
> 
> The "offset" is cold.
> Stats_add(), which is the only hot function, only touches "counter".
> 
> Instead of having a struct with {counter, offset}, I strongly prefer having them separate.
> E.g. as a struct defining the set of statistics (e.g. pkts, bytes, errors), instantiated once for the counters (in a hot zone of the device data structure) and once for the offsets (in a cold zone of the device data structure).
> There could be variants of this "set of statistics" struct, e.g. one for RX and a different one for TX. (Each variant would be instantiated twice, once for counters, and once for offsets.)
> 

Although having them together was logical, good point from performance
perspective, let me work on it.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 16:54             ` Ferruh Yigit
@ 2024-05-07 18:47               ` Stephen Hemminger
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-07 18:47 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Morten Brørup, Mattias Rönnblom, John W. Linville,
	Thomas Monjalon, dev, Mattias Rönnblom

On Tue, 7 May 2024 17:54:18 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> On 5/7/2024 5:00 PM, Morten Brørup wrote:
> >> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> >> Sent: Tuesday, 7 May 2024 16.51  
> >   
> >> I would prefer that the SW statistics be handled generically by ethdev
> >> layers and used by all such drivers.  
> > 
> > I agree.
> > 
> > Please note that maintaining counters in the ethdev layer might cause more cache misses than maintaining them in the hot parts of the individual drivers' data structures, so it's not all that simple. ;-)
> > 
> > Until then, let's find a short term solution, viable to implement across all software NIC drivers without API/ABI breakage.
> >   
> 
> I am against ehtdev layer being aware of SW drivers and behave
> differently for them.
> This is dev_ops and can be managed per driver. We can add helper
> functions for drivers if there is a common pattern.

It is more about having a set of helper routines for SW only drivers.
I have something in progress for this.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07  7:23     ` Mattias Rönnblom
  2024-05-07 13:49       ` Ferruh Yigit
@ 2024-05-07 19:19       ` Morten Brørup
  2024-05-08  6:34         ` Mattias Rönnblom
  1 sibling, 1 reply; 42+ messages in thread
From: Morten Brørup @ 2024-05-07 19:19 UTC (permalink / raw)
  To: Mattias Rönnblom, Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Tuesday, 7 May 2024 09.24
> 
> On 2024-04-28 17:11, Mattias Rönnblom wrote:
> > On 2024-04-26 16:38, Ferruh Yigit wrote:

[...]

> > static uint64_t
> > counter_value(struct counter *counter)
> > {
> >      uint64_t count = __atomic_load_n(&counter->count,
> __ATOMIC_RELAXED);
> >      uint64_t offset = __atomic_load_n(&counter->offset,
> __ATOMIC_RELAXED);
> >
> 
> Since the count and the offset are written to independently, without any
> ordering restrictions, an update and a reset in quick succession may
> cause the offset store to be globally visible before the new count.

Good catch.
This may happen when a thread calls stats_add() and then the same thread calls stats_reset().

> In such a scenario, a reader could see an offset > count.
> 
> Thus, unless I'm missing something, one should add a
> 
> if (unlikely(offset > count))
> 	return 0;
> 
> here. With the appropriate comment explaining why this might be.
> 
> Another approach would be to think about what memory barriers may be
> required to make sure one sees the count update before the offset
> update, but, intuitively, that seems like both more complex and more
> costly (performance-wise).

I think it can be done without affecting stats_add(), by using "offset" with Release-Consume ordering:
 - stats_reset() must write "offset" with memory_order_release, so "counter" cannot be visible after it, and
 - stats_get() must read "offset" with memory_order_consume, so no reads or writes in the current thread dependent on "offset" can be reordered before this load, and writes to "counter" (a data-dependent variable) in other threads that release "offset" are visible in the current thread.

> 
> >      return count + offset;
> > }
> >
> > static void
> > counter_reset(struct counter *counter)
> > {
> >      uint64_t count = __atomic_load_n(&counter->count,
> __ATOMIC_RELAXED);
> >
> >      __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
> > }
> >
> > static void
> > counter_add(struct counter *counter, uint64_t operand)
> > {
> >      __atomic_store_n(&counter->count, counter->count + operand,
> > __ATOMIC_RELAXED);
> > }


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 13:49       ` Ferruh Yigit
  2024-05-07 14:51         ` Stephen Hemminger
@ 2024-05-08  6:25         ` Mattias Rönnblom
  1 sibling, 0 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  6:25 UTC (permalink / raw)
  To: Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger,
	Morten Brørup

On 2024-05-07 15:49, Ferruh Yigit wrote:
> On 5/7/2024 8:23 AM, Mattias Rönnblom wrote:
>> On 2024-04-28 17:11, Mattias Rönnblom wrote:
>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>>> For stats reset, use an offset instead of zeroing out actual stats
>>>> values,
>>>> get_stats() displays diff between stats and offset.
>>>> This way stats only updated in datapath and offset only updated in stats
>>>> reset function. This makes stats reset function more reliable.
>>>>
>>>> As stats only written by single thread, we can remove 'volatile'
>>>> qualifier
>>>> which should improve the performance in datapath.
>>>>
>>>
>>> volatile wouldn't help you if you had multiple writers, so that can't
>>> be the reason for its removal. It would be more accurate to say it
>>> should be replaced with atomic updates. If you don't use volatile and
>>> don't use atomics, you have to consider if the compiler can reach the
>>> conclusion that it does not need to store the counter value for future
>>> use *for that thread*. Since otherwise, I don't think the store
>>> actually needs to occur. Since DPDK statistics tend to work, it's
>>> pretty obvious that current compilers tend not to reach this conclusion.
>>>
>>> If this should be done 100% properly, the update operation should be a
>>> non-atomic load, non-atomic add, and an atomic store. Similarly, for
>>> the reset, the offset store should be atomic.
>>>
>>> Considered the state of the rest of the DPDK code base, I think a
>>> non-atomic, non-volatile solution is also fine.
>>>
>>> (That said, I think we're better off just deprecating stats reset
>>> altogether, and returning -ENOTSUP here meanwhile.)
>>>
>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> ---
>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>
>>>> This update triggered by mail list discussion [1].
>>>>
>>>> [1]
>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
>>>>
>>>> v2:
>>>> * Remove wrapping check for stats
>>>> ---
>>>>    drivers/net/af_packet/rte_eth_af_packet.c | 66 ++++++++++++++---------
>>>>    1 file changed, 41 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
>>>> b/drivers/net/af_packet/rte_eth_af_packet.c
>>>> index 397a32db5886..10c8e1e50139 100644
>>>> --- a/drivers/net/af_packet/rte_eth_af_packet.c
>>>> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
>>>> @@ -51,8 +51,10 @@ struct pkt_rx_queue {
>>>>        uint16_t in_port;
>>>>        uint8_t vlan_strip;
>>>> -    volatile unsigned long rx_pkts;
>>>> -    volatile unsigned long rx_bytes;
>>>> +    uint64_t rx_pkts;
>>>> +    uint64_t rx_bytes;
>>>> +    uint64_t rx_pkts_offset;
>>>> +    uint64_t rx_bytes_offset;
>>>
>>> I suggest you introduce a separate struct for reset-able counters.
>>> It'll make things cleaner, and you can sneak in atomics without too
>>> much atomics-related bloat.
>>>
>>> struct counter
>>> {
>>>       uint64_t count;
>>>       uint64_t offset;
>>> };
>>>
>>> /../
>>>       struct counter rx_pkts;
>>>       struct counter rx_bytes;
>>> /../
>>>
>>> static uint64_t
>>> counter_value(struct counter *counter)
>>> {
>>>       uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>>       uint64_t offset = __atomic_load_n(&counter->offset,
>>> __ATOMIC_RELAXED);
>>>
>>
>> Since the count and the offset are written to independently, without any
>> ordering restrictions, an update and a reset in quick succession may
>> cause the offset store to be globally visible before the new count. In
>> such a scenario, a reader could see an offset > count.
>>
>> Thus, unless I'm missing something, one should add a
>>
>> if (unlikely(offset > count))
>>      return 0;
>>
>> here. With the appropriate comment explaining why this might be.
>>
>> Another approach would be to think about what memory barriers may be
>> required to make sure one sees the count update before the offset
>> update, but, intuitively, that seems like both more complex and more
>> costly (performance-wise).
>>
> 
> We are going with lazy alternative and requesting to stop forwarding
> before stats reset, this should prevent 'count' and 'offset' being
> updated simultaneously.
> 
> 

In that case, 'offset' is not needed.

>>>       return count + offset;
>>> }
>>>
>>> static void
>>> counter_reset(struct counter *counter)
>>> {
>>>       uint64_t count = __atomic_load_n(&counter->count, __ATOMIC_RELAXED);
>>>
>>>       __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>>> }
>>>
>>> static void
>>> counter_add(struct counter *counter, uint64_t operand)
>>> {
>>>       __atomic_store_n(&counter->count, counter->count + operand,
>>> __ATOMIC_RELAXED);
>>> }
>>>
>>> You'd have to port this to <rte_stdatomic.h> calls, which prevents
>>> non-atomic loads from RTE_ATOMIC()s. The non-atomic reads above must
>>> be replaced with explicit relaxed non-atomic load. Otherwise, if you
>>> just use "counter->count", that would be an atomic load with
>>> sequential consistency memory order on C11 atomics-based builds, which
>>> would result in a barrier, at least on weakly ordered machines (e.g.,
>>> ARM).
>>>
>>> I would still use a struct and some helper-functions even for the less
>>> ambitious, non-atomic variant.
>>>
>>> The only drawback of using GCC built-ins type atomics here, versus an
>>> atomic- and volatile-free approach, is that current compilers seems to
>>> refuse merging atomic stores. It's beyond me why this is the case. If
>>> you store to a variable twice in quick succession, it'll be two store
>>> machine instructions, even in cases where the compiler *knows* the
>>> value is identical. So volatile, even though you didn't ask for it.
>>> Weird.
>>>
>>> So if you have a loop, you may want to make an "counter_add()" in the
>>> end from a temporary, to get the final 0.001% of performance.
>>>
>>> If the tech board thinks MT-safe reset-able software-manage statistics
>>> is the future (as opposed to dropping reset support, for example), I
>>> think this stuff should go into a separate header file, so other PMDs
>>> can reuse it. Maybe out of scope for this patch.
>>>
>>>>    };
>>>>    struct pkt_tx_queue {
>>>> @@ -64,9 +66,12 @@ struct pkt_tx_queue {
>>>>        unsigned int framecount;
>>>>        unsigned int framenum;
>>>> -    volatile unsigned long tx_pkts;
>>>> -    volatile unsigned long err_pkts;
>>>> -    volatile unsigned long tx_bytes;
>>>> +    uint64_t tx_pkts;
>>>> +    uint64_t err_pkts;
>>>> +    uint64_t tx_bytes;
>>>> +    uint64_t tx_pkts_offset;
>>>> +    uint64_t err_pkts_offset;
>>>> +    uint64_t tx_bytes_offset;
>>>>    };
>>>>    struct pmd_internals {
>>>> @@ -385,8 +390,15 @@ eth_dev_info(struct rte_eth_dev *dev, struct
>>>> rte_eth_dev_info *dev_info)
>>>>        return 0;
>>>>    }
>>>> +
>>>> +static uint64_t
>>>> +stats_get_diff(uint64_t stats, uint64_t offset)
>>>> +{
>>>> +    return stats - offset;
>>>> +}
>>>> +
>>>>    static int
>>>> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>>>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>>>>    {
>>>>        unsigned i, imax;
>>>>        unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>>>> @@ -396,27 +408,29 @@ eth_stats_get(struct rte_eth_dev *dev, struct
>>>> rte_eth_stats *igb_stats)
>>>>        imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>>                internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>>        for (i = 0; i < imax; i++) {
>>>> -        igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
>>>> -        igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
>>>> -        rx_total += igb_stats->q_ipackets[i];
>>>> -        rx_bytes_total += igb_stats->q_ibytes[i];
>>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>>> +        stats->q_ipackets[i] = stats_get_diff(rxq->rx_pkts,
>>>> rxq->rx_pkts_offset);
>>>> +        stats->q_ibytes[i] = stats_get_diff(rxq->rx_bytes,
>>>> rxq->rx_bytes_offset);
>>>> +        rx_total += stats->q_ipackets[i];
>>>> +        rx_bytes_total += stats->q_ibytes[i];
>>>>        }
>>>>        imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
>>>>                internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
>>>>        for (i = 0; i < imax; i++) {
>>>> -        igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
>>>> -        igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
>>>> -        tx_total += igb_stats->q_opackets[i];
>>>> -        tx_err_total += internal->tx_queue[i].err_pkts;
>>>> -        tx_bytes_total += igb_stats->q_obytes[i];
>>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>>> +        stats->q_opackets[i] = stats_get_diff(txq->tx_pkts,
>>>> txq->tx_pkts_offset);
>>>> +        stats->q_obytes[i] = stats_get_diff(txq->tx_bytes,
>>>> txq->tx_bytes_offset);
>>>> +        tx_total += stats->q_opackets[i];
>>>> +        tx_err_total += stats_get_diff(txq->err_pkts,
>>>> txq->err_pkts_offset);
>>>> +        tx_bytes_total += stats->q_obytes[i];
>>>>        }
>>>> -    igb_stats->ipackets = rx_total;
>>>> -    igb_stats->ibytes = rx_bytes_total;
>>>> -    igb_stats->opackets = tx_total;
>>>> -    igb_stats->oerrors = tx_err_total;
>>>> -    igb_stats->obytes = tx_bytes_total;
>>>> +    stats->ipackets = rx_total;
>>>> +    stats->ibytes = rx_bytes_total;
>>>> +    stats->opackets = tx_total;
>>>> +    stats->oerrors = tx_err_total;
>>>> +    stats->obytes = tx_bytes_total;
>>>>        return 0;
>>>>    }
>>>> @@ -427,14 +441,16 @@ eth_stats_reset(struct rte_eth_dev *dev)
>>>>        struct pmd_internals *internal = dev->data->dev_private;
>>>>        for (i = 0; i < internal->nb_queues; i++) {
>>>> -        internal->rx_queue[i].rx_pkts = 0;
>>>> -        internal->rx_queue[i].rx_bytes = 0;
>>>> +        struct pkt_rx_queue *rxq = &internal->rx_queue[i];
>>>> +        rxq->rx_pkts_offset = rxq->rx_pkts;
>>>> +        rxq->rx_bytes_offset = rxq->rx_bytes;
>>>>        }
>>>>        for (i = 0; i < internal->nb_queues; i++) {
>>>> -        internal->tx_queue[i].tx_pkts = 0;
>>>> -        internal->tx_queue[i].err_pkts = 0;
>>>> -        internal->tx_queue[i].tx_bytes = 0;
>>>> +        struct pkt_tx_queue *txq = &internal->tx_queue[i];
>>>> +        txq->tx_pkts_offset = txq->tx_pkts;
>>>> +        txq->err_pkts_offset = txq->err_pkts;
>>>> +        txq->tx_bytes_offset = txq->tx_bytes;
>>>>        }
>>>>        return 0;
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 14:51         ` Stephen Hemminger
  2024-05-07 16:00           ` Morten Brørup
@ 2024-05-08  6:28           ` Mattias Rönnblom
  1 sibling, 0 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  6:28 UTC (permalink / raw)
  To: Stephen Hemminger, Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup

On 2024-05-07 16:51, Stephen Hemminger wrote:
> On Tue, 7 May 2024 14:49:19 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> On 5/7/2024 8:23 AM, Mattias Rönnblom wrote:
>>> On 2024-04-28 17:11, Mattias Rönnblom wrote:
>>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>>>> For stats reset, use an offset instead of zeroing out actual stats
>>>>> values,
>>>>> get_stats() displays diff between stats and offset.
>>>>> This way stats only updated in datapath and offset only updated in stats
>>>>> reset function. This makes stats reset function more reliable.
>>>>>
>>>>> As stats only written by single thread, we can remove 'volatile'
>>>>> qualifier
>>>>> which should improve the performance in datapath.
>>>>>   
>>>>
>>>> volatile wouldn't help you if you had multiple writers, so that can't
>>>> be the reason for its removal. It would be more accurate to say it
>>>> should be replaced with atomic updates. If you don't use volatile and
>>>> don't use atomics, you have to consider if the compiler can reach the
>>>> conclusion that it does not need to store the counter value for future
>>>> use *for that thread*. Since otherwise, I don't think the store
>>>> actually needs to occur. Since DPDK statistics tend to work, it's
>>>> pretty obvious that current compilers tend not to reach this conclusion.
>>>>
>>>> If this should be done 100% properly, the update operation should be a
>>>> non-atomic load, non-atomic add, and an atomic store. Similarly, for
>>>> the reset, the offset store should be atomic.
>>>>
>>>> Considered the state of the rest of the DPDK code base, I think a
>>>> non-atomic, non-volatile solution is also fine.
>>>>
>>>> (That said, I think we're better off just deprecating stats reset
>>>> altogether, and returning -ENOTSUP here meanwhile.)
>>>>   
>>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>>> ---
>>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>>
>>>>> This update triggered by mail list discussion [1].
>>>>>
>>>>> [1]
>>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
> 
> I would prefer that the SW statistics be handled generically by ethdev
> layers and used by all such drivers.
> 
> The most complete version of SW stats now is in the virtio driver.
> If reset needs to be reliable (debatable), then it needs to be done without
> atomics.

Why it needs to be done without atomics? Whatever that means.

In what sense should they be unreliable, needs to be documented.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 19:19       ` Morten Brørup
@ 2024-05-08  6:34         ` Mattias Rönnblom
  2024-05-08  7:10           ` Morten Brørup
  0 siblings, 1 reply; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  6:34 UTC (permalink / raw)
  To: Morten Brørup, Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

On 2024-05-07 21:19, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Tuesday, 7 May 2024 09.24
>>
>> On 2024-04-28 17:11, Mattias Rönnblom wrote:
>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
> 
> [...]
> 
>>> static uint64_t
>>> counter_value(struct counter *counter)
>>> {
>>>       uint64_t count = __atomic_load_n(&counter->count,
>> __ATOMIC_RELAXED);
>>>       uint64_t offset = __atomic_load_n(&counter->offset,
>> __ATOMIC_RELAXED);
>>>
>>
>> Since the count and the offset are written to independently, without any
>> ordering restrictions, an update and a reset in quick succession may
>> cause the offset store to be globally visible before the new count.
> 
> Good catch.
> This may happen when a thread calls stats_add() and then the same thread calls stats_reset().
> 
>> In such a scenario, a reader could see an offset > count.
>>
>> Thus, unless I'm missing something, one should add a
>>
>> if (unlikely(offset > count))
>> 	return 0;
>>
>> here. With the appropriate comment explaining why this might be.
>>
>> Another approach would be to think about what memory barriers may be
>> required to make sure one sees the count update before the offset
>> update, but, intuitively, that seems like both more complex and more
>> costly (performance-wise).
> 
> I think it can be done without affecting stats_add(), by using "offset" with Release-Consume ordering:
>   - stats_reset() must write "offset" with memory_order_release, so "counter" cannot be visible after it, and
>   - stats_get() must read "offset" with memory_order_consume, so no reads or writes in the current thread dependent on "offset" can be reordered before this load, and writes to "counter" (a data-dependent variable) in other threads that release "offset" are visible in the current thread.
> 

That was the kind of complexity I was thinking about. Those barriers 
come with a non-zero cost, both with different instructions being used 
and compiler optimizations being prevented.

>>
>>>       return count + offset;
>>> }
>>>
>>> static void
>>> counter_reset(struct counter *counter)
>>> {
>>>       uint64_t count = __atomic_load_n(&counter->count,
>> __ATOMIC_RELAXED);
>>>
>>>       __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>>> }
>>>
>>> static void
>>> counter_add(struct counter *counter, uint64_t operand)
>>> {
>>>       __atomic_store_n(&counter->count, counter->count + operand,
>>> __ATOMIC_RELAXED);
>>> }
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-08  6:34         ` Mattias Rönnblom
@ 2024-05-08  7:10           ` Morten Brørup
  2024-05-08  7:23             ` Mattias Rönnblom
  0 siblings, 1 reply; 42+ messages in thread
From: Morten Brørup @ 2024-05-08  7:10 UTC (permalink / raw)
  To: Mattias Rönnblom, Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 8 May 2024 08.35
> 
> On 2024-05-07 21:19, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Tuesday, 7 May 2024 09.24
> >>
> >> On 2024-04-28 17:11, Mattias Rönnblom wrote:
> >>> On 2024-04-26 16:38, Ferruh Yigit wrote:
> >
> > [...]
> >
> >>> static uint64_t
> >>> counter_value(struct counter *counter)
> >>> {
> >>>       uint64_t count = __atomic_load_n(&counter->count,
> >> __ATOMIC_RELAXED);
> >>>       uint64_t offset = __atomic_load_n(&counter->offset,
> >> __ATOMIC_RELAXED);
> >>>
> >>
> >> Since the count and the offset are written to independently, without any
> >> ordering restrictions, an update and a reset in quick succession may
> >> cause the offset store to be globally visible before the new count.
> >
> > Good catch.
> > This may happen when a thread calls stats_add() and then the same thread
> calls stats_reset().
> >
> >> In such a scenario, a reader could see an offset > count.
> >>
> >> Thus, unless I'm missing something, one should add a
> >>
> >> if (unlikely(offset > count))
> >> 	return 0;
> >>
> >> here. With the appropriate comment explaining why this might be.
> >>
> >> Another approach would be to think about what memory barriers may be
> >> required to make sure one sees the count update before the offset
> >> update, but, intuitively, that seems like both more complex and more
> >> costly (performance-wise).
> >
> > I think it can be done without affecting stats_add(), by using "offset" with
> Release-Consume ordering:
> >   - stats_reset() must write "offset" with memory_order_release, so
> "counter" cannot be visible after it, and
> >   - stats_get() must read "offset" with memory_order_consume, so no reads or
> writes in the current thread dependent on "offset" can be reordered before
> this load, and writes to "counter" (a data-dependent variable) in other
> threads that release "offset" are visible in the current thread.
> >
> 
> That was the kind of complexity I was thinking about. Those barriers
> come with a non-zero cost, both with different instructions being used
> and compiler optimizations being prevented.

Yep, you mentioned that there might be a more complex alternative, so I decided to explore it. :-)

This approach doesn't impose any new requirements on stats_add(), so the data plane performance is not affected.

For per-thread counters, stats_add() can store "counter" using memory_order_relaxed. Or, if the architecture prevents tearing of 64-bit variables, using volatile.

Counters shared by multiple threads must be atomically incremented using rte_atomic_fetch_add_explicit() with memory_order_relaxed.

> 
> >>
> >>>       return count + offset;
> >>> }
> >>>
> >>> static void
> >>> counter_reset(struct counter *counter)
> >>> {
> >>>       uint64_t count = __atomic_load_n(&counter->count,
> >> __ATOMIC_RELAXED);
> >>>
> >>>       __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
> >>> }
> >>>
> >>> static void
> >>> counter_add(struct counter *counter, uint64_t operand)
> >>> {
> >>>       __atomic_store_n(&counter->count, counter->count + operand,
> >>> __ATOMIC_RELAXED);
> >>> }
> >

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-03 22:00   ` Stephen Hemminger
  2024-05-07 13:48     ` Ferruh Yigit
@ 2024-05-08  7:19     ` Mattias Rönnblom
  2024-05-08 15:23       ` Stephen Hemminger
  1 sibling, 1 reply; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  7:19 UTC (permalink / raw)
  To: Stephen Hemminger, Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup

On 2024-05-04 00:00, Stephen Hemminger wrote:
> On Fri, 3 May 2024 16:45:47 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> For stats reset, use an offset instead of zeroing out actual stats values,
>> get_stats() displays diff between stats and offset.
>> This way stats only updated in datapath and offset only updated in stats
>> reset function. This makes stats reset function more reliable.
>>
>> As stats only written by single thread, we can remove 'volatile' qualifier
>> which should improve the performance in datapath.
>>
>> While updating around, 'igb_stats' parameter renamed as 'stats'.
>>
>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>> ---
>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>
>> This update triggered by mail list discussion [1].
>>
>> [1]
>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/
> 
> 
> NAK
> 
> I did not hear a good argument why atomic or volatile was necessary in the first place.
> Why?
> 

On the reader side, loads should be atomic.
On the writer side, stores should be atomic.

Updates (stores) should actually occur in a timely manner. The complete 
read-modify-write cycle need not be atomic, since we only have a single 
writer. All this for the per-lcore counter case.

If load or store tearing occurs, the counter values may occasionally 
take totally bogus values. I think that should be avoided. Especially 
since it will likely come at a very reasonable cost.

 From what it seems to me, load or store tearing may well occur. GCC may 
generate two 32-bit stores for a program-level 64-bit store on 32-bit 
x86. If you have constant and immediate-data store instructions, 
constant writes may also be end up teared. The kernel documentation has 
some example of this. Add LTO, it's not necessarily going to be all that 
clear what is storing-a-constant and what is not.

Maybe you care a little less if statistics are occasionally broken, or 
some transient, inconsistent state, but generally they should work, and 
they should never have some totally bogus values. So, statistics aren't 
snow flakes, mostly just business as usual.

We can't both have a culture that promotes C11-style parallel 
programming, or, at the extreme, push the C11 APIs as-is, and the say 
"and btw you don't have to care about the standard when it comes to 
statistics".

We could adopt the Linux kernel's rules, programming model, and APIs 
(ignoring legal issues). That would be very old school, maybe somewhat 
over-engineered for our purpose, include a fair amount of inline 
assembler, and also and may well depend on GCC or GCC-like compilers, 
just like what I believe the kernel does.

We could use something in-between, heavily inspired by C11 but still 
with an opportunity to work around compiler issues, library issues, and
extend the API for our use case.

I agree we shouldn't have to mark statistics _Atomic, or RTE_ATOMIC(), 
rte_atomic64_t, or rte_sometimes_atomic_and_sometimes_not64_t. Just 
keeping the usual C integer types seems like a better option to me.

> Why is this driver special (a snowflake) compared to all the other drivers doing software
> statistics (tap, virtio, xdp, ring, memif, netvsc, vmware)?

If a broken piece of code has been copied around, one place is going to 
be the first to be fixed.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-08  7:10           ` Morten Brørup
@ 2024-05-08  7:23             ` Mattias Rönnblom
  0 siblings, 0 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  7:23 UTC (permalink / raw)
  To: Morten Brørup, Ferruh Yigit, John W. Linville
  Cc: Thomas Monjalon, dev, Mattias Rönnblom, Stephen Hemminger

On 2024-05-08 09:10, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 8 May 2024 08.35
>>
>> On 2024-05-07 21:19, Morten Brørup wrote:
>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>> Sent: Tuesday, 7 May 2024 09.24
>>>>
>>>> On 2024-04-28 17:11, Mattias Rönnblom wrote:
>>>>> On 2024-04-26 16:38, Ferruh Yigit wrote:
>>>
>>> [...]
>>>
>>>>> static uint64_t
>>>>> counter_value(struct counter *counter)
>>>>> {
>>>>>        uint64_t count = __atomic_load_n(&counter->count,
>>>> __ATOMIC_RELAXED);
>>>>>        uint64_t offset = __atomic_load_n(&counter->offset,
>>>> __ATOMIC_RELAXED);
>>>>>
>>>>
>>>> Since the count and the offset are written to independently, without any
>>>> ordering restrictions, an update and a reset in quick succession may
>>>> cause the offset store to be globally visible before the new count.
>>>
>>> Good catch.
>>> This may happen when a thread calls stats_add() and then the same thread
>> calls stats_reset().
>>>
>>>> In such a scenario, a reader could see an offset > count.
>>>>
>>>> Thus, unless I'm missing something, one should add a
>>>>
>>>> if (unlikely(offset > count))
>>>> 	return 0;
>>>>
>>>> here. With the appropriate comment explaining why this might be.
>>>>
>>>> Another approach would be to think about what memory barriers may be
>>>> required to make sure one sees the count update before the offset
>>>> update, but, intuitively, that seems like both more complex and more
>>>> costly (performance-wise).
>>>
>>> I think it can be done without affecting stats_add(), by using "offset" with
>> Release-Consume ordering:
>>>    - stats_reset() must write "offset" with memory_order_release, so
>> "counter" cannot be visible after it, and
>>>    - stats_get() must read "offset" with memory_order_consume, so no reads or
>> writes in the current thread dependent on "offset" can be reordered before
>> this load, and writes to "counter" (a data-dependent variable) in other
>> threads that release "offset" are visible in the current thread.
>>>
>>
>> That was the kind of complexity I was thinking about. Those barriers
>> come with a non-zero cost, both with different instructions being used
>> and compiler optimizations being prevented.
> 
> Yep, you mentioned that there might be a more complex alternative, so I decided to explore it. :-)
> 
> This approach doesn't impose any new requirements on stats_add(), so the data plane performance is not affected.
> 

OK, I get it now. That's a good point. In my thought experiment, I had a 
thread both updating and resetting the counter, which should be allowed, 
but you could have barriers sit only in the reset routine.

> For per-thread counters, stats_add() can store "counter" using memory_order_relaxed. Or, if the architecture prevents tearing of 64-bit variables, using volatile.
> 
> Counters shared by multiple threads must be atomically incremented using rte_atomic_fetch_add_explicit() with memory_order_relaxed.
> 
>>
>>>>
>>>>>        return count + offset;
>>>>> }
>>>>>
>>>>> static void
>>>>> counter_reset(struct counter *counter)
>>>>> {
>>>>>        uint64_t count = __atomic_load_n(&counter->count,
>>>> __ATOMIC_RELAXED);
>>>>>
>>>>>        __atomic_store_n(&counter->offset, count, __ATOMIC_RELAXED);
>>>>> }
>>>>>
>>>>> static void
>>>>> counter_add(struct counter *counter, uint64_t operand)
>>>>> {
>>>>>        __atomic_store_n(&counter->count, counter->count + operand,
>>>>> __ATOMIC_RELAXED);
>>>>> }
>>>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v2] net/af_packet: make stats reset reliable
  2024-05-07 16:00           ` Morten Brørup
  2024-05-07 16:54             ` Ferruh Yigit
@ 2024-05-08  7:48             ` Mattias Rönnblom
  1 sibling, 0 replies; 42+ messages in thread
From: Mattias Rönnblom @ 2024-05-08  7:48 UTC (permalink / raw)
  To: Morten Brørup, Stephen Hemminger, Ferruh Yigit
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom

On 2024-05-07 18:00, Morten Brørup wrote:
>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>> Sent: Tuesday, 7 May 2024 16.51
> 
>> I would prefer that the SW statistics be handled generically by ethdev
>> layers and used by all such drivers.
> 
> I agree.
> 
> Please note that maintaining counters in the ethdev layer might cause more cache misses than maintaining them in the hot parts of the individual drivers' data structures, so it's not all that simple. ;-)
> 
> Until then, let's find a short term solution, viable to implement across all software NIC drivers without API/ABI breakage.
> 
>>
>> The most complete version of SW stats now is in the virtio driver.
> 
> It looks like the virtio PMD maintains the counters; they are not retrieved from the host.
> 
> Considering a DPDK application running as a virtual machine (guest) on a host server...
> 
> If the host is unable to put a packet onto the guest's virtio RX queue - like when a HW NIC is out of RX descriptors - is it counted somewhere visible to the guest?
> 
> Similarly, if the guest is unable to put a packet onto its virtio TX queue, is it counted somewhere visible to the host?
> 
>> If reset needs to be reliable (debatable), then it needs to be done without
>> atomics.
> 
> Let's modify that slightly: Without performance degradation in the fast path.
> I'm not sure that all atomic operations are slow.

Relaxed atomic loads from and stores to naturally aligned addresses are 
for free on ARM and x86_64 up to at least 64 bits.

"For free" is not entirely true, since both C11 relaxed stores and 
stores through volatile may prevent vectorization in GCC. I don't see 
why, but in practice that seems to be the case. That is very much a 
corner case.

Also, as mentioned before, C11 atomic store effectively has volatile 
semantics, which in turn may prevent some compiler optimizations.

On 32-bit x86, 64-bit atomic stores use xmm registers, but those are 
going to be used anyway, since you'll have a 64-bit add.

> But you are right that it needs to be done without _Atomic counters; they seem to be slow.
> 

_Atomic is not slower than atomics without _Atomic, when you actually 
need atomic operations.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-08  7:19     ` Mattias Rönnblom
@ 2024-05-08 15:23       ` Stephen Hemminger
  2024-05-08 19:48         ` Ferruh Yigit
  0 siblings, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-08 15:23 UTC (permalink / raw)
  To: Mattias Rönnblom
  Cc: Ferruh Yigit, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Wed, 8 May 2024 09:19:02 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> On 2024-05-04 00:00, Stephen Hemminger wrote:
> > On Fri, 3 May 2024 16:45:47 +0100
> > Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> >   
> >> For stats reset, use an offset instead of zeroing out actual stats values,
> >> get_stats() displays diff between stats and offset.
> >> This way stats only updated in datapath and offset only updated in stats
> >> reset function. This makes stats reset function more reliable.
> >>
> >> As stats only written by single thread, we can remove 'volatile' qualifier
> >> which should improve the performance in datapath.
> >>
> >> While updating around, 'igb_stats' parameter renamed as 'stats'.
> >>
> >> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
> >> ---
> >> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> >> Cc: Stephen Hemminger <stephen@networkplumber.org>
> >> Cc: Morten Brørup <mb@smartsharesystems.com>
> >>
> >> This update triggered by mail list discussion [1].
> >>
> >> [1]
> >> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/  
> > 
> > 
> > NAK
> > 
> > I did not hear a good argument why atomic or volatile was necessary in the first place.
> > Why?
> >   
> 
> On the reader side, loads should be atomic.
> On the writer side, stores should be atomic.
> 
> Updates (stores) should actually occur in a timely manner. The complete 
> read-modify-write cycle need not be atomic, since we only have a single 
> writer. All this for the per-lcore counter case.
> 
> If load or store tearing occurs, the counter values may occasionally 
> take totally bogus values. I think that should be avoided. Especially 
> since it will likely come at a very reasonable cost.
> 
>  From what it seems to me, load or store tearing may well occur. GCC may 
> generate two 32-bit stores for a program-level 64-bit store on 32-bit 
> x86. If you have constant and immediate-data store instructions, 
> constant writes may also be end up teared. The kernel documentation has 
> some example of this. Add LTO, it's not necessarily going to be all that 
> clear what is storing-a-constant and what is not.
> 
> Maybe you care a little less if statistics are occasionally broken, or 
> some transient, inconsistent state, but generally they should work, and 
> they should never have some totally bogus values. So, statistics aren't 
> snow flakes, mostly just business as usual.
> 
> We can't both have a culture that promotes C11-style parallel 
> programming, or, at the extreme, push the C11 APIs as-is, and the say 
> "and btw you don't have to care about the standard when it comes to 
> statistics".
> 
> We could adopt the Linux kernel's rules, programming model, and APIs 
> (ignoring legal issues). That would be very old school, maybe somewhat 
> over-engineered for our purpose, include a fair amount of inline 
> assembler, and also and may well depend on GCC or GCC-like compilers, 
> just like what I believe the kernel does.
> 
> We could use something in-between, heavily inspired by C11 but still 
> with an opportunity to work around compiler issues, library issues, and
> extend the API for our use case.
> 
> I agree we shouldn't have to mark statistics _Atomic, or RTE_ATOMIC(), 
> rte_atomic64_t, or rte_sometimes_atomic_and_sometimes_not64_t. Just 
> keeping the usual C integer types seems like a better option to me.
> 
> > Why is this driver special (a snowflake) compared to all the other drivers doing software
> > statistics (tap, virtio, xdp, ring, memif, netvsc, vmware)?  
> 
> If a broken piece of code has been copied around, one place is going to 
> be the first to be fixed.


I dislike when any driver does something completely different than valid precedent.
No other driver in DPDK, Vpp, FreeBSD, Linux (and probably Windows) uses atomic for
updating statistics. We even got performance benefit at MS from removing atomic
increment of staistics in internal layers.

The idea of load tearing is crazy talk of integral types. It would break so many things.
It is the kind of stupid compiler thing that would send Linus on a rant and get
the GCC compiler writers in trouble. 

The DPDK has always favored performance over strict safety guard rails everywhere.
Switching to making every statistic an atomic operation is not in the spirit of
what is required. There is no strict guarantee necessary here.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-08 15:23       ` Stephen Hemminger
@ 2024-05-08 19:48         ` Ferruh Yigit
  2024-05-08 20:54           ` Stephen Hemminger
  0 siblings, 1 reply; 42+ messages in thread
From: Ferruh Yigit @ 2024-05-08 19:48 UTC (permalink / raw)
  To: Stephen Hemminger, Mattias Rönnblom
  Cc: John W. Linville, Thomas Monjalon, dev, Mattias Rönnblom,
	Morten Brørup

On 5/8/2024 4:23 PM, Stephen Hemminger wrote:
> On Wed, 8 May 2024 09:19:02 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>> On 2024-05-04 00:00, Stephen Hemminger wrote:
>>> On Fri, 3 May 2024 16:45:47 +0100
>>> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
>>>   
>>>> For stats reset, use an offset instead of zeroing out actual stats values,
>>>> get_stats() displays diff between stats and offset.
>>>> This way stats only updated in datapath and offset only updated in stats
>>>> reset function. This makes stats reset function more reliable.
>>>>
>>>> As stats only written by single thread, we can remove 'volatile' qualifier
>>>> which should improve the performance in datapath.
>>>>
>>>> While updating around, 'igb_stats' parameter renamed as 'stats'.
>>>>
>>>> Signed-off-by: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> ---
>>>> Cc: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>>>> Cc: Stephen Hemminger <stephen@networkplumber.org>
>>>> Cc: Morten Brørup <mb@smartsharesystems.com>
>>>>
>>>> This update triggered by mail list discussion [1].
>>>>
>>>> [1]
>>>> https://inbox.dpdk.org/dev/3b2cf48e-2293-4226-b6cd-5f4dd3969f99@lysator.liu.se/  
>>>
>>>
>>> NAK
>>>
>>> I did not hear a good argument why atomic or volatile was necessary in the first place.
>>> Why?
>>>   
>>
>> On the reader side, loads should be atomic.
>> On the writer side, stores should be atomic.
>>
>> Updates (stores) should actually occur in a timely manner. The complete 
>> read-modify-write cycle need not be atomic, since we only have a single 
>> writer. All this for the per-lcore counter case.
>>
>> If load or store tearing occurs, the counter values may occasionally 
>> take totally bogus values. I think that should be avoided. Especially 
>> since it will likely come at a very reasonable cost.
>>
>>  From what it seems to me, load or store tearing may well occur. GCC may 
>> generate two 32-bit stores for a program-level 64-bit store on 32-bit 
>> x86. If you have constant and immediate-data store instructions, 
>> constant writes may also be end up teared. The kernel documentation has 
>> some example of this. Add LTO, it's not necessarily going to be all that 
>> clear what is storing-a-constant and what is not.
>>
>> Maybe you care a little less if statistics are occasionally broken, or 
>> some transient, inconsistent state, but generally they should work, and 
>> they should never have some totally bogus values. So, statistics aren't 
>> snow flakes, mostly just business as usual.
>>
>> We can't both have a culture that promotes C11-style parallel 
>> programming, or, at the extreme, push the C11 APIs as-is, and the say 
>> "and btw you don't have to care about the standard when it comes to 
>> statistics".
>>
>> We could adopt the Linux kernel's rules, programming model, and APIs 
>> (ignoring legal issues). That would be very old school, maybe somewhat 
>> over-engineered for our purpose, include a fair amount of inline 
>> assembler, and also and may well depend on GCC or GCC-like compilers, 
>> just like what I believe the kernel does.
>>
>> We could use something in-between, heavily inspired by C11 but still 
>> with an opportunity to work around compiler issues, library issues, and
>> extend the API for our use case.
>>
>> I agree we shouldn't have to mark statistics _Atomic, or RTE_ATOMIC(), 
>> rte_atomic64_t, or rte_sometimes_atomic_and_sometimes_not64_t. Just 
>> keeping the usual C integer types seems like a better option to me.
>>
>>> Why is this driver special (a snowflake) compared to all the other drivers doing software
>>> statistics (tap, virtio, xdp, ring, memif, netvsc, vmware)?  
>>
>> If a broken piece of code has been copied around, one place is going to 
>> be the first to be fixed.
> 
> 
> I dislike when any driver does something completely different than valid precedent.
> No other driver in DPDK, Vpp, FreeBSD, Linux (and probably Windows) uses atomic for
> updating statistics. We even got performance benefit at MS from removing atomic
> increment of staistics in internal layers.
> 
> The idea of load tearing is crazy talk of integral types. It would break so many things.
> It is the kind of stupid compiler thing that would send Linus on a rant and get
> the GCC compiler writers in trouble. 
> 
> The DPDK has always favored performance over strict safety guard rails everywhere.
> Switching to making every statistic an atomic operation is not in the spirit of
> what is required. There is no strict guarantee necessary here.
> 

I kind of agree with Stephen.

Thanks Mattias, Morten & Stephen, it was informative discussion. But for
*SW drivers* stats update and reset is not core functionality and I
think we can be OK to get hit on corner cases, instead of
over-engineering or making code more complex.

I am for putting priority as following (from high to low):
- Datapath performance
- Stats get accuracy
- Stats reset accuracy

With the restriction that stat reset requires forwarding to stop, we can
even drop 'offset' logic.
And I am not sure if it is a real requirement that stats reset should be
supported during forwarding, although I can see it is convenient.
If we get this requirement in the future, we can focus on a solution.


As action,
I am planning to send a new version of this RFC that only removes the
'volatile' qualifier.
In next step we can remove atomic updates and volatile stat counters
from more SW drivers.

Thanks,
ferruh


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC v3] net/af_packet: make stats reset reliable
  2024-05-08 19:48         ` Ferruh Yigit
@ 2024-05-08 20:54           ` Stephen Hemminger
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Hemminger @ 2024-05-08 20:54 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Mattias Rönnblom, John W. Linville, Thomas Monjalon, dev,
	Mattias Rönnblom, Morten Brørup

On Wed, 8 May 2024 20:48:06 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> > 
> > The idea of load tearing is crazy talk of integral types. It would break so many things.
> > It is the kind of stupid compiler thing that would send Linus on a rant and get
> > the GCC compiler writers in trouble. 
> > 
> > The DPDK has always favored performance over strict safety guard rails everywhere.
> > Switching to making every statistic an atomic operation is not in the spirit of
> > what is required. There is no strict guarantee necessary here.
> >   
> 
> I kind of agree with Stephen.
> 
> Thanks Mattias, Morten & Stephen, it was informative discussion. But for
> *SW drivers* stats update and reset is not core functionality and I
> think we can be OK to get hit on corner cases, instead of
> over-engineering or making code more complex.


I forgot the case of 64 bit values on 32 bit platforms!
Mostly because haven't cared about 32 bit for years...

The Linux kernel uses some wrappers to handle this.
On 64 bit platforms they become noop.
On 32 bit platform, they are protected by a seqlock and updates are
wrapped by the sequence count.

If we go this way, then doing similar Noop on 64 bit and atomic or seqlock
on 32 bit should be done, but in common helper.

Looking inside FreeBSD, it looks like that has changed over the years as well.

	if_inc_counter
		counter_u64_add
			atomic_add_64
But the counters are always per-cpu in this case. So although it does use
locked operation, will always be uncontended.
				

PS: Does DPDK still actually support 32 bit on x86? Can it be dropped this cycle?

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2024-05-08 20:54 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-25 17:46 [RFC] net/af_packet: make stats reset reliable Ferruh Yigit
2024-04-26 11:33 ` Morten Brørup
2024-04-26 13:37   ` Ferruh Yigit
2024-04-26 14:56     ` Morten Brørup
2024-04-28 15:42   ` Mattias Rönnblom
2024-04-26 14:38 ` [RFC v2] " Ferruh Yigit
2024-04-26 14:47   ` Morten Brørup
2024-04-28 15:11   ` Mattias Rönnblom
2024-05-01 16:19     ` Ferruh Yigit
2024-05-02  5:51       ` Mattias Rönnblom
2024-05-02 14:22         ` Ferruh Yigit
2024-05-02 15:59           ` Stephen Hemminger
2024-05-02 18:20             ` Ferruh Yigit
2024-05-02 17:37           ` Mattias Rönnblom
2024-05-02 18:26             ` Stephen Hemminger
2024-05-02 21:26               ` Mattias Rönnblom
2024-05-02 21:46                 ` Stephen Hemminger
2024-05-07  7:23     ` Mattias Rönnblom
2024-05-07 13:49       ` Ferruh Yigit
2024-05-07 14:51         ` Stephen Hemminger
2024-05-07 16:00           ` Morten Brørup
2024-05-07 16:54             ` Ferruh Yigit
2024-05-07 18:47               ` Stephen Hemminger
2024-05-08  7:48             ` Mattias Rönnblom
2024-05-08  6:28           ` Mattias Rönnblom
2024-05-08  6:25         ` Mattias Rönnblom
2024-05-07 19:19       ` Morten Brørup
2024-05-08  6:34         ` Mattias Rönnblom
2024-05-08  7:10           ` Morten Brørup
2024-05-08  7:23             ` Mattias Rönnblom
2024-04-26 21:28 ` [RFC] " Patrick Robb
2024-05-03 15:45 ` [RFC v3] " Ferruh Yigit
2024-05-03 22:00   ` Stephen Hemminger
2024-05-07 13:48     ` Ferruh Yigit
2024-05-07 14:52       ` Stephen Hemminger
2024-05-07 17:27         ` Ferruh Yigit
2024-05-08  7:19     ` Mattias Rönnblom
2024-05-08 15:23       ` Stephen Hemminger
2024-05-08 19:48         ` Ferruh Yigit
2024-05-08 20:54           ` Stephen Hemminger
2024-05-07 15:27   ` Morten Brørup
2024-05-07 17:40     ` Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).