DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
@ 2019-06-26 12:48 Viacheslav Ovsiienko
  2019-06-26 12:57 ` Bruce Richardson
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
  0 siblings, 2 replies; 29+ messages in thread
From: Viacheslav Ovsiienko @ 2019-06-26 12:48 UTC (permalink / raw)
  To: dev; +Cc: bernard.iremonger, ferruh.yigit

There is the testpmd configuration option called
RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on
the testpmd application measures the CPU clocks spent
within forwarding loop. This time is the sum of execution
times of rte_eth_rx_burst(), rte_eth_tx_burst(), rte_delay_us(),
 rte_pktmbuf_free() and so on, depending on fwd mode set.

While debugging and performance optimization of datapath
burst routines tt would be useful to see the pure execution
times of these ones. It is proposed to add separated profiling
options:

CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
    enables gathering profiling data for transmit datapath,
    ticks spent within rte_eth_tx_burst() routine

CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
    enables gathering profiling data for receive datapath,
    ticks spent within rte_eth_rx_burst() routine

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 RFC: http://patches.dpdk.org/patch/53704/
---
 app/test-pmd/csumonly.c                 | 25 ++++++++++-----------
 app/test-pmd/flowgen.c                  | 25 +++++++++++----------
 app/test-pmd/icmpecho.c                 | 26 ++++++++++-----------
 app/test-pmd/iofwd.c                    | 24 ++++++++++----------
 app/test-pmd/macfwd.c                   | 24 +++++++++++---------
 app/test-pmd/macswap.c                  | 26 +++++++++++----------
 app/test-pmd/rxonly.c                   | 17 +++++---------
 app/test-pmd/softnicfwd.c               | 24 ++++++++++----------
 app/test-pmd/testpmd.c                  | 32 ++++++++++++++++++++++++++
 app/test-pmd/testpmd.h                  | 40 +++++++++++++++++++++++++++++++++
 app/test-pmd/txonly.c                   | 23 +++++++++----------
 config/common_base                      |  2 ++
 doc/guides/testpmd_app_ug/build_app.rst | 17 ++++++++++++++
 13 files changed, 197 insertions(+), 108 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index e1cb7fb..e449ae2 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -717,19 +717,19 @@ struct simple_gre_hdr {
 	uint16_t nb_segments = 0;
 	int ret;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/* receive a burst of packet */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -989,8 +989,10 @@ struct simple_gre_hdr {
 		printf("Preparing packet burst to transmit failed: %s\n",
 				rte_strerror(rte_errno));
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/*
 	 * Retry if necessary
@@ -999,8 +1001,10 @@ struct simple_gre_hdr {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -1017,12 +1021,7 @@ struct simple_gre_hdr {
 			rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine csum_fwd_engine = {
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index ade6fe5..2b98637 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -130,20 +130,21 @@
 	uint16_t i;
 	uint32_t retry;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 	static int next_flow = 0;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/* Receive a burst of packets and discard them. */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -212,7 +213,9 @@
 		next_flow = (next_flow + 1) % cfg_n_flows;
 	}
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -220,8 +223,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -239,11 +244,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine flow_gen_engine = {
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 2d359c9..f2a6195 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -293,21 +293,22 @@
 	uint32_t cksum;
 	uint8_t  i;
 	int l2_len;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * First, receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -492,8 +493,10 @@
 
 	/* Send back ICMP echo replies, if any. */
 	if (nb_replies > 0) {
+		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		/*
 		 * Retry if necessary
 		 */
@@ -502,10 +505,12 @@
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
 				rte_delay_us(burst_tx_delay_time);
+				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 				nb_tx += rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 			}
 		}
 		fs->tx_packets += nb_tx;
@@ -519,12 +524,7 @@
 			} while (++nb_tx < nb_replies);
 		}
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine icmp_echo_engine = {
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9dce76e..dc66a88 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -51,21 +51,21 @@
 	uint16_t nb_tx;
 	uint32_t retry;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,8 +73,10 @@
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -82,8 +84,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -96,11 +100,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine io_fwd_engine = {
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index d2ebb11..58168b6 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -56,21 +56,23 @@
 	uint16_t i;
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -103,7 +105,9 @@
 		mb->vlan_tci = txp->tx_vlan_id;
 		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
 	}
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -111,8 +115,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 
@@ -126,11 +132,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_fwd_engine = {
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 71af916..b22acdb 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -86,21 +86,22 @@
 	uint16_t nb_rx;
 	uint16_t nb_tx;
 	uint32_t retry;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -112,7 +113,10 @@
 
 	do_macswap(pkts_burst, nb_rx, txp);
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+
 	/*
 	 * Retry if necessary
 	 */
@@ -120,8 +124,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -134,11 +140,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_swap_engine = {
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 5c65fc4..d1da357 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -50,19 +50,18 @@
 	uint16_t nb_rx;
 	uint16_t i;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*
 	 * Receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -73,11 +72,7 @@
 	for (i = 0; i < nb_rx; i++)
 		rte_pktmbuf_free(pkts_burst[i]);
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine rx_only_engine = {
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index 94e6669..9b2b0e6 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -87,35 +87,39 @@ struct tm_hierarchy {
 	uint16_t nb_tx;
 	uint32_t retry;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
 #endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 	/*  Packets Receive */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -130,11 +134,7 @@ struct tm_hierarchy {
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 0dd47b3..b9cdc2e 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1506,6 +1506,12 @@ struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t fwd_cycles = 0;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	uint64_t rx_cycles = 0;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	uint64_t tx_cycles = 0;
+#endif
 	uint64_t total_recv = 0;
 	uint64_t total_xmit = 0;
 	struct rte_port *port;
@@ -1536,6 +1542,12 @@ struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fwd_cycles += fs->core_cycles;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+		rx_cycles += fs->core_rx_cycles;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+		tx_cycles += fs->core_tx_cycles;
+#endif
 	}
 	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
 		uint8_t j;
@@ -1671,6 +1683,20 @@ struct extmem_param {
 		       (unsigned int)(fwd_cycles / total_recv),
 		       fwd_cycles, total_recv);
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	if (total_recv > 0)
+		printf("\n  rx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
+		       (unsigned int)(rx_cycles / total_recv),
+		       rx_cycles, total_recv);
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	if (total_xmit > 0)
+		printf("\n  tx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
+		       (unsigned int)(tx_cycles / total_xmit),
+		       tx_cycles, total_xmit);
+#endif
 }
 
 void
@@ -1701,6 +1727,12 @@ struct extmem_param {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fs->core_cycles = 0;
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+		fs->core_rx_cycles = 0;
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+		fs->core_tx_cycles = 0;
+#endif
 	}
 }
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index e3a6f7c..d4890d4 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -130,12 +130,52 @@ struct fwd_stream {
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t     core_cycles; /**< used for RX and TX processing */
 #endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+	uint64_t     core_tx_cycles; /**< used for tx_burst processing */
+#endif
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+	uint64_t     core_rx_cycles; /**< used for rx_burst processing */
+#endif
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	struct pkt_burst_stats rx_burst_stats;
 	struct pkt_burst_stats tx_burst_stats;
 #endif
 };
 
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+#define TEST_PMD_CORE_CYC_TX_START(a) {a = rte_rdtsc(); }
+#else
+#define TEST_PMD_CORE_CYC_TX_START(a)
+#endif
+
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES) || \
+	defined(RTE_TEST_PMD_RECORD_CORE_RX_CYCLES)
+#define TEST_PMD_CORE_CYC_RX_START(a) {a = rte_rdtsc(); }
+#else
+#define TEST_PMD_CORE_CYC_RX_START(a)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_tx_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+{uint64_t end_tsc = rte_rdtsc(); fs->core_rx_cycles += end_tsc - (s); }
+#else
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#endif
+
 /** Descriptor for a single flow. */
 struct port_flow {
 	struct port_flow *next; /**< Next flow in list. */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 10d641a..f08dffe 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -241,16 +241,16 @@
 	uint32_t retry;
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+#if defined(RTE_TEST_PMD_RECORD_CORE_TX_CYCLES)
+	uint64_t start_tx_tsc;
+#endif
+#if defined(RTE_TEST_PMD_RECORD_CORE_CYCLES)
+	uint64_t start_rx_tsc;
 #endif
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 #endif
-
 	mbp = current_fwd_lcore()->mbp;
 	txp = &ports[fs->tx_port];
 	tx_offloads = txp->dev_conf.txmode.offloads;
@@ -302,7 +302,9 @@
 	if (nb_pkt == 0)
 		return;
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -310,8 +312,10 @@
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -334,12 +338,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/config/common_base b/config/common_base
index 5cb73a7..5ab189d 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1003,6 +1003,8 @@ CONFIG_RTE_PROC_INFO=n
 #
 CONFIG_RTE_TEST_PMD=y
 CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
+CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES=n
+CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES=n
 CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
 
 #
diff --git a/doc/guides/testpmd_app_ug/build_app.rst b/doc/guides/testpmd_app_ug/build_app.rst
index d1ca9f3..661f0c6 100644
--- a/doc/guides/testpmd_app_ug/build_app.rst
+++ b/doc/guides/testpmd_app_ug/build_app.rst
@@ -21,6 +21,23 @@ The basic compilation steps are:
 
         export RTE_TARGET=x86_64-native-linux-gcc
 
+#.  Edit the desired conditional options in $RTE_SDK/config/common_base (optional):
+
+    *  ``CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES``
+
+       Enables gathering profiling data for transmit datapath,
+       counts the ticks spent within rte_eth_tx_burst() routine.
+
+    *  ``CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES``
+
+       Enables gathering profiling data for receive datapath,
+       counts ticks spent within rte_eth_rx_burst() routine.
+
+    *  ``CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES``
+
+       Enables gathering profiling data for forwarding routine
+       in general.
+
 #.  Build the application:
 
     .. code-block:: console
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-26 12:48 [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines Viacheslav Ovsiienko
@ 2019-06-26 12:57 ` Bruce Richardson
  2019-06-26 13:19   ` Slava Ovsiienko
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
  1 sibling, 1 reply; 29+ messages in thread
From: Bruce Richardson @ 2019-06-26 12:57 UTC (permalink / raw)
  To: Viacheslav Ovsiienko; +Cc: dev, bernard.iremonger, ferruh.yigit

On Wed, Jun 26, 2019 at 12:48:37PM +0000, Viacheslav Ovsiienko wrote:
> There is the testpmd configuration option called
> RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on
> the testpmd application measures the CPU clocks spent
> within forwarding loop. This time is the sum of execution
> times of rte_eth_rx_burst(), rte_eth_tx_burst(), rte_delay_us(),
>  rte_pktmbuf_free() and so on, depending on fwd mode set.
> 
> While debugging and performance optimization of datapath
> burst routines tt would be useful to see the pure execution
> times of these ones. It is proposed to add separated profiling
> options:
> 
> CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
>     enables gathering profiling data for transmit datapath,
>     ticks spent within rte_eth_tx_burst() routine
> 
> CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
>     enables gathering profiling data for receive datapath,
>     ticks spent within rte_eth_rx_burst() routine
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  RFC: http://patches.dpdk.org/patch/53704/
> ---

Out of interest, did you try making these runtime rather than build-time
options, and see if it makes any perf difference? Given the fact that we
would have just two predictable branches per burst of packets, I'd expect
the impact to me pretty minimal, if measurable at all.

/Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-26 12:57 ` Bruce Richardson
@ 2019-06-26 13:19   ` Slava Ovsiienko
  2019-06-26 13:21     ` Bruce Richardson
  0 siblings, 1 reply; 29+ messages in thread
From: Slava Ovsiienko @ 2019-06-26 13:19 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, bernard.iremonger, ferruh.yigit

Hi, Bruce

Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef ?

No, I did not try runtime control settings.
Instead I compared performance with all RECORD_CORE_XX_CYCLES options enabled/disabled builds
and have seen the ~1-2% performance difference on my setups (mainly fwd txonly with retry).
So, ticks measuring is not free.

With best regards,
Slava

> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Wednesday, June 26, 2019 15:58
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; bernard.iremonger@intel.com; ferruh.yigit@intel.com
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> On Wed, Jun 26, 2019 at 12:48:37PM +0000, Viacheslav Ovsiienko wrote:
> > There is the testpmd configuration option called
> > RTE_TEST_PMD_RECORD_CORE_CYCLES, if this one is turned on the
> testpmd
> > application measures the CPU clocks spent within forwarding loop. This
> > time is the sum of execution times of rte_eth_rx_burst(),
> > rte_eth_tx_burst(), rte_delay_us(),
> >  rte_pktmbuf_free() and so on, depending on fwd mode set.
> >
> > While debugging and performance optimization of datapath burst
> > routines tt would be useful to see the pure execution times of these
> > ones. It is proposed to add separated profiling
> > options:
> >
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_TX_CYCLES
> >     enables gathering profiling data for transmit datapath,
> >     ticks spent within rte_eth_tx_burst() routine
> >
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_RX_CYCLES
> >     enables gathering profiling data for receive datapath,
> >     ticks spent within rte_eth_rx_burst() routine
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  RFC:
> >
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> es.dpdk.org%2Fpatch%2F53704%2F&amp;data=02%7C01%7Cviacheslavo%4
> 0mellan
> >
> ox.com%7Cbe154ad6d7b3460006ed08d6fa35de9c%7Ca652971c7d2e4d9ba
> 6a4d14925
> >
> 6f461b%7C0%7C0%7C636971506606185633&amp;sdata=Y6UPuGwiYEeYrv3
> 9Sg3Wq9E2
> > GIBjyVa4mw31Et6FXKE%3D&amp;reserved=0
> > ---
> 
> Out of interest, did you try making these runtime rather than build-time
> options, and see if it makes any perf difference? Given the fact that we
> would have just two predictable branches per burst of packets, I'd expect the
> impact to me pretty minimal, if measurable at all.
> 
> /Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-26 13:19   ` Slava Ovsiienko
@ 2019-06-26 13:21     ` Bruce Richardson
  2019-06-27  4:48       ` Slava Ovsiienko
  0 siblings, 1 reply; 29+ messages in thread
From: Bruce Richardson @ 2019-06-26 13:21 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: dev, bernard.iremonger, ferruh.yigit

On Wed, Jun 26, 2019 at 01:19:24PM +0000, Slava Ovsiienko wrote:
> Hi, Bruce
> 
> Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef ?
> 
> No, I did not try runtime control settings.
> Instead I compared performance with all RECORD_CORE_XX_CYCLES options enabled/disabled builds
> and have seen the ~1-2% performance difference on my setups (mainly fwd txonly with retry).
> So, ticks measuring is not free.
> 
> With best regards,
> Slava
> 
Yes, I realise that measuring ticks is going to have a performance impact.
However, what I was referring to was exactly the former - using an "if"
rather than an "ifdef". I would hope with ticks disable using this option
shows no perf impact, and we can reduce the use of build-time configs.

/Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-26 13:21     ` Bruce Richardson
@ 2019-06-27  4:48       ` Slava Ovsiienko
  2019-06-28 13:45         ` Iremonger, Bernard
  0 siblings, 1 reply; 29+ messages in thread
From: Slava Ovsiienko @ 2019-06-27  4:48 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, bernard.iremonger, ferruh.yigit

OK, what do you think about this:

#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 if (record_cycle & RECORD_TX_CORE_CYCLES) {
   .. do measurement stuff ..
 }
#endif

+ add some new command to config in runtime: "set record_cycle 3"

We keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES, do not introduce
new build-time configs and get some new runtime configuring.

WBR,
Slava

> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Wednesday, June 26, 2019 16:21
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; bernard.iremonger@intel.com; ferruh.yigit@intel.com
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> On Wed, Jun 26, 2019 at 01:19:24PM +0000, Slava Ovsiienko wrote:
> > Hi, Bruce
> >
> > Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef ?
> >
> > No, I did not try runtime control settings.
> > Instead I compared performance with all RECORD_CORE_XX_CYCLES
> options
> > enabled/disabled builds and have seen the ~1-2% performance difference
> on my setups (mainly fwd txonly with retry).
> > So, ticks measuring is not free.
> >
> > With best regards,
> > Slava
> >
> Yes, I realise that measuring ticks is going to have a performance impact.
> However, what I was referring to was exactly the former - using an "if"
> rather than an "ifdef". I would hope with ticks disable using this option shows
> no perf impact, and we can reduce the use of build-time configs.
> 
> /Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-27  4:48       ` Slava Ovsiienko
@ 2019-06-28 13:45         ` Iremonger, Bernard
  2019-06-28 14:20           ` Bruce Richardson
  0 siblings, 1 reply; 29+ messages in thread
From: Iremonger, Bernard @ 2019-06-28 13:45 UTC (permalink / raw)
  To: Slava Ovsiienko, Richardson, Bruce; +Cc: dev, Yigit, Ferruh

Hi Bruce, Slava,

> -----Original Message-----
> From: Slava Ovsiienko [mailto:viacheslavo@mellanox.com]
> Sent: Thursday, June 27, 2019 5:49 AM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Iremonger, Bernard <bernard.iremonger@intel.com>;
> Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> OK, what do you think about this:
> 
> #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>  if (record_cycle & RECORD_TX_CORE_CYCLES) {
>    .. do measurement stuff ..
>  }
> #endif
> 
> + add some new command to config in runtime: "set record_cycle 3"
> 
> We keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES, do not introduce
> new build-time configs and get some new runtime configuring.
> 
> WBR,
> Slava
> 
> > -----Original Message-----
> > From: Bruce Richardson <bruce.richardson@intel.com>
> > Sent: Wednesday, June 26, 2019 16:21
> > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > Cc: dev@dpdk.org; bernard.iremonger@intel.com; ferruh.yigit@intel.com
> > Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx
> > burst routines
> >
> > On Wed, Jun 26, 2019 at 01:19:24PM +0000, Slava Ovsiienko wrote:
> > > Hi, Bruce
> > >
> > > Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef ?
> > >
> > > No, I did not try runtime control settings.
> > > Instead I compared performance with all RECORD_CORE_XX_CYCLES
> > options
> > > enabled/disabled builds and have seen the ~1-2% performance
> > > difference
> > on my setups (mainly fwd txonly with retry).
> > > So, ticks measuring is not free.
> > >
> > > With best regards,
> > > Slava
> > >
> > Yes, I realise that measuring ticks is going to have a performance impact.
> > However, what I was referring to was exactly the former - using an "if"
> > rather than an "ifdef". I would hope with ticks disable using this
> > option shows no perf impact, and we can reduce the use of build-time
> configs.
> >
> > /Bruce

Given that  RTE_TEST_PMD_RECORD_CORE_CYCLES is already in the config file.
I think it is better to be consistent and add the new RECORD macros there.

Would it be reasonable to have runtime settings available as well?

Regards,

Bernard



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-28 13:45         ` Iremonger, Bernard
@ 2019-06-28 14:20           ` Bruce Richardson
  2019-07-01  4:57             ` Slava Ovsiienko
  0 siblings, 1 reply; 29+ messages in thread
From: Bruce Richardson @ 2019-06-28 14:20 UTC (permalink / raw)
  To: Iremonger, Bernard; +Cc: Slava Ovsiienko, dev, Yigit, Ferruh

On Fri, Jun 28, 2019 at 02:45:13PM +0100, Iremonger, Bernard wrote:
> Hi Bruce, Slava,
> 
> > -----Original Message-----
> > From: Slava Ovsiienko [mailto:viacheslavo@mellanox.com]
> > Sent: Thursday, June 27, 2019 5:49 AM
> > To: Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org; Iremonger, Bernard <bernard.iremonger@intel.com>;
> > Yigit, Ferruh <ferruh.yigit@intel.com>
> > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst
> > routines
> > 
> > OK, what do you think about this:
> > 
> > #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  if (record_cycle & RECORD_TX_CORE_CYCLES) {
> >    .. do measurement stuff ..
> >  }
> > #endif
> > 
> > + add some new command to config in runtime: "set record_cycle 3"
> > 
> > We keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES, do not introduce
> > new build-time configs and get some new runtime configuring.
> > 
> > WBR,
> > Slava
> > 
> > > -----Original Message-----
> > > From: Bruce Richardson <bruce.richardson@intel.com>
> > > Sent: Wednesday, June 26, 2019 16:21
> > > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > > Cc: dev@dpdk.org; bernard.iremonger@intel.com; ferruh.yigit@intel.com
> > > Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx
> > > burst routines
> > >
> > > On Wed, Jun 26, 2019 at 01:19:24PM +0000, Slava Ovsiienko wrote:
> > > > Hi, Bruce
> > > >
> > > > Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef ?
> > > >
> > > > No, I did not try runtime control settings.
> > > > Instead I compared performance with all RECORD_CORE_XX_CYCLES
> > > options
> > > > enabled/disabled builds and have seen the ~1-2% performance
> > > > difference
> > > on my setups (mainly fwd txonly with retry).
> > > > So, ticks measuring is not free.
> > > >
> > > > With best regards,
> > > > Slava
> > > >
> > > Yes, I realise that measuring ticks is going to have a performance impact.
> > > However, what I was referring to was exactly the former - using an "if"
> > > rather than an "ifdef". I would hope with ticks disable using this
> > > option shows no perf impact, and we can reduce the use of build-time
> > configs.
> > >
> > > /Bruce
> 
> Given that  RTE_TEST_PMD_RECORD_CORE_CYCLES is already in the config file.
> I think it is better to be consistent and add the new RECORD macros there.
> 
> Would it be reasonable to have runtime settings available as well?
> 
That configuration option is only present right now for the make builds, so
I'd like to see it replaced with a runtime option rather than see about
adding more config options to the meson build. The first step should be to
avoid adding more config options and just look to use dynamic ones. Ideally
the existing build option should be replaced at the same time.

/Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-06-28 14:20           ` Bruce Richardson
@ 2019-07-01  4:57             ` Slava Ovsiienko
  2019-07-01  8:15               ` Bruce Richardson
  0 siblings, 1 reply; 29+ messages in thread
From: Slava Ovsiienko @ 2019-07-01  4:57 UTC (permalink / raw)
  To: Bruce Richardson, Iremonger, Bernard; +Cc: dev, Yigit, Ferruh

I think we should compromise: keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES
and extend with runtime switch under this build-time option:

#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
if (record_tx)
  .. gather tx related stats...
if (record_rx)
  .. gather rx related stats...
#endif

This is very specific feature, it is needed while debugging and testing datapath
routines, and It seems this feature with appropriate overhead should not be always enabled.
existing build-time configuration options looks OK as for me. 

Bruce, if proposed runtime extension is acceptable - I will update the patch.

WBR,
Slava

> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Friday, June 28, 2019 17:20
> To: Iremonger, Bernard <bernard.iremonger@intel.com>
> Cc: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org; Yigit,
> Ferruh <ferruh.yigit@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst
> routines
> 
> On Fri, Jun 28, 2019 at 02:45:13PM +0100, Iremonger, Bernard wrote:
> > Hi Bruce, Slava,
> >
> > > -----Original Message-----
> > > From: Slava Ovsiienko [mailto:viacheslavo@mellanox.com]
> > > Sent: Thursday, June 27, 2019 5:49 AM
> > > To: Richardson, Bruce <bruce.richardson@intel.com>
> > > Cc: dev@dpdk.org; Iremonger, Bernard <bernard.iremonger@intel.com>;
> > > Yigit, Ferruh <ferruh.yigit@intel.com>
> > > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx
> > > burst routines
> > >
> > > OK, what do you think about this:
> > >
> > > #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES  if (record_cycle &
> > > RECORD_TX_CORE_CYCLES) {
> > >    .. do measurement stuff ..
> > >  }
> > > #endif
> > >
> > > + add some new command to config in runtime: "set record_cycle 3"
> > >
> > > We keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES, do not
> introduce
> > > new build-time configs and get some new runtime configuring.
> > >
> > > WBR,
> > > Slava
> > >
> > > > -----Original Message-----
> > > > From: Bruce Richardson <bruce.richardson@intel.com>
> > > > Sent: Wednesday, June 26, 2019 16:21
> > > > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > > > Cc: dev@dpdk.org; bernard.iremonger@intel.com;
> > > > ferruh.yigit@intel.com
> > > > Subject: Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for
> > > > Rx/Tx burst routines
> > > >
> > > > On Wed, Jun 26, 2019 at 01:19:24PM +0000, Slava Ovsiienko wrote:
> > > > > Hi, Bruce
> > > > >
> > > > > Do you mean using "if (core_rx_cycle_enabled) {...}" instead of #ifdef
> ?
> > > > >
> > > > > No, I did not try runtime control settings.
> > > > > Instead I compared performance with all RECORD_CORE_XX_CYCLES
> > > > options
> > > > > enabled/disabled builds and have seen the ~1-2% performance
> > > > > difference
> > > > on my setups (mainly fwd txonly with retry).
> > > > > So, ticks measuring is not free.
> > > > >
> > > > > With best regards,
> > > > > Slava
> > > > >
> > > > Yes, I realise that measuring ticks is going to have a performance
> impact.
> > > > However, what I was referring to was exactly the former - using an "if"
> > > > rather than an "ifdef". I would hope with ticks disable using this
> > > > option shows no perf impact, and we can reduce the use of
> > > > build-time
> > > configs.
> > > >
> > > > /Bruce
> >
> > Given that  RTE_TEST_PMD_RECORD_CORE_CYCLES is already in the
> config file.
> > I think it is better to be consistent and add the new RECORD macros there.
> >
> > Would it be reasonable to have runtime settings available as well?
> >
> That configuration option is only present right now for the make builds, so I'd
> like to see it replaced with a runtime option rather than see about adding
> more config options to the meson build. The first step should be to avoid
> adding more config options and just look to use dynamic ones. Ideally the
> existing build option should be replaced at the same time.
> 
> /Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-07-01  4:57             ` Slava Ovsiienko
@ 2019-07-01  8:15               ` Bruce Richardson
  2019-09-30 12:32                 ` Yigit, Ferruh
  0 siblings, 1 reply; 29+ messages in thread
From: Bruce Richardson @ 2019-07-01  8:15 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: Iremonger, Bernard, dev, Yigit, Ferruh

On Mon, Jul 01, 2019 at 04:57:30AM +0000, Slava Ovsiienko wrote:
> I think we should compromise: keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES
> and extend with runtime switch under this build-time option:
> 
> #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> if (record_tx)
>   .. gather tx related stats...
> if (record_rx)
>   .. gather rx related stats...
> #endif
> 
> This is very specific feature, it is needed while debugging and testing datapath
> routines, and It seems this feature with appropriate overhead should not be always enabled.
> existing build-time configuration options looks OK as for me. 
> 
> Bruce, if proposed runtime extension is acceptable - I will update the patch.
> 
Ok for me.

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines
  2019-07-01  8:15               ` Bruce Richardson
@ 2019-09-30 12:32                 ` Yigit, Ferruh
  0 siblings, 0 replies; 29+ messages in thread
From: Yigit, Ferruh @ 2019-09-30 12:32 UTC (permalink / raw)
  To: Bruce Richardson, Slava Ovsiienko; +Cc: Iremonger, Bernard, dev, Yigit, Ferruh

On 7/1/2019 9:15 AM, Bruce Richardson wrote:
> On Mon, Jul 01, 2019 at 04:57:30AM +0000, Slava Ovsiienko wrote:
>> I think we should compromise: keep existing RTE_TEST_PMD_RECORD_CORE_CYCLES
>> and extend with runtime switch under this build-time option:
>>
>> #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>> if (record_tx)
>>   .. gather tx related stats...
>> if (record_rx)
>>   .. gather rx related stats...
>> #endif
>>
>> This is very specific feature, it is needed while debugging and testing datapath
>> routines, and It seems this feature with appropriate overhead should not be always enabled.
>> existing build-time configuration options looks OK as for me. 

+1, if we will enable this I am for having compile time config options.

Only a concern about the implementation, 'RTE_TEST_PMD_RECORD_CORE_CYCLES' and
'RTE_TEST_PMD_RECORD_CORE_RX_CYCLES' are both using same variable, like
'start_rx_tsc', is there an assumption that both won't be enabled at same time?
I think better to able to enable CORE_CYCLES, RX_CYCLES and TX_CYCLES separately
/ independently.

Another think to consider, for long term - not for this patch, to move introduce
RX/TX ifdefs to ethdev Rx/Tx functions so that all applications can use them,
not just testpmd.

>>
>> Bruce, if proposed runtime extension is acceptable - I will update the patch.
>>
> Ok for me.>
> Thanks,
> /Bruce
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size
  2019-06-26 12:48 [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines Viacheslav Ovsiienko
  2019-06-26 12:57 ` Bruce Richardson
@ 2020-03-19 13:50 ` Viacheslav Ovsiienko
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
                     ` (3 more replies)
  1 sibling, 4 replies; 29+ messages in thread
From: Viacheslav Ovsiienko @ 2020-03-19 13:50 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, thomas, bernard.iremonger

There is the CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configuration
parameter enabling the lightweight profiler for the forwarding
routines that provides the time spent in the routines and estimates
the CPU cycles required to process one packet.

It would be good to have separated data for the Rx and Tx directions.
Beside this, the performance depends on the actual burst size, the profiling
data per burst size are meaningful and would help detect the performance
anomalies.

To control this profiling statistics the new testpmd command is introduced:

  set fwdprof (flags)

This command controls which profiling statistics is being gathered
in runtime:

- bit 0 - enables profiling the entire forward routine, counts the ticks
          spent in the forwarding routine, is set by default. Provides
	  the same data as previous implementation.

- bit 1 - enables gathering the profiling data for the transmit datapath,
          counts the ticks spent within rte_eth_tx_burst() routine,
          is cleared by default, extends the existing statistics.

- bit 2 - enables gathering the profiling data for the receive datapath,
          counts the ticks spent within	rte_eth_rx_burst() routine,
	  is cleared by default, extends the existing statistics.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

---
v2: - run time flags instead of compile time configuration
    - detailed statistics per burst size

v1: http://patches.dpdk.org/patch/55407/

Viacheslav Ovsiienko (3):
  app/testpmd: add profiling flags set command
  app/testpmd: gather Rx and Tx routines profiling
  app/testpmd: qualify profiling statistics on burst size

 app/test-pmd/cmdline.c                      | 15 ++++++++
 app/test-pmd/config.c                       | 10 +++++
 app/test-pmd/csumonly.c                     | 26 ++++++-------
 app/test-pmd/flowgen.c                      | 26 ++++++-------
 app/test-pmd/icmpecho.c                     | 27 +++++++-------
 app/test-pmd/iofwd.c                        | 26 ++++++-------
 app/test-pmd/macfwd.c                       | 26 ++++++-------
 app/test-pmd/macswap.c                      | 26 ++++++-------
 app/test-pmd/rxonly.c                       | 14 ++-----
 app/test-pmd/softnicfwd.c                   | 26 ++++++-------
 app/test-pmd/testpmd.c                      | 52 ++++++++++++++++++++++----
 app/test-pmd/testpmd.h                      | 58 ++++++++++++++++++++++++++++-
 app/test-pmd/txonly.c                       | 25 ++++++-------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 22 +++++++++++
 14 files changed, 256 insertions(+), 123 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
@ 2020-03-19 13:50   ` Viacheslav Ovsiienko
  2020-04-02 11:15     ` Thomas Monjalon
  2020-04-09 11:56     ` Ferruh Yigit
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling Viacheslav Ovsiienko
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 29+ messages in thread
From: Viacheslav Ovsiienko @ 2020-03-19 13:50 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, thomas, bernard.iremonger

This commit is preparation step before adding the separated profiling
of Rx and Tx burst routines. The new testpmd command is introduced:

  set fwdprof (flags)

This command controls which profiling statistics is being gathered
in runtime:

- bit 0 - enables profiling the entire forward routine, counts the ticks
          spent in the forwarding routine, is set by default. Provides the
	  same data as previous implementation.

- bit 1 - enables gathering the profiling data for the transmit datapath,
          counts the ticks spent within rte_eth_tx_burst() routine,
          is cleared by default, extends the existing statistics.

- bit 2 - enables gathering the profiling data for the receive datapath,
          counts the ticks spent within rte_eth_rx_burst() routine,
          is cleared by default, extends the existing statistics.

The new counters for the cycles spent into rx/tx burst
routines are introduced. The feature is engaged only if
CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configured to 'Y'.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/cmdline.c                      | 15 +++++++++++++++
 app/test-pmd/config.c                       | 10 ++++++++++
 app/test-pmd/testpmd.c                      |  3 +++
 app/test-pmd/testpmd.h                      |  8 ++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 22 ++++++++++++++++++++++
 5 files changed, 58 insertions(+)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index a037a55..74dbb29 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -263,6 +263,9 @@ static void cmd_help_long_parsed(void *parsed_result,
 			"set verbose (level)\n"
 			"    Set the debug verbosity level X.\n\n"
 
+			"set fwdprof (flags)\n"
+			"    Set the flags to profile the forwarding.\n\n"
+
 			"set log global|(type) (level)\n"
 			"    Set the log level.\n\n"
 
@@ -3743,20 +3746,32 @@ static void cmd_set_parsed(void *parsed_result,
 		set_nb_pkt_per_burst(res->value);
 	else if (!strcmp(res->what, "verbose"))
 		set_verbose_level(res->value);
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+	else if (!strcmp(res->what, "fwdprof"))
+		set_fwdprof_flags(res->value);
+#endif
 }
 
 cmdline_parse_token_string_t cmd_set_set =
 	TOKEN_STRING_INITIALIZER(struct cmd_set_result, set, "set");
 cmdline_parse_token_string_t cmd_set_what =
 	TOKEN_STRING_INITIALIZER(struct cmd_set_result, what,
+#ifndef RTE_TEST_PMD_RECORD_CORE_CYCLES
 				 "nbport#nbcore#burst#verbose");
+#else
+				 "nbport#nbcore#burst#verbose#fwdprof");
+#endif
 cmdline_parse_token_num_t cmd_set_value =
 	TOKEN_NUM_INITIALIZER(struct cmd_set_result, value, UINT16);
 
 cmdline_parse_inst_t cmd_set_numbers = {
 	.f = cmd_set_parsed,
 	.data = NULL,
+#ifndef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	.help_str = "set nbport|nbcore|burst|verbose <value>",
+#else
+	.help_str = "set nbport|nbcore|burst|verbose|fwdprof <value>",
+#endif
 	.tokens = {
 		(void *)&cmd_set_set,
 		(void *)&cmd_set_what,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 8cf84cc..1d86250 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -3151,6 +3151,16 @@ struct igb_ring_desc_16_bytes {
 	configure_rxtx_dump_callbacks(verbose_level);
 }
 
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+void
+set_fwdprof_flags(uint16_t pf_flags)
+{
+	printf("Change forward profiling flags from %u to %u\n",
+	       (unsigned int) fwdprof_flags, (unsigned int) pf_flags);
+	fwdprof_flags = pf_flags;
+}
+#endif
+
 void
 vlan_extend_set(portid_t port_id, int on)
 {
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 035836a..c93fa35 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -81,6 +81,9 @@
 #define EXTBUF_ZONE_SIZE RTE_PGSIZE_2M
 
 uint16_t verbose_level = 0; /**< Silent by default. */
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+uint16_t fwdprof_flags = RECORD_CORE_CYCLES_FWD; /**< Fwd only by default. */
+#endif
 int testpmd_logtype; /**< Log type for testpmd logs */
 
 /* use master core for command line ? */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 7a7c73f..466e611 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -321,8 +321,15 @@ struct queue_stats_mappings {
 
 extern uint8_t xstats_hide_zero; /**< Hide zero values for xstats display */
 
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+#define RECORD_CORE_CYCLES_FWD (1<<0)
+#define RECORD_CORE_CYCLES_RX (1<<1)
+#define RECORD_CORE_CYCLES_TX (1<<2)
+#endif
+
 /* globals used for configuration */
 extern uint16_t verbose_level; /**< Drives messages being displayed, if any. */
+extern uint16_t fwdprof_flags; /**< Controls the profiling statistics. */
 extern int testpmd_logtype; /**< Log type for testpmd logs */
 extern uint8_t  interactive;
 extern uint8_t  auto_start;
@@ -787,6 +794,7 @@ void vlan_tpid_set(portid_t port_id, enum rte_vlan_type vlan_type,
 void set_xstats_hide_zero(uint8_t on_off);
 
 void set_verbose_level(uint16_t vb_level);
+void set_fwdprof_flags(uint16_t pf_flags);
 void set_tx_pkt_segments(unsigned *seg_lengths, unsigned nb_segs);
 void show_tx_pkt_segments(void);
 void set_tx_pkt_split(const char *name);
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 5bb12a5..b6ec783 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -647,6 +647,28 @@ Regexes can also be used for type. To change log level of user1, user2 and user3
 
 	testpmd> set log user[1-3] (level)
 
+set fwdprof
+~~~~~~~~~~~
+
+Set the flags controlling the datapath profiling statistics::
+
+   testpmd> set fwdprof (flags)
+
+This command is available only if ``CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES`` is
+configured to Y, enabling the profiling code generation.
+
+The bitmask flag controls the gathering profiling statistics for datapath:
+
+* ``bit 0`` enables gathering the profiling data for the entire
+  forwarding routine, counts the ticks spent in the forwarding routine,
+  is set by default.
+* ``bit 1`` enables gathering the profiling data for the transmit datapath,
+  counts the ticks spent within rte_eth_tx_burst() routine, is cleared by
+  default.
+* ``bit 2`` enables gathering the profiling data for the receive datapath,
+  counts the ticks spent within rte_eth_rx_burst() routine, is cleared by
+  default.
+
 set nbport
 ~~~~~~~~~~
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
@ 2020-03-19 13:50   ` Viacheslav Ovsiienko
  2020-04-02 11:20     ` Thomas Monjalon
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
  2020-04-02 11:13   ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data " Thomas Monjalon
  3 siblings, 1 reply; 29+ messages in thread
From: Viacheslav Ovsiienko @ 2020-03-19 13:50 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, thomas, bernard.iremonger

This patch counts the tick spent in rx-tx_burst routines in
dedicated counters and displays the gathered profiling statistics.

The feature is engaged only if CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES
configured as 'Y'. The "set fwdprof (flags)" command can be used
to select what counters should be involved.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/csumonly.c   | 21 +++++++++------------
 app/test-pmd/flowgen.c    | 21 +++++++++------------
 app/test-pmd/icmpecho.c   | 21 +++++++++------------
 app/test-pmd/iofwd.c      | 21 +++++++++------------
 app/test-pmd/macfwd.c     | 21 +++++++++------------
 app/test-pmd/macswap.c    | 21 +++++++++------------
 app/test-pmd/rxonly.c     | 14 ++++----------
 app/test-pmd/softnicfwd.c | 21 +++++++++------------
 app/test-pmd/testpmd.c    | 18 +++++++++++++++++-
 app/test-pmd/testpmd.h    | 34 ++++++++++++++++++++++++++++++++--
 app/test-pmd/txonly.c     | 20 ++++++++------------
 11 files changed, 124 insertions(+), 109 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 25091de..4104737 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -789,18 +789,15 @@ struct simple_gre_hdr {
 	int ret;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/* receive a burst of packet */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -1067,8 +1064,10 @@ struct simple_gre_hdr {
 		printf("Preparing packet burst to transmit failed: %s\n",
 				rte_strerror(rte_errno));
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/*
 	 * Retry if necessary
@@ -1077,8 +1076,10 @@ struct simple_gre_hdr {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -1096,11 +1097,7 @@ struct simple_gre_hdr {
 		} while (++nb_tx < nb_rx);
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine csum_fwd_engine = {
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 4bd351e..51e87b0 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -98,19 +98,16 @@
 	uint32_t retry;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 	static int next_flow = 0;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
-#endif
-
 	/* Receive a burst of packets and discard them. */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -180,7 +177,9 @@
 		next_flow = (next_flow + 1) % cfg_n_flows;
 	}
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -188,8 +187,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -207,11 +208,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine flow_gen_engine = {
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 65aece16..8843183 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -294,20 +294,17 @@
 	uint8_t  i;
 	int l2_len;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * First, receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -492,8 +489,10 @@
 
 	/* Send back ICMP echo replies, if any. */
 	if (nb_replies > 0) {
+		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		/*
 		 * Retry if necessary
 		 */
@@ -502,10 +501,12 @@
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
 				rte_delay_us(burst_tx_delay_time);
+				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 				nb_tx += rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 			}
 		}
 		fs->tx_packets += nb_tx;
@@ -520,11 +521,7 @@
 		}
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine icmp_echo_engine = {
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9dce76e..9ff6531 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -52,20 +52,17 @@
 	uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,8 +70,10 @@
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -82,8 +81,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -96,11 +97,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine io_fwd_engine = {
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index d2ebb11..f4a213e 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -57,20 +57,17 @@
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -103,7 +100,9 @@
 		mb->vlan_tci = txp->tx_vlan_id;
 		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
 	}
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -111,8 +110,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 
@@ -126,11 +127,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_fwd_engine = {
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 8428c26..5cb3133 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -58,20 +58,17 @@
 	uint16_t nb_tx;
 	uint32_t retry;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -83,7 +80,9 @@
 
 	do_macswap(pkts_burst, nb_rx, txp);
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -91,8 +90,10 @@
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -105,11 +106,7 @@
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_swap_engine = {
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 5c65fc4..2820d7f 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -51,18 +51,16 @@
 	uint16_t i;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -73,11 +71,7 @@
 	for (i = 0; i < nb_rx; i++)
 		rte_pktmbuf_free(pkts_burst[i]);
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine rx_only_engine = {
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index e9d4373..b78f2ce 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -88,34 +88,35 @@ struct tm_hierarchy {
 	uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*  Packets Receive */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -130,11 +131,7 @@ struct tm_hierarchy {
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index c93fa35..b195880 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1625,6 +1625,8 @@ struct extmem_param {
 	struct rte_eth_stats stats;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t fwd_cycles = 0;
+	uint64_t rx_cycles = 0;
+	uint64_t tx_cycles = 0;
 #endif
 	uint64_t total_recv = 0;
 	uint64_t total_xmit = 0;
@@ -1655,6 +1657,8 @@ struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fwd_cycles += fs->core_cycles;
+		rx_cycles += fs->core_rx_cycles;
+		tx_cycles += fs->core_tx_cycles;
 #endif
 	}
 	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
@@ -1785,11 +1789,21 @@ struct extmem_param {
 	       "%s\n",
 	       acc_stats_border, acc_stats_border);
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	if (total_recv > 0)
+	if (fwdprof_flags & RECORD_CORE_CYCLES_FWD && total_recv > 0)
 		printf("\n  CPU cycles/packet=%u (total cycles="
 		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
 		       (unsigned int)(fwd_cycles / total_recv),
 		       fwd_cycles, total_recv);
+	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
+		printf("\n  rx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
+		       (unsigned int)(rx_cycles / total_recv),
+		       rx_cycles, total_recv);
+	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
+		printf("\n  tx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
+		       (unsigned int)(tx_cycles / total_xmit),
+		       tx_cycles, total_xmit);
 #endif
 }
 
@@ -1820,6 +1834,8 @@ struct extmem_param {
 #endif
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fs->core_cycles = 0;
+		fs->core_rx_cycles = 0;
+		fs->core_tx_cycles = 0;
 #endif
 	}
 }
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 466e611..6177a50 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -136,7 +136,9 @@ struct fwd_stream {
 	/**< received packets has bad outer l4 checksum */
 	unsigned int gro_times;	/**< GRO operation times */
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t     core_cycles; /**< used for RX and TX processing */
+	uint64_t core_cycles; /**< used for RX and TX processing */
+	uint64_t core_tx_cycles; /**< used for tx_burst processing */
+	uint64_t core_rx_cycles; /**< used for rx_burst processing */
 #endif
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	struct pkt_burst_stats rx_burst_stats;
@@ -325,7 +327,35 @@ struct queue_stats_mappings {
 #define RECORD_CORE_CYCLES_FWD (1<<0)
 #define RECORD_CORE_CYCLES_RX (1<<1)
 #define RECORD_CORE_CYCLES_TX (1<<2)
-#endif
+
+/* Macros to gather profiling statistics. */
+#define TEST_PMD_CORE_CYC_TX_START(a) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_RX_START(a) \
+{if (fwdprof_flags & (RECORD_CORE_CYCLES_FWD | \
+		       RECORD_CORE_CYCLES_RX)) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
+
+#else
+/* No profiling statistics is configured. */
+#define TEST_PMD_CORE_CYC_TX_START(a)
+#define TEST_PMD_CORE_CYC_RX_START(a)
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
 
 /* globals used for configuration */
 extern uint16_t verbose_level; /**< Drives messages being displayed, if any. */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 8a1989f..8ff7410 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -241,15 +241,11 @@
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	mbp = current_fwd_lcore()->mbp;
 	txp = &ports[fs->tx_port];
 	tx_offloads = txp->dev_conf.txmode.offloads;
@@ -301,7 +297,9 @@
 	if (nb_pkt == 0)
 		return;
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -309,8 +307,10 @@
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -334,11 +334,7 @@
 		} while (++nb_tx < nb_pkt);
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling Viacheslav Ovsiienko
@ 2020-03-19 13:50   ` Viacheslav Ovsiienko
  2020-03-20  6:13     ` Jerin Jacob
                       ` (3 more replies)
  2020-04-02 11:13   ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data " Thomas Monjalon
  3 siblings, 4 replies; 29+ messages in thread
From: Viacheslav Ovsiienko @ 2020-03-19 13:50 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, thomas, bernard.iremonger

The execution time of rx/tx burst routine depends on the burst size.
It would be meaningful to research this dependency, the patch
provides an extra profiling data per rx/tx burst size.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/csumonly.c   | 11 +++++++----
 app/test-pmd/flowgen.c    | 11 +++++++----
 app/test-pmd/icmpecho.c   | 12 ++++++++----
 app/test-pmd/iofwd.c      | 11 +++++++----
 app/test-pmd/macfwd.c     | 11 +++++++----
 app/test-pmd/macswap.c    | 11 +++++++----
 app/test-pmd/rxonly.c     |  2 +-
 app/test-pmd/softnicfwd.c | 11 +++++++----
 app/test-pmd/testpmd.c    | 31 ++++++++++++++++++++++++-------
 app/test-pmd/testpmd.h    | 26 ++++++++++++++++++++++----
 app/test-pmd/txonly.c     |  9 ++++++---
 11 files changed, 103 insertions(+), 43 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 4104737..c966892 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -797,7 +797,7 @@ struct simple_gre_hdr {
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -1067,7 +1067,7 @@ struct simple_gre_hdr {
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 
 	/*
 	 * Retry if necessary
@@ -1075,11 +1075,14 @@ struct simple_gre_hdr {
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 51e87b0..9189e7b 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -107,7 +107,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -179,18 +179,21 @@
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 8843183..0c8a8af 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -304,7 +304,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -492,7 +492,7 @@
 		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
-		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 		/*
 		 * Retry if necessary
 		 */
@@ -500,13 +500,17 @@
 			retry = 0;
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
+				uint16_t nb_rt;
+
 				rte_delay_us(burst_tx_delay_time);
 				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-				nb_tx += rte_eth_tx_burst(fs->tx_port,
+				nb_rt = rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
-				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+				nb_tx += nb_rt;
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc,
+							 nb_rt);
 			}
 		}
 		fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9ff6531..b05ed02 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -62,7 +62,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,18 +73,21 @@
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index f4a213e..a4aae0c 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -67,7 +67,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -102,18 +102,21 @@
 	}
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 5cb3133..57628ba 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -68,7 +68,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -82,18 +82,21 @@
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 2820d7f..ee79e7b 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -60,7 +60,7 @@
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	if (unlikely(nb_rx == 0))
 		return;
 
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index b78f2ce..793677d 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -96,7 +96,7 @@ struct tm_hierarchy {
 	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
-	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -106,17 +106,20 @@ struct tm_hierarchy {
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index b195880..1d4b55b 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1515,7 +1515,7 @@ struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 static void
-pkt_burst_stats_display(const char *rx_tx, struct pkt_burst_stats *pbs)
+pkt_burst_stats_display(int nrx_tx, struct pkt_burst_stats *pbs)
 {
 	unsigned int total_burst;
 	unsigned int nb_burst;
@@ -1549,8 +1549,8 @@ struct extmem_param {
 	if (total_burst == 0)
 		return;
 	burst_percent[0] = (burst_stats[0] * 100) / total_burst;
-	printf("  %s-bursts : %u [%d%% of %d pkts", rx_tx, total_burst,
-	       burst_percent[0], (int) pktnb_stats[0]);
+	printf("  %s-bursts : %u [%d%% of %d pkts", nrx_tx ? "TX" : "RX",
+	       total_burst, burst_percent[0], (int) pktnb_stats[0]);
 	if (burst_stats[0] == total_burst) {
 		printf("]\n");
 		return;
@@ -1568,6 +1568,23 @@ struct extmem_param {
 	}
 	printf(" + %d%% of %d pkts + %d%% of others]\n",
 	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
+				      : RECORD_CORE_CYCLES_RX)))
+		return;
+	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
+		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
+				  : pbs->pkt_burst_spread[nb_pkt];
+		if (nb_burst == 0)
+			continue;
+		printf("  CPU cycles/%u packet burst=%u (total cycles="
+		       "%"PRIu64" / total %s bursts=%u)\n",
+		       (unsigned int)nb_pkt,
+		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
+		       pbs->pkt_ticks_spread[nb_pkt],
+		       nrx_tx ? "TX" : "RX", nb_burst);
+	}
+#endif
 }
 #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
 
@@ -1601,8 +1618,8 @@ struct extmem_param {
 	}
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
-	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
-	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
+	pkt_burst_stats_display(false, &fs->rx_burst_stats);
+	pkt_burst_stats_display(true, &fs->tx_burst_stats);
 #endif
 }
 
@@ -1742,10 +1759,10 @@ struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 		if (ports_stats[pt_id].rx_stream)
-			pkt_burst_stats_display("RX",
+			pkt_burst_stats_display(false,
 				&ports_stats[pt_id].rx_stream->rx_burst_stats);
 		if (ports_stats[pt_id].tx_stream)
-			pkt_burst_stats_display("TX",
+			pkt_burst_stats_display(true,
 				&ports_stats[pt_id].tx_stream->tx_burst_stats);
 #endif
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 6177a50..90eb0ef 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -89,6 +89,10 @@ enum {
  */
 struct pkt_burst_stats {
 	unsigned int pkt_burst_spread[MAX_PKT_BURST];
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+	unsigned int pkt_retry_spread[MAX_PKT_BURST];
+	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
+#endif
 };
 #endif
 
@@ -340,21 +344,35 @@ struct queue_stats_mappings {
 {if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }
 
-#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; \
+fs->tx_burst_stats.pkt_ticks_spread[np] += tsc; \
+fs->tx_burst_stats.pkt_retry_spread[np]++; } }
+
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; \
+fs->rx_burst_stats.pkt_ticks_spread[np] += tsc; } }
+
+#else /* RTE_TEST_PMD_RECORD_BURST_STATS */
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
 {if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
 
-#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
 {if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
 {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
+#endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
 
 #else
 /* No profiling statistics is configured. */
 #define TEST_PMD_CORE_CYC_TX_START(a)
 #define TEST_PMD_CORE_CYC_RX_START(a)
 #define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
-#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
-#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np)
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np)
 #endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
 
 /* globals used for configuration */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 8ff7410..593044e 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -299,18 +299,21 @@
 
 	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
-	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
 	/*
 	 * Retry if necessary
 	 */
 	if (unlikely(nb_tx < nb_pkt) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
+			uint16_t nb_rt;
+
 			rte_delay_us(burst_tx_delay_time);
 			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
-			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
-			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
+			nb_tx += nb_rt;
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
 		}
 	}
 	fs->tx_packets += nb_tx;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
@ 2020-03-20  6:13     ` Jerin Jacob
  2020-04-09 11:46       ` Ferruh Yigit
  2020-03-20 16:03     ` Andrzej Ostruszka
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 29+ messages in thread
From: Jerin Jacob @ 2020-03-20  6:13 UTC (permalink / raw)
  To: Viacheslav Ovsiienko
  Cc: dpdk-dev, Ferruh Yigit, Thomas Monjalon, Bernard Iremonger

On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
<viacheslavo@mellanox.com> wrote:
>
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +                                     : RECORD_CORE_CYCLES_RX)))
> +               return;
> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +                                 : pbs->pkt_burst_spread[nb_pkt];
> +               if (nb_burst == 0)
> +                       continue;
> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
> +                      "%"PRIu64" / total %s bursts=%u)\n",
> +                      (unsigned int)nb_pkt,
> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +                      pbs->pkt_ticks_spread[nb_pkt],
> +                      nrx_tx ? "TX" : "RX", nb_burst);
> +       }
> +#endif


# Thanks for this feature, IMO, It worth to mention in release notes.

# I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
this flag is not
by default, there is a risk of compilation issue when this flag is get enabled.
IMO, it is better to move to _if (0)_ semantics to enable
- performance when compiler opt-out the code.
- It forces the compiler to check the compilation errors irrespective
of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.

Something like below,

static __rte_always_inline int
testpmd_has_stats_feature(void)
{
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
        return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
#else
        return 0;
#endif
}


if (testpmd_has_stats_feature()) {

}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
  2020-03-20  6:13     ` Jerin Jacob
@ 2020-03-20 16:03     ` Andrzej Ostruszka
  2020-04-02 11:21     ` Thomas Monjalon
  2020-04-09 12:03     ` Ferruh Yigit
  3 siblings, 0 replies; 29+ messages in thread
From: Andrzej Ostruszka @ 2020-03-20 16:03 UTC (permalink / raw)
  To: Viacheslav Ovsiienko, dev; +Cc: ferruh.yigit, thomas, bernard.iremonger

Viacheslav, thanks for the patches.  I'll comment here on the whole
patch set.  This patch set has many similar changes so I'll focus on the
first (csumonly) and the comment will apply to all remaining - OK?

On 3/19/20 2:50 PM, Viacheslav Ovsiienko wrote:
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  app/test-pmd/csumonly.c   | 11 +++++++----
>  app/test-pmd/flowgen.c    | 11 +++++++----
>  app/test-pmd/icmpecho.c   | 12 ++++++++----
>  app/test-pmd/iofwd.c      | 11 +++++++----
>  app/test-pmd/macfwd.c     | 11 +++++++----
>  app/test-pmd/macswap.c    | 11 +++++++----
>  app/test-pmd/rxonly.c     |  2 +-
>  app/test-pmd/softnicfwd.c | 11 +++++++----
>  app/test-pmd/testpmd.c    | 31 ++++++++++++++++++++++++-------
>  app/test-pmd/testpmd.h    | 26 ++++++++++++++++++++++----
>  app/test-pmd/txonly.c     |  9 ++++++---
>  11 files changed, 103 insertions(+), 43 deletions(-)
> 
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
> index 4104737..c966892 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -797,7 +797,7 @@ struct simple_gre_hdr {
>  	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
>  	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
>  				 nb_pkt_per_burst);
> -	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
> +	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc, nb_rx);
>  	if (unlikely(nb_rx == 0))
>  		return;
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> @@ -1067,7 +1067,7 @@ struct simple_gre_hdr {
>  	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
>  			nb_prep);
> -	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> +	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_tx);
>  
>  	/*
>  	 * Retry if necessary
> @@ -1075,11 +1075,14 @@ struct simple_gre_hdr {
>  	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
>  		retry = 0;
>  		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
> +			uint16_t nb_rt;
> +
>  			rte_delay_us(burst_tx_delay_time);
>  			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
> -			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
> +			nb_rt = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
>  					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
> -			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
> +			nb_tx += nb_rt;
> +			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc, nb_rt);
>  		}
>  	}
>  	fs->tx_packets += nb_tx;

Below this line there is (not visible in patch) code (I'll name it as
"total tx bump" in order to refer to it below):

#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
	fs->tx_burst_stats.pkt_burst_spread[nb_tx]++;
#endif

So when the BURST_STATS is enabled you will have:

struct pkt_burst_stats {
	unsigned int pkt_burst_spread[MAX_PKT_BURST];
	unsigned int pkt_retry_spread[MAX_PKT_BURST];
	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
};

* pkt_ticks_spread will contain ticks per every tx_burst attempt
  (spreaded by the number of packets sent),
* pkt_retry_spread will contain number of tx_burst attempts (spreaded
  by the number of packets sent),
* pkt_burst_spread will have logged +1 for each each bucket that is a
  sum of all packets sent in given rx/tx loop iteration

What the last item actually gives the user that cannot be infered from
the middle one?  If I want to have total number of packets sent I can
sum: slot*spread[slot] over slots.  This should give the same result
when done over burst_spread and retry_spread.  I might be missing
something here so could you elaborate - to me that "total tx bump" looks
like an artificial stat?

If there is actually nothing substantial there, then I'd suggest to
remove pkt_retry_spread (that name actually is also wrong for the sunny
day scenario when there are no retransmissions) and use pkt_burst_spread
for that purpose.  AFAIU the 'retry' member is not used at all for the
RX path - correct?

So my suggestion would be to:
- keep only pkt_burst_spread and pkt_ticks_spread
- use them properly per every rx/tx_burst attempt
- skip the "total tx bump" code

That way RX/TX stats would be symmetrical and reflect the actual
rx/tx_burst attempts.

[...]
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index b195880..1d4b55b 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -1515,7 +1515,7 @@ struct extmem_param {
>  
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
>  static void
> -pkt_burst_stats_display(const char *rx_tx, struct pkt_burst_stats *pbs)
> +pkt_burst_stats_display(int nrx_tx, struct pkt_burst_stats *pbs)

Maybe another name for 'nrx_tx'?  Like 'is_tx' or 'tx_stats'?  And maybe
bool?

>  {
>  	unsigned int total_burst;
>  	unsigned int nb_burst;
> @@ -1549,8 +1549,8 @@ struct extmem_param {
>  	if (total_burst == 0)
>  		return;
>  	burst_percent[0] = (burst_stats[0] * 100) / total_burst;
> -	printf("  %s-bursts : %u [%d%% of %d pkts", rx_tx, total_burst,
> -	       burst_percent[0], (int) pktnb_stats[0]);
> +	printf("  %s-bursts : %u [%d%% of %d pkts", nrx_tx ? "TX" : "RX",
> +	       total_burst, burst_percent[0], (int) pktnb_stats[0]);
>  	if (burst_stats[0] == total_burst) {
>  		printf("]\n");
>  		return;
> @@ -1568,6 +1568,23 @@ struct extmem_param {
>  	}
>  	printf(" + %d%% of %d pkts + %d%% of others]\n",
>  	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +				      : RECORD_CORE_CYCLES_RX)))
> +		return;
> +	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +				  : pbs->pkt_burst_spread[nb_pkt];

So you actually don't use pkt_burst_spread for TX.  Then just follow my
suggestion above and remove 'retry' and use pkt_burst_spread as you use
'retry' now.

> +		if (nb_burst == 0)
> +			continue;
> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",
> +		       (unsigned int)nb_pkt,
> +		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +		       pbs->pkt_ticks_spread[nb_pkt],
> +		       nrx_tx ? "TX" : "RX", nb_burst);
> +	}
> +#endif
>  }
>  #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
[...]
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -89,6 +89,10 @@ enum {
>   */
>  struct pkt_burst_stats {
>  	unsigned int pkt_burst_spread[MAX_PKT_BURST];
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	unsigned int pkt_retry_spread[MAX_PKT_BURST];
> +	uint64_t pkt_ticks_spread[MAX_PKT_BURST];
> +#endif
>  };
>  #endif
>  
> @@ -340,21 +344,35 @@ struct queue_stats_mappings {
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }

General comment to the macros below - such macros are usually defined
with the "traditional":
	do { ... } while (0)
structure.  That way they behave better when used in conditional statements:
	if (cond)
		MACRO(args);
	else
		...
With the definition below the "else" would produce compiler error.

I know that they are not used that way, but I'd say it would be cleaner
to do that way anyway :).

> -#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
> +#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
> +{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
> +{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; \
> +fs->tx_burst_stats.pkt_ticks_spread[np] += tsc; \
> +fs->tx_burst_stats.pkt_retry_spread[np]++; } }
> +
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
> +{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
> +{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; \
> +fs->rx_burst_stats.pkt_ticks_spread[np] += tsc; } }
> +
> +#else /* RTE_TEST_PMD_RECORD_BURST_STATS */
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
>  
> -#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
>  {if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
>  {uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
> +#endif /* RTE_TEST_PMD_RECORD_BURST_STATS */

In addition to the above "do/while(0)" comment maybe you could merge
these macros above like (not tested):

#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
#define TEST_PMD_BURST_STATS_ADD(stats, np, tsc) \
	stats.pkt_ticks_spread[np] += tsc; \
	stats.pkt_burst_spread[np]++
#else
#define TEST_PMD_BURST_STATS_ADD(stats, np, tsc)
#endif

#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np) \
	do { if (fwdprof_flags & RECORD_CORE_CYCLES_TX) { \
		uint64_t tsc = rte_rdtsc(); tsc -= (s); \
		fs->core_tx_cycles += tsc; \
		TEST_PMD_BURST_STATS_ADD(fs->tx_burst_stats, np, tsc); \
	}} while(0)

#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np) \
	do { if (fwdprof_flags & RECORD_CORE_CYCLES_RX) { \
		uint64_t tsc = rte_rdtsc(); tsc -= (s); \
		fs->core_rx_cycles += tsc; \
		TEST_PMD_BURST_STATS_ADD(fs->rx_burst_stats, np, tsc); \
	}} while(0)

That way handling of RX and TX would be very symmetrical - you'd just
call TX_ADD/RX_ADD where needed and you'd remove both "total TX bump"
and "total RX bump" code.

Note however that accepting this suggestion and removal of "total RX
bump" would also be a behaviour change - since currently when no packet
is read nothing happens (apart from the tics updated) whereas with the
proposed changes (and enabled burst stats) you'd be bumping
rx_burst_stats.pkt_burst_spread[0].  I'd say that would be change for
good since this would allow to have a glimpse how many iterations are idle.

>  #else
>  /* No profiling statistics is configured. */
>  #define TEST_PMD_CORE_CYC_TX_START(a)
>  #define TEST_PMD_CORE_CYC_RX_START(a)
>  #define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
> -#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
> -#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
> +#define TEST_PMD_CORE_CYC_TX_ADD(fs, s, np)
> +#define TEST_PMD_CORE_CYC_RX_ADD(fs, s, np)
>  #endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
[...]

With regards
Andrzej Ostruszka

Reviewed-by: Andrzej Ostruszka <aostruszka@marvell.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size
  2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
                     ` (2 preceding siblings ...)
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
@ 2020-04-02 11:13   ` Thomas Monjalon
  3 siblings, 0 replies; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-02 11:13 UTC (permalink / raw)
  To: Viacheslav Ovsiienko; +Cc: dev, ferruh.yigit, bernard.iremonger

19/03/2020 14:50, Viacheslav Ovsiienko:
> There is the CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configuration
> parameter enabling the lightweight profiler for the forwarding
> routines that provides the time spent in the routines and estimates
> the CPU cycles required to process one packet.
> 
> It would be good to have separated data for the Rx and Tx directions.
> Beside this, the performance depends on the actual burst size, the profiling
> data per burst size are meaningful and would help detect the performance
> anomalies.
> 
> To control this profiling statistics the new testpmd command is introduced:
> 
>   set fwdprof (flags)
> 
> This command controls which profiling statistics is being gathered
> in runtime:
> 
> - bit 0 - enables profiling the entire forward routine, counts the ticks
>           spent in the forwarding routine, is set by default. Provides
> 	  the same data as previous implementation.
> 
> - bit 1 - enables gathering the profiling data for the transmit datapath,
>           counts the ticks spent within rte_eth_tx_burst() routine,
>           is cleared by default, extends the existing statistics.
> 
> - bit 2 - enables gathering the profiling data for the receive datapath,
>           counts the ticks spent within	rte_eth_rx_burst() routine,
> 	  is cleared by default, extends the existing statistics.

Please can you rename bit 1 to Rx, and bit 2 to Tx,
and implement configuration with text parsing instead of obscure bitflags?
It would be more user-friendly, thanks.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
@ 2020-04-02 11:15     ` Thomas Monjalon
  2020-04-09 11:56     ` Ferruh Yigit
  1 sibling, 0 replies; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-02 11:15 UTC (permalink / raw)
  To: Viacheslav Ovsiienko; +Cc: dev, ferruh.yigit, bernard.iremonger

19/03/2020 14:50, Viacheslav Ovsiienko:
> +set fwdprof
> +~~~~~~~~~~~
> +
> +Set the flags controlling the datapath profiling statistics::
> +
> +   testpmd> set fwdprof (flags)
> +
> +This command is available only if ``CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES`` is
> +configured to Y, enabling the profiling code generation.
> +
> +The bitmask flag controls the gathering profiling statistics for datapath:
> +
> +* ``bit 0`` enables gathering the profiling data for the entire
> +  forwarding routine, counts the ticks spent in the forwarding routine,
> +  is set by default.
> +* ``bit 1`` enables gathering the profiling data for the transmit datapath,
> +  counts the ticks spent within rte_eth_tx_burst() routine, is cleared by
> +  default.
> +* ``bit 2`` enables gathering the profiling data for the receive datapath,
> +  counts the ticks spent within rte_eth_rx_burst() routine, is cleared by
> +  default.

As commented in the cover letter, please replace flags with Rx/Tx text tokens.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling Viacheslav Ovsiienko
@ 2020-04-02 11:20     ` Thomas Monjalon
  2020-04-02 11:23       ` Slava Ovsiienko
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-02 11:20 UTC (permalink / raw)
  To: Viacheslav Ovsiienko; +Cc: dev, ferruh.yigit, bernard.iremonger

19/03/2020 14:50, Viacheslav Ovsiienko:
> +	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
> +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> +		       (unsigned int)(rx_cycles / total_recv),
> +		       rx_cycles, total_recv);
> +	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
> +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> +		       (unsigned int)(tx_cycles / total_xmit),
> +		       tx_cycles, total_xmit);

This is the "UI", so I think it deserves a cautious review.

Instead of "=" without space, I think ": " is easier to read.

Please use "Rx" and "Tx" instead of lowercase ones.

"CPU cycles/packet" is hard to read. I would prefer either
	"cycles/packet" without CPU
or
	"cycles per packet"

I will continue some UI comments in next patch.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
  2020-03-20  6:13     ` Jerin Jacob
  2020-03-20 16:03     ` Andrzej Ostruszka
@ 2020-04-02 11:21     ` Thomas Monjalon
  2020-04-09 12:03     ` Ferruh Yigit
  3 siblings, 0 replies; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-02 11:21 UTC (permalink / raw)
  To: Viacheslav Ovsiienko; +Cc: dev, ferruh.yigit, bernard.iremonger

19/03/2020 14:50, Viacheslav Ovsiienko:
> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",

You can make it simpler by removing some words:
	CPU cycles -> cycles
	packet burst -> burst



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling
  2020-04-02 11:20     ` Thomas Monjalon
@ 2020-04-02 11:23       ` Slava Ovsiienko
  0 siblings, 0 replies; 29+ messages in thread
From: Slava Ovsiienko @ 2020-04-02 11:23 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, ferruh.yigit, bernard.iremonger

Hi,

Thanks to all for reviewing and comments, will address ones in v3.

With best regards,
Slava

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, April 2, 2020 14:20
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; ferruh.yigit@intel.com; bernard.iremonger@intel.com
> Subject: Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx
> routines profiling
> 
> 19/03/2020 14:50, Viacheslav Ovsiienko:
> > +	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
> > +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> > +		       (unsigned int)(rx_cycles / total_recv),
> > +		       rx_cycles, total_recv);
> > +	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
> > +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> > +		       (unsigned int)(tx_cycles / total_xmit),
> > +		       tx_cycles, total_xmit);
> 
> This is the "UI", so I think it deserves a cautious review.
> 
> Instead of "=" without space, I think ": " is easier to read.
> 
> Please use "Rx" and "Tx" instead of lowercase ones.
> 
> "CPU cycles/packet" is hard to read. I would prefer either
> 	"cycles/packet" without CPU
> or
> 	"cycles per packet"
> 
> I will continue some UI comments in next patch.
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-20  6:13     ` Jerin Jacob
@ 2020-04-09 11:46       ` Ferruh Yigit
  2020-04-09 12:49         ` Jerin Jacob
  0 siblings, 1 reply; 29+ messages in thread
From: Ferruh Yigit @ 2020-04-09 11:46 UTC (permalink / raw)
  To: Jerin Jacob, Viacheslav Ovsiienko
  Cc: dpdk-dev, Thomas Monjalon, Bernard Iremonger

On 3/20/2020 6:13 AM, Jerin Jacob wrote:
> On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
> <viacheslavo@mellanox.com> wrote:
>>
>> The execution time of rx/tx burst routine depends on the burst size.
>> It would be meaningful to research this dependency, the patch
>> provides an extra profiling data per rx/tx burst size.
>>
>> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> 
>> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
>> +                                     : RECORD_CORE_CYCLES_RX)))
>> +               return;
>> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
>> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
>> +                                 : pbs->pkt_burst_spread[nb_pkt];
>> +               if (nb_burst == 0)
>> +                       continue;
>> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
>> +                      "%"PRIu64" / total %s bursts=%u)\n",
>> +                      (unsigned int)nb_pkt,
>> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
>> +                      pbs->pkt_ticks_spread[nb_pkt],
>> +                      nrx_tx ? "TX" : "RX", nb_burst);
>> +       }
>> +#endif
> 
> 
> # Thanks for this feature, IMO, It worth to mention in release notes.
> 
> # I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
> I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
> this flag is not
> by default, there is a risk of compilation issue when this flag is get enabled.
> IMO, it is better to move to _if (0)_ semantics to enable
> - performance when compiler opt-out the code.
> - It forces the compiler to check the compilation errors irrespective
> of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.
> 
> Something like below,
> 
> static __rte_always_inline int
> testpmd_has_stats_feature(void)
> {
> #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>         return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
> #else
>         return 0;
> #endif
> }
> 
> 
> if (testpmd_has_stats_feature()) {
> 
> }
> 

Hi Jerin,

In this usage, compiler will removed the "if (0) { }" block, right?
I think this is good idea, we can use it other places too, including this one.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
  2020-04-02 11:15     ` Thomas Monjalon
@ 2020-04-09 11:56     ` Ferruh Yigit
  2020-04-13  7:56       ` Slava Ovsiienko
  1 sibling, 1 reply; 29+ messages in thread
From: Ferruh Yigit @ 2020-04-09 11:56 UTC (permalink / raw)
  To: Viacheslav Ovsiienko, dev; +Cc: thomas, bernard.iremonger

On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> This commit is preparation step before adding the separated profiling
> of Rx and Tx burst routines. The new testpmd command is introduced:
> 
>   set fwdprof (flags)
> 
> This command controls which profiling statistics is being gathered
> in runtime:
> 
> - bit 0 - enables profiling the entire forward routine, counts the ticks
>           spent in the forwarding routine, is set by default. Provides the
> 	  same data as previous implementation.
> 
> - bit 1 - enables gathering the profiling data for the transmit datapath,
>           counts the ticks spent within rte_eth_tx_burst() routine,
>           is cleared by default, extends the existing statistics.
> 
> - bit 2 - enables gathering the profiling data for the receive datapath,
>           counts the ticks spent within rte_eth_rx_burst() routine,
>           is cleared by default, extends the existing statistics.
> 
> The new counters for the cycles spent into rx/tx burst
> routines are introduced. The feature is engaged only if
> CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configured to 'Y'.

Hi Slava,

Thanks for improving the testpmd performance measuring, unfortunately these
features are not documented at all, unless I miss it, and now you are improving
it but still there is no documentation.

Would you mind adding a section for performance measures, document how to enable
and use it, and how to read the output?

> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

<...>

> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +void
> +set_fwdprof_flags(uint16_t pf_flags)
> +{
> +	printf("Change forward profiling flags from %u to %u\n",
> +	       (unsigned int) fwdprof_flags, (unsigned int) pf_flags);

To reduce the 'RTE_TEST_PMD_RECORD_CORE_CYCLES' define usage, what do you think
have it only in this function, if it is defined work as expected and if not
print an error and don't update anything?

<...>

> @@ -647,6 +647,28 @@ Regexes can also be used for type. To change log level of user1, user2 and user3
>  
>  	testpmd> set log user[1-3] (level)
>  
> +set fwdprof
> +~~~~~~~~~~~
> +
> +Set the flags controlling the datapath profiling statistics::
> +
> +   testpmd> set fwdprof (flags)

What do you think a little longer command 'fwdprofile", which is more clear I think.

And what about having another command to show existing value, right now it is
'set' only?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
                       ` (2 preceding siblings ...)
  2020-04-02 11:21     ` Thomas Monjalon
@ 2020-04-09 12:03     ` Ferruh Yigit
  2020-04-09 12:09       ` Thomas Monjalon
  3 siblings, 1 reply; 29+ messages in thread
From: Ferruh Yigit @ 2020-04-09 12:03 UTC (permalink / raw)
  To: Viacheslav Ovsiienko, dev; +Cc: thomas, bernard.iremonger

On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> The execution time of rx/tx burst routine depends on the burst size.
> It would be meaningful to research this dependency, the patch
> provides an extra profiling data per rx/tx burst size.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

<...>

> @@ -1568,6 +1568,23 @@ struct extmem_param {
>  	}
>  	printf(" + %d%% of %d pkts + %d%% of others]\n",
>  	       burst_percent[1], (int) pktnb_stats[1], burst_percent[2]);
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> +				      : RECORD_CORE_CYCLES_RX)))
> +		return;
> +	for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> +		nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> +				  : pbs->pkt_burst_spread[nb_pkt];
> +		if (nb_burst == 0)
> +			continue;

Why "nb_burst == 0" excluded, wouldn't be interesting to see number of calls
with 0 Rx/Tx and cycles spent there?

> +		printf("  CPU cycles/%u packet burst=%u (total cycles="
> +		       "%"PRIu64" / total %s bursts=%u)\n",
> +		       (unsigned int)nb_pkt,
> +		       (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> +		       pbs->pkt_ticks_spread[nb_pkt],
> +		       nrx_tx ? "TX" : "RX", nb_burst);
> +	}
> +#endif
>  }
>  #endif /* RTE_TEST_PMD_RECORD_BURST_STATS */
>  
> @@ -1601,8 +1618,8 @@ struct extmem_param {
>  	}
>  
>  #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
> -	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
> -	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
> +	pkt_burst_stats_display(false, &fs->rx_burst_stats);
> +	pkt_burst_stats_display(true, &fs->tx_burst_stats);

Instead of this true/false, I would be OK to some duplication and have explicit
Rx/Tx, I believe it would be more clear that way, but no strong opinion on it.

<...>

> +++ b/app/test-pmd/testpmd.h
> @@ -89,6 +89,10 @@ enum {
>   */
>  struct pkt_burst_stats {
>  	unsigned int pkt_burst_spread[MAX_PKT_BURST];
> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> +	unsigned int pkt_retry_spread[MAX_PKT_BURST];

Isn't this keep the value of all Tx burst count, why named as 'retry'?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-04-09 12:03     ` Ferruh Yigit
@ 2020-04-09 12:09       ` Thomas Monjalon
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-09 12:09 UTC (permalink / raw)
  To: Viacheslav Ovsiienko, Ferruh Yigit; +Cc: dev, bernard.iremonger

09/04/2020 14:03, Ferruh Yigit:
> On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> > -	pkt_burst_stats_display("RX", &fs->rx_burst_stats);
> > -	pkt_burst_stats_display("TX", &fs->tx_burst_stats);
> > +	pkt_burst_stats_display(false, &fs->rx_burst_stats);
> > +	pkt_burst_stats_display(true, &fs->tx_burst_stats);
> 
> Instead of this true/false, I would be OK to some duplication and have explicit
> Rx/Tx, I believe it would be more clear that way, but no strong opinion on it.

I have a strong opinion here: Rx/Tx is not a boolean.
If not a string, it should be a constant (#define).




^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size
  2020-04-09 11:46       ` Ferruh Yigit
@ 2020-04-09 12:49         ` Jerin Jacob
  0 siblings, 0 replies; 29+ messages in thread
From: Jerin Jacob @ 2020-04-09 12:49 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Viacheslav Ovsiienko, dpdk-dev, Thomas Monjalon, Bernard Iremonger

On Thu, Apr 9, 2020 at 5:16 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>
> On 3/20/2020 6:13 AM, Jerin Jacob wrote:
> > On Thu, Mar 19, 2020 at 7:21 PM Viacheslav Ovsiienko
> > <viacheslavo@mellanox.com> wrote:
> >>
> >> The execution time of rx/tx burst routine depends on the burst size.
> >> It would be meaningful to research this dependency, the patch
> >> provides an extra profiling data per rx/tx burst size.
> >>
> >> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> >
> >> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >> +       if (!(fwdprof_flags & (nrx_tx ? RECORD_CORE_CYCLES_TX
> >> +                                     : RECORD_CORE_CYCLES_RX)))
> >> +               return;
> >> +       for (nb_pkt = 0; nb_pkt < MAX_PKT_BURST; nb_pkt++) {
> >> +               nb_burst = nrx_tx ? pbs->pkt_retry_spread[nb_pkt]
> >> +                                 : pbs->pkt_burst_spread[nb_pkt];
> >> +               if (nb_burst == 0)
> >> +                       continue;
> >> +               printf("  CPU cycles/%u packet burst=%u (total cycles="
> >> +                      "%"PRIu64" / total %s bursts=%u)\n",
> >> +                      (unsigned int)nb_pkt,
> >> +                      (unsigned int)(pbs->pkt_ticks_spread[nb_pkt] / nb_burst),
> >> +                      pbs->pkt_ticks_spread[nb_pkt],
> >> +                      nrx_tx ? "TX" : "RX", nb_burst);
> >> +       }
> >> +#endif
> >
> >
> > # Thanks for this feature, IMO, It worth to mention in release notes.
> >
> > # I see a lot of code get added under RTE_TEST_PMD_RECORD_CORE_CYCLES.
> > I agree that it should be under RTE_TEST_PMD_RECORD_CORE_CYCLES. Since
> > this flag is not
> > by default, there is a risk of compilation issue when this flag is get enabled.
> > IMO, it is better to move to _if (0)_ semantics to enable
> > - performance when compiler opt-out the code.
> > - It forces the compiler to check the compilation errors irrespective
> > of the RTE_TEST_PMD_RECORD_CORE_CYCLES state.
> >
> > Something like below,
> >
> > static __rte_always_inline int
> > testpmd_has_stats_feature(void)
> > {
> > #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >         return RTE_TEST_PMD_RECORD_CORE_CYCLES ;
> > #else
> >         return 0;
> > #endif
> > }
> >
> >
> > if (testpmd_has_stats_feature()) {
> >
> > }
> >
>
> Hi Jerin,

Hi Ferruh,

> In this usage, compiler will removed the "if (0) { }" block, right?

Yes.

> I think this is good idea, we can use it other places too, including this one.

Yes.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-04-09 11:56     ` Ferruh Yigit
@ 2020-04-13  7:56       ` Slava Ovsiienko
  2020-04-13 12:23         ` Thomas Monjalon
  2020-04-14  9:07         ` Ferruh Yigit
  0 siblings, 2 replies; 29+ messages in thread
From: Slava Ovsiienko @ 2020-04-13  7:56 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: Thomas Monjalon, bernard.iremonger

Hi, Ferruh

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Thursday, April 9, 2020 14:56
> To: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org
> Cc: Thomas Monjalon <thomas@monjalon.net>;
> bernard.iremonger@intel.com
> Subject: Re: [PATCH v2 1/3] app/testpmd: add profiling flags set command
> 
> On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
> > This commit is preparation step before adding the separated profiling
> > of Rx and Tx burst routines. The new testpmd command is introduced:
> >
> >   set fwdprof (flags)
> >
> > This command controls which profiling statistics is being gathered in
> > runtime:
> >
> > - bit 0 - enables profiling the entire forward routine, counts the ticks
> >           spent in the forwarding routine, is set by default. Provides the
> > 	  same data as previous implementation.
> >
> > - bit 1 - enables gathering the profiling data for the transmit datapath,
> >           counts the ticks spent within rte_eth_tx_burst() routine,
> >           is cleared by default, extends the existing statistics.
> >
> > - bit 2 - enables gathering the profiling data for the receive datapath,
> >           counts the ticks spent within rte_eth_rx_burst() routine,
> >           is cleared by default, extends the existing statistics.
> >
> > The new counters for the cycles spent into rx/tx burst routines are
> > introduced. The feature is engaged only if
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configured to 'Y'.
> 
> Hi Slava,
> 
> Thanks for improving the testpmd performance measuring, unfortunately
> these features are not documented at all, unless I miss it, and now you are
> improving it but still there is no documentation.
> 
> Would you mind adding a section for performance measures, document how
> to enable and use it, and how to read the output?

OK, I'll update the documentation either and explain the new extended stats.

> 
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> 
> <...>
> 
> > +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES void
> > +set_fwdprof_flags(uint16_t pf_flags) {
> > +	printf("Change forward profiling flags from %u to %u\n",
> > +	       (unsigned int) fwdprof_flags, (unsigned int) pf_flags);
> 
> To reduce the 'RTE_TEST_PMD_RECORD_CORE_CYCLES' define usage, what do
> you think have it only in this function, if it is defined work as expected and if
> not print an error and don't update anything?
Agree, it would simplify, going to fix.

> 
> <...>
> 
> > @@ -647,6 +647,28 @@ Regexes can also be used for type. To change log
> > level of user1, user2 and user3
> >
> >  	testpmd> set log user[1-3] (level)
> >
> > +set fwdprof
> > +~~~~~~~~~~~
> > +
> > +Set the flags controlling the datapath profiling statistics::
> > +
> > +   testpmd> set fwdprof (flags)
> 
> What do you think a little longer command 'fwdprofile", which is more clear I
> think.
Yes, it is clearer, but longer to type 😊
If you insist on - I will extend, please, let me know your final opinion.

> 
> And what about having another command to show existing value, right now it
> is 'set' only?
Do you mean it would be good to add "show fwdpro/fwdprofile" ?

With best regards, Slava


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-04-13  7:56       ` Slava Ovsiienko
@ 2020-04-13 12:23         ` Thomas Monjalon
  2020-04-14  9:07         ` Ferruh Yigit
  1 sibling, 0 replies; 29+ messages in thread
From: Thomas Monjalon @ 2020-04-13 12:23 UTC (permalink / raw)
  To: Ferruh Yigit, Slava Ovsiienko; +Cc: dev, bernard.iremonger

13/04/2020 09:56, Slava Ovsiienko:
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> > > +set fwdprof
> > > +~~~~~~~~~~~
> > > +
> > > +Set the flags controlling the datapath profiling statistics::
> > > +
> > > +   testpmd> set fwdprof (flags)
> > 
> > What do you think a little longer command 'fwdprofile", which is more clear I
> > think.
> Yes, it is clearer, but longer to type 😊
> If you insist on - I will extend, please, let me know your final opinion.

I agree with Ferruh in general and especially about "fwdprof" not nice to read.
What about fwd-profiling?
I would be also OK with fwd-prof.

Is it common to use hyphen sign in testpmd commands?



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command
  2020-04-13  7:56       ` Slava Ovsiienko
  2020-04-13 12:23         ` Thomas Monjalon
@ 2020-04-14  9:07         ` Ferruh Yigit
  1 sibling, 0 replies; 29+ messages in thread
From: Ferruh Yigit @ 2020-04-14  9:07 UTC (permalink / raw)
  To: Slava Ovsiienko, dev; +Cc: Thomas Monjalon, bernard.iremonger

On 4/13/2020 8:56 AM, Slava Ovsiienko wrote:
> Hi, Ferruh
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Thursday, April 9, 2020 14:56
>> To: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org
>> Cc: Thomas Monjalon <thomas@monjalon.net>;
>> bernard.iremonger@intel.com
>> Subject: Re: [PATCH v2 1/3] app/testpmd: add profiling flags set command
>>
>> On 3/19/2020 1:50 PM, Viacheslav Ovsiienko wrote:
>>> This commit is preparation step before adding the separated profiling
>>> of Rx and Tx burst routines. The new testpmd command is introduced:
>>>
>>>   set fwdprof (flags)
>>>
>>> This command controls which profiling statistics is being gathered in
>>> runtime:
>>>
>>> - bit 0 - enables profiling the entire forward routine, counts the ticks
>>>           spent in the forwarding routine, is set by default. Provides the
>>> 	  same data as previous implementation.
>>>
>>> - bit 1 - enables gathering the profiling data for the transmit datapath,
>>>           counts the ticks spent within rte_eth_tx_burst() routine,
>>>           is cleared by default, extends the existing statistics.
>>>
>>> - bit 2 - enables gathering the profiling data for the receive datapath,
>>>           counts the ticks spent within rte_eth_rx_burst() routine,
>>>           is cleared by default, extends the existing statistics.
>>>
>>> The new counters for the cycles spent into rx/tx burst routines are
>>> introduced. The feature is engaged only if
>>> CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES configured to 'Y'.
>>
>> Hi Slava,
>>
>> Thanks for improving the testpmd performance measuring, unfortunately
>> these features are not documented at all, unless I miss it, and now you are
>> improving it but still there is no documentation.
>>
>> Would you mind adding a section for performance measures, document how
>> to enable and use it, and how to read the output?
> 
> OK, I'll update the documentation either and explain the new extended stats.
> 
>>
>>>
>>> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
>>
>> <...>
>>
>>> +#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES void
>>> +set_fwdprof_flags(uint16_t pf_flags) {
>>> +	printf("Change forward profiling flags from %u to %u\n",
>>> +	       (unsigned int) fwdprof_flags, (unsigned int) pf_flags);
>>
>> To reduce the 'RTE_TEST_PMD_RECORD_CORE_CYCLES' define usage, what do
>> you think have it only in this function, if it is defined work as expected and if
>> not print an error and don't update anything?
> Agree, it would simplify, going to fix.
> 
>>
>> <...>
>>
>>> @@ -647,6 +647,28 @@ Regexes can also be used for type. To change log
>>> level of user1, user2 and user3
>>>
>>>  	testpmd> set log user[1-3] (level)
>>>
>>> +set fwdprof
>>> +~~~~~~~~~~~
>>> +
>>> +Set the flags controlling the datapath profiling statistics::
>>> +
>>> +   testpmd> set fwdprof (flags)
>>
>> What do you think a little longer command 'fwdprofile", which is more clear I
>> think.
> Yes, it is clearer, but longer to type 😊
> If you insist on - I will extend, please, let me know your final opinion.

In the scope of this patch it may look OK, but if you look all testpmd commands
without knowing the code, 'fwdprof' may not be clear what it is. I am for making
it more clear.

> 
>>
>> And what about having another command to show existing value, right now it
>> is 'set' only?
> Do you mean it would be good to add "show fwdpro/fwdprofile" ?

Yes.


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2020-04-14  9:07 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-26 12:48 [dpdk-dev] [PATCH] app/testpmd: add profiling for Rx/Tx burst routines Viacheslav Ovsiienko
2019-06-26 12:57 ` Bruce Richardson
2019-06-26 13:19   ` Slava Ovsiienko
2019-06-26 13:21     ` Bruce Richardson
2019-06-27  4:48       ` Slava Ovsiienko
2019-06-28 13:45         ` Iremonger, Bernard
2019-06-28 14:20           ` Bruce Richardson
2019-07-01  4:57             ` Slava Ovsiienko
2019-07-01  8:15               ` Bruce Richardson
2019-09-30 12:32                 ` Yigit, Ferruh
2020-03-19 13:50 ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data on burst size Viacheslav Ovsiienko
2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: add profiling flags set command Viacheslav Ovsiienko
2020-04-02 11:15     ` Thomas Monjalon
2020-04-09 11:56     ` Ferruh Yigit
2020-04-13  7:56       ` Slava Ovsiienko
2020-04-13 12:23         ` Thomas Monjalon
2020-04-14  9:07         ` Ferruh Yigit
2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx routines profiling Viacheslav Ovsiienko
2020-04-02 11:20     ` Thomas Monjalon
2020-04-02 11:23       ` Slava Ovsiienko
2020-03-19 13:50   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: qualify profiling statistics on burst size Viacheslav Ovsiienko
2020-03-20  6:13     ` Jerin Jacob
2020-04-09 11:46       ` Ferruh Yigit
2020-04-09 12:49         ` Jerin Jacob
2020-03-20 16:03     ` Andrzej Ostruszka
2020-04-02 11:21     ` Thomas Monjalon
2020-04-09 12:03     ` Ferruh Yigit
2020-04-09 12:09       ` Thomas Monjalon
2020-04-02 11:13   ` [dpdk-dev] [PATCH v2 0/3] app/testpmd: qualify Rx/Tx profiling data " Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).