[PATCH 1/3] event/cnxk: align TX queue buffer adjustment

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH 1/3] event/cnxk: align TX queue buffer adjustment
@ 2023-05-16 14:37 pbhagavatula
  2023-05-16 14:37 ` [PATCH 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: pbhagavatula @ 2023-05-16 14:37 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Remove recalculating SQB thresholds in Tx queue buffer adjustment.
The adjustment is already done during Tx queue setup.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: series-27660

 drivers/event/cnxk/cn10k_eventdev.c  |  9 +--------
 drivers/event/cnxk/cn10k_tx_worker.h |  6 +++---
 drivers/event/cnxk/cn9k_eventdev.c   |  9 +--------
 drivers/event/cnxk/cn9k_worker.h     | 12 +++++++++---
 drivers/net/cnxk/cn10k_tx.h          | 12 ++++++------
 drivers/net/cnxk/cn9k_tx.h           |  5 +++--
 6 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 89f32c4d1e..f7c6a83ff0 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -840,16 +840,9 @@ cn10k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
 		sq = &cnxk_eth_dev->sqs[tx_queue_id];
 		txq = eth_dev->data->tx_queues[tx_queue_id];
 		sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
-		sq->nb_sqb_bufs_adj =
-			sq->nb_sqb_bufs -
-			RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
-				sqes_per_sqb;
 		if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
-			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
-						(sqes_per_sqb - 1));
+			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
 		txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
-		txq->nb_sqb_bufs_adj =
-			((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
 	}
 }

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index c18786a14c..7b2798ad2e 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -32,9 +32,9 @@ cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
 static __rte_always_inline int32_t
 cn10k_sso_sq_depth(const struct cn10k_eth_txq *txq)
 {
-	return (txq->nb_sqb_bufs_adj -
-		__atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
-	       << txq->sqes_per_sqb_log2;
+	int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
+			(int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+	return (avail << txq->sqes_per_sqb_log2) - avail;
 }

 static __rte_always_inline uint16_t
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index df23219f14..a9d603c22f 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -893,16 +893,9 @@ cn9k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
 		sq = &cnxk_eth_dev->sqs[tx_queue_id];
 		txq = eth_dev->data->tx_queues[tx_queue_id];
 		sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
-		sq->nb_sqb_bufs_adj =
-			sq->nb_sqb_bufs -
-			RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
-				sqes_per_sqb;
 		if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
-			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
-						(sqes_per_sqb - 1));
+			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
 		txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
-		txq->nb_sqb_bufs_adj =
-			((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
 	}
 }

diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 988cb3acb6..d15dd309fe 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -711,6 +711,14 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 }
 #endif

+static __rte_always_inline int32_t
+cn9k_sso_sq_depth(const struct cn9k_eth_txq *txq)
+{
+	int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
+			(int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+	return (avail << txq->sqes_per_sqb_log2) - avail;
+}
+
 static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		      uint64_t *txq_data, const uint32_t flags)
@@ -734,9 +742,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
 		handle_tx_completion_pkts(txq, 1, 1);

-	if (((txq->nb_sqb_bufs_adj -
-	      __atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
-	     << txq->sqes_per_sqb_log2) <= 0)
+	if (cn9k_sso_sq_depth(txq) <= 0)
 		return 0;
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
 	cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index c9ec01cd9d..bab08a2d3b 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -35,12 +35,13 @@

 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
 	do {                                                                   \
+		int64_t avail;                                                 \
 		/* Cached value is low, Update the fc_cache_pkts */            \
 		if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
+			avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
 			/* Multiply with sqe_per_sqb to express in pkts */     \
 			(txq)->fc_cache_pkts =                                 \
-				((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
-				<< (txq)->sqes_per_sqb_log2;                   \
+				(avail << (txq)->sqes_per_sqb_log2) - avail;   \
 			/* Check it again for the room */                      \
 			if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
 				return 0;                                      \
@@ -113,10 +114,9 @@ cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
 	if (cached < 0) {
 		/* Check if we have space else retry. */
 		do {
-			refill =
-				(txq->nb_sqb_bufs_adj -
-				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
-				<< txq->sqes_per_sqb_log2;
+			refill = txq->nb_sqb_bufs_adj -
+				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+			refill = (refill << txq->sqes_per_sqb_log2) - refill;
 		} while (refill <= 0);
 		__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
 					  0, __ATOMIC_RELEASE,
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index e956c1ad2a..8efb75b505 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -32,12 +32,13 @@

 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
 	do {                                                                   \
+		int64_t avail;                                                 \
 		/* Cached value is low, Update the fc_cache_pkts */            \
 		if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
+			avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
 			/* Multiply with sqe_per_sqb to express in pkts */     \
 			(txq)->fc_cache_pkts =                                 \
-				((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
-				<< (txq)->sqes_per_sqb_log2;                   \
+				(avail << (txq)->sqes_per_sqb_log2) - avail;   \
 			/* Check it again for the room */                      \
 			if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
 				return 0;                                      \
--
2.39.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/3] event/cnxk: use local labels in asm intrinsic
  2023-05-16 14:37 [PATCH 1/3] event/cnxk: align TX queue buffer adjustment pbhagavatula
@ 2023-05-16 14:37 ` pbhagavatula
  2023-05-16 14:37 ` [PATCH 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: pbhagavatula @ 2023-05-16 14:37 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Using labels in asm generates them as regular function and shades
callstack in tools like gdb or perf.
Use local label instead for better visibility.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h    |  8 ++---
 drivers/event/cnxk/cn9k_worker.h     | 25 ++++++++--------
 drivers/event/cnxk/cnxk_tim_worker.h | 44 ++++++++++++++--------------
 drivers/event/cnxk/cnxk_worker.h     |  8 ++---
 4 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 27d3a9bca2..6ec81a5ce5 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -268,12 +268,12 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev,
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
-		     "		tbz %[tag], 63, done%=			\n"
+		     "		tbz %[tag], 63, .Ldone%=		\n"
 		     "		sevl					\n"
-		     "rty%=:	wfe					\n"
+		     ".Lrty%=:	wfe					\n"
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=			\n"
-		     "done%=:	dmb ld					\n"
+		     "		tbnz %[tag], 63, .Lrty%=		\n"
+		     ".Ldone%=:	dmb ld					\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
 		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index d15dd309fe..51b4db419e 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -232,18 +232,19 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 		rte_prefetch_non_temporal(dws->lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
-		     "rty%=:					\n"
+		     ".Lrty%=:					\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	str %[gw], [%[pong]]		\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	str %[gw], [%[pong]]		\n"
 		     "		dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     "		prfm pldl1keep, [%[mbuf]]	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(base + SSOW_LF_GWS_TAG),
-		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(dws->gw_wdata),
+		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP),
+		       [gw] "r"(dws->gw_wdata),
 		       [pong] "r"(pair_base + SSOW_LF_GWS_OP_GET_WORK0));
 #else
 	gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
@@ -282,13 +283,13 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbz %[tag], 63, done%=		\n"
+		     "		tbz %[tag], 63, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	dmb ld				\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     "		prfm pldl1keep, [%[mbuf]]	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
@@ -330,13 +331,13 @@ cn9k_sso_hws_get_work_empty(uint64_t base, struct rte_event *ev,
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbz %[tag], 63, done%=		\n"
+		     "		tbz %[tag], 63, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	dmb ld				\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
diff --git a/drivers/event/cnxk/cnxk_tim_worker.h b/drivers/event/cnxk/cnxk_tim_worker.h
index 8fafb8f09c..f0857f26ba 100644
--- a/drivers/event/cnxk/cnxk_tim_worker.h
+++ b/drivers/event/cnxk/cnxk_tim_worker.h
@@ -262,12 +262,12 @@ cnxk_tim_add_entry_sp(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -345,12 +345,12 @@ cnxk_tim_add_entry_mp(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -374,13 +374,13 @@ cnxk_tim_add_entry_mp(struct cnxk_tim_ring *const tim_ring,
 		cnxk_tim_bkt_dec_lock(bkt);
 #ifdef RTE_ARCH_ARM64
 		asm volatile(PLT_CPU_FEATURE_PREAMBLE
-			     "		ldxr %[rem], [%[crem]]	\n"
-			     "		tbz %[rem], 63, dne%=		\n"
+			     "		ldxr %[rem], [%[crem]]		\n"
+			     "		tbz %[rem], 63, .Ldne%=		\n"
 			     "		sevl				\n"
-			     "rty%=:	wfe				\n"
-			     "		ldxr %[rem], [%[crem]]	\n"
-			     "		tbnz %[rem], 63, rty%=		\n"
-			     "dne%=:					\n"
+			     ".Lrty%=:	wfe				\n"
+			     "		ldxr %[rem], [%[crem]]		\n"
+			     "		tbnz %[rem], 63, .Lrty%=	\n"
+			     ".Ldne%=:					\n"
 			     : [rem] "=&r"(rem)
 			     : [crem] "r"(&bkt->w1)
 			     : "memory");
@@ -478,12 +478,12 @@ cnxk_tim_add_entry_brst(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -510,13 +510,13 @@ cnxk_tim_add_entry_brst(struct cnxk_tim_ring *const tim_ring,
 		asm volatile(PLT_CPU_FEATURE_PREAMBLE
 			     "		ldxrb %w[lock_cnt], [%[lock]]	\n"
 			     "		tst %w[lock_cnt], 255		\n"
-			     "		beq dne%=			\n"
+			     "		beq .Ldne%=			\n"
 			     "		sevl				\n"
-			     "rty%=:	wfe				\n"
+			     ".Lrty%=:	wfe				\n"
 			     "		ldxrb %w[lock_cnt], [%[lock]]	\n"
 			     "		tst %w[lock_cnt], 255		\n"
-			     "		bne rty%=			\n"
-			     "dne%=:					\n"
+			     "		bne .Lrty%=			\n"
+			     ".Ldne%=:					\n"
 			     : [lock_cnt] "=&r"(lock_cnt)
 			     : [lock] "r"(&bkt->lock)
 			     : "memory");
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 22d90afba2..2bd41f8a5e 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -71,12 +71,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbz %[swtb], 62, done%=		\n"
+		     "		tbz %[swtb], 62, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbnz %[swtb], 62, rty%=		\n"
-		     "done%=:					\n"
+		     "		tbnz %[swtb], 62, .Lrty%=	\n"
+		     ".Ldone%=:					\n"
 		     : [swtb] "=&r"(swtp)
 		     : [swtp_loc] "r"(tag_op));
 #else
-- 
2.39.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/3] event/cnxk: use WFE in Tx fc wait
  2023-05-16 14:37 [PATCH 1/3] event/cnxk: align TX queue buffer adjustment pbhagavatula
  2023-05-16 14:37 ` [PATCH 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
@ 2023-05-16 14:37 ` pbhagavatula
  2023-06-12 15:52 ` [PATCH 1/3] event/cnxk: align TX queue buffer adjustment Jerin Jacob
  2023-06-13  9:25 ` [PATCH v2 " pbhagavatula
  3 siblings, 0 replies; 13+ messages in thread
From: pbhagavatula @ 2023-05-16 14:37 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use WFE is Tx path when waiting for space in the Tx queue.
Depending upon the Tx queue contention and size, WFE will
reduce the cache pressure and power consumption.
In multi-core scenarios we have observed up to 8W power reduction.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
 drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
 2 files changed, 147 insertions(+), 23 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index 7b2798ad2e..df57a4b137 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 static __rte_always_inline void
 cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
 {
+#ifdef RTE_ARCH_ARM64
+	uint64_t space;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space] 			\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space]			\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(space)
+		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
+		     : "memory");
+#else
 	while ((uint64_t)txq->nb_sqb_bufs_adj <=
 	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline int32_t
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index bab08a2d3b..9049ac6b1a 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
 }
 
 static __plt_always_inline void
-cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
+cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
 {
 	int64_t cached, refill;
+	int64_t pkts;
 
 retry:
+#ifdef RTE_ARCH_ARM64
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbz %[pkts], 63, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbnz %[pkts], 63, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(pkts)
+		     : [addr] "r"(&txq->fc_cache_pkts)
+		     : "memory");
+#else
+	RTE_SET_USED(pkts);
 	while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
 		;
+#endif
 	cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
 	/* Check if there is enough space, else update and retry. */
-	if (cached < 0) {
-		/* Check if we have space else retry. */
-		do {
-			refill = txq->nb_sqb_bufs_adj -
-				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
-			refill = (refill << txq->sqes_per_sqb_log2) - refill;
-		} while (refill <= 0);
-		__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
-					  0, __ATOMIC_RELEASE,
-					  __ATOMIC_RELAXED);
+	if (cached >= 0)
+		return;
+
+	/* Check if we have space else retry. */
+#ifdef RTE_ARCH_ARM64
+	int64_t val;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(refill), [val] "=&r" (val)
+		     : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
+		       [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
+		     : "memory");
+#else
+	do {
+		refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
+		refill = (refill << txq->sqes_per_sqb_log2) - refill;
+		refill -= req;
+	} while (refill < 0);
+#endif
+	if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
+				  0, __ATOMIC_RELEASE,
+				  __ATOMIC_RELAXED))
 		goto retry;
-	}
 }
 
 /* Function to determine no of tx subdesc required in case ext
@@ -283,10 +328,27 @@ static __rte_always_inline void
 cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
 {
 	uint64_t nb_desc = txq->cpt_desc;
-	uint64_t *fc = txq->cpt_fc;
-
-	while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
+	uint64_t fc;
+
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(fc)
+		     : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
+		     : "memory");
+#else
+	RTE_SET_USED(fc);
+	while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline void
@@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 {
 	int32_t nb_desc, val, newval;
 	int32_t *fc_sw;
-	volatile uint64_t *fc;
+	uint64_t *fc;
 
 	/* Check if there is any CPT instruction to submit */
 	if (!nb_pkts)
@@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 
 again:
 	fc_sw = txq->cpt_fc_sw;
-	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbz %w[pkts], 31, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbnz %w[pkts], 31, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(val)
+		     : [addr] "r"(fc_sw)
+		     : "memory");
+#else
+	/* Wait for primary core to refill FC. */
+	while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
+		;
+#endif
+
+	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
 	if (likely(val >= 0))
 		return;
 
 	nb_desc = txq->cpt_desc;
 	fc = txq->cpt_fc;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(newval)
+		     : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
+		     : "memory");
+#else
 	while (true) {
 		newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
 		newval -= nb_pkts;
 		if (newval >= 0)
 			break;
 	}
+#endif
 
-	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
-					 __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
+					 __ATOMIC_RELAXED))
 		goto again;
 }
 
@@ -3110,10 +3210,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
 		wd.data[1] |= (uint64_t)(lmt_id + 16);
 
-		if (flags & NIX_TX_VWQE_F)
-			cn10k_nix_vwqe_wait_fc(txq,
-				burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
-					 1));
+		if (flags & NIX_TX_VWQE_F) {
+			if (flags & NIX_TX_MULTI_SEG_F) {
+				if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
+					cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			} else {
+				cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			}
+		}
 		/* STEOR1 */
 		roc_lmt_submit_steorl(wd.data[1], pa);
 	} else if (lnum) {
-- 
2.39.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] event/cnxk: align TX queue buffer adjustment
  2023-05-16 14:37 [PATCH 1/3] event/cnxk: align TX queue buffer adjustment pbhagavatula
  2023-05-16 14:37 ` [PATCH 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
  2023-05-16 14:37 ` [PATCH 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
@ 2023-06-12 15:52 ` Jerin Jacob
  2023-06-13  9:25 ` [PATCH v2 " pbhagavatula
  3 siblings, 0 replies; 13+ messages in thread
From: Jerin Jacob @ 2023-06-12 15:52 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K,
	Sunil Kumar Kori, Satha Rao, dev

On Tue, May 16, 2023 at 8:08 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Remove recalculating SQB thresholds in Tx queue buffer adjustment.
> The adjustment is already done during Tx queue setup.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
> Depends-on: series-27660

Depend patches merged to main tree. Please resent the patch to run through CI.


>
>  drivers/event/cnxk/cn10k_eventdev.c  |  9 +--------
>  drivers/event/cnxk/cn10k_tx_worker.h |  6 +++---
>  drivers/event/cnxk/cn9k_eventdev.c   |  9 +--------
>  drivers/event/cnxk/cn9k_worker.h     | 12 +++++++++---
>  drivers/net/cnxk/cn10k_tx.h          | 12 ++++++------
>  drivers/net/cnxk/cn9k_tx.h           |  5 +++--
>  6 files changed, 23 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index 89f32c4d1e..f7c6a83ff0 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -840,16 +840,9 @@ cn10k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
>                 sq = &cnxk_eth_dev->sqs[tx_queue_id];
>                 txq = eth_dev->data->tx_queues[tx_queue_id];
>                 sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
> -               sq->nb_sqb_bufs_adj =
> -                       sq->nb_sqb_bufs -
> -                       RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
> -                               sqes_per_sqb;
>                 if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
> -                       sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
> -                                               (sqes_per_sqb - 1));
> +                       sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
>                 txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
> -               txq->nb_sqb_bufs_adj =
> -                       ((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
>         }
>  }
>
> diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
> index c18786a14c..7b2798ad2e 100644
> --- a/drivers/event/cnxk/cn10k_tx_worker.h
> +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> @@ -32,9 +32,9 @@ cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  static __rte_always_inline int32_t
>  cn10k_sso_sq_depth(const struct cn10k_eth_txq *txq)
>  {
> -       return (txq->nb_sqb_bufs_adj -
> -               __atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
> -              << txq->sqes_per_sqb_log2;
> +       int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
> +                       (int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> +       return (avail << txq->sqes_per_sqb_log2) - avail;
>  }
>
>  static __rte_always_inline uint16_t
> diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
> index df23219f14..a9d603c22f 100644
> --- a/drivers/event/cnxk/cn9k_eventdev.c
> +++ b/drivers/event/cnxk/cn9k_eventdev.c
> @@ -893,16 +893,9 @@ cn9k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
>                 sq = &cnxk_eth_dev->sqs[tx_queue_id];
>                 txq = eth_dev->data->tx_queues[tx_queue_id];
>                 sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
> -               sq->nb_sqb_bufs_adj =
> -                       sq->nb_sqb_bufs -
> -                       RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
> -                               sqes_per_sqb;
>                 if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
> -                       sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
> -                                               (sqes_per_sqb - 1));
> +                       sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
>                 txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
> -               txq->nb_sqb_bufs_adj =
> -                       ((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
>         }
>  }
>
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index 988cb3acb6..d15dd309fe 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -711,6 +711,14 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>  }
>  #endif
>
> +static __rte_always_inline int32_t
> +cn9k_sso_sq_depth(const struct cn9k_eth_txq *txq)
> +{
> +       int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
> +                       (int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> +       return (avail << txq->sqes_per_sqb_log2) - avail;
> +}
> +
>  static __rte_always_inline uint16_t
>  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>                       uint64_t *txq_data, const uint32_t flags)
> @@ -734,9 +742,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
>                 handle_tx_completion_pkts(txq, 1, 1);
>
> -       if (((txq->nb_sqb_bufs_adj -
> -             __atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
> -            << txq->sqes_per_sqb_log2) <= 0)
> +       if (cn9k_sso_sq_depth(txq) <= 0)
>                 return 0;
>         cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
>         cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index c9ec01cd9d..bab08a2d3b 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -35,12 +35,13 @@
>
>  #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
>         do {                                                                   \
> +               int64_t avail;                                                 \
>                 /* Cached value is low, Update the fc_cache_pkts */            \
>                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
> +                       avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
>                         /* Multiply with sqe_per_sqb to express in pkts */     \
>                         (txq)->fc_cache_pkts =                                 \
> -                               ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
> -                               << (txq)->sqes_per_sqb_log2;                   \
> +                               (avail << (txq)->sqes_per_sqb_log2) - avail;   \
>                         /* Check it again for the room */                      \
>                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
>                                 return 0;                                      \
> @@ -113,10 +114,9 @@ cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
>         if (cached < 0) {
>                 /* Check if we have space else retry. */
>                 do {
> -                       refill =
> -                               (txq->nb_sqb_bufs_adj -
> -                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
> -                               << txq->sqes_per_sqb_log2;
> +                       refill = txq->nb_sqb_bufs_adj -
> +                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> +                       refill = (refill << txq->sqes_per_sqb_log2) - refill;
>                 } while (refill <= 0);
>                 __atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
>                                           0, __ATOMIC_RELEASE,
> diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
> index e956c1ad2a..8efb75b505 100644
> --- a/drivers/net/cnxk/cn9k_tx.h
> +++ b/drivers/net/cnxk/cn9k_tx.h
> @@ -32,12 +32,13 @@
>
>  #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
>         do {                                                                   \
> +               int64_t avail;                                                 \
>                 /* Cached value is low, Update the fc_cache_pkts */            \
>                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
> +                       avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
>                         /* Multiply with sqe_per_sqb to express in pkts */     \
>                         (txq)->fc_cache_pkts =                                 \
> -                               ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
> -                               << (txq)->sqes_per_sqb_log2;                   \
> +                               (avail << (txq)->sqes_per_sqb_log2) - avail;   \
>                         /* Check it again for the room */                      \
>                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
>                                 return 0;                                      \
> --
> 2.39.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/3] event/cnxk: align TX queue buffer adjustment
  2023-05-16 14:37 [PATCH 1/3] event/cnxk: align TX queue buffer adjustment pbhagavatula
                   ` (2 preceding siblings ...)
  2023-06-12 15:52 ` [PATCH 1/3] event/cnxk: align TX queue buffer adjustment Jerin Jacob
@ 2023-06-13  9:25 ` pbhagavatula
  2023-06-13  9:25   ` [PATCH v2 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
  2023-06-13  9:25   ` [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
  3 siblings, 2 replies; 13+ messages in thread
From: pbhagavatula @ 2023-06-13  9:25 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Remove recalculating SQB thresholds in Tx queue buffer adjustment.
The adjustment is already done during Tx queue setup.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Rebase on ToT.

 drivers/event/cnxk/cn10k_eventdev.c  |  9 +--------
 drivers/event/cnxk/cn10k_tx_worker.h |  6 +++---
 drivers/event/cnxk/cn9k_eventdev.c   |  9 +--------
 drivers/event/cnxk/cn9k_worker.h     | 12 +++++++++---
 drivers/net/cnxk/cn10k_tx.h          | 12 ++++++------
 drivers/net/cnxk/cn9k_tx.h           |  5 +++--
 6 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 670fc9e926..8ee9ab3c5c 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -843,16 +843,9 @@ cn10k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
 		sq = &cnxk_eth_dev->sqs[tx_queue_id];
 		txq = eth_dev->data->tx_queues[tx_queue_id];
 		sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
-		sq->nb_sqb_bufs_adj =
-			sq->nb_sqb_bufs -
-			RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
-				sqes_per_sqb;
 		if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
-			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
-						(sqes_per_sqb - 1));
+			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
 		txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
-		txq->nb_sqb_bufs_adj =
-			((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
 	}
 }

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index 31cbccf7d6..b6c9bb1d26 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -32,9 +32,9 @@ cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
 static __rte_always_inline int32_t
 cn10k_sso_sq_depth(const struct cn10k_eth_txq *txq)
 {
-	return (txq->nb_sqb_bufs_adj -
-		__atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
-	       << txq->sqes_per_sqb_log2;
+	int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
+			(int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+	return (avail << txq->sqes_per_sqb_log2) - avail;
 }

 static __rte_always_inline uint16_t
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 7ed9aa1331..dde58b60e4 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -877,16 +877,9 @@ cn9k_sso_txq_fc_update(const struct rte_eth_dev *eth_dev, int32_t tx_queue_id)
 		sq = &cnxk_eth_dev->sqs[tx_queue_id];
 		txq = eth_dev->data->tx_queues[tx_queue_id];
 		sqes_per_sqb = 1U << txq->sqes_per_sqb_log2;
-		sq->nb_sqb_bufs_adj =
-			sq->nb_sqb_bufs -
-			RTE_ALIGN_MUL_CEIL(sq->nb_sqb_bufs, sqes_per_sqb) /
-				sqes_per_sqb;
 		if (cnxk_eth_dev->tx_offloads & RTE_ETH_TX_OFFLOAD_SECURITY)
-			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc /
-						(sqes_per_sqb - 1));
+			sq->nb_sqb_bufs_adj -= (cnxk_eth_dev->outb.nb_desc / sqes_per_sqb);
 		txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
-		txq->nb_sqb_bufs_adj =
-			((100 - ROC_NIX_SQB_THRESH) * txq->nb_sqb_bufs_adj) / 100;
 	}
 }

diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index ec2c1c68dd..ed3b97d7e1 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -713,6 +713,14 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 }
 #endif

+static __rte_always_inline int32_t
+cn9k_sso_sq_depth(const struct cn9k_eth_txq *txq)
+{
+	int32_t avail = (int32_t)txq->nb_sqb_bufs_adj -
+			(int32_t)__atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+	return (avail << txq->sqes_per_sqb_log2) - avail;
+}
+
 static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		      uint64_t *txq_data, const uint32_t flags)
@@ -736,9 +744,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
 		handle_tx_completion_pkts(txq, 1);

-	if (((txq->nb_sqb_bufs_adj -
-	      __atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
-	     << txq->sqes_per_sqb_log2) <= 0)
+	if (cn9k_sso_sq_depth(txq) <= 0)
 		return 0;
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
 	cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 4f23a8dfc3..a365cbe0ee 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -35,12 +35,13 @@

 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
 	do {                                                                   \
+		int64_t avail;                                                 \
 		/* Cached value is low, Update the fc_cache_pkts */            \
 		if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
+			avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
 			/* Multiply with sqe_per_sqb to express in pkts */     \
 			(txq)->fc_cache_pkts =                                 \
-				((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
-				<< (txq)->sqes_per_sqb_log2;                   \
+				(avail << (txq)->sqes_per_sqb_log2) - avail;   \
 			/* Check it again for the room */                      \
 			if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
 				return 0;                                      \
@@ -113,10 +114,9 @@ cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
 	if (cached < 0) {
 		/* Check if we have space else retry. */
 		do {
-			refill =
-				(txq->nb_sqb_bufs_adj -
-				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
-				<< txq->sqes_per_sqb_log2;
+			refill = txq->nb_sqb_bufs_adj -
+				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
+			refill = (refill << txq->sqes_per_sqb_log2) - refill;
 		} while (refill <= 0);
 		__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
 					  0, __ATOMIC_RELEASE,
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 8f1e05a461..fba4bb4215 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -32,12 +32,13 @@

 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
 	do {                                                                   \
+		int64_t avail;                                                 \
 		/* Cached value is low, Update the fc_cache_pkts */            \
 		if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
+			avail = txq->nb_sqb_bufs_adj - *txq->fc_mem;           \
 			/* Multiply with sqe_per_sqb to express in pkts */     \
 			(txq)->fc_cache_pkts =                                 \
-				((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
-				<< (txq)->sqes_per_sqb_log2;                   \
+				(avail << (txq)->sqes_per_sqb_log2) - avail;   \
 			/* Check it again for the room */                      \
 			if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
 				return 0;                                      \
--
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 2/3] event/cnxk: use local labels in asm intrinsic
  2023-06-13  9:25 ` [PATCH v2 " pbhagavatula
@ 2023-06-13  9:25   ` pbhagavatula
  2023-06-13  9:25   ` [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
  1 sibling, 0 replies; 13+ messages in thread
From: pbhagavatula @ 2023-06-13  9:25 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Using labels in asm generates them as regular function and shades
callstack in tools like gdb or perf.
Use local label instead for better visibility.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h    |  8 ++---
 drivers/event/cnxk/cn9k_worker.h     | 25 ++++++++--------
 drivers/event/cnxk/cnxk_tim_worker.h | 44 ++++++++++++++--------------
 drivers/event/cnxk/cnxk_worker.h     |  8 ++---
 4 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index a01894ae10..2af0bb3f9f 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -269,12 +269,12 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev,
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
-		     "		tbz %[tag], 63, done%=			\n"
+		     "		tbz %[tag], 63, .Ldone%=		\n"
 		     "		sevl					\n"
-		     "rty%=:	wfe					\n"
+		     ".Lrty%=:	wfe					\n"
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=			\n"
-		     "done%=:	dmb ld					\n"
+		     "		tbnz %[tag], 63, .Lrty%=		\n"
+		     ".Ldone%=:	dmb ld					\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
 		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index ed3b97d7e1..9ddab095ac 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -232,18 +232,19 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 		rte_prefetch_non_temporal(dws->lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
-		     "rty%=:					\n"
+		     ".Lrty%=:					\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	str %[gw], [%[pong]]		\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	str %[gw], [%[pong]]		\n"
 		     "		dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     "		prfm pldl1keep, [%[mbuf]]	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(base + SSOW_LF_GWS_TAG),
-		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(dws->gw_wdata),
+		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP),
+		       [gw] "r"(dws->gw_wdata),
 		       [pong] "r"(pair_base + SSOW_LF_GWS_OP_GET_WORK0));
 #else
 	gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
@@ -282,13 +283,13 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbz %[tag], 63, done%=		\n"
+		     "		tbz %[tag], 63, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	dmb ld				\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     "		prfm pldl1keep, [%[mbuf]]	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
@@ -330,13 +331,13 @@ cn9k_sso_hws_get_work_empty(uint64_t base, struct rte_event *ev,
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbz %[tag], 63, done%=		\n"
+		     "		tbz %[tag], 63, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[tag], [%[tag_loc]]	\n"
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
-		     "		tbnz %[tag], 63, rty%=		\n"
-		     "done%=:	dmb ld				\n"
+		     "		tbnz %[tag], 63, .Lrty%=	\n"
+		     ".Ldone%=:	dmb ld				\n"
 		     "		sub %[mbuf], %[wqp], #0x80	\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
diff --git a/drivers/event/cnxk/cnxk_tim_worker.h b/drivers/event/cnxk/cnxk_tim_worker.h
index 8fafb8f09c..f0857f26ba 100644
--- a/drivers/event/cnxk/cnxk_tim_worker.h
+++ b/drivers/event/cnxk/cnxk_tim_worker.h
@@ -262,12 +262,12 @@ cnxk_tim_add_entry_sp(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -345,12 +345,12 @@ cnxk_tim_add_entry_mp(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -374,13 +374,13 @@ cnxk_tim_add_entry_mp(struct cnxk_tim_ring *const tim_ring,
 		cnxk_tim_bkt_dec_lock(bkt);
 #ifdef RTE_ARCH_ARM64
 		asm volatile(PLT_CPU_FEATURE_PREAMBLE
-			     "		ldxr %[rem], [%[crem]]	\n"
-			     "		tbz %[rem], 63, dne%=		\n"
+			     "		ldxr %[rem], [%[crem]]		\n"
+			     "		tbz %[rem], 63, .Ldne%=		\n"
 			     "		sevl				\n"
-			     "rty%=:	wfe				\n"
-			     "		ldxr %[rem], [%[crem]]	\n"
-			     "		tbnz %[rem], 63, rty%=		\n"
-			     "dne%=:					\n"
+			     ".Lrty%=:	wfe				\n"
+			     "		ldxr %[rem], [%[crem]]		\n"
+			     "		tbnz %[rem], 63, .Lrty%=	\n"
+			     ".Ldne%=:					\n"
 			     : [rem] "=&r"(rem)
 			     : [crem] "r"(&bkt->w1)
 			     : "memory");
@@ -478,12 +478,12 @@ cnxk_tim_add_entry_brst(struct cnxk_tim_ring *const tim_ring,
 #ifdef RTE_ARCH_ARM64
 			asm volatile(PLT_CPU_FEATURE_PREAMBLE
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbz %[hbt], 33, dne%=	\n"
+				     "		tbz %[hbt], 33, .Ldne%=	\n"
 				     "		sevl			\n"
-				     "rty%=:	wfe			\n"
+				     ".Lrty%=:	wfe			\n"
 				     "		ldxr %[hbt], [%[w1]]	\n"
-				     "		tbnz %[hbt], 33, rty%=	\n"
-				     "dne%=:				\n"
+				     "		tbnz %[hbt], 33, .Lrty%=\n"
+				     ".Ldne%=:				\n"
 				     : [hbt] "=&r"(hbt_state)
 				     : [w1] "r"((&bkt->w1))
 				     : "memory");
@@ -510,13 +510,13 @@ cnxk_tim_add_entry_brst(struct cnxk_tim_ring *const tim_ring,
 		asm volatile(PLT_CPU_FEATURE_PREAMBLE
 			     "		ldxrb %w[lock_cnt], [%[lock]]	\n"
 			     "		tst %w[lock_cnt], 255		\n"
-			     "		beq dne%=			\n"
+			     "		beq .Ldne%=			\n"
 			     "		sevl				\n"
-			     "rty%=:	wfe				\n"
+			     ".Lrty%=:	wfe				\n"
 			     "		ldxrb %w[lock_cnt], [%[lock]]	\n"
 			     "		tst %w[lock_cnt], 255		\n"
-			     "		bne rty%=			\n"
-			     "dne%=:					\n"
+			     "		bne .Lrty%=			\n"
+			     ".Ldne%=:					\n"
 			     : [lock_cnt] "=&r"(lock_cnt)
 			     : [lock] "r"(&bkt->lock)
 			     : "memory");
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 22d90afba2..2bd41f8a5e 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -71,12 +71,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbz %[swtb], 62, done%=		\n"
+		     "		tbz %[swtb], 62, .Ldone%=	\n"
 		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
+		     ".Lrty%=:	wfe				\n"
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbnz %[swtb], 62, rty%=		\n"
-		     "done%=:					\n"
+		     "		tbnz %[swtb], 62, .Lrty%=	\n"
+		     ".Ldone%=:					\n"
 		     : [swtb] "=&r"(swtp)
 		     : [swtp_loc] "r"(tag_op));
 #else
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-13  9:25 ` [PATCH v2 " pbhagavatula
  2023-06-13  9:25   ` [PATCH v2 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
@ 2023-06-13  9:25   ` pbhagavatula
  2023-06-14 10:33     ` Jerin Jacob
  2023-06-15 15:28     ` Stephen Hemminger
  1 sibling, 2 replies; 13+ messages in thread
From: pbhagavatula @ 2023-06-13  9:25 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use WFE is Tx path when waiting for space in the Tx queue.
Depending upon the Tx queue contention and size, WFE will
reduce the cache pressure and power consumption.
In multi-core scenarios we have observed up to 8W power reduction.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
 drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
 2 files changed, 147 insertions(+), 23 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index b6c9bb1d26..dea6cdcde2 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 static __rte_always_inline void
 cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
 {
+#ifdef RTE_ARCH_ARM64
+	uint64_t space;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space] 			\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space]			\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(space)
+		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
+		     : "memory");
+#else
 	while ((uint64_t)txq->nb_sqb_bufs_adj <=
 	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline int32_t
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index a365cbe0ee..d0e8350ce2 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
 }
 
 static __plt_always_inline void
-cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
+cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
 {
 	int64_t cached, refill;
+	int64_t pkts;
 
 retry:
+#ifdef RTE_ARCH_ARM64
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbz %[pkts], 63, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbnz %[pkts], 63, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(pkts)
+		     : [addr] "r"(&txq->fc_cache_pkts)
+		     : "memory");
+#else
+	RTE_SET_USED(pkts);
 	while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
 		;
+#endif
 	cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
 	/* Check if there is enough space, else update and retry. */
-	if (cached < 0) {
-		/* Check if we have space else retry. */
-		do {
-			refill = txq->nb_sqb_bufs_adj -
-				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
-			refill = (refill << txq->sqes_per_sqb_log2) - refill;
-		} while (refill <= 0);
-		__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
-					  0, __ATOMIC_RELEASE,
-					  __ATOMIC_RELAXED);
+	if (cached >= 0)
+		return;
+
+	/* Check if we have space else retry. */
+#ifdef RTE_ARCH_ARM64
+	int64_t val;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(refill), [val] "=&r" (val)
+		     : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
+		       [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
+		     : "memory");
+#else
+	do {
+		refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
+		refill = (refill << txq->sqes_per_sqb_log2) - refill;
+		refill -= req;
+	} while (refill < 0);
+#endif
+	if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
+				  0, __ATOMIC_RELEASE,
+				  __ATOMIC_RELAXED))
 		goto retry;
-	}
 }
 
 /* Function to determine no of tx subdesc required in case ext
@@ -283,10 +328,27 @@ static __rte_always_inline void
 cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
 {
 	uint64_t nb_desc = txq->cpt_desc;
-	uint64_t *fc = txq->cpt_fc;
-
-	while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
+	uint64_t fc;
+
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(fc)
+		     : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
+		     : "memory");
+#else
+	RTE_SET_USED(fc);
+	while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline void
@@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 {
 	int32_t nb_desc, val, newval;
 	int32_t *fc_sw;
-	volatile uint64_t *fc;
+	uint64_t *fc;
 
 	/* Check if there is any CPT instruction to submit */
 	if (!nb_pkts)
@@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 
 again:
 	fc_sw = txq->cpt_fc_sw;
-	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbz %w[pkts], 31, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbnz %w[pkts], 31, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(val)
+		     : [addr] "r"(fc_sw)
+		     : "memory");
+#else
+	/* Wait for primary core to refill FC. */
+	while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
+		;
+#endif
+
+	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
 	if (likely(val >= 0))
 		return;
 
 	nb_desc = txq->cpt_desc;
 	fc = txq->cpt_fc;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(newval)
+		     : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
+		     : "memory");
+#else
 	while (true) {
 		newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
 		newval -= nb_pkts;
 		if (newval >= 0)
 			break;
 	}
+#endif
 
-	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
-					 __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
+					 __ATOMIC_RELAXED))
 		goto again;
 }
 
@@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
 		wd.data[1] |= (uint64_t)(lmt_id + 16);
 
-		if (flags & NIX_TX_VWQE_F)
-			cn10k_nix_vwqe_wait_fc(txq,
-				burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
-					 1));
+		if (flags & NIX_TX_VWQE_F) {
+			if (flags & NIX_TX_MULTI_SEG_F) {
+				if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
+					cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			} else {
+				cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			}
+		}
 		/* STEOR1 */
 		roc_lmt_submit_steorl(wd.data[1], pa);
 	} else if (lnum) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-13  9:25   ` [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
@ 2023-06-14 10:33     ` Jerin Jacob
  2023-06-14 18:27       ` Patrick Robb
  2023-06-15 15:28     ` Stephen Hemminger
  1 sibling, 1 reply; 13+ messages in thread
From: Jerin Jacob @ 2023-06-14 10:33 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K,
	Sunil Kumar Kori, Satha Rao, dev

On Tue, Jun 13, 2023 at 2:56 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use WFE is Tx path when waiting for space in the Tx queue.
> Depending upon the Tx queue contention and size, WFE will
> reduce the cache pressure and power consumption.
> In multi-core scenarios we have observed up to 8W power reduction.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Series Applied to dpdk-next-net-eventdev/for-main. Thanks

> ---
>  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
>  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
>  2 files changed, 147 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
> index b6c9bb1d26..dea6cdcde2 100644
> --- a/drivers/event/cnxk/cn10k_tx_worker.h
> +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +       uint64_t space;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.ls .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(space)
> +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +                    : "memory");
> +#else
>         while ((uint64_t)txq->nb_sqb_bufs_adj <=
>                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline int32_t
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index a365cbe0ee..d0e8350ce2 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
>  }
>
>  static __plt_always_inline void
> -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
>  {
>         int64_t cached, refill;
> +       int64_t pkts;
>
>  retry:
> +#ifdef RTE_ARCH_ARM64
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbz %[pkts], 63, .Ldne%=                \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbnz %[pkts], 63, .Lrty%=               \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(pkts)
> +                    : [addr] "r"(&txq->fc_cache_pkts)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(pkts);
>         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
>                 ;
> +#endif
>         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
>         /* Check if there is enough space, else update and retry. */
> -       if (cached < 0) {
> -               /* Check if we have space else retry. */
> -               do {
> -                       refill = txq->nb_sqb_bufs_adj -
> -                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> -                       refill = (refill << txq->sqes_per_sqb_log2) - refill;
> -               } while (refill <= 0);
> -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> -                                         0, __ATOMIC_RELEASE,
> -                                         __ATOMIC_RELAXED);
> +       if (cached >= 0)
> +               return;
> +
> +       /* Check if we have space else retry. */
> +#ifdef RTE_ARCH_ARM64
> +       int64_t val;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> +                    : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
> +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> +                    : "memory");
> +#else
> +       do {
> +               refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> +               refill -= req;
> +       } while (refill < 0);
> +#endif
> +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> +                                 0, __ATOMIC_RELEASE,
> +                                 __ATOMIC_RELAXED))
>                 goto retry;
> -       }
>  }
>
>  /* Function to determine no of tx subdesc required in case ext
> @@ -283,10 +328,27 @@ static __rte_always_inline void
>  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
>  {
>         uint64_t nb_desc = txq->cpt_desc;
> -       uint64_t *fc = txq->cpt_fc;
> -
> -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> +       uint64_t fc;
> +
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.ls .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(fc)
> +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(fc);
> +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline void
> @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>  {
>         int32_t nb_desc, val, newval;
>         int32_t *fc_sw;
> -       volatile uint64_t *fc;
> +       uint64_t *fc;
>
>         /* Check if there is any CPT instruction to submit */
>         if (!nb_pkts)
> @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>
>  again:
>         fc_sw = txq->cpt_fc_sw;
> -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbz %w[pkts], 31, .Ldne%=               \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbnz %w[pkts], 31, .Lrty%=              \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(val)
> +                    : [addr] "r"(fc_sw)
> +                    : "memory");
> +#else
> +       /* Wait for primary core to refill FC. */
> +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> +               ;
> +#endif
> +
> +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
>         if (likely(val >= 0))
>                 return;
>
>         nb_desc = txq->cpt_desc;
>         fc = txq->cpt_fc;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(newval)
> +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
> +                    : "memory");
> +#else
>         while (true) {
>                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
>                 newval -= nb_pkts;
>                 if (newval >= 0)
>                         break;
>         }
> +#endif
>
> -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> -                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED))
> +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
> +                                        __ATOMIC_RELAXED))
>                 goto again;
>  }
>
> @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
>                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
>                 wd.data[1] |= (uint64_t)(lmt_id + 16);
>
> -               if (flags & NIX_TX_VWQE_F)
> -                       cn10k_nix_vwqe_wait_fc(txq,
> -                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
> -                                        1));
> +               if (flags & NIX_TX_VWQE_F) {
> +                       if (flags & NIX_TX_MULTI_SEG_F) {
> +                               if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> +                                       cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       } else {
> +                               cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       }
> +               }
>                 /* STEOR1 */
>                 roc_lmt_submit_steorl(wd.data[1], pa);
>         } else if (lnum) {
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-14 10:33     ` Jerin Jacob
@ 2023-06-14 18:27       ` Patrick Robb
  2023-06-14 20:24         ` [EXT] " Pavan Nikhilesh Bhagavatula
  2023-06-15  5:49         ` Jerin Jacob Kollanukkaran
  0 siblings, 2 replies; 13+ messages in thread
From: Patrick Robb @ 2023-06-14 18:27 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: pbhagavatula, jerinj, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao, dev

[-- Attachment #1: Type: text/plain, Size: 13190 bytes --]

Hello Pavan and Jerin,

The Community Lab's CI testing failed this patchseries on clang compile on
ARM systems. That wasn't properly reported to Patchwork due to issues with
our reporting process, which we are resolving currently. This updated
report show the failed compile. Apologies for the incomplete initial
report.

http://mails.dpdk.org/archives/test-report/2023-June/411303.html

On Wed, Jun 14, 2023 at 6:33 AM Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Tue, Jun 13, 2023 at 2:56 PM <pbhagavatula@marvell.com> wrote:
> >
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Use WFE is Tx path when waiting for space in the Tx queue.
> > Depending upon the Tx queue contention and size, WFE will
> > reduce the cache pressure and power consumption.
> > In multi-core scenarios we have observed up to 8W power reduction.
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Series Applied to dpdk-next-net-eventdev/for-main. Thanks
>
> > ---
> >  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
> >  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
> >  2 files changed, 147 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/event/cnxk/cn10k_tx_worker.h
> b/drivers/event/cnxk/cn10k_tx_worker.h
> > index b6c9bb1d26..dea6cdcde2 100644
> > --- a/drivers/event/cnxk/cn10k_tx_worker.h
> > +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> > @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const
> uint64_t *txq_data)
> >  static __rte_always_inline void
> >  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
> >  {
> > +#ifdef RTE_ARCH_ARM64
> > +       uint64_t space;
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[adj], %[space]
> \n"
> > +                    "          b.hi .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[adj], %[space]
> \n"
> > +                    "          b.ls .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [space] "=&r"(space)
> > +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr]
> "r"(txq->fc_mem)
> > +                    : "memory");
> > +#else
> >         while ((uint64_t)txq->nb_sqb_bufs_adj <=
> >                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
> >                 ;
> > +#endif
> >  }
> >
> >  static __rte_always_inline int32_t
> > diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> > index a365cbe0ee..d0e8350ce2 100644
> > --- a/drivers/net/cnxk/cn10k_tx.h
> > +++ b/drivers/net/cnxk/cn10k_tx.h
> > @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m,
> const uint32_t flags)
> >  }
> >
> >  static __plt_always_inline void
> > -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> > +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
> >  {
> >         int64_t cached, refill;
> > +       int64_t pkts;
> >
> >  retry:
> > +#ifdef RTE_ARCH_ARM64
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[pkts], [%[addr]]
>  \n"
> > +                    "          tbz %[pkts], 63, .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[pkts], [%[addr]]
>  \n"
> > +                    "          tbnz %[pkts], 63, .Lrty%=
>  \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [pkts] "=&r"(pkts)
> > +                    : [addr] "r"(&txq->fc_cache_pkts)
> > +                    : "memory");
> > +#else
> > +       RTE_SET_USED(pkts);
> >         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) <
> 0)
> >                 ;
> > +#endif
> >         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req,
> __ATOMIC_ACQUIRE) - req;
> >         /* Check if there is enough space, else update and retry. */
> > -       if (cached < 0) {
> > -               /* Check if we have space else retry. */
> > -               do {
> > -                       refill = txq->nb_sqb_bufs_adj -
> > -                                __atomic_load_n(txq->fc_mem,
> __ATOMIC_RELAXED);
> > -                       refill = (refill << txq->sqes_per_sqb_log2) -
> refill;
> > -               } while (refill <= 0);
> > -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached,
> &refill,
> > -                                         0, __ATOMIC_RELEASE,
> > -                                         __ATOMIC_RELAXED);
> > +       if (cached >= 0)
> > +               return;
> > +
> > +       /* Check if we have space else retry. */
> > +#ifdef RTE_ARCH_ARM64
> > +       int64_t val;
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[val], [%[addr]]
> \n"
> > +                    "          sub %[val], %[adj], %[val]
> \n"
> > +                    "          lsl %[refill], %[val], %[shft]
> \n"
> > +                    "          sub %[refill], %[refill], %[val]
> \n"
> > +                    "          sub %[refill], %[refill], %[sub]
> \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.ge .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[val], [%[addr]]
> \n"
> > +                    "          sub %[val], %[adj], %[val]
> \n"
> > +                    "          lsl %[refill], %[val], %[shft]
> \n"
> > +                    "          sub %[refill], %[refill], %[val]
> \n"
> > +                    "          sub %[refill], %[refill], %[sub]
> \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.lt .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> > +                    : [addr] "r"(txq->fc_mem), [adj]
> "r"(txq->nb_sqb_bufs_adj),
> > +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> > +                    : "memory");
> > +#else
> > +       do {
> > +               refill = (txq->nb_sqb_bufs_adj -
> __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> > +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> > +               refill -= req;
> > +       } while (refill < 0);
> > +#endif
> > +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached,
> &refill,
> > +                                 0, __ATOMIC_RELEASE,
> > +                                 __ATOMIC_RELAXED))
> >                 goto retry;
> > -       }
> >  }
> >
> >  /* Function to determine no of tx subdesc required in case ext
> > @@ -283,10 +328,27 @@ static __rte_always_inline void
> >  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
> >  {
> >         uint64_t nb_desc = txq->cpt_desc;
> > -       uint64_t *fc = txq->cpt_fc;
> > -
> > -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> > +       uint64_t fc;
> > +
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[nb_desc], %[space]
> \n"
> > +                    "          b.hi .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[nb_desc], %[space]
> \n"
> > +                    "          b.ls .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [space] "=&r"(fc)
> > +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> > +                    : "memory");
> > +#else
> > +       RTE_SET_USED(fc);
> > +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
> >                 ;
> > +#endif
> >  }
> >
> >  static __rte_always_inline void
> > @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq,
> uint16_t nb_pkts)
> >  {
> >         int32_t nb_desc, val, newval;
> >         int32_t *fc_sw;
> > -       volatile uint64_t *fc;
> > +       uint64_t *fc;
> >
> >         /* Check if there is any CPT instruction to submit */
> >         if (!nb_pkts)
> > @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq,
> uint16_t nb_pkts)
> >
> >  again:
> >         fc_sw = txq->cpt_fc_sw;
> > -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) -
> nb_pkts;
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %w[pkts], [%[addr]]
> \n"
> > +                    "          tbz %w[pkts], 31, .Ldne%=
>  \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %w[pkts], [%[addr]]
> \n"
> > +                    "          tbnz %w[pkts], 31, .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [pkts] "=&r"(val)
> > +                    : [addr] "r"(fc_sw)
> > +                    : "memory");
> > +#else
> > +       /* Wait for primary core to refill FC. */
> > +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> > +               ;
> > +#endif
> > +
> > +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) -
> nb_pkts;
> >         if (likely(val >= 0))
> >                 return;
> >
> >         nb_desc = txq->cpt_desc;
> >         fc = txq->cpt_fc;
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[refill], [%[addr]]
>  \n"
> > +                    "          sub %[refill], %[desc], %[refill]
>  \n"
> > +                    "          sub %[refill], %[refill], %[pkts]
>  \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.ge .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[refill], [%[addr]]
>  \n"
> > +                    "          sub %[refill], %[desc], %[refill]
>  \n"
> > +                    "          sub %[refill], %[refill], %[pkts]
>  \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.lt .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [refill] "=&r"(newval)
> > +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts]
> "r"(nb_pkts)
> > +                    : "memory");
> > +#else
> >         while (true) {
> >                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
> >                 newval -= nb_pkts;
> >                 if (newval >= 0)
> >                         break;
> >         }
> > +#endif
> >
> > -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> > -                                        __ATOMIC_RELAXED,
> __ATOMIC_RELAXED))
> > +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> __ATOMIC_RELEASE,
> > +                                        __ATOMIC_RELAXED))
> >                 goto again;
> >  }
> >
> > @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue,
> uint64_t *ws,
> >                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
> >                 wd.data[1] |= (uint64_t)(lmt_id + 16);
> >
> > -               if (flags & NIX_TX_VWQE_F)
> > -                       cn10k_nix_vwqe_wait_fc(txq,
> > -                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >>
> > -                                        1));
> > +               if (flags & NIX_TX_VWQE_F) {
> > +                       if (flags & NIX_TX_MULTI_SEG_F) {
> > +                               if (burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> > +                                       cn10k_nix_vwqe_wait_fc(txq,
> > +                                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> > +                       } else {
> > +                               cn10k_nix_vwqe_wait_fc(txq,
> > +                                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> > +                       }
> > +               }
> >                 /* STEOR1 */
> >                 roc_lmt_submit_steorl(wd.data[1], pa);
> >         } else if (lnum) {
> > --
> > 2.25.1
> >
>


-- 

Patrick Robb

Technical Service Manager

UNH InterOperability Laboratory

21 Madbury Rd, Suite 100, Durham, NH 03824

www.iol.unh.edu

[-- Attachment #2: Type: text/html, Size: 22185 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-14 18:27       ` Patrick Robb
@ 2023-06-14 20:24         ` Pavan Nikhilesh Bhagavatula
  2023-06-15  5:49         ` Jerin Jacob Kollanukkaran
  1 sibling, 0 replies; 13+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-06-14 20:24 UTC (permalink / raw)
  To: Patrick Robb, Jerin Jacob
  Cc: Jerin Jacob Kollanukkaran, Shijith Thotton,
	Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
	Sunil Kumar Kori, Satha Koteswara Rao Kottidi, dev

Thanks Patrick, 

I have sent out a fix.
https://mails.dpdk.org/archives/dev/2023-June/271209.html

Pavan. 

From: Patrick Robb <probb@iol.unh.edu> 
Sent: Wednesday, June 14, 2023 11:57 PM
To: Jerin Jacob <jerinjacobk@gmail.com>
Cc: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Shijith Thotton <sthotton@marvell.com>; Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>; Sunil Kumar Kori <skori@marvell.com>; Satha Koteswara Rao Kottidi <skoteshwar@marvell.com>; dev@dpdk.org
Subject: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait

External Email 
________________________________________
Hello Pavan and Jerin,

The Community Lab's CI testing failed this patchseries on clang compile on ARM systems. That wasn't properly reported to Patchwork due to issues with our reporting process, which we are resolving currently. This updated report show the failed compile. Apologies for the incomplete initial report. 

https://urldefense.proofpoint.com/v2/url?u=http-3A__mails.dpdk.org_archives_test-2Dreport_2023-2DJune_411303.html&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=U7W1cRWIneMh37ISczLfZtTK5aKC-fqoytVWHd6Igas&e=

On Wed, Jun 14, 2023 at 6:33 AM Jerin Jacob <mailto:jerinjacobk@gmail.com> wrote:
On Tue, Jun 13, 2023 at 2:56 PM <mailto:pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <mailto:pbhagavatula@marvell.com>
>
> Use WFE is Tx path when waiting for space in the Tx queue.
> Depending upon the Tx queue contention and size, WFE will
> reduce the cache pressure and power consumption.
> In multi-core scenarios we have observed up to 8W power reduction.
>
> Signed-off-by: Pavan Nikhilesh <mailto:pbhagavatula@marvell.com>

Series Applied to dpdk-next-net-eventdev/for-main. Thanks

> ---
>  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
>  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
>  2 files changed, 147 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
> index b6c9bb1d26..dea6cdcde2 100644
> --- a/drivers/event/cnxk/cn10k_tx_worker.h
> +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +       uint64_t space;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ls&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=8AHNen7bqm6oP2d8IDE-lkZWD5__3U2eBfDpMNtHHUo&e= .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(space)
> +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +                    : "memory");
> +#else
>         while ((uint64_t)txq->nb_sqb_bufs_adj <=
>                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline int32_t
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index a365cbe0ee..d0e8350ce2 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
>  }
>
>  static __plt_always_inline void
> -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
>  {
>         int64_t cached, refill;
> +       int64_t pkts;
>
>  retry:
> +#ifdef RTE_ARCH_ARM64
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbz %[pkts], 63, .Ldne%=                \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbnz %[pkts], 63, .Lrty%=               \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(pkts)
> +                    : [addr] "r"(&txq->fc_cache_pkts)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(pkts);
>         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
>                 ;
> +#endif
>         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
>         /* Check if there is enough space, else update and retry. */
> -       if (cached < 0) {
> -               /* Check if we have space else retry. */
> -               do {
> -                       refill = txq->nb_sqb_bufs_adj -
> -                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> -                       refill = (refill << txq->sqes_per_sqb_log2) - refill;
> -               } while (refill <= 0);
> -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> -                                         0, __ATOMIC_RELEASE,
> -                                         __ATOMIC_RELAXED);
> +       if (cached >= 0)
> +               return;
> +
> +       /* Check if we have space else retry. */
> +#ifdef RTE_ARCH_ARM64
> +       int64_t val;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ge&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=sbZBvpT_K2GLNqT_8h6QHd-uCwbmeFyjeTvQeUD06xU&e= .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.lt&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=1wNlzQMC7VhkNNlqeu6wIxOXHh7ZmtLSXjVR3c2NwV4&e= .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> +                    : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
> +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> +                    : "memory");
> +#else
> +       do {
> +               refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> +               refill -= req;
> +       } while (refill < 0);
> +#endif
> +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> +                                 0, __ATOMIC_RELEASE,
> +                                 __ATOMIC_RELAXED))
>                 goto retry;
> -       }
>  }
>
>  /* Function to determine no of tx subdesc required in case ext
> @@ -283,10 +328,27 @@ static __rte_always_inline void
>  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
>  {
>         uint64_t nb_desc = txq->cpt_desc;
> -       uint64_t *fc = txq->cpt_fc;
> -
> -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> +       uint64_t fc;
> +
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ls&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=8AHNen7bqm6oP2d8IDE-lkZWD5__3U2eBfDpMNtHHUo&e= .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(fc)
> +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(fc);
> +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline void
> @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>  {
>         int32_t nb_desc, val, newval;
>         int32_t *fc_sw;
> -       volatile uint64_t *fc;
> +       uint64_t *fc;
>
>         /* Check if there is any CPT instruction to submit */
>         if (!nb_pkts)
> @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>
>  again:
>         fc_sw = txq->cpt_fc_sw;
> -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbz %w[pkts], 31, .Ldne%=               \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbnz %w[pkts], 31, .Lrty%=              \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(val)
> +                    : [addr] "r"(fc_sw)
> +                    : "memory");
> +#else
> +       /* Wait for primary core to refill FC. */
> +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> +               ;
> +#endif
> +
> +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
>         if (likely(val >= 0))
>                 return;
>
>         nb_desc = txq->cpt_desc;
>         fc = txq->cpt_fc;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ge&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=sbZBvpT_K2GLNqT_8h6QHd-uCwbmeFyjeTvQeUD06xU&e= .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          https://urldefense.proofpoint.com/v2/url?u=http-3A__b.lt&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=1wNlzQMC7VhkNNlqeu6wIxOXHh7ZmtLSXjVR3c2NwV4&e= .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(newval)
> +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
> +                    : "memory");
> +#else
>         while (true) {
>                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
>                 newval -= nb_pkts;
>                 if (newval >= 0)
>                         break;
>         }
> +#endif
>
> -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> -                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED))
> +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
> +                                        __ATOMIC_RELAXED))
>                 goto again;
>  }
>
> @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
>                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
>                 wd.data[1] |= (uint64_t)(lmt_id + 16);
>
> -               if (flags & NIX_TX_VWQE_F)
> -                       cn10k_nix_vwqe_wait_fc(txq,
> -                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
> -                                        1));
> +               if (flags & NIX_TX_VWQE_F) {
> +                       if (flags & NIX_TX_MULTI_SEG_F) {
> +                               if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> +                                       cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       } else {
> +                               cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       }
> +               }
>                 /* STEOR1 */
>                 roc_lmt_submit_steorl(wd.data[1], pa);
>         } else if (lnum) {
> --
> 2.25.1
>



-- 
Patrick Robb
Technical Service Manager
UNH InterOperability Laboratory
21 Madbury Rd, Suite 100, Durham, NH 03824
https://urldefense.proofpoint.com/v2/url?u=http-3A__www.iol.unh.edu_&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=E3SgYMjtKCMVsB-fmvgGV3o-g_fjLhk5Pupi9ijohpc&m=qZTANW2mtAObKrxDKzHg5OgJ2xnuoK-RKlcUaLMiomhiLFG5_131TS0rVRujlTI5&s=QjDtbWAefhLmfcecKBmV7cgdwEE8_zlvMH3ptVtQJdo&e=



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-14 18:27       ` Patrick Robb
  2023-06-14 20:24         ` [EXT] " Pavan Nikhilesh Bhagavatula
@ 2023-06-15  5:49         ` Jerin Jacob Kollanukkaran
  1 sibling, 0 replies; 13+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2023-06-15  5:49 UTC (permalink / raw)
  To: Patrick Robb, Jerin Jacob
  Cc: Pavan Nikhilesh Bhagavatula, Shijith Thotton,
	Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
	Sunil Kumar Kori, Satha Koteswara Rao Kottidi, dev

[-- Attachment #1: Type: text/plain, Size: 16814 bytes --]

Fix merged.
________________________________
From: Patrick Robb <probb@iol.unh.edu>
Sent: Wednesday, June 14, 2023 11:57 PM
To: Jerin Jacob <jerinjacobk@gmail.com>
Cc: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Shijith Thotton <sthotton@marvell.com>; Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>; Sunil Kumar Kori <skori@marvell.com>; Satha Koteswara Rao Kottidi <skoteshwar@marvell.com>; dev@dpdk.org <dev@dpdk.org>
Subject: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait

External Email
________________________________
Hello Pavan and Jerin,

The Community Lab's CI testing failed this patchseries on clang compile on ARM systems. That wasn't properly reported to Patchwork due to issues with our reporting process, which we are resolving currently. This updated report show the failed compile. Apologies for the incomplete initial report.

http://mails.dpdk.org/archives/test-report/2023-June/411303.html<https://urldefense.proofpoint.com/v2/url?u=http-3A__mails.dpdk.org_archives_test-2Dreport_2023-2DJune_411303.html&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=9FXCmqxHZTcV3Z4Di2iBAheusgkMC5cGx7UfGrOTS_M&e=>

On Wed, Jun 14, 2023 at 6:33 AM Jerin Jacob <jerinjacobk@gmail.com<mailto:jerinjacobk@gmail.com>> wrote:
On Tue, Jun 13, 2023 at 2:56 PM <pbhagavatula@marvell.com<mailto:pbhagavatula@marvell.com>> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com<mailto:pbhagavatula@marvell.com>>
>
> Use WFE is Tx path when waiting for space in the Tx queue.
> Depending upon the Tx queue contention and size, WFE will
> reduce the cache pressure and power consumption.
> In multi-core scenarios we have observed up to 8W power reduction.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com<mailto:pbhagavatula@marvell.com>>

Series Applied to dpdk-next-net-eventdev/for-main. Thanks

> ---
>  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
>  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
>  2 files changed, 147 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
> index b6c9bb1d26..dea6cdcde2 100644
> --- a/drivers/event/cnxk/cn10k_tx_worker.h
> +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +       uint64_t space;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.ls<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ls&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=7_L6grMaTkrE6l15-rPboGrijiBzvmc0jeDUK1qWJt0&e=> .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(space)
> +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +                    : "memory");
> +#else
>         while ((uint64_t)txq->nb_sqb_bufs_adj <=
>                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline int32_t
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index a365cbe0ee..d0e8350ce2 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
>  }
>
>  static __plt_always_inline void
> -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
>  {
>         int64_t cached, refill;
> +       int64_t pkts;
>
>  retry:
> +#ifdef RTE_ARCH_ARM64
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbz %[pkts], 63, .Ldne%=                \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbnz %[pkts], 63, .Lrty%=               \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(pkts)
> +                    : [addr] "r"(&txq->fc_cache_pkts)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(pkts);
>         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
>                 ;
> +#endif
>         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
>         /* Check if there is enough space, else update and retry. */
> -       if (cached < 0) {
> -               /* Check if we have space else retry. */
> -               do {
> -                       refill = txq->nb_sqb_bufs_adj -
> -                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> -                       refill = (refill << txq->sqes_per_sqb_log2) - refill;
> -               } while (refill <= 0);
> -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> -                                         0, __ATOMIC_RELEASE,
> -                                         __ATOMIC_RELAXED);
> +       if (cached >= 0)
> +               return;
> +
> +       /* Check if we have space else retry. */
> +#ifdef RTE_ARCH_ARM64
> +       int64_t val;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ge&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=poGRidtbMlekMuju7dkSIMPVgX_NvzmkSh4kyoMwmj4&e=> .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.lt&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=Hn_UpnhODUQpAWw-9ZlxrzNy2MyxX_kJS6d2z99kqao&e=> .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> +                    : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
> +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> +                    : "memory");
> +#else
> +       do {
> +               refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> +               refill -= req;
> +       } while (refill < 0);
> +#endif
> +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> +                                 0, __ATOMIC_RELEASE,
> +                                 __ATOMIC_RELAXED))
>                 goto retry;
> -       }
>  }
>
>  /* Function to determine no of tx subdesc required in case ext
> @@ -283,10 +328,27 @@ static __rte_always_inline void
>  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
>  {
>         uint64_t nb_desc = txq->cpt_desc;
> -       uint64_t *fc = txq->cpt_fc;
> -
> -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> +       uint64_t fc;
> +
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.ls<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ls&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=7_L6grMaTkrE6l15-rPboGrijiBzvmc0jeDUK1qWJt0&e=> .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(fc)
> +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(fc);
> +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline void
> @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>  {
>         int32_t nb_desc, val, newval;
>         int32_t *fc_sw;
> -       volatile uint64_t *fc;
> +       uint64_t *fc;
>
>         /* Check if there is any CPT instruction to submit */
>         if (!nb_pkts)
> @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>
>  again:
>         fc_sw = txq->cpt_fc_sw;
> -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbz %w[pkts], 31, .Ldne%=               \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbnz %w[pkts], 31, .Lrty%=              \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(val)
> +                    : [addr] "r"(fc_sw)
> +                    : "memory");
> +#else
> +       /* Wait for primary core to refill FC. */
> +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> +               ;
> +#endif
> +
> +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
>         if (likely(val >= 0))
>                 return;
>
>         nb_desc = txq->cpt_desc;
>         fc = txq->cpt_fc;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.ge&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=poGRidtbMlekMuju7dkSIMPVgX_NvzmkSh4kyoMwmj4&e=> .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt<https://urldefense.proofpoint.com/v2/url?u=http-3A__b.lt&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=Hn_UpnhODUQpAWw-9ZlxrzNy2MyxX_kJS6d2z99kqao&e=> .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(newval)
> +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
> +                    : "memory");
> +#else
>         while (true) {
>                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
>                 newval -= nb_pkts;
>                 if (newval >= 0)
>                         break;
>         }
> +#endif
>
> -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> -                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED))
> +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
> +                                        __ATOMIC_RELAXED))
>                 goto again;
>  }
>
> @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
>                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
>                 wd.data[1] |= (uint64_t)(lmt_id + 16);
>
> -               if (flags & NIX_TX_VWQE_F)
> -                       cn10k_nix_vwqe_wait_fc(txq,
> -                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
> -                                        1));
> +               if (flags & NIX_TX_VWQE_F) {
> +                       if (flags & NIX_TX_MULTI_SEG_F) {
> +                               if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> +                                       cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       } else {
> +                               cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       }
> +               }
>                 /* STEOR1 */
>                 roc_lmt_submit_steorl(wd.data[1], pa);
>         } else if (lnum) {
> --
> 2.25.1
>


--

Patrick Robb

Technical Service Manager

UNH InterOperability Laboratory

21 Madbury Rd, Suite 100, Durham, NH 03824

www.iol.unh.edu<https://urldefense.proofpoint.com/v2/url?u=http-3A__www.iol.unh.edu_&d=DwMGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=1DGob4H4rxz6H8uITozGOCa0s5f4wCNtTa4UUKvcsvI&m=3-k4hoFpHV-mJao0lg_yOPeBgGXugZwXaQ7lE17jZ2zwns7YlXt6exJ2gfSLoFGW&s=Y8ZpK3xtg8c-YvTVjw-jOWVGSQ0BWQbtdrl0Asxorfo&e=>


[https://lh4.googleusercontent.com/7sTY8VswXadak_YT0J13osh5ockNVRX2BuYaRsKoTTpkpilBokA0WlocYHLB4q7XUgXNHka6-ns47S8R_am0sOt7MYQQ1ILQ3S-P5aezsrjp3-IsJMmMrErHWmTARNgZhpAx06n2]

[-- Attachment #2: Type: text/html, Size: 36199 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-13  9:25   ` [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
  2023-06-14 10:33     ` Jerin Jacob
@ 2023-06-15 15:28     ` Stephen Hemminger
  2023-06-16  6:46       ` [EXT] " Pavan Nikhilesh Bhagavatula
  1 sibling, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2023-06-15 15:28 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K,
	Sunil Kumar Kori, Satha Rao, dev

On Tue, 13 Jun 2023 14:55:48 +0530
<pbhagavatula@marvell.com> wrote:

>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +	uint64_t space;
> +
> +	asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +		     "		ldxr %[space], [%[addr]]		\n"
> +		     "		cmp %[adj], %[space] 			\n"
> +		     "		b.hi .Ldne%=				\n"
> +		     "		sevl					\n"
> +		     ".Lrty%=:	wfe					\n"
> +		     "		ldxr %[space], [%[addr]]		\n"
> +		     "		cmp %[adj], %[space]			\n"
> +		     "		b.ls .Lrty%=				\n"
> +		     ".Ldne%=:						\n"
> +		     : [space] "=&r"(space)
> +		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +		     : "memory");
> +#else
>  	while ((uint64_t)txq->nb_sqb_bufs_adj <=
>  	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>  		;
> +#endif
>  }

Rather than introduce assembly here, please extend existing rte_pause
functions and then other drivers could benefit and it would fit
existing WFE usage.

Something like:

static __rte_always_inline void
rte_wait_until_le_32(volatile uint32_t *addr, uint32_t expected,
		int memorder);

Direct assembly in drivers is hard to maintain and should be forbidden.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait
  2023-06-15 15:28     ` Stephen Hemminger
@ 2023-06-16  6:46       ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 13+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-06-16  6:46 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jerin Jacob Kollanukkaran, Shijith Thotton,
	Nithin Kumar Dabilpuram, Kiran Kumar Kokkilagadda,
	Sunil Kumar Kori, Satha Koteswara Rao Kottidi, dev

=
> On Tue, 13 Jun 2023 14:55:48 +0530
> <pbhagavatula@marvell.com> wrote:
> 
> >  static __rte_always_inline void
> >  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
> >  {
> > +#ifdef RTE_ARCH_ARM64
> > +	uint64_t space;
> > +
> > +	asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +		     "		ldxr %[space], [%[addr]]		\n"
> > +		     "		cmp %[adj], %[space] 			\n"
> > +		     "		b.hi .Ldne%=				\n"
> > +		     "		sevl					\n"
> > +		     ".Lrty%=:	wfe					\n"
> > +		     "		ldxr %[space], [%[addr]]		\n"
> > +		     "		cmp %[adj], %[space]			\n"
> > +		     "		b.ls .Lrty%=				\n"
> > +		     ".Ldne%=:						\n"
> > +		     : [space] "=&r"(space)
> > +		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq-
> >fc_mem)
> > +		     : "memory");
> > +#else
> >  	while ((uint64_t)txq->nb_sqb_bufs_adj <=
> >  	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
> >  		;
> > +#endif
> >  }
> 
> Rather than introduce assembly here, please extend existing rte_pause
> functions and then other drivers could benefit and it would fit
> existing WFE usage.
> 

Yes, a future patch is planned to replace this and other occurrences with 
RTE_WAIT_UNTIL_MASKED

> Something like:
> 
> static __rte_always_inline void
> rte_wait_until_le_32(volatile uint32_t *addr, uint32_t expected,
> 		int memorder);
> 
> Direct assembly in drivers is hard to maintain and should be forbidden.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-06-16  6:46 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-16 14:37 [PATCH 1/3] event/cnxk: align TX queue buffer adjustment pbhagavatula
2023-05-16 14:37 ` [PATCH 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
2023-05-16 14:37 ` [PATCH 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
2023-06-12 15:52 ` [PATCH 1/3] event/cnxk: align TX queue buffer adjustment Jerin Jacob
2023-06-13  9:25 ` [PATCH v2 " pbhagavatula
2023-06-13  9:25   ` [PATCH v2 2/3] event/cnxk: use local labels in asm intrinsic pbhagavatula
2023-06-13  9:25   ` [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait pbhagavatula
2023-06-14 10:33     ` Jerin Jacob
2023-06-14 18:27       ` Patrick Robb
2023-06-14 20:24         ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-06-15  5:49         ` Jerin Jacob Kollanukkaran
2023-06-15 15:28     ` Stephen Hemminger
2023-06-16  6:46       ` [EXT] " Pavan Nikhilesh Bhagavatula

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).