DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue
@ 2022-01-19  7:13 pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
                   ` (5 more replies)
  0 siblings, 6 replies; 16+ messages in thread
From: pbhagavatula @ 2022-01-19  7:13 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Tx command is prepared based on offloads enabled and stored in
Tx queue structure at tx_queue_setup phase.
In fastpath the command is copied from Tx queue to LMT line for
all the packets.
Since, the command contents are mostly constants we can move the
command preparation to fastpath and avoid accessing Tx queue
memory.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 Depends-on: Series-20922
 Depends-on: Series-20928

 v2 Changes:
 - Rebase.
 - Fix incorrect use of RoC API

 drivers/common/cnxk/roc_io.h             |  33 ++++-
 drivers/common/cnxk/roc_io_generic.h     |  15 ++
 drivers/crypto/cnxk/cn9k_cryptodev_ops.c |   2 +-
 drivers/crypto/cnxk/cn9k_ipsec.c         |   2 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  26 +++-
 drivers/event/cnxk/cn10k_worker.h        |  89 ++++++------
 drivers/event/cnxk/cn9k_eventdev.c       |  33 +++--
 drivers/event/cnxk/cn9k_worker.h         |  64 ++++-----
 drivers/event/cnxk/cnxk_eventdev.h       |  13 +-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 141 ++++++++++++++++---
 drivers/net/cnxk/cn10k_ethdev.c          |  24 +---
 drivers/net/cnxk/cn10k_ethdev.h          |   3 +-
 drivers/net/cnxk/cn10k_tx.h              | 167 ++++++++++++-----------
 drivers/net/cnxk/cn9k_ethdev.c           |  36 ++---
 drivers/net/cnxk/cn9k_ethdev.h           |   3 +-
 drivers/net/cnxk/cn9k_tx.h               | 135 +++++++++++-------
 16 files changed, 479 insertions(+), 307 deletions(-)

diff --git a/drivers/common/cnxk/roc_io.h b/drivers/common/cnxk/roc_io.h
index fe5f7f46d0..ea7fcd4e9a 100644
--- a/drivers/common/cnxk/roc_io.h
+++ b/drivers/common/cnxk/roc_io.h
@@ -152,13 +152,36 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	dst128[1] = src128[1];
 	/* lmtext receives following value:
 	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
-	 * 2: NIX_SUBDC_EXT + NIX_SUBDC_MEM i.e. tstamp case
 	 */
-	if (lmtext) {
+	if (lmtext)
+		dst128[2] = src128[2];
+}
+
+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	volatile const __uint128_t *src128 = (const __uint128_t *)in;
+	volatile __uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	const __uint128_t *src128 = (const __uint128_t *)in;
+	__uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	/* lmtext receives following value:
+	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
+	 */
+	if (lmtext)
 		dst128[2] = src128[2];
-		if (lmtext > 1)
-			dst128[3] = src128[3];
-	}
 }

 static __plt_always_inline void
diff --git a/drivers/common/cnxk/roc_io_generic.h b/drivers/common/cnxk/roc_io_generic.h
index ceaa3a38d8..af42e66345 100644
--- a/drivers/common/cnxk/roc_io_generic.h
+++ b/drivers/common/cnxk/roc_io_generic.h
@@ -97,6 +97,21 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
 }

+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	PLT_SET_USED(out);
+	PLT_SET_USED(in);
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	PLT_SET_USED(in);
+	PLT_SET_USED(lmtext);
+	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
+}
+
 static __plt_always_inline void
 roc_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
 {
diff --git a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
index 449208da8f..53e427a3c1 100644
--- a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
+++ b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
@@ -161,7 +161,7 @@ cn9k_cpt_inst_submit(struct cpt_inst_s *inst, uint64_t lmtline,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, inst, 2);
+		roc_lmt_mov64((void *)lmtline, inst);

 		/*
 		 * Make sure compiler does not reorder memcpy and ldeor.
diff --git a/drivers/crypto/cnxk/cn9k_ipsec.c b/drivers/crypto/cnxk/cn9k_ipsec.c
index a81130b244..117e54cae7 100644
--- a/drivers/crypto/cnxk/cn9k_ipsec.c
+++ b/drivers/crypto/cnxk/cn9k_ipsec.c
@@ -53,7 +53,7 @@ cn9k_cpt_enq_sa_write(struct cn9k_ipsec_sa *sa, struct cnxk_cpt_qp *qp,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, &inst, 2);
+		roc_lmt_mov64((void *)lmtline, &inst);
 		lmt_status = roc_lmt_submit_ldeor(io_addr);
 	} while (lmt_status == 0);

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 70e2aa5555..c57e45a118 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -50,7 +50,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
-	ws->tx_base = ws->base;
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -259,15 +258,13 @@ cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 			ws_cookie,
 			sizeof(struct cnxk_sso_hws_cookie) +
 				sizeof(struct cn10k_sso_hws) +
-				(sizeof(uint64_t) * (dev->max_port_id + 1) *
-				 RTE_MAX_QUEUES_PER_PORT),
+				dev->tx_adptr_data_sz,
 			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 		if (ws_cookie == NULL)
 			return -ENOMEM;
 		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
 		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-		       sizeof(uint64_t) * (dev->max_port_id + 1) *
-			       RTE_MAX_QUEUES_PER_PORT);
+		       dev->tx_adptr_data_sz);
 		event_dev->data->ports[i] = ws;
 	}

@@ -727,16 +724,35 @@ cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			       const struct rte_eth_dev *eth_dev,
 			       int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 78d029baaa..e80e4fb895 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -457,18 +457,18 @@ NIX_RX_FASTPATH_MODES
 	}

 static __rte_always_inline struct cn10k_eth_txq *
-cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 {
-	return (struct cn10k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+	return (struct cn10k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 static __rte_always_inline void
-cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
-		 uintptr_t lmt_addr, uint8_t sched_type, uintptr_t base,
-		 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		 const uint32_t flags)
+cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
+		 uint16_t lmt_id, uintptr_t lmt_addr, uint8_t sched_type,
+		 const uint64_t *txq_data, const uint32_t flags)
 {
 	uint8_t lnum = 0, loff = 0, shft = 0;
 	struct cn10k_eth_txq *txq;
@@ -478,7 +478,7 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	bool sec;

 	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
 	/* Perform header writes before barrier
 	 * for TSO
 	 */
@@ -503,23 +503,23 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	else
 		segdw = cn10k_nix_tx_ext_subs(flags) + 2;

+	cn10k_nix_xmit_prepare_tstamp(txq, laddr, m->ol_flags, segdw, flags);
 	if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
 		pa = txq->cpt_io_addr | 3 << 4;
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);

 	if (!sched_type)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);

 	roc_lmt_submit_steorl(lmt_id, pa);
 }

 static __rte_always_inline void
-cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
-			uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
-			uint8_t sched_type, uintptr_t base,
-			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-			const uint32_t flags)
+cn10k_sso_vwqe_split_tx(struct cn10k_sso_hws *ws, struct rte_mbuf **mbufs,
+			uint16_t nb_mbufs, uint64_t *cmd, uint16_t lmt_id,
+			uintptr_t lmt_addr, uint8_t sched_type,
+			const uint64_t *txq_data, const uint32_t flags)
 {
 	uint16_t port[4], queue[4];
 	uint16_t i, j, pkts, scalar;
@@ -542,14 +542,16 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 		if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
 		    ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
 			for (j = 0; j < 4; j++)
-				cn10k_sso_tx_one(mbufs[i + j], cmd, lmt_id,
-						 lmt_addr, sched_type, base,
-						 txq_data, flags);
+				cn10k_sso_tx_one(ws, mbufs[i + j], cmd, lmt_id,
+						 lmt_addr, sched_type, txq_data,
+						 flags);
 		} else {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[port[0]][queue[0]];
-			cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd,
-						   base + SSOW_LF_GWS_TAG,
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[port[0]] >> 48) +
+						   queue[0]] &
+					  (BIT_ULL(48) - 1));
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws,
+						   &mbufs[i], 4, cmd,
 						   flags | NIX_TX_VWQE_F);
 		}
 	}
@@ -557,15 +559,14 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 	mbufs += i;

 	for (i = 0; i < scalar; i++) {
-		cn10k_sso_tx_one(mbufs[i], cmd, lmt_id, lmt_addr, sched_type,
-				 base, txq_data, flags);
+		cn10k_sso_tx_one(ws, mbufs[i], cmd, lmt_id, lmt_addr,
+				 sched_type, txq_data, flags);
 	}
 }

 static __rte_always_inline uint16_t
 cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
-		       uint64_t *cmd,
-		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       uint64_t *cmd, const uint64_t *txq_data,
 		       const uint32_t flags)
 {
 	struct cn10k_eth_txq *txq;
@@ -582,17 +583,19 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t meta = *(uint64_t *)ev->vec;

 		if (meta & BIT(31)) {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[meta >> 32][meta >> 48];
-
-			cn10k_nix_xmit_pkts_vector(
-				txq, mbufs, meta & 0xFFFF, cmd,
-				ws->tx_base + SSOW_LF_GWS_TAG,
-				flags | NIX_TX_VWQE_F);
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[meta >> 32] >>
+						    48) +
+						   (meta >> 48)] &
+					  (BIT_ULL(48) - 1));
+
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws, mbufs,
+						   meta & 0xFFFF, cmd,
+						   flags | NIX_TX_VWQE_F);
 		} else {
 			cn10k_sso_vwqe_split_tx(
-				mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
-				ev->sched_type, ws->tx_base, txq_data, flags);
+				ws, mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
 		return (meta & 0xFFFF);
@@ -600,16 +603,16 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,

 	m = ev->mbuf;
 	ref_cnt = m->refcnt;
-	cn10k_sso_tx_one(m, cmd, lmt_id, lmt_addr, ev->sched_type, ws->tx_base,
-			 txq_data, flags);
+	cn10k_sso_tx_one(ws, m, cmd, lmt_id, lmt_addr, ev->sched_type, txq_data,
+			 flags);

 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		if (ref_cnt > 1)
 			return 1;
 	}

-	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
-				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+	cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
+				 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 	return 1;
 }

@@ -631,9 +634,7 @@ NIX_TX_FASTPATH_MODES
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			flags);                                                \
 	}

@@ -646,9 +647,7 @@ NIX_TX_FASTPATH_MODES
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index a5f0cb12db..3bff327477 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -259,17 +259,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			dws = RTE_PTR_ADD(ws_cookie,
 					  sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = dws;
 		} else {
 			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
@@ -280,17 +277,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			ws = RTE_PTR_ADD(ws_cookie,
 					 sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = ws;
 		}
 	}
@@ -1006,17 +1000,36 @@ cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev,
 			      int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	cn9k_sso_txq_fc_update(eth_dev, tx_queue_id, true);
 	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index a46d4e786a..0f58e00e7f 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -603,20 +603,13 @@ cn9k_sso_txq_fc_wait(const struct cn9k_eth_txq *txq)
 		;
 }

-static __rte_always_inline const struct cn9k_eth_txq *
-cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+static __rte_always_inline struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m, uint64_t *txq_data)
 {
-	return (const struct cn9k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
-}
-
-static __rte_always_inline void
-cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
-			 uint64_t *cmd, const uint32_t flags)
-{
-	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
-	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
+	return (struct cn9k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 #if defined(RTE_ARCH_ARM64)
@@ -673,7 +666,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 	nixtx += BIT_ULL(7);
 	nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);

-	roc_lmt_mov((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
+	roc_lmt_mov_nv((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));

 	/* Load opcode and cptr already prepared at pkt metadata set */
 	pkt_len -= l2_len;
@@ -760,12 +753,11 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,

 static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
-		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		      const uint32_t flags)
+		      uint64_t *txq_data, const uint32_t flags)
 {
 	struct rte_mbuf *m = ev->mbuf;
-	const struct cn9k_eth_txq *txq;
 	uint16_t ref_cnt = m->refcnt;
+	struct cn9k_eth_txq *txq;

 	/* Perform header writes before barrier for TSO */
 	cn9k_nix_xmit_prepare_tso(m, flags);
@@ -778,7 +770,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	    !(flags & NIX_TX_OFFLOAD_SECURITY_F))
 		rte_io_wmb();
 	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
-	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
+	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);

 	if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
 		uint64_t ol_flags = m->ol_flags;
@@ -800,6 +793,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,

 	if (flags & NIX_TX_MULTI_SEG_F) {
 		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
+					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -812,6 +807,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					       segdw);
 		}
 	} else {
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -858,11 +854,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[sz];                                              \
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_TX_SEG(fn, sz, flags)                                              \
@@ -873,11 +867,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws *ws = port;                                \
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #define SSO_DUAL_TX(fn, sz, flags)                                             \
@@ -888,11 +880,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[sz];                                              \
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_DUAL_TX_SEG(fn, sz, flags)                                         \
@@ -903,11 +893,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws_dual *ws = port;                           \
                                                                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #endif
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 305c6a3b9e..eea1597a05 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -99,7 +99,10 @@ struct cnxk_sso_evdev {
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
 	uint64_t *tx_adptr_data;
+	size_t tx_adptr_data_sz;
 	uint16_t max_port_id;
+	uint16_t max_queue_id[RTE_MAX_ETHPORTS];
+	uint8_t tx_adptr_configured;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -131,8 +134,8 @@ struct cn10k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint64_t tx_base __rte_cache_aligned;
-	uintptr_t lmt_base;
+	uintptr_t lmt_base __rte_cache_aligned;
+	uint64_t lso_tun_fmt;
 	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

@@ -149,7 +152,8 @@ struct cn9k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cn9k_sso_hws_dual {
@@ -165,7 +169,8 @@ struct cn9k_sso_hws_dual {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cnxk_sso_hws_cookie {
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index fdcd68ca63..29dce44d39 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -345,24 +345,136 @@ cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
 			    void *txq)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
 	uint16_t max_port_id = dev->max_port_id;
-	uint64_t *txq_data = dev->tx_adptr_data;
-
-	if (txq_data == NULL || eth_port_id > max_port_id) {
-		max_port_id = RTE_MAX(max_port_id, eth_port_id);
-		txq_data = rte_realloc_socket(
-			txq_data,
-			(sizeof(uint64_t) * (max_port_id + 1) *
-			 RTE_MAX_QUEUES_PER_PORT),
-			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+	uint64_t offset = 0, row = 0;
+	uint64_t *txq_data = NULL;
+	size_t size = 0;
+	int i, j;
+
+	if (((uint64_t)txq) & 0xFFFF000000000000)
+		return -EINVAL;
+
+	if (dev->tx_adptr_data == NULL) {
+		size = (eth_port_id + 1);
+		size += (eth_port_id + tx_queue_id);
+		row = 2 * eth_port_id;
+	} else {
+		if (eth_port_id > max_port_id) {
+			size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
+			for (i = 1; i < eth_port_id; i++)
+				size += (dev->max_queue_id[i] + 1);
+			row = size;
+			size += (tx_queue_id + 1);
+		} else if (tx_queue_id > max_queue_id) {
+			size = !eth_port_id ? tx_queue_id + 1 :
+						    RTE_MAX(max_port_id,
+						      dev->max_queue_id[0]) +
+						      1;
+			for (i = 1; i < max_port_id + 1; i++) {
+				if (i == eth_port_id) {
+					row = size;
+					size += tx_queue_id + 1;
+				} else {
+					size += dev->max_queue_id[i] + 1;
+				}
+			}
+		}
+	}
+
+	size *= sizeof(uint64_t);
+
+	if (size) {
+		uint64_t *otxq_data = dev->tx_adptr_data;
+
+		txq_data = malloc(size);
 		if (txq_data == NULL)
 			return -ENOMEM;
+		memset(txq_data, 0, size);
+		txq_data[eth_port_id] = ((uint64_t)row) << 48;
+		txq_data[row + tx_queue_id] = (uint64_t)txq;
+
+		if (otxq_data != NULL) {
+			for (i = 0; i < dev->max_queue_id[0] + 1; i++) {
+				txq_data[i] |= (otxq_data[i] &
+						~((BIT_ULL(16) - 1) << 48));
+			}
+
+			if (eth_port_id > max_port_id) {
+				dev->max_queue_id[0] = RTE_MAX(
+					dev->max_queue_id[0], eth_port_id);
+				dev->max_port_id =
+					RTE_MAX(dev->max_port_id, eth_port_id);
+
+				for (i = 1; i < eth_port_id; i++) {
+					offset +=
+						(dev->max_queue_id[i - 1] + 1);
+					txq_data[i] |= offset << 48;
+					for (j = 0;
+					     (i < dev->max_port_id) &&
+					     (j < dev->max_queue_id[i] + 1);
+					     j++) {
+
+						txq_data[offset + j] = otxq_data
+							[(otxq_data[i] >> 48) +
+							 j];
+					}
+				}
+				dev->max_queue_id[eth_port_id] =
+					RTE_MAX(dev->max_queue_id[eth_port_id],
+						tx_queue_id);
+			} else if (tx_queue_id > max_queue_id) {
+				dev->max_queue_id[eth_port_id] =
+					RTE_MAX(dev->max_queue_id[eth_port_id],
+						tx_queue_id);
+				dev->max_port_id =
+					RTE_MAX(max_port_id, eth_port_id);
+				for (i = 1; i < max_port_id + 1; i++) {
+					offset +=
+						(dev->max_queue_id[i - 1] + 1);
+					txq_data[i] |= offset << 48;
+					for (j = 0;
+					     j < dev->max_queue_id[i] + 1;
+					     j++) {
+						if (i == eth_port_id &&
+						    j > max_queue_id)
+							continue;
+						txq_data[offset + j] = otxq_data
+							[(otxq_data[i] >> 48) +
+							 j];
+					}
+				}
+			}
+		} else {
+			dev->max_queue_id[0] =
+				RTE_MAX(dev->max_queue_id[0], eth_port_id);
+			for (i = 1; i < eth_port_id; i++) {
+				offset += (dev->max_queue_id[i - 1] + 1);
+				txq_data[i] |= offset << 48;
+
+				for (j = 0; (i < max_port_id) &&
+					    (j < dev->max_queue_id[i] + 1);
+				     j++) {
+
+					txq_data[offset + j] =
+						otxq_data[(otxq_data[i] >> 48) +
+							  j];
+				}
+			}
+			dev->max_port_id =
+				RTE_MAX(dev->max_port_id, eth_port_id);
+			dev->max_queue_id[eth_port_id] = RTE_MAX(
+				dev->max_queue_id[eth_port_id], tx_queue_id);
+		}
+		dev->tx_adptr_data_sz = size;
+		free(otxq_data);
+		dev->tx_adptr_data = txq_data;
+	} else {
+		txq_data = dev->tx_adptr_data;
+		row = txq_data[eth_port_id] >> 48;
+		txq_data[row + tx_queue_id] |= (uint64_t)txq;
 	}

-	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
-		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
-	dev->max_port_id = max_port_id;
-	dev->tx_adptr_data = txq_data;
 	return 0;
 }

@@ -372,7 +484,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			      int32_t tx_queue_id)
 {
 	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
-	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	struct roc_nix_sq *sq;
 	int i, ret;
 	void *txq;
@@ -388,8 +499,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
 		if (ret < 0)
 			return ret;
-
-		dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
 	}

 	return 0;
diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index 8378cbffc2..9bb08e1824 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -131,53 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
 	union nix_send_hdr_w0_u send_hdr_w0;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s sg_w0;
-
-	RTE_SET_USED(dev);

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
 	send_hdr_w0.u = 0;
-	sg_w0.u = 0;
-
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
 		send_hdr_w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[0];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
 			send_hdr_w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)(txq->cmd + 2);
-			send_mem->w0.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
 	} else {
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
 		send_hdr_w0.sizem1 = 1;
 	}
-
 	send_hdr_w0.sq = qid;
-	sg_w0.subdc = NIX_SUBDC_SG;
-	sg_w0.segs = 1;
-	sg_w0.ld_type = NIX_SENDLDTYPE_LDD;
-
 	txq->send_hdr_w0 = send_hdr_w0.u;
-	txq->sg_w0 = sg_w0.u;
-
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
index 0982158c62..ec40e53152 100644
--- a/drivers/net/cnxk/cn10k_ethdev.h
+++ b/drivers/net/cnxk/cn10k_ethdev.h
@@ -9,7 +9,6 @@

 struct cn10k_eth_txq {
 	uint64_t send_hdr_w0;
-	uint64_t sg_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	uintptr_t lmt_base;
@@ -20,8 +19,8 @@ struct cn10k_eth_txq {
 	uint64_t sa_base;
 	uint64_t *cpt_fc;
 	uint16_t cpt_desc;
-	uint64_t cmd[4];
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 } __plt_cache_aligned;

 struct cn10k_eth_rxq {
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 2c9411f42c..1eff0e568c 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -186,23 +186,26 @@ cn10k_cpt_tx_steor_data(void)
 }

 static __rte_always_inline void
-cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
-		      const uint16_t flags)
+cn10k_nix_tx_skeleton(struct cn10k_eth_txq *txq, uint64_t *cmd,
+		      const uint16_t flags, const uint16_t static_sz)
 {
-	/* Send hdr */
-	cmd[0] = txq->send_hdr_w0;
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
 	cmd[1] = 0;
-	cmd += 2;

-	/* Send ext if present */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		*(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
-		cmd += 2;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
 	}
-
-	/* Send sg */
-	cmd[0] = txq->sg_w0;
-	cmd[1] = 0;
 }

 static __rte_always_inline void
@@ -718,41 +721,29 @@ cn10k_nix_xmit_mv_lmt_base(uintptr_t lmt_addr, uint64_t *cmd,
 }

 static __rte_always_inline void
-cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
+cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
 			      const uint64_t ol_flags, const uint16_t no_segdw,
 			      const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
-		struct nix_send_ext_s *send_hdr_ext =
-			(struct nix_send_ext_s *)lmt_addr + 16;
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
 		uint64_t *lmt = (uint64_t *)lmt_addr;
 		uint16_t off = (no_segdw - 1) << 1;
 		struct nix_send_mem_s *send_mem;

 		send_mem = (struct nix_send_mem_s *)(lmt + off);
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
-		send_hdr_ext->w0.tstmp = 1;
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			lmt[off] = cmd[2];
-
-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
 		send_mem->w0.subdc = NIX_SUBDC_MEM;
-		send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+		send_mem->w0.alg =
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
 		send_mem->addr =
-			(rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
+			(rte_iova_t)(((uint64_t *)txq->ts_mem) + is_ol_tstamp);
 	}
 }

@@ -841,8 +832,8 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
-		    uint64_t *cmd, uintptr_t base, const uint16_t flags)
+cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
+		    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	const rte_iova_t io_addr = txq->io_addr;
@@ -863,9 +854,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		/* Reduce the cached count */
 		txq->fc_cache_pkts -= pkts;
 	}
-
 	/* Get cmd skeleton */
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -909,14 +899,14 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, 4, flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      4, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec)
 			lnum++;
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -967,9 +957,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
-			 uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			 const uint16_t flags)
+cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
+			 struct rte_mbuf **tx_pkts, uint16_t pkts,
+			 uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	uintptr_t pa0, pa1, lbase = txq->lmt_base;
@@ -987,12 +977,13 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uintptr_t laddr;
 	bool sec;

-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
-
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	}
+	/* Get cmd skeleton */
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -1038,13 +1029,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-
 		/* Store sg list directly on lmt line */
 		segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
 					       flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, segdw,
-					      flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      segdw, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
 			lnum++;
 			data128 |= (((__uint128_t)(segdw - 1)) << shft);
@@ -1053,7 +1042,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -1474,9 +1463,9 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1526,25 +1515,42 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
 	}

-	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	} else {
+		uint64_t w0 =
+			(txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
+
+		senddesc01_w0 = vdupq_n_u64(w0);
+	}
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
 	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	/* Get LMT base address and LMT ID as lcore id */
@@ -2577,7 +2583,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		wd.data[0] >>= 16;

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;

@@ -2640,12 +2646,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 	if (unlikely(scalar)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
-			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
-							 scalar, cmd, base,
-							 flags);
+			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, ws, tx_pkts,
+							 scalar, cmd, flags);
 		else
-			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
-						    cmd, base, flags);
+			pkts += cn10k_nix_xmit_pkts(tx_queue, ws, tx_pkts,
+						    scalar, cmd, flags);
 	}

 	return pkts;
@@ -2653,16 +2658,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 #else
 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
+	RTE_SET_USED(ws);
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
-	RTE_SET_USED(base);
 	return 0;
 }
 #endif
@@ -2896,7 +2901,7 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, 0,    \
+		return cn10k_nix_xmit_pkts(tx_queue, NULL, tx_pkts, pkts, cmd, \
 					   flags);                             \
 	}

@@ -2910,8 +2915,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
-						0,                             \
+		return cn10k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, \
+						cmd,                           \
 						flags | NIX_TX_MULTI_SEG_F);   \
 	}

@@ -2925,8 +2930,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts,     \
-						  cmd, 0, (flags));            \
+		return cn10k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts,     \
+						  pkts, cmd, (flags));         \
 	}

 #define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags)                                    \
@@ -2940,7 +2945,7 @@ NIX_TX_FASTPATH_MODES
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(                             \
-			tx_queue, tx_pkts, pkts, cmd, 0,                       \
+			tx_queue, NULL, tx_pkts, pkts, cmd,                    \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index d34bc6898f..01e3850561 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -131,51 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
-	struct nix_send_hdr_s *send_hdr;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s *sg;
+	union nix_send_hdr_w0_u send_hdr_w0;

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
-
+	send_hdr_w0.u = 0;
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
-		send_hdr->w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[2];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
+		send_hdr_w0.sizem1 = 2;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
-			send_hdr->w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;
+			send_hdr_w0.sizem1 = 3;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)
-				(txq->cmd + (send_hdr->w0.sizem1 << 1));
-			send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.cn9k.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
-		sg = (union nix_send_sg_s *)&txq->cmd[4];
 	} else {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
-		send_hdr->w0.sizem1 = 1;
-		sg = (union nix_send_sg_s *)&txq->cmd[2];
+		send_hdr_w0.sizem1 = 1;
 	}
-
-	send_hdr->w0.sq = qid;
-	sg->subdc = NIX_SUBDC_SG;
-	sg->segs = 1;
-	sg->ld_type = NIX_SENDLDTYPE_LDD;
-
+	send_hdr_w0.sq = qid;
+	txq->send_hdr_w0 = send_hdr_w0.u;
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
index 2b452fe009..8ab924944c 100644
--- a/drivers/net/cnxk/cn9k_ethdev.h
+++ b/drivers/net/cnxk/cn9k_ethdev.h
@@ -9,12 +9,13 @@
 #include <cnxk_security_ar.h>

 struct cn9k_eth_txq {
-	uint64_t cmd[8];
+	uint64_t send_hdr_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	void *lmt_addr;
 	rte_iova_t io_addr;
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 	uint16_t sqes_per_sqb_log2;
 	int16_t nb_sqb_bufs_adj;
 	rte_iova_t cpt_io_addr;
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 0c68b241cf..8eb07c4327 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -58,6 +58,29 @@ cn9k_nix_tx_ext_subs(const uint16_t flags)
 				  : 0);
 }

+static __rte_always_inline void
+cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
+		     const uint16_t flags, const uint16_t static_sz)
+{
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn9k_nix_tx_ext_subs(flags) + 1) << 40);
+	cmd[1] = 0;
+
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	}
+}
+
 static __rte_always_inline void
 cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 {
@@ -136,11 +159,11 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		w1.u = 0;
 	}

-	if (!(flags & NIX_TX_MULTI_SEG_F)) {
+	if (!(flags & NIX_TX_MULTI_SEG_F))
 		send_hdr->w0.total = m->data_len;
-		send_hdr->w0.aura =
-			roc_npa_aura_handle_to_aura(m->pool->pool_id);
-	}
+	else
+		send_hdr->w0.total = m->pkt_len;
+	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	/*
 	 * L3type:  2 => IPV4
@@ -287,41 +310,39 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		/* Mark mempool object as "put" since it is freed by NIX */
 		if (!send_hdr->w0.df)
 			RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	} else {
+		sg->seg1_size = m->data_len;
+		*(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
+
+		/* NOFF is handled later for multi-seg */
 	}
 }

 static __rte_always_inline void
-cn9k_nix_xmit_prepare_tstamp(uint64_t *cmd, const uint64_t *send_mem_desc,
+cn9k_nix_xmit_prepare_tstamp(struct cn9k_eth_txq *txq, uint64_t *cmd,
 			     const uint64_t ol_flags, const uint16_t no_segdw,
 			     const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 		struct nix_send_mem_s *send_mem;
 		uint16_t off = (no_segdw - 1) << 1;
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);

 		send_mem = (struct nix_send_mem_s *)(cmd + off);
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			cmd[off] = send_mem_desc[6];

-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
+		send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
 		send_mem->w0.cn9k.alg =
-			NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);

-		send_mem->addr = (rte_iova_t)((uint64_t *)send_mem_desc[7] +
-					      (is_ol_tstamp));
+		send_mem->addr = (rte_iova_t)(((uint64_t *)txq->ts_mem) +
+				(is_ol_tstamp));
 	}
 }

@@ -367,8 +388,6 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 	uint8_t off, i;

 	send_hdr = (struct nix_send_hdr_s *)cmd;
-	send_hdr->w0.total = m->pkt_len;
-	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	if (flags & NIX_TX_NEED_EXT_HDR)
 		off = 2;
@@ -376,13 +395,29 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		off = 0;

 	sg = (union nix_send_sg_s *)&cmd[2 + off];
-	/* Clear sg->u header before use */
-	sg->u &= 0xFC00000000000000;
+
+	/* Start from second segment, first segment is already there */
+	i = 1;
 	sg_u = sg->u;
-	slist = &cmd[3 + off];
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+	slist = &cmd[3 + off + 1];

-	i = 0;
-	nb_segs = m->nb_segs;
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		rte_io_wmb();
+	}
+
+	/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+	m = m_next;
+	if (!m)
+		goto done;

 	/* Fill mbuf segments */
 	do {
@@ -417,6 +452,7 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		m = m_next;
 	} while (nb_segs);

+done:
 	sg->u = sg_u;
 	sg->segs = i;
 	segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
@@ -472,7 +508,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -490,8 +526,8 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, 4, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
+					     flags);
 		cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
 	}

@@ -514,7 +550,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -533,9 +569,8 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
 		segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, segdw,
-					     flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
+					     segdw, flags);
 		cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
 	}

@@ -862,28 +897,34 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
 		rte_io_wmb();

-	senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
+	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
-		sgdesc23_w0 = sgdesc01_w0;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
-	} else {
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sgdesc23_w0 = sgdesc01_w0;
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
--
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 2/4] event/cnxk: store and reuse workslot status
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
@ 2022-01-19  7:13 ` pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-01-19  7:13 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Store and reuse workslot status for TT, GRP and HEAD status
instead of reading from GWC as reading from GWC imposes
additional latency.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/common/cnxk/roc_sso.h      | 14 ++++++++------
 drivers/event/cnxk/cn10k_worker.h  | 16 +++++++++-------
 drivers/event/cnxk/cn9k_worker.h   |  6 +++---
 drivers/event/cnxk/cnxk_eventdev.h |  2 ++
 drivers/event/cnxk/cnxk_worker.h   | 11 +++++++----
 drivers/net/cnxk/cn10k_tx.h        | 12 ++++++------
 6 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index 27d49c6c68..ab7cee1c60 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -54,12 +54,13 @@ struct roc_sso {
 	uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
 } __plt_cache_aligned;
 
-static __plt_always_inline void
-roc_sso_hws_head_wait(uintptr_t tag_op)
+static __plt_always_inline uint64_t
+roc_sso_hws_head_wait(uintptr_t base)
 {
-#ifdef RTE_ARCH_ARM64
+	uintptr_t tag_op = base + SSOW_LF_GWS_TAG;
 	uint64_t tag;
 
+#if defined(__aarch64__)
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_op]]	\n"
 		     "		tbnz %[tag], 35, done%=		\n"
@@ -71,10 +72,11 @@ roc_sso_hws_head_wait(uintptr_t tag_op)
 		     : [tag] "=&r"(tag)
 		     : [tag_op] "r"(tag_op));
 #else
-	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (!(plt_read64(tag_op) & BIT_ULL(35)))
-		;
+	do {
+		tag = plt_read64(tag_op);
+	} while (!(tag & BIT_ULL(35)));
 #endif
+	return tag;
 }
 
 /* SSO device initialization */
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index e80e4fb895..ba82aa58dd 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -40,8 +40,7 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 {
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
-	const uint8_t cur_tt =
-		CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
+	const uint8_t cur_tt = CNXK_TT_FROM_TAG(ws->gw_rdata);
 
 	/* CNXK model
 	 * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
@@ -81,7 +80,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 	const uint8_t grp = ev->queue_id;
 
 	/* Group hasn't changed, Use SWTAG to forward the event */
-	if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
+	if (CNXK_GRP_FROM_TAG(ws->gw_rdata) == grp)
 		cn10k_sso_hws_fwd_swtag(ws, ev);
 	else
 		/*
@@ -211,6 +210,7 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 	} while (gw.u64[0] & BIT_ULL(63));
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
+	ws->gw_rdata = gw.u64[0];
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
@@ -406,7 +406,8 @@ NIX_RX_FASTPATH_MODES
 		RTE_SET_USED(timeout_ticks);                                   \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return 1;                                              \
 		}                                                              \
 		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
@@ -426,7 +427,8 @@ NIX_RX_FASTPATH_MODES
                                                                                \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return ret;                                            \
 		}                                                              \
 		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
@@ -509,8 +511,8 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);
 
-	if (!sched_type)
-		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
+	if (!CNXK_TAG_IS_HEAD(ws->gw_rdata) && !sched_type)
+		ws->gw_rdata = roc_sso_hws_head_wait(ws->base);
 
 	roc_lmt_submit_steorl(lmt_id, pa);
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 0f58e00e7f..b85c647ec8 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -704,7 +704,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 
 	/* Head wait if needed */
 	if (base)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(base);
 
 	/* ESN */
 	outb_priv = roc_nix_inl_onf_ipsec_outb_sa_sw_rsvd((void *)sa);
@@ -797,7 +797,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
@@ -810,7 +810,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index eea1597a05..2bc044a6df 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -47,6 +47,7 @@
 #define CNXK_CLR_SUB_EVENT(x)	    (~(0xffu << 20) & x)
 #define CNXK_GRP_FROM_TAG(x)	    (((x) >> 36) & 0x3ff)
 #define CNXK_SWTAG_PEND(x)	    (BIT_ULL(62) & x)
+#define CNXK_TAG_IS_HEAD(x)	    (BIT_ULL(35) & x)
 
 #define CN9K_SSOW_GET_BASE_ADDR(_GW) ((_GW)-SSOW_LF_GWS_OP_GET_WORK0)
 
@@ -123,6 +124,7 @@ struct cnxk_sso_evdev {
 
 struct cn10k_sso_hws {
 	uint64_t base;
+	uint64_t gw_rdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 9f9ceab8a1..7de03f3fbb 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -52,11 +52,11 @@ cnxk_sso_hws_swtag_flush(uint64_t tag_op, uint64_t flush_op)
 	plt_write64(0, flush_op);
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint64_t
 cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 {
-#ifdef RTE_ARCH_ARM64
 	uint64_t swtp;
+#ifdef RTE_ARCH_ARM64
 
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
@@ -70,9 +70,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 		     : [swtp_loc] "r"(tag_op));
 #else
 	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (plt_read64(tag_op) & BIT_ULL(62))
-		;
+	do {
+		swtp = plt_read64(tag_op);
+	} while (swtp & BIT_ULL(62));
 #endif
+
+	return swtp;
 }
 
 #endif
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 1eff0e568c..47214a0fc0 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -905,8 +905,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 			lnum++;
 	}
 
-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);
 
 	left -= burst;
 	tx_pkts += burst;
@@ -1041,8 +1041,8 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		}
 	}
 
-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);
 
 	left -= burst;
 	tx_pkts += burst;
@@ -2582,8 +2582,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 	if (flags & (NIX_TX_MULTI_SEG_F | NIX_TX_OFFLOAD_SECURITY_F))
 		wd.data[0] >>= 16;
 
-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);
 
 	left -= burst;
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
@ 2022-01-19  7:13 ` pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 4/4] net/cnxk: improve Rx performance pbhagavatula
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-01-19  7:13 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Setting WAITW bit enables default min dequeue timeout of 1us.
Avoid the min dequeue timeout by setting WAITW only when dequeue_timeout
is configured.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c |  8 +++++--
 drivers/event/cnxk/cn9k_eventdev.c  |  9 ++++++-
 drivers/event/cnxk/cn9k_worker.h    | 37 +++++++++++++----------------
 drivers/event/cnxk/cnxk_eventdev.c  |  2 +-
 drivers/event/cnxk/cnxk_eventdev.h  |  2 ++
 5 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index c57e45a118..380d1ede69 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -15,7 +15,10 @@
 static uint32_t
 cn10k_sso_gw_mode_wdata(struct cnxk_sso_evdev *dev)
 {
-	uint32_t wdata = BIT(16) | 1;
+	uint32_t wdata = 1;
+
+	if (dev->deq_tmo_ns)
+		wdata |= BIT(16);
 
 	switch (dev->gw_mode) {
 	case CN10K_GW_MODE_NONE:
@@ -88,7 +91,8 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	ws->xaq_lmt = dev->xaq_lmt;
 
 	/* Set get_work timeout for HWS */
-	val = NSEC2USEC(dev->deq_tmo_ns) - 1;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	plt_write64(val, ws->base + SSOW_LF_GWS_NW_TIM);
 }
 
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 3bff327477..6ebd5b435b 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -72,7 +72,8 @@ cn9k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;
 
 	/* Set get_work tmo for HWS */
-	val = dev->deq_tmo_ns ? NSEC2USEC(dev->deq_tmo_ns) - 1 : 0;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	if (dev->dual_ws) {
 		dws = hws;
 		dws->grp_base = grp_base;
@@ -696,6 +697,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		dws->hws_id = port_id;
 		dws->swtag_req = 0;
 		dws->vws = 0;
+		if (dev->deq_tmo_ns)
+			dws->gw_wdata = BIT_ULL(16);
+		dws->gw_wdata |= 1;
 
 		data = dws;
 	} else {
@@ -714,6 +718,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
 		ws->hws_id = port_id;
 		ws->swtag_req = 0;
+		if (dev->deq_tmo_ns)
+			ws->gw_wdata = BIT_ULL(16);
+		ws->gw_wdata |= 1;
 
 		data = ws;
 	}
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index b85c647ec8..e44422ec25 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -149,10 +149,8 @@ cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			   struct rte_event *ev, const uint32_t flags,
-			   const void *const lookup_mem,
-			   struct cnxk_timesync_info *const tstamp)
+			   struct cn9k_sso_hws_dual *dws)
 {
-	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
@@ -161,7 +159,7 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 	uint64_t mbuf;
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
-		rte_prefetch_non_temporal(lookup_mem);
+		rte_prefetch_non_temporal(dws->lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -175,14 +173,14 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(base + SSOW_LF_GWS_TAG),
-		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(set_gw),
+		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(dws->gw_wdata),
 		       [pong] "r"(pair_base + SSOW_LF_GWS_OP_GET_WORK0));
 #else
 	gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	while ((BIT_ULL(63)) & gw.u64[0])
 		gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	gw.u64[1] = plt_read64(base + SSOW_LF_GWS_WQP);
-	plt_write64(set_gw, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(dws->gw_wdata, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
@@ -202,12 +200,13 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
 			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
 					 gw.u64[0] & 0xFFFFF, flags,
-					 lookup_mem);
+					 dws->lookup_mem);
 			/* Extracting tstamp, if PTP enabled*/
 			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
 							    gw.u64[1]) +
 						   CNXK_SSO_WQE_SG_PTR);
-			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						dws->tstamp,
 						flags & NIX_RX_OFFLOAD_TSTAMP_F,
 						flags & NIX_RX_MULTI_SEG_F,
 						(uint64_t *)tstamp_ptr);
@@ -232,9 +231,7 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
 	uint64_t tstamp_ptr;
 	uint64_t mbuf;
 
-	plt_write64(BIT_ULL(16) | /* wait for work. */
-			    1,	  /* Use Mask set 0. */
-		    ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(ws->gw_wdata, ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
 		rte_prefetch_non_temporal(lookup_mem);
@@ -532,9 +529,9 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return 1;                                              \
 		}                                                              \
-		gw = cn9k_sso_hws_dual_get_work(                               \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		gw = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],           \
+						dws->base[!dws->vws], ev,      \
+						flags, dws);                   \
 		dws->vws = !dws->vws;                                          \
 		return gw;                                                     \
 	}
@@ -558,14 +555,14 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return ret;                                            \
 		}                                                              \
-		ret = cn9k_sso_hws_dual_get_work(                              \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],          \
+						 dws->base[!dws->vws], ev,     \
+						 flags, dws);                  \
 		dws->vws = !dws->vws;                                          \
 		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
-			ret = cn9k_sso_hws_dual_get_work(                      \
-				dws->base[dws->vws], dws->base[!dws->vws], ev, \
-				flags, dws->lookup_mem, dws->tstamp);          \
+			ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],  \
+							 dws->base[!dws->vws], \
+							 ev, flags, dws);      \
 			dws->vws = !dws->vws;                                  \
 		}                                                              \
 		return ret;                                                    \
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index f7a5026250..997ea87f3b 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -610,7 +610,7 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}
 
 	dev->is_timeout_deq = 0;
-	dev->min_dequeue_timeout_ns = USEC2NSEC(1);
+	dev->min_dequeue_timeout_ns = 0;
 	dev->max_dequeue_timeout_ns = USEC2NSEC(0x3FF);
 	dev->max_num_events = -1;
 	dev->nb_event_queues = 0;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 2bc044a6df..a54dcb4a79 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -144,6 +144,7 @@ struct cn10k_sso_hws {
 /* Event port a.k.a GWS */
 struct cn9k_sso_hws {
 	uint64_t base;
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
@@ -160,6 +161,7 @@ struct cn9k_sso_hws {
 
 struct cn9k_sso_hws_dual {
 	uint64_t base[2]; /* Ping and Pong */
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 4/4] net/cnxk: improve Rx performance
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
  2022-01-19  7:13 ` [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
@ 2022-01-19  7:13 ` pbhagavatula
  2022-02-07 14:03 ` [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue Jerin Jacob
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-01-19  7:13 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h | 25 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx.h       |  8 ++++----
 drivers/net/cnxk/cn9k_rx.h        | 20 ++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ba82aa58dd..1e61a6ddf0 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint8_t loff = 0;
 	uint64_t sa_base;
 	uint64_t **wqe;
+	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
 	wqe = vec->u64s;
 
+	rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+		rte_prefetch_non_temporal(&vec->ptrs[i]);
+
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
 	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t u64[2];
 	} gw;
 	uint64_t tstamp_ptr;
-	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
-		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		"sub %[mbuf], %H[wdata], #0x80				\n"
-		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+		"caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+		: [wdata] "+r"(gw.get_work)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
-	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	ws->gw_rdata = gw.u64[0];
-	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
-		    (gw.u64[0] & 0xffffffff);
-
-	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+	if (gw.u64[1]) {
+		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+			    (gw.u64[0] & (0x3FFull << 36)) << 4 |
+			    (gw.u64[0] & 0xffffffff);
 		if ((flags & CPT_RX_WQE_F) &&
 		    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 		     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 			   RTE_EVENT_TYPE_ETHDEV) {
 			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			uint64_t mbuf;
 
+			mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+			rte_prefetch0((void *)mbuf);
 			if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 				struct rte_mbuf *m;
 				uintptr_t sa_base;
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index a2442d3726..9694a3080f 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
 		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index b038b1a6ef..fa4efbf80a 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -342,16 +342,16 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags =
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
-	mbuf->pkt_len = len;
-	mbuf->data_len = len;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
-
 	mbuf->ol_flags = ol_flags;
+	*(uint64_t *)(&mbuf->rearm_data) = val;
+	mbuf->pkt_len = len;
 
-	if (flag & NIX_RX_MULTI_SEG_F)
+	if (flag & NIX_RX_MULTI_SEG_F) {
 		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-	else
+	} else {
+		mbuf->data_len = len;
 		mbuf->next = NULL;
+	}
 }
 
 static inline uint16_t
@@ -723,10 +723,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
-		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
@@ -751,6 +747,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			mbuf3->next = NULL;
 		}
 
+		/* Store the mbufs to rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
                   ` (2 preceding siblings ...)
  2022-01-19  7:13 ` [PATCH v2 4/4] net/cnxk: improve Rx performance pbhagavatula
@ 2022-02-07 14:03 ` Jerin Jacob
  2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
  2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  5 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2022-02-07 14:03 UTC (permalink / raw)
  To: Pavan Nikhilesh
  Cc: Jerin Jacob, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Shijith Thotton, dpdk-dev

On Wed, Jan 19, 2022 at 12:43 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Tx command is prepared based on offloads enabled and stored in
> Tx queue structure at tx_queue_setup phase.
> In fastpath the command is copied from Tx queue to LMT line for
> all the packets.
> Since, the command contents are mostly constants we can move the
> command preparation to fastpath and avoid accessing Tx queue
> memory.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  Depends-on: Series-20922
>  Depends-on: Series-20928
>
>  v2 Changes:
>  - Rebase.
>  - Fix incorrect use of RoC API

# Please rebase to next-net-mrvl
# Please split this series as two, 1/4 can go through next-net-mrvl
and 2..4/4 can go through next-event
#

> +                       }
> +               }
> +       }
> +
> +       size *= sizeof(uint64_t);
> +
> +       if (size) {
> +               uint64_t *otxq_data = dev->tx_adptr_data;
> +
> +               txq_data = malloc(size);
>                 if (txq_data == NULL)
>                         return -ENOMEM;
> +               memset(txq_data, 0, size);
> +               txq_data[eth_port_id] = ((uint64_t)row) << 48;
> +               txq_data[row + tx_queue_id] = (uint64_t)txq;
> +
> +               if (otxq_data != NULL) {
> +                       for (i = 0; i < dev->max_queue_id[0] + 1; i++) {
> +                               txq_data[i] |= (otxq_data[i] &
> +                                               ~((BIT_ULL(16) - 1) << 48));
> +                       }
> +
> +                       if (eth_port_id > max_port_id) {
> +                               dev->max_queue_id[0] = RTE_MAX(
> +                                       dev->max_queue_id[0], eth_port_id);
> +                               dev->max_port_id =
> +                                       RTE_MAX(dev->max_port_id, eth_port_id);
> +
> +                               for (i = 1; i < eth_port_id; i++) {
> +                                       offset +=
> +                                               (dev->max_queue_id[i - 1] + 1);
> +                                       txq_data[i] |= offset << 48;
> +                                       for (j = 0;
> +                                            (i < dev->max_port_id) &&
> +                                            (j < dev->max_queue_id[i] + 1);
> +                                            j++) {
> +
> +                                               txq_data[offset + j] = otxq_data
> +                                                       [(otxq_data[i] >> 48) +
> +                                                        j];
> +                                       }
> +                               }
> +                               dev->max_queue_id[eth_port_id] =
> +                                       RTE_MAX(dev->max_queue_id[eth_port_id],
> +                                               tx_queue_id);
> +                       } else if (tx_queue_id > max_queue_id) {
> +                               dev->max_queue_id[eth_port_id] =
> +                                       RTE_MAX(dev->max_queue_id[eth_port_id],
> +                                               tx_queue_id);
> +                               dev->max_port_id =
> +                                       RTE_MAX(max_port_id, eth_port_id);
> +                               for (i = 1; i < max_port_id + 1; i++) {
> +                                       offset +=
> +                                               (dev->max_queue_id[i - 1] + 1);
> +                                       txq_data[i] |= offset << 48;
> +                                       for (j = 0;
> +                                            j < dev->max_queue_id[i] + 1;
> +                                            j++) {
> +                                               if (i == eth_port_id &&
> +                                                   j > max_queue_id)
> +                                                       continue;
> +                                               txq_data[offset + j] = otxq_data
> +                                                       [(otxq_data[i] >> 48) +
> +                                                        j];
> +                                       }

Please move to another function across the patch to reduce the clutter.

This will fix the following issue too
[for-next-net]dell[dpdk-next-net-mrvl] $ ./devtools/checkpatches.sh -n 1

### net/cnxk: avoid command copy from Tx queue

WARNING:DEEP_INDENTATION: Too many leading tabs - consider code refactoring
#742: FILE: drivers/event/cnxk/cnxk_eventdev_adptr.c:439:
+                                               if (i == eth_port_id &&

total: 0 errors, 1 warnings, 0 checks, 1412 lines checked

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3] net/cnxk: avoid command copy from Tx queue
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
                   ` (3 preceding siblings ...)
  2022-02-07 14:03 ` [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue Jerin Jacob
@ 2022-02-10 10:13 ` pbhagavatula
  2022-02-10 10:19   ` Jerin Jacob
  2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
  2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  5 siblings, 2 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 10:13 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Tx command is prepared based on offloads enabled and stored in
Tx queue structure at tx_queue_setup phase.
In fastpath the command is copied from Tx queue to LMT line for
all the packets.
Since, the command contents are mostly constants we can move the
command preparation to fastpath and avoid accessing Tx queue
memory.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v3 Changes:
 - Rebase.
 - Split patches.
 - Refactoring large function.

 v2 Changes:
 - Rebase.
 - Fix incorrect use of RoC API

 drivers/common/cnxk/roc_io.h             |  33 ++++-
 drivers/common/cnxk/roc_io_generic.h     |  15 ++
 drivers/crypto/cnxk/cn9k_cryptodev_ops.c |   2 +-
 drivers/crypto/cnxk/cn9k_ipsec.c         |   2 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  26 +++-
 drivers/event/cnxk/cn10k_worker.h        |  89 ++++++------
 drivers/event/cnxk/cn9k_eventdev.c       |  33 +++--
 drivers/event/cnxk/cn9k_worker.h         |  64 ++++-----
 drivers/event/cnxk/cnxk_eventdev.h       |  13 +-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 139 ++++++++++++++++---
 drivers/net/cnxk/cn10k_ethdev.c          |  24 +---
 drivers/net/cnxk/cn10k_ethdev.h          |   3 +-
 drivers/net/cnxk/cn10k_tx.h              | 167 ++++++++++++-----------
 drivers/net/cnxk/cn9k_ethdev.c           |  36 ++---
 drivers/net/cnxk/cn9k_ethdev.h           |   3 +-
 drivers/net/cnxk/cn9k_tx.h               | 135 +++++++++++-------
 16 files changed, 477 insertions(+), 307 deletions(-)

diff --git a/drivers/common/cnxk/roc_io.h b/drivers/common/cnxk/roc_io.h
index 4f15503c29..62e98d9d00 100644
--- a/drivers/common/cnxk/roc_io.h
+++ b/drivers/common/cnxk/roc_io.h
@@ -164,13 +164,36 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	dst128[1] = src128[1];
 	/* lmtext receives following value:
 	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
-	 * 2: NIX_SUBDC_EXT + NIX_SUBDC_MEM i.e. tstamp case
 	 */
-	if (lmtext) {
+	if (lmtext)
+		dst128[2] = src128[2];
+}
+
+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	volatile const __uint128_t *src128 = (const __uint128_t *)in;
+	volatile __uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	const __uint128_t *src128 = (const __uint128_t *)in;
+	__uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	/* lmtext receives following value:
+	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
+	 */
+	if (lmtext)
 		dst128[2] = src128[2];
-		if (lmtext > 1)
-			dst128[3] = src128[3];
-	}
 }

 static __plt_always_inline void
diff --git a/drivers/common/cnxk/roc_io_generic.h b/drivers/common/cnxk/roc_io_generic.h
index 5f90835c09..42764455cc 100644
--- a/drivers/common/cnxk/roc_io_generic.h
+++ b/drivers/common/cnxk/roc_io_generic.h
@@ -106,6 +106,21 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
 }

+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	PLT_SET_USED(out);
+	PLT_SET_USED(in);
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	PLT_SET_USED(in);
+	PLT_SET_USED(lmtext);
+	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
+}
+
 static __plt_always_inline void
 roc_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
 {
diff --git a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
index ac1953b66d..ddba9d5dd0 100644
--- a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
+++ b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
@@ -161,7 +161,7 @@ cn9k_cpt_inst_submit(struct cpt_inst_s *inst, uint64_t lmtline,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, inst, 2);
+		roc_lmt_mov64((void *)lmtline, inst);

 		/*
 		 * Make sure compiler does not reorder memcpy and ldeor.
diff --git a/drivers/crypto/cnxk/cn9k_ipsec.c b/drivers/crypto/cnxk/cn9k_ipsec.c
index 9f876f75f2..672b65a5d2 100644
--- a/drivers/crypto/cnxk/cn9k_ipsec.c
+++ b/drivers/crypto/cnxk/cn9k_ipsec.c
@@ -53,7 +53,7 @@ cn9k_cpt_enq_sa_write(struct cn9k_ipsec_sa *sa, struct cnxk_cpt_qp *qp,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, &inst, 2);
+		roc_lmt_mov64((void *)lmtline, &inst);
 		lmt_status = roc_lmt_submit_ldeor(io_addr);
 	} while (lmt_status == 0);

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 7b7ce44c74..97a88feb13 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -50,7 +50,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
-	ws->tx_base = ws->base;
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -259,15 +258,13 @@ cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 			ws_cookie,
 			sizeof(struct cnxk_sso_hws_cookie) +
 				sizeof(struct cn10k_sso_hws) +
-				(sizeof(uint64_t) * (dev->max_port_id + 1) *
-				 RTE_MAX_QUEUES_PER_PORT),
+				dev->tx_adptr_data_sz,
 			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 		if (ws_cookie == NULL)
 			return -ENOMEM;
 		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
 		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-		       sizeof(uint64_t) * (dev->max_port_id + 1) *
-			       RTE_MAX_QUEUES_PER_PORT);
+		       dev->tx_adptr_data_sz);
 		event_dev->data->ports[i] = ws;
 	}

@@ -721,16 +718,35 @@ cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			       const struct rte_eth_dev *eth_dev,
 			       int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 4019c13bd2..ff08b2d974 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -455,18 +455,18 @@ NIX_RX_FASTPATH_MODES
 	}

 static __rte_always_inline struct cn10k_eth_txq *
-cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 {
-	return (struct cn10k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+	return (struct cn10k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 static __rte_always_inline void
-cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
-		 uintptr_t lmt_addr, uint8_t sched_type, uintptr_t base,
-		 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		 const uint32_t flags)
+cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
+		 uint16_t lmt_id, uintptr_t lmt_addr, uint8_t sched_type,
+		 const uint64_t *txq_data, const uint32_t flags)
 {
 	uint8_t lnum = 0, loff = 0, shft = 0;
 	struct cn10k_eth_txq *txq;
@@ -476,7 +476,7 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	bool sec;

 	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
 	/* Perform header writes before barrier
 	 * for TSO
 	 */
@@ -501,23 +501,23 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	else
 		segdw = cn10k_nix_tx_ext_subs(flags) + 2;

+	cn10k_nix_xmit_prepare_tstamp(txq, laddr, m->ol_flags, segdw, flags);
 	if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
 		pa = txq->cpt_io_addr | 3 << 4;
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);

 	if (!sched_type)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);

 	roc_lmt_submit_steorl(lmt_id, pa);
 }

 static __rte_always_inline void
-cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
-			uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
-			uint8_t sched_type, uintptr_t base,
-			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-			const uint32_t flags)
+cn10k_sso_vwqe_split_tx(struct cn10k_sso_hws *ws, struct rte_mbuf **mbufs,
+			uint16_t nb_mbufs, uint64_t *cmd, uint16_t lmt_id,
+			uintptr_t lmt_addr, uint8_t sched_type,
+			const uint64_t *txq_data, const uint32_t flags)
 {
 	uint16_t port[4], queue[4];
 	uint16_t i, j, pkts, scalar;
@@ -540,14 +540,16 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 		if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
 		    ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
 			for (j = 0; j < 4; j++)
-				cn10k_sso_tx_one(mbufs[i + j], cmd, lmt_id,
-						 lmt_addr, sched_type, base,
-						 txq_data, flags);
+				cn10k_sso_tx_one(ws, mbufs[i + j], cmd, lmt_id,
+						 lmt_addr, sched_type, txq_data,
+						 flags);
 		} else {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[port[0]][queue[0]];
-			cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd,
-						   base + SSOW_LF_GWS_TAG,
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[port[0]] >> 48) +
+						   queue[0]] &
+					  (BIT_ULL(48) - 1));
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws,
+						   &mbufs[i], 4, cmd,
 						   flags | NIX_TX_VWQE_F);
 		}
 	}
@@ -555,15 +557,14 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 	mbufs += i;

 	for (i = 0; i < scalar; i++) {
-		cn10k_sso_tx_one(mbufs[i], cmd, lmt_id, lmt_addr, sched_type,
-				 base, txq_data, flags);
+		cn10k_sso_tx_one(ws, mbufs[i], cmd, lmt_id, lmt_addr,
+				 sched_type, txq_data, flags);
 	}
 }

 static __rte_always_inline uint16_t
 cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
-		       uint64_t *cmd,
-		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       uint64_t *cmd, const uint64_t *txq_data,
 		       const uint32_t flags)
 {
 	struct cn10k_eth_txq *txq;
@@ -580,17 +581,19 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t meta = *(uint64_t *)ev->vec;

 		if (meta & BIT(31)) {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[meta >> 32][meta >> 48];
-
-			cn10k_nix_xmit_pkts_vector(
-				txq, mbufs, meta & 0xFFFF, cmd,
-				ws->tx_base + SSOW_LF_GWS_TAG,
-				flags | NIX_TX_VWQE_F);
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[meta >> 32] >>
+						    48) +
+						   (meta >> 48)] &
+					  (BIT_ULL(48) - 1));
+
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws, mbufs,
+						   meta & 0xFFFF, cmd,
+						   flags | NIX_TX_VWQE_F);
 		} else {
 			cn10k_sso_vwqe_split_tx(
-				mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
-				ev->sched_type, ws->tx_base, txq_data, flags);
+				ws, mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
 		return (meta & 0xFFFF);
@@ -598,16 +601,16 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,

 	m = ev->mbuf;
 	ref_cnt = m->refcnt;
-	cn10k_sso_tx_one(m, cmd, lmt_id, lmt_addr, ev->sched_type, ws->tx_base,
-			 txq_data, flags);
+	cn10k_sso_tx_one(ws, m, cmd, lmt_id, lmt_addr, ev->sched_type, txq_data,
+			 flags);

 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		if (ref_cnt > 1)
 			return 1;
 	}

-	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
-				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+	cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
+				 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 	return 1;
 }

@@ -628,9 +631,7 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			flags);                                                \
 	}

@@ -642,9 +643,7 @@ NIX_TX_FASTPATH_MODES
 		struct cn10k_sso_hws *ws = port;                               \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 4611936b7f..f8652d4fbc 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -259,17 +259,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			dws = RTE_PTR_ADD(ws_cookie,
 					  sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = dws;
 		} else {
 			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
@@ -280,17 +277,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			ws = RTE_PTR_ADD(ws_cookie,
 					 sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = ws;
 		}
 	}
@@ -987,17 +981,36 @@ cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev,
 			      int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	cn9k_sso_txq_fc_update(eth_dev, tx_queue_id, true);
 	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index c99e459c1b..303b04c215 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -599,20 +599,13 @@ cn9k_sso_txq_fc_wait(const struct cn9k_eth_txq *txq)
 		;
 }

-static __rte_always_inline const struct cn9k_eth_txq *
-cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+static __rte_always_inline struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m, uint64_t *txq_data)
 {
-	return (const struct cn9k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
-}
-
-static __rte_always_inline void
-cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
-			 uint64_t *cmd, const uint32_t flags)
-{
-	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
-	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
+	return (struct cn9k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 #if defined(RTE_ARCH_ARM64)
@@ -669,7 +662,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 	nixtx += BIT_ULL(7);
 	nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);

-	roc_lmt_mov((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
+	roc_lmt_mov_nv((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));

 	/* Load opcode and cptr already prepared at pkt metadata set */
 	pkt_len -= l2_len;
@@ -756,12 +749,11 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,

 static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
-		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		      const uint32_t flags)
+		      uint64_t *txq_data, const uint32_t flags)
 {
 	struct rte_mbuf *m = ev->mbuf;
-	const struct cn9k_eth_txq *txq;
 	uint16_t ref_cnt = m->refcnt;
+	struct cn9k_eth_txq *txq;

 	/* Perform header writes before barrier for TSO */
 	cn9k_nix_xmit_prepare_tso(m, flags);
@@ -774,7 +766,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	    !(flags & NIX_TX_OFFLOAD_SECURITY_F))
 		rte_io_wmb();
 	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
-	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
+	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);

 	if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
 		uint64_t ol_flags = m->ol_flags;
@@ -796,6 +789,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,

 	if (flags & NIX_TX_MULTI_SEG_F) {
 		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
+					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -808,6 +803,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					       segdw);
 		}
 	} else {
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -853,11 +849,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws *ws = port;                                \
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_TX_SEG(fn, sz, flags)                                              \
@@ -867,11 +861,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
 		struct cn9k_sso_hws *ws = port;                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #define SSO_DUAL_TX(fn, sz, flags)                                             \
@@ -881,11 +873,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws_dual *ws = port;                           \
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_DUAL_TX_SEG(fn, sz, flags)                                         \
@@ -895,11 +885,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
 		struct cn9k_sso_hws_dual *ws = port;                           \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #endif
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 4652b58a84..b26df58588 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -99,7 +99,10 @@ struct cnxk_sso_evdev {
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
 	uint64_t *tx_adptr_data;
+	size_t tx_adptr_data_sz;
 	uint16_t max_port_id;
+	uint16_t max_queue_id[RTE_MAX_ETHPORTS];
+	uint8_t tx_adptr_configured;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -131,8 +134,8 @@ struct cn10k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint64_t tx_base __rte_cache_aligned;
-	uintptr_t lmt_base;
+	uintptr_t lmt_base __rte_cache_aligned;
+	uint64_t lso_tun_fmt;
 	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

@@ -149,7 +152,8 @@ struct cn9k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cn9k_sso_hws_dual {
@@ -165,7 +169,8 @@ struct cn9k_sso_hws_dual {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cnxk_sso_hws_cookie {
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index fdcd68ca63..82ef315807 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -339,30 +339,140 @@ cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
 		sq->aura_handle, RTE_MIN(nb_sqb_bufs, sq->aura_sqb_bufs));
 }

+static void
+cnxk_sso_tx_queue_data_init(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			    uint16_t eth_port_id, uint16_t tx_queue_id)
+{
+	uint64_t offset = 0;
+	int i, j;
+
+	dev->max_queue_id[0] = RTE_MAX(dev->max_queue_id[0], eth_port_id);
+	for (i = 1; i < eth_port_id; i++) {
+		offset += (dev->max_queue_id[i - 1] + 1);
+		txq_data[i] |= offset << 48;
+	}
+	dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
+	dev->max_queue_id[eth_port_id] =
+		RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+}
+
+static void
+cnxk_sso_tx_queue_data_rewrite(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			       uint16_t eth_port_id, uint16_t tx_queue_id,
+			       uint64_t *otxq_data, uint16_t max_port_id,
+			       uint16_t max_queue_id)
+{
+	uint64_t offset = 0;
+	int i, j;
+
+	for (i = 0; i < dev->max_queue_id[0] + 1; i++)
+		txq_data[i] |= (otxq_data[i] & ~((BIT_ULL(16) - 1) << 48));
+
+	if (eth_port_id > max_port_id) {
+		dev->max_queue_id[0] =
+			RTE_MAX(dev->max_queue_id[0], eth_port_id);
+		dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
+
+		for (i = 1; i < eth_port_id; i++) {
+			offset += (dev->max_queue_id[i - 1] + 1);
+			txq_data[i] |= offset << 48;
+			for (j = 0; (i < dev->max_port_id) &&
+				    (j < dev->max_queue_id[i] + 1);
+			     j++)
+				txq_data[offset + j] =
+					otxq_data[(otxq_data[i] >> 48) + j];
+		}
+		dev->max_queue_id[eth_port_id] =
+			RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+	} else if (tx_queue_id > max_queue_id) {
+		dev->max_queue_id[eth_port_id] =
+			RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+		dev->max_port_id = RTE_MAX(max_port_id, eth_port_id);
+		for (i = 1; i < max_port_id + 1; i++) {
+			offset += (dev->max_queue_id[i - 1] + 1);
+			txq_data[i] |= offset << 48;
+			for (j = 0; j < dev->max_queue_id[i] + 1; j++) {
+				if (i == eth_port_id && j > max_queue_id)
+					continue;
+				txq_data[offset + j] =
+					otxq_data[(otxq_data[i] >> 48) + j];
+			}
+		}
+	}
+}
+
 static int
 cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
 			    uint16_t eth_port_id, uint16_t tx_queue_id,
 			    void *txq)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
 	uint16_t max_port_id = dev->max_port_id;
-	uint64_t *txq_data = dev->tx_adptr_data;
-
-	if (txq_data == NULL || eth_port_id > max_port_id) {
-		max_port_id = RTE_MAX(max_port_id, eth_port_id);
-		txq_data = rte_realloc_socket(
-			txq_data,
-			(sizeof(uint64_t) * (max_port_id + 1) *
-			 RTE_MAX_QUEUES_PER_PORT),
-			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+	uint64_t offset = 0, row = 0;
+	uint64_t *txq_data = NULL;
+	size_t size = 0;
+	int i, j;
+
+	if (((uint64_t)txq) & 0xFFFF000000000000)
+		return -EINVAL;
+
+	if (dev->tx_adptr_data == NULL) {
+		size = (eth_port_id + 1);
+		size += (eth_port_id + tx_queue_id);
+		row = 2 * eth_port_id;
+	} else {
+		if (eth_port_id > max_port_id) {
+			size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
+			for (i = 1; i < eth_port_id; i++)
+				size += (dev->max_queue_id[i] + 1);
+			row = size;
+			size += (tx_queue_id + 1);
+		} else if (tx_queue_id > max_queue_id) {
+			size = !eth_port_id ? tx_queue_id + 1 :
+						    RTE_MAX(max_port_id,
+						      dev->max_queue_id[0]) +
+						      1;
+			for (i = 1; i < max_port_id + 1; i++) {
+				if (i == eth_port_id) {
+					row = size;
+					size += tx_queue_id + 1;
+				} else {
+					size += dev->max_queue_id[i] + 1;
+				}
+			}
+		}
+	}
+
+	size *= sizeof(uint64_t);
+
+	if (size) {
+		uint64_t *otxq_data = dev->tx_adptr_data;
+
+		txq_data = malloc(size);
 		if (txq_data == NULL)
 			return -ENOMEM;
+		memset(txq_data, 0, size);
+		txq_data[eth_port_id] = ((uint64_t)row) << 48;
+		txq_data[row + tx_queue_id] = (uint64_t)txq;
+
+		if (otxq_data != NULL)
+			cnxk_sso_tx_queue_data_rewrite(
+				dev, txq_data, eth_port_id, tx_queue_id,
+				otxq_data, max_port_id, max_queue_id);
+		else
+			cnxk_sso_tx_queue_data_init(dev, txq_data, eth_port_id,
+						    tx_queue_id);
+		dev->tx_adptr_data_sz = size;
+		free(otxq_data);
+		dev->tx_adptr_data = txq_data;
+	} else {
+		txq_data = dev->tx_adptr_data;
+		row = txq_data[eth_port_id] >> 48;
+		txq_data[row + tx_queue_id] &= ~(BIT_ULL(48) - 1);
+		txq_data[row + tx_queue_id] |= (uint64_t)txq;
 	}

-	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
-		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
-	dev->max_port_id = max_port_id;
-	dev->tx_adptr_data = txq_data;
 	return 0;
 }

@@ -372,7 +482,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			      int32_t tx_queue_id)
 {
 	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
-	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	struct roc_nix_sq *sq;
 	int i, ret;
 	void *txq;
@@ -388,8 +497,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
 		if (ret < 0)
 			return ret;
-
-		dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
 	}

 	return 0;
diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index 8378cbffc2..9bb08e1824 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -131,53 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
 	union nix_send_hdr_w0_u send_hdr_w0;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s sg_w0;
-
-	RTE_SET_USED(dev);

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
 	send_hdr_w0.u = 0;
-	sg_w0.u = 0;
-
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
 		send_hdr_w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[0];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
 			send_hdr_w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)(txq->cmd + 2);
-			send_mem->w0.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
 	} else {
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
 		send_hdr_w0.sizem1 = 1;
 	}
-
 	send_hdr_w0.sq = qid;
-	sg_w0.subdc = NIX_SUBDC_SG;
-	sg_w0.segs = 1;
-	sg_w0.ld_type = NIX_SENDLDTYPE_LDD;
-
 	txq->send_hdr_w0 = send_hdr_w0.u;
-	txq->sg_w0 = sg_w0.u;
-
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
index 0982158c62..ec40e53152 100644
--- a/drivers/net/cnxk/cn10k_ethdev.h
+++ b/drivers/net/cnxk/cn10k_ethdev.h
@@ -9,7 +9,6 @@

 struct cn10k_eth_txq {
 	uint64_t send_hdr_w0;
-	uint64_t sg_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	uintptr_t lmt_base;
@@ -20,8 +19,8 @@ struct cn10k_eth_txq {
 	uint64_t sa_base;
 	uint64_t *cpt_fc;
 	uint16_t cpt_desc;
-	uint64_t cmd[4];
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 } __plt_cache_aligned;

 struct cn10k_eth_rxq {
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index fc1f6ceb8c..4ae6bbf517 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -186,23 +186,26 @@ cn10k_cpt_tx_steor_data(void)
 }

 static __rte_always_inline void
-cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
-		      const uint16_t flags)
+cn10k_nix_tx_skeleton(struct cn10k_eth_txq *txq, uint64_t *cmd,
+		      const uint16_t flags, const uint16_t static_sz)
 {
-	/* Send hdr */
-	cmd[0] = txq->send_hdr_w0;
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
 	cmd[1] = 0;
-	cmd += 2;

-	/* Send ext if present */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		*(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
-		cmd += 2;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
 	}
-
-	/* Send sg */
-	cmd[0] = txq->sg_w0;
-	cmd[1] = 0;
 }

 static __rte_always_inline void
@@ -718,41 +721,29 @@ cn10k_nix_xmit_mv_lmt_base(uintptr_t lmt_addr, uint64_t *cmd,
 }

 static __rte_always_inline void
-cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
+cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
 			      const uint64_t ol_flags, const uint16_t no_segdw,
 			      const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
-		struct nix_send_ext_s *send_hdr_ext =
-			(struct nix_send_ext_s *)lmt_addr + 16;
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
 		uint64_t *lmt = (uint64_t *)lmt_addr;
 		uint16_t off = (no_segdw - 1) << 1;
 		struct nix_send_mem_s *send_mem;

 		send_mem = (struct nix_send_mem_s *)(lmt + off);
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
-		send_hdr_ext->w0.tstmp = 1;
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			lmt[off] = cmd[2];
-
-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
 		send_mem->w0.subdc = NIX_SUBDC_MEM;
-		send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+		send_mem->w0.alg =
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
 		send_mem->addr =
-			(rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
+			(rte_iova_t)(((uint64_t *)txq->ts_mem) + is_ol_tstamp);
 	}
 }

@@ -841,8 +832,8 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
-		    uint64_t *cmd, uintptr_t base, const uint16_t flags)
+cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
+		    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	const rte_iova_t io_addr = txq->io_addr;
@@ -863,9 +854,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		/* Reduce the cached count */
 		txq->fc_cache_pkts -= pkts;
 	}
-
 	/* Get cmd skeleton */
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -909,14 +899,14 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, 4, flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      4, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec)
 			lnum++;
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -967,9 +957,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
-			 uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			 const uint16_t flags)
+cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
+			 struct rte_mbuf **tx_pkts, uint16_t pkts,
+			 uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	uintptr_t pa0, pa1, lbase = txq->lmt_base;
@@ -987,12 +977,13 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uintptr_t laddr;
 	bool sec;

-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
-
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	}
+	/* Get cmd skeleton */
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -1038,13 +1029,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-
 		/* Store sg list directly on lmt line */
 		segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
 					       flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, segdw,
-					      flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      segdw, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
 			lnum++;
 			data128 |= (((__uint128_t)(segdw - 1)) << shft);
@@ -1053,7 +1042,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -1474,9 +1463,9 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1526,25 +1515,42 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
 	}

-	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	} else {
+		uint64_t w0 =
+			(txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
+
+		senddesc01_w0 = vdupq_n_u64(w0);
+	}
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
 	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	/* Get LMT base address and LMT ID as lcore id */
@@ -2577,7 +2583,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		wd.data[0] >>= 16;

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;

@@ -2640,12 +2646,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 	if (unlikely(scalar)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
-			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
-							 scalar, cmd, base,
-							 flags);
+			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, ws, tx_pkts,
+							 scalar, cmd, flags);
 		else
-			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
-						    cmd, base, flags);
+			pkts += cn10k_nix_xmit_pkts(tx_queue, ws, tx_pkts,
+						    scalar, cmd, flags);
 	}

 	return pkts;
@@ -2653,16 +2658,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 #else
 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
+	RTE_SET_USED(ws);
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
-	RTE_SET_USED(base);
 	return 0;
 }
 #endif
@@ -2892,7 +2897,7 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, 0,    \
+		return cn10k_nix_xmit_pkts(tx_queue, NULL, tx_pkts, pkts, cmd, \
 					   flags);                             \
 	}

@@ -2905,8 +2910,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
-						0,                             \
+		return cn10k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, \
+						cmd,                           \
 						flags | NIX_TX_MULTI_SEG_F);   \
 	}

@@ -2919,8 +2924,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts,     \
-						  cmd, 0, (flags));            \
+		return cn10k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts,     \
+						  pkts, cmd, (flags));         \
 	}

 #define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags)                                    \
@@ -2933,7 +2938,7 @@ NIX_TX_FASTPATH_MODES
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(                             \
-			tx_queue, tx_pkts, pkts, cmd, 0,                       \
+			tx_queue, NULL, tx_pkts, pkts, cmd,                    \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index d34bc6898f..01e3850561 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -131,51 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
-	struct nix_send_hdr_s *send_hdr;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s *sg;
+	union nix_send_hdr_w0_u send_hdr_w0;

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
-
+	send_hdr_w0.u = 0;
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
-		send_hdr->w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[2];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
+		send_hdr_w0.sizem1 = 2;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
-			send_hdr->w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;
+			send_hdr_w0.sizem1 = 3;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)
-				(txq->cmd + (send_hdr->w0.sizem1 << 1));
-			send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.cn9k.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
-		sg = (union nix_send_sg_s *)&txq->cmd[4];
 	} else {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
-		send_hdr->w0.sizem1 = 1;
-		sg = (union nix_send_sg_s *)&txq->cmd[2];
+		send_hdr_w0.sizem1 = 1;
 	}
-
-	send_hdr->w0.sq = qid;
-	sg->subdc = NIX_SUBDC_SG;
-	sg->segs = 1;
-	sg->ld_type = NIX_SENDLDTYPE_LDD;
-
+	send_hdr_w0.sq = qid;
+	txq->send_hdr_w0 = send_hdr_w0.u;
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
index 2b452fe009..8ab924944c 100644
--- a/drivers/net/cnxk/cn9k_ethdev.h
+++ b/drivers/net/cnxk/cn9k_ethdev.h
@@ -9,12 +9,13 @@
 #include <cnxk_security_ar.h>

 struct cn9k_eth_txq {
-	uint64_t cmd[8];
+	uint64_t send_hdr_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	void *lmt_addr;
 	rte_iova_t io_addr;
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 	uint16_t sqes_per_sqb_log2;
 	int16_t nb_sqb_bufs_adj;
 	rte_iova_t cpt_io_addr;
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 8564dd85ee..d23e4b61b4 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -58,6 +58,29 @@ cn9k_nix_tx_ext_subs(const uint16_t flags)
 				  : 0);
 }

+static __rte_always_inline void
+cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
+		     const uint16_t flags, const uint16_t static_sz)
+{
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn9k_nix_tx_ext_subs(flags) + 1) << 40);
+	cmd[1] = 0;
+
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	}
+}
+
 static __rte_always_inline void
 cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 {
@@ -136,11 +159,11 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		w1.u = 0;
 	}

-	if (!(flags & NIX_TX_MULTI_SEG_F)) {
+	if (!(flags & NIX_TX_MULTI_SEG_F))
 		send_hdr->w0.total = m->data_len;
-		send_hdr->w0.aura =
-			roc_npa_aura_handle_to_aura(m->pool->pool_id);
-	}
+	else
+		send_hdr->w0.total = m->pkt_len;
+	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	/*
 	 * L3type:  2 => IPV4
@@ -287,41 +310,39 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		/* Mark mempool object as "put" since it is freed by NIX */
 		if (!send_hdr->w0.df)
 			RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	} else {
+		sg->seg1_size = m->data_len;
+		*(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
+
+		/* NOFF is handled later for multi-seg */
 	}
 }

 static __rte_always_inline void
-cn9k_nix_xmit_prepare_tstamp(uint64_t *cmd, const uint64_t *send_mem_desc,
+cn9k_nix_xmit_prepare_tstamp(struct cn9k_eth_txq *txq, uint64_t *cmd,
 			     const uint64_t ol_flags, const uint16_t no_segdw,
 			     const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 		struct nix_send_mem_s *send_mem;
 		uint16_t off = (no_segdw - 1) << 1;
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);

 		send_mem = (struct nix_send_mem_s *)(cmd + off);
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			cmd[off] = send_mem_desc[6];

-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
+		send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
 		send_mem->w0.cn9k.alg =
-			NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);

-		send_mem->addr = (rte_iova_t)((uint64_t *)send_mem_desc[7] +
-					      (is_ol_tstamp));
+		send_mem->addr = (rte_iova_t)(((uint64_t *)txq->ts_mem) +
+				(is_ol_tstamp));
 	}
 }

@@ -367,8 +388,6 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 	uint8_t off, i;

 	send_hdr = (struct nix_send_hdr_s *)cmd;
-	send_hdr->w0.total = m->pkt_len;
-	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	if (flags & NIX_TX_NEED_EXT_HDR)
 		off = 2;
@@ -376,13 +395,29 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		off = 0;

 	sg = (union nix_send_sg_s *)&cmd[2 + off];
-	/* Clear sg->u header before use */
-	sg->u &= 0xFC00000000000000;
+
+	/* Start from second segment, first segment is already there */
+	i = 1;
 	sg_u = sg->u;
-	slist = &cmd[3 + off];
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+	slist = &cmd[3 + off + 1];

-	i = 0;
-	nb_segs = m->nb_segs;
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		rte_io_wmb();
+	}
+
+	/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+	m = m_next;
+	if (!m)
+		goto done;

 	/* Fill mbuf segments */
 	do {
@@ -417,6 +452,7 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		m = m_next;
 	} while (nb_segs);

+done:
 	sg->u = sg_u;
 	sg->segs = i;
 	segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
@@ -472,7 +508,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -490,8 +526,8 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, 4, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
+					     flags);
 		cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
 	}

@@ -514,7 +550,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -533,9 +569,8 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
 		segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, segdw,
-					     flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
+					     segdw, flags);
 		cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
 	}

@@ -862,28 +897,34 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
 		rte_io_wmb();

-	senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
+	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
-		sgdesc23_w0 = sgdesc01_w0;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
-	} else {
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sgdesc23_w0 = sgdesc01_w0;
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
--
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] net/cnxk: avoid command copy from Tx queue
  2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
@ 2022-02-10 10:19   ` Jerin Jacob
  2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
  1 sibling, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2022-02-10 10:19 UTC (permalink / raw)
  To: Pavan Nikhilesh
  Cc: Jerin Jacob, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Shijith Thotton, dpdk-dev

On Thu, Feb 10, 2022 at 3:43 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Tx command is prepared based on offloads enabled and stored in
> Tx queue structure at tx_queue_setup phase.
> In fastpath the command is copied from Tx queue to LMT line for
> all the packets.
> Since, the command contents are mostly constants we can move the
> command preparation to fastpath and avoid accessing Tx queue
> memory.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  v3 Changes:
>  - Rebase.
>  - Split patches.
>  - Refactoring large function.
>
>  v2 Changes:
>  - Rebase.
>  - Fix incorrect use of RoC API
>
>  drivers/common/cnxk/roc_io.h             |  33 ++++-
> +static void
> +cnxk_sso_tx_queue_data_init(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                           uint16_t eth_port_id, uint16_t tx_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       dev->max_queue_id[0] = RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +       for (i = 1; i < eth_port_id; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +       }
> +       dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +       dev->max_queue_id[eth_port_id] =
> +               RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_rewrite(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                              uint16_t eth_port_id, uint16_t tx_queue_id,
> +                              uint64_t *otxq_data, uint16_t max_port_id,
> +                              uint16_t max_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       for (i = 0; i < dev->max_queue_id[0] + 1; i++)
> +               txq_data[i] |= (otxq_data[i] & ~((BIT_ULL(16) - 1) << 48));
> +
> +       if (eth_port_id > max_port_id) {
> +               dev->max_queue_id[0] =
> +                       RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +               dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +
> +               for (i = 1; i < eth_port_id; i++) {
> +                       offset += (dev->max_queue_id[i - 1] + 1);
> +                       txq_data[i] |= offset << 48;
> +                       for (j = 0; (i < dev->max_port_id) &&
> +                                   (j < dev->max_queue_id[i] + 1);
> +                            j++)
> +                               txq_data[offset + j] =
> +                                       otxq_data[(otxq_data[i] >> 48) + j];
> +               }
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);

Could you move this as a separate static function? Too much depth


> +       } else if (tx_queue_id > max_queue_id) {
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +               dev->max_port_id = RTE_MAX(max_port_id, eth_port_id);
> +               for (i = 1; i < max_port_id + 1; i++) {
> +                       offset += (dev->max_queue_id[i - 1] + 1);
> +                       txq_data[i] |= offset << 48;
> +                       for (j = 0; j < dev->max_queue_id[i] + 1; j++) {
> +                               if (i == eth_port_id && j > max_queue_id)
> +                                       continue;
> +                               txq_data[offset + j] =
> +                                       otxq_data[(otxq_data[i] >> 48) + j];
> +                       }
> +               }
> +       }
> +}

Could you move this as a separate static function? Too much depth


> +
>  static int
>  cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
>                             uint16_t eth_port_id, uint16_t tx_queue_id,
>                             void *txq)
>  {
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
>         uint16_t max_port_id = dev->max_port_id;
> -       uint64_t *txq_data = dev->tx_adptr_data;
> -
> -       if (txq_data == NULL || eth_port_id > max_port_id) {
> -               max_port_id = RTE_MAX(max_port_id, eth_port_id);
> -               txq_data = rte_realloc_socket(
> -                       txq_data,
> -                       (sizeof(uint64_t) * (max_port_id + 1) *
> -                        RTE_MAX_QUEUES_PER_PORT),
> -                       RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
> +       uint64_t offset = 0, row = 0;
> +       uint64_t *txq_data = NULL;
> +       size_t size = 0;
> +       int i, j;
> +
> +       if (((uint64_t)txq) & 0xFFFF000000000000)
> +               return -EINVAL;
> +
> +       if (dev->tx_adptr_data == NULL) {
> +               size = (eth_port_id + 1);
> +               size += (eth_port_id + tx_queue_id);
> +               row = 2 * eth_port_id;
> +       } else {
> +               if (eth_port_id > max_port_id) {
> +                       size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
> +                       for (i = 1; i < eth_port_id; i++)
> +                               size += (dev->max_queue_id[i] + 1);
> +                       row = size;
> +                       size += (tx_queue_id + 1);
> +               } else if (tx_queue_id > max_queue_id) {
> +                       size = !eth_port_id ? tx_queue_id + 1 :
> +                                                   RTE_MAX(max_port_id,
> +                                                     dev->max_queue_id[0]) +
> +                                                     1;

See below
> +                       for (i = 1; i < max_port_id + 1; i++) {
> +                               if (i == eth_port_id) {
> +                                       row = size;
> +                                       size += tx_queue_id + 1;
> +                               } else {
> +                                       size += dev->max_queue_id[i] + 1;
> +                               }
> +                       }
> +               }
> +       }
Could you move this as a separate static function? Too much depth

The rest looks good.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 1/3] event/cnxk: store and reuse workslot status
  2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
                   ` (4 preceding siblings ...)
  2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
@ 2022-02-10 10:19 ` pbhagavatula
  2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
                     ` (2 more replies)
  5 siblings, 3 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 10:19 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Store and reuse workslot status for TT, GRP and HEAD status
instead of reading from GWC as reading from GWC imposes
additional latency.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 Depends-on: 21590

 v3 Changes:
 - Split and rebase patches.

 v2 Changes:
 - Rebase.
 - Fix incorrect use of RoC API

 drivers/common/cnxk/roc_sso.h      | 14 ++++++++------
 drivers/event/cnxk/cn10k_worker.h  | 16 +++++++++-------
 drivers/event/cnxk/cn9k_worker.h   |  6 +++---
 drivers/event/cnxk/cnxk_eventdev.h |  2 ++
 drivers/event/cnxk/cnxk_worker.h   | 11 +++++++----
 drivers/net/cnxk/cn10k_tx.h        | 12 ++++++------
 6 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index 27d49c6c68..ab7cee1c60 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -54,12 +54,13 @@ struct roc_sso {
 	uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
 } __plt_cache_aligned;

-static __plt_always_inline void
-roc_sso_hws_head_wait(uintptr_t tag_op)
+static __plt_always_inline uint64_t
+roc_sso_hws_head_wait(uintptr_t base)
 {
-#ifdef RTE_ARCH_ARM64
+	uintptr_t tag_op = base + SSOW_LF_GWS_TAG;
 	uint64_t tag;

+#if defined(__aarch64__)
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_op]]	\n"
 		     "		tbnz %[tag], 35, done%=		\n"
@@ -71,10 +72,11 @@ roc_sso_hws_head_wait(uintptr_t tag_op)
 		     : [tag] "=&r"(tag)
 		     : [tag_op] "r"(tag_op));
 #else
-	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (!(plt_read64(tag_op) & BIT_ULL(35)))
-		;
+	do {
+		tag = plt_read64(tag_op);
+	} while (!(tag & BIT_ULL(35)));
 #endif
+	return tag;
 }

 /* SSO device initialization */
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ff08b2d974..ada230ea1d 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -40,8 +40,7 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 {
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
-	const uint8_t cur_tt =
-		CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
+	const uint8_t cur_tt = CNXK_TT_FROM_TAG(ws->gw_rdata);

 	/* CNXK model
 	 * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
@@ -81,7 +80,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 	const uint8_t grp = ev->queue_id;

 	/* Group hasn't changed, Use SWTAG to forward the event */
-	if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
+	if (CNXK_GRP_FROM_TAG(ws->gw_rdata) == grp)
 		cn10k_sso_hws_fwd_swtag(ws, ev);
 	else
 		/*
@@ -211,6 +210,7 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 	} while (gw.u64[0] & BIT_ULL(63));
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
+	ws->gw_rdata = gw.u64[0];
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
@@ -405,7 +405,8 @@ NIX_RX_FASTPATH_MODES
 		RTE_SET_USED(timeout_ticks);                                   \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return 1;                                              \
 		}                                                              \
 		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
@@ -424,7 +425,8 @@ NIX_RX_FASTPATH_MODES
 		uint64_t iter;                                                 \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return ret;                                            \
 		}                                                              \
 		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
@@ -507,8 +509,8 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);

-	if (!sched_type)
-		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
+	if (!CNXK_TAG_IS_HEAD(ws->gw_rdata) && !sched_type)
+		ws->gw_rdata = roc_sso_hws_head_wait(ws->base);

 	roc_lmt_submit_steorl(lmt_id, pa);
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 303b04c215..8455272005 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -700,7 +700,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,

 	/* Head wait if needed */
 	if (base)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(base);

 	/* ESN */
 	outb_priv = roc_nix_inl_onf_ipsec_outb_sa_sw_rsvd((void *)sa);
@@ -793,7 +793,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
@@ -806,7 +806,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index b26df58588..ab58508590 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -47,6 +47,7 @@
 #define CNXK_CLR_SUB_EVENT(x)	    (~(0xffu << 20) & x)
 #define CNXK_GRP_FROM_TAG(x)	    (((x) >> 36) & 0x3ff)
 #define CNXK_SWTAG_PEND(x)	    (BIT_ULL(62) & x)
+#define CNXK_TAG_IS_HEAD(x)	    (BIT_ULL(35) & x)

 #define CN9K_SSOW_GET_BASE_ADDR(_GW) ((_GW)-SSOW_LF_GWS_OP_GET_WORK0)

@@ -123,6 +124,7 @@ struct cnxk_sso_evdev {

 struct cn10k_sso_hws {
 	uint64_t base;
+	uint64_t gw_rdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 9f9ceab8a1..7de03f3fbb 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -52,11 +52,11 @@ cnxk_sso_hws_swtag_flush(uint64_t tag_op, uint64_t flush_op)
 	plt_write64(0, flush_op);
 }

-static __rte_always_inline void
+static __rte_always_inline uint64_t
 cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 {
-#ifdef RTE_ARCH_ARM64
 	uint64_t swtp;
+#ifdef RTE_ARCH_ARM64

 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
@@ -70,9 +70,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 		     : [swtp_loc] "r"(tag_op));
 #else
 	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (plt_read64(tag_op) & BIT_ULL(62))
-		;
+	do {
+		swtp = plt_read64(tag_op);
+	} while (swtp & BIT_ULL(62));
 #endif
+
+	return swtp;
 }

 #endif
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 4ae6bbf517..ec6366168c 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -905,8 +905,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 			lnum++;
 	}

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -1041,8 +1041,8 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		}
 	}

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -2582,8 +2582,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 	if (flags & (NIX_TX_MULTI_SEG_F | NIX_TX_OFFLOAD_SECURITY_F))
 		wd.data[0] >>= 16;

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;

--
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue
  2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
@ 2022-02-10 10:19   ` pbhagavatula
  2022-02-10 10:19   ` [PATCH v3 3/3] net/cnxk: improve Rx performance pbhagavatula
  2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 10:19 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Setting WAITW bit enables default min dequeue timeout of 1us.
Avoid the min dequeue timeout by setting WAITW only when dequeue_timeout
is configured.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c |  8 +++++--
 drivers/event/cnxk/cn9k_eventdev.c  |  9 ++++++-
 drivers/event/cnxk/cn9k_worker.h    | 37 +++++++++++++----------------
 drivers/event/cnxk/cnxk_eventdev.c  |  2 +-
 drivers/event/cnxk/cnxk_eventdev.h  |  2 ++
 5 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 97a88feb13..26d65e3568 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -15,7 +15,10 @@
 static uint32_t
 cn10k_sso_gw_mode_wdata(struct cnxk_sso_evdev *dev)
 {
-	uint32_t wdata = BIT(16) | 1;
+	uint32_t wdata = 1;
+
+	if (dev->deq_tmo_ns)
+		wdata |= BIT(16);
 
 	switch (dev->gw_mode) {
 	case CN10K_GW_MODE_NONE:
@@ -88,7 +91,8 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	ws->xaq_lmt = dev->xaq_lmt;
 
 	/* Set get_work timeout for HWS */
-	val = NSEC2USEC(dev->deq_tmo_ns) - 1;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	plt_write64(val, ws->base + SSOW_LF_GWS_NW_TIM);
 }
 
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index f8652d4fbc..6d3d03c97c 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -72,7 +72,8 @@ cn9k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;
 
 	/* Set get_work tmo for HWS */
-	val = dev->deq_tmo_ns ? NSEC2USEC(dev->deq_tmo_ns) - 1 : 0;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	if (dev->dual_ws) {
 		dws = hws;
 		dws->grp_base = grp_base;
@@ -677,6 +678,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		dws->hws_id = port_id;
 		dws->swtag_req = 0;
 		dws->vws = 0;
+		if (dev->deq_tmo_ns)
+			dws->gw_wdata = BIT_ULL(16);
+		dws->gw_wdata |= 1;
 
 		data = dws;
 	} else {
@@ -695,6 +699,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
 		ws->hws_id = port_id;
 		ws->swtag_req = 0;
+		if (dev->deq_tmo_ns)
+			ws->gw_wdata = BIT_ULL(16);
+		ws->gw_wdata |= 1;
 
 		data = ws;
 	}
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 8455272005..79374b8d95 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -149,10 +149,8 @@ cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			   struct rte_event *ev, const uint32_t flags,
-			   const void *const lookup_mem,
-			   struct cnxk_timesync_info *const tstamp)
+			   struct cn9k_sso_hws_dual *dws)
 {
-	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
@@ -161,7 +159,7 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 	uint64_t mbuf;
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
-		rte_prefetch_non_temporal(lookup_mem);
+		rte_prefetch_non_temporal(dws->lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -175,14 +173,14 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(base + SSOW_LF_GWS_TAG),
-		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(set_gw),
+		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(dws->gw_wdata),
 		       [pong] "r"(pair_base + SSOW_LF_GWS_OP_GET_WORK0));
 #else
 	gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	while ((BIT_ULL(63)) & gw.u64[0])
 		gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	gw.u64[1] = plt_read64(base + SSOW_LF_GWS_WQP);
-	plt_write64(set_gw, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(dws->gw_wdata, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
@@ -202,12 +200,13 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
 			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
 					 gw.u64[0] & 0xFFFFF, flags,
-					 lookup_mem);
+					 dws->lookup_mem);
 			/* Extracting tstamp, if PTP enabled*/
 			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
 							    gw.u64[1]) +
 						   CNXK_SSO_WQE_SG_PTR);
-			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						dws->tstamp,
 						flags & NIX_RX_OFFLOAD_TSTAMP_F,
 						flags & NIX_RX_MULTI_SEG_F,
 						(uint64_t *)tstamp_ptr);
@@ -232,9 +231,7 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
 	uint64_t tstamp_ptr;
 	uint64_t mbuf;
 
-	plt_write64(BIT_ULL(16) | /* wait for work. */
-			    1,	  /* Use Mask set 0. */
-		    ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(ws->gw_wdata, ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
 		rte_prefetch_non_temporal(lookup_mem);
@@ -529,9 +526,9 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return 1;                                              \
 		}                                                              \
-		gw = cn9k_sso_hws_dual_get_work(                               \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		gw = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],           \
+						dws->base[!dws->vws], ev,      \
+						flags, dws);                   \
 		dws->vws = !dws->vws;                                          \
 		return gw;                                                     \
 	}
@@ -554,14 +551,14 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return ret;                                            \
 		}                                                              \
-		ret = cn9k_sso_hws_dual_get_work(                              \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],          \
+						 dws->base[!dws->vws], ev,     \
+						 flags, dws);                  \
 		dws->vws = !dws->vws;                                          \
 		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
-			ret = cn9k_sso_hws_dual_get_work(                      \
-				dws->base[dws->vws], dws->base[!dws->vws], ev, \
-				flags, dws->lookup_mem, dws->tstamp);          \
+			ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],  \
+							 dws->base[!dws->vws], \
+							 ev, flags, dws);      \
 			dws->vws = !dws->vws;                                  \
 		}                                                              \
 		return ret;                                                    \
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index 6ad4e23e2b..be021d86c9 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -610,7 +610,7 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}
 
 	dev->is_timeout_deq = 0;
-	dev->min_dequeue_timeout_ns = USEC2NSEC(1);
+	dev->min_dequeue_timeout_ns = 0;
 	dev->max_dequeue_timeout_ns = USEC2NSEC(0x3FF);
 	dev->max_num_events = -1;
 	dev->nb_event_queues = 0;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index ab58508590..e3b5ffa7eb 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -144,6 +144,7 @@ struct cn10k_sso_hws {
 /* Event port a.k.a GWS */
 struct cn9k_sso_hws {
 	uint64_t base;
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
@@ -160,6 +161,7 @@ struct cn9k_sso_hws {
 
 struct cn9k_sso_hws_dual {
 	uint64_t base[2]; /* Ping and Pong */
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 3/3] net/cnxk: improve Rx performance
  2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
@ 2022-02-10 10:19   ` pbhagavatula
  2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 10:19 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h | 25 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx.h       |  8 ++++----
 drivers/net/cnxk/cn9k_rx.h        | 20 ++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ada230ea1d..cfe729cef9 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint8_t loff = 0;
 	uint64_t sa_base;
 	uint64_t **wqe;
+	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
 	wqe = vec->u64s;
 
+	rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+		rte_prefetch_non_temporal(&vec->ptrs[i]);
+
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
 	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t u64[2];
 	} gw;
 	uint64_t tstamp_ptr;
-	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
-		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		"sub %[mbuf], %H[wdata], #0x80				\n"
-		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+		"caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+		: [wdata] "+r"(gw.get_work)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
-	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	ws->gw_rdata = gw.u64[0];
-	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
-		    (gw.u64[0] & 0xffffffff);
-
-	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+	if (gw.u64[1]) {
+		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+			    (gw.u64[0] & (0x3FFull << 36)) << 4 |
+			    (gw.u64[0] & 0xffffffff);
 		if ((flags & CPT_RX_WQE_F) &&
 		    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 		     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 			   RTE_EVENT_TYPE_ETHDEV) {
 			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			uint64_t mbuf;
 
+			mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+			rte_prefetch0((void *)mbuf);
 			if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 				struct rte_mbuf *m;
 				uintptr_t sa_base;
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 8b00fcc660..564e50f0af 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
 		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 1178f95317..d36f292c95 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -388,16 +388,16 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags =
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
-	mbuf->pkt_len = len;
-	mbuf->data_len = len;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
-
 	mbuf->ol_flags = ol_flags;
+	*(uint64_t *)(&mbuf->rearm_data) = val;
+	mbuf->pkt_len = len;
 
-	if (flag & NIX_RX_MULTI_SEG_F)
+	if (flag & NIX_RX_MULTI_SEG_F) {
 		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-	else
+	} else {
+		mbuf->data_len = len;
 		mbuf->next = NULL;
+	}
 }
 
 static inline uint16_t
@@ -769,10 +769,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
-		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
@@ -797,6 +793,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			mbuf3->next = NULL;
 		}
 
+		/* Store the mbufs to rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4] net/cnxk: avoid command copy from Tx queue
  2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
  2022-02-10 10:19   ` Jerin Jacob
@ 2022-02-10 13:15   ` pbhagavatula
  2022-02-11 10:27     ` Jerin Jacob
  1 sibling, 1 reply; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 13:15 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Tx command is prepared based on offloads enabled and stored in
Tx queue structure at tx_queue_setup phase.
In fastpath the command is copied from Tx queue to LMT line for
all the packets.
Since, the command contents are mostly constants we can move the
command preparation to fastpath and avoid accessing Tx queue
memory.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v4 Changes:
 - Further refactor large functions.
 v3 Changes:
 - Rebase.
 - Split patches.
 - Refactoring large function.
 v2 Changes:
 - Rebase.
 - Fix incorrect use of RoC API

 drivers/common/cnxk/roc_io.h             |  33 ++++-
 drivers/common/cnxk/roc_io_generic.h     |  15 ++
 drivers/crypto/cnxk/cn9k_cryptodev_ops.c |   2 +-
 drivers/crypto/cnxk/cn9k_ipsec.c         |   2 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  26 +++-
 drivers/event/cnxk/cn10k_worker.h        |  89 ++++++------
 drivers/event/cnxk/cn9k_eventdev.c       |  33 +++--
 drivers/event/cnxk/cn9k_worker.h         |  64 ++++----
 drivers/event/cnxk/cnxk_eventdev.h       |  13 +-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 178 +++++++++++++++++++++--
 drivers/net/cnxk/cn10k_ethdev.c          |  24 +--
 drivers/net/cnxk/cn10k_ethdev.h          |   3 +-
 drivers/net/cnxk/cn10k_tx.h              | 167 ++++++++++-----------
 drivers/net/cnxk/cn9k_ethdev.c           |  36 +----
 drivers/net/cnxk/cn9k_ethdev.h           |   3 +-
 drivers/net/cnxk/cn9k_tx.h               | 135 +++++++++++------
 16 files changed, 516 insertions(+), 307 deletions(-)

diff --git a/drivers/common/cnxk/roc_io.h b/drivers/common/cnxk/roc_io.h
index 4f15503c29..62e98d9d00 100644
--- a/drivers/common/cnxk/roc_io.h
+++ b/drivers/common/cnxk/roc_io.h
@@ -164,13 +164,36 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	dst128[1] = src128[1];
 	/* lmtext receives following value:
 	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
-	 * 2: NIX_SUBDC_EXT + NIX_SUBDC_MEM i.e. tstamp case
 	 */
-	if (lmtext) {
+	if (lmtext)
+		dst128[2] = src128[2];
+}
+
+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	volatile const __uint128_t *src128 = (const __uint128_t *)in;
+	volatile __uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	const __uint128_t *src128 = (const __uint128_t *)in;
+	__uint128_t *dst128 = (__uint128_t *)out;
+
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	/* lmtext receives following value:
+	 * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
+	 */
+	if (lmtext)
 		dst128[2] = src128[2];
-		if (lmtext > 1)
-			dst128[3] = src128[3];
-	}
 }

 static __plt_always_inline void
diff --git a/drivers/common/cnxk/roc_io_generic.h b/drivers/common/cnxk/roc_io_generic.h
index 5f90835c09..42764455cc 100644
--- a/drivers/common/cnxk/roc_io_generic.h
+++ b/drivers/common/cnxk/roc_io_generic.h
@@ -106,6 +106,21 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
 	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
 }

+static __plt_always_inline void
+roc_lmt_mov64(void *out, const void *in)
+{
+	PLT_SET_USED(out);
+	PLT_SET_USED(in);
+}
+
+static __plt_always_inline void
+roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
+{
+	PLT_SET_USED(in);
+	PLT_SET_USED(lmtext);
+	memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
+}
+
 static __plt_always_inline void
 roc_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
 {
diff --git a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
index ac1953b66d..ddba9d5dd0 100644
--- a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
+++ b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
@@ -161,7 +161,7 @@ cn9k_cpt_inst_submit(struct cpt_inst_s *inst, uint64_t lmtline,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, inst, 2);
+		roc_lmt_mov64((void *)lmtline, inst);

 		/*
 		 * Make sure compiler does not reorder memcpy and ldeor.
diff --git a/drivers/crypto/cnxk/cn9k_ipsec.c b/drivers/crypto/cnxk/cn9k_ipsec.c
index 9f876f75f2..672b65a5d2 100644
--- a/drivers/crypto/cnxk/cn9k_ipsec.c
+++ b/drivers/crypto/cnxk/cn9k_ipsec.c
@@ -53,7 +53,7 @@ cn9k_cpt_enq_sa_write(struct cn9k_ipsec_sa *sa, struct cnxk_cpt_qp *qp,

 	do {
 		/* Copy CPT command to LMTLINE */
-		roc_lmt_mov((void *)lmtline, &inst, 2);
+		roc_lmt_mov64((void *)lmtline, &inst);
 		lmt_status = roc_lmt_submit_ldeor(io_addr);
 	} while (lmt_status == 0);

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 7b7ce44c74..97a88feb13 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -50,7 +50,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
-	ws->tx_base = ws->base;
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -259,15 +258,13 @@ cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 			ws_cookie,
 			sizeof(struct cnxk_sso_hws_cookie) +
 				sizeof(struct cn10k_sso_hws) +
-				(sizeof(uint64_t) * (dev->max_port_id + 1) *
-				 RTE_MAX_QUEUES_PER_PORT),
+				dev->tx_adptr_data_sz,
 			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 		if (ws_cookie == NULL)
 			return -ENOMEM;
 		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
 		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-		       sizeof(uint64_t) * (dev->max_port_id + 1) *
-			       RTE_MAX_QUEUES_PER_PORT);
+		       dev->tx_adptr_data_sz);
 		event_dev->data->ports[i] = ws;
 	}

@@ -721,16 +718,35 @@ cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			       const struct rte_eth_dev *eth_dev,
 			       int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 4019c13bd2..ff08b2d974 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -455,18 +455,18 @@ NIX_RX_FASTPATH_MODES
 	}

 static __rte_always_inline struct cn10k_eth_txq *
-cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 {
-	return (struct cn10k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+	return (struct cn10k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 static __rte_always_inline void
-cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
-		 uintptr_t lmt_addr, uint8_t sched_type, uintptr_t base,
-		 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		 const uint32_t flags)
+cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
+		 uint16_t lmt_id, uintptr_t lmt_addr, uint8_t sched_type,
+		 const uint64_t *txq_data, const uint32_t flags)
 {
 	uint8_t lnum = 0, loff = 0, shft = 0;
 	struct cn10k_eth_txq *txq;
@@ -476,7 +476,7 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	bool sec;

 	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
 	/* Perform header writes before barrier
 	 * for TSO
 	 */
@@ -501,23 +501,23 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
 	else
 		segdw = cn10k_nix_tx_ext_subs(flags) + 2;

+	cn10k_nix_xmit_prepare_tstamp(txq, laddr, m->ol_flags, segdw, flags);
 	if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
 		pa = txq->cpt_io_addr | 3 << 4;
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);

 	if (!sched_type)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);

 	roc_lmt_submit_steorl(lmt_id, pa);
 }

 static __rte_always_inline void
-cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
-			uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
-			uint8_t sched_type, uintptr_t base,
-			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-			const uint32_t flags)
+cn10k_sso_vwqe_split_tx(struct cn10k_sso_hws *ws, struct rte_mbuf **mbufs,
+			uint16_t nb_mbufs, uint64_t *cmd, uint16_t lmt_id,
+			uintptr_t lmt_addr, uint8_t sched_type,
+			const uint64_t *txq_data, const uint32_t flags)
 {
 	uint16_t port[4], queue[4];
 	uint16_t i, j, pkts, scalar;
@@ -540,14 +540,16 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 		if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
 		    ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
 			for (j = 0; j < 4; j++)
-				cn10k_sso_tx_one(mbufs[i + j], cmd, lmt_id,
-						 lmt_addr, sched_type, base,
-						 txq_data, flags);
+				cn10k_sso_tx_one(ws, mbufs[i + j], cmd, lmt_id,
+						 lmt_addr, sched_type, txq_data,
+						 flags);
 		} else {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[port[0]][queue[0]];
-			cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd,
-						   base + SSOW_LF_GWS_TAG,
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[port[0]] >> 48) +
+						   queue[0]] &
+					  (BIT_ULL(48) - 1));
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws,
+						   &mbufs[i], 4, cmd,
 						   flags | NIX_TX_VWQE_F);
 		}
 	}
@@ -555,15 +557,14 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
 	mbufs += i;

 	for (i = 0; i < scalar; i++) {
-		cn10k_sso_tx_one(mbufs[i], cmd, lmt_id, lmt_addr, sched_type,
-				 base, txq_data, flags);
+		cn10k_sso_tx_one(ws, mbufs[i], cmd, lmt_id, lmt_addr,
+				 sched_type, txq_data, flags);
 	}
 }

 static __rte_always_inline uint16_t
 cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
-		       uint64_t *cmd,
-		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       uint64_t *cmd, const uint64_t *txq_data,
 		       const uint32_t flags)
 {
 	struct cn10k_eth_txq *txq;
@@ -580,17 +581,19 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t meta = *(uint64_t *)ev->vec;

 		if (meta & BIT(31)) {
-			txq = (struct cn10k_eth_txq *)
-				txq_data[meta >> 32][meta >> 48];
-
-			cn10k_nix_xmit_pkts_vector(
-				txq, mbufs, meta & 0xFFFF, cmd,
-				ws->tx_base + SSOW_LF_GWS_TAG,
-				flags | NIX_TX_VWQE_F);
+			txq = (struct cn10k_eth_txq
+				       *)(txq_data[(txq_data[meta >> 32] >>
+						    48) +
+						   (meta >> 48)] &
+					  (BIT_ULL(48) - 1));
+
+			cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws, mbufs,
+						   meta & 0xFFFF, cmd,
+						   flags | NIX_TX_VWQE_F);
 		} else {
 			cn10k_sso_vwqe_split_tx(
-				mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
-				ev->sched_type, ws->tx_base, txq_data, flags);
+				ws, mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+				ev->sched_type, txq_data, flags);
 		}
 		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
 		return (meta & 0xFFFF);
@@ -598,16 +601,16 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,

 	m = ev->mbuf;
 	ref_cnt = m->refcnt;
-	cn10k_sso_tx_one(m, cmd, lmt_id, lmt_addr, ev->sched_type, ws->tx_base,
-			 txq_data, flags);
+	cn10k_sso_tx_one(ws, m, cmd, lmt_id, lmt_addr, ev->sched_type, txq_data,
+			 flags);

 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		if (ref_cnt > 1)
 			return 1;
 	}

-	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
-				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+	cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
+				 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 	return 1;
 }

@@ -628,9 +631,7 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			flags);                                                \
 	}

@@ -642,9 +643,7 @@ NIX_TX_FASTPATH_MODES
 		struct cn10k_sso_hws *ws = port;                               \
 		RTE_SET_USED(nb_events);                                       \
 		return cn10k_sso_hws_event_tx(                                 \
-			ws, &ev[0], cmd,                                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
+			ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 4611936b7f..f8652d4fbc 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -259,17 +259,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			dws = RTE_PTR_ADD(ws_cookie,
 					  sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = dws;
 		} else {
 			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
@@ -280,17 +277,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
 				ws_cookie,
 				sizeof(struct cnxk_sso_hws_cookie) +
 					sizeof(struct cn9k_sso_hws_dual) +
-					(sizeof(uint64_t) *
-					 (dev->max_port_id + 1) *
-					 RTE_MAX_QUEUES_PER_PORT),
+					dev->tx_adptr_data_sz,
 				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
 			if (ws_cookie == NULL)
 				return -ENOMEM;
 			ws = RTE_PTR_ADD(ws_cookie,
 					 sizeof(struct cnxk_sso_hws_cookie));
 			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
-			       sizeof(uint64_t) * (dev->max_port_id + 1) *
-				       RTE_MAX_QUEUES_PER_PORT);
+			       dev->tx_adptr_data_sz);
 			event_dev->data->ports[i] = ws;
 		}
 	}
@@ -987,17 +981,36 @@ cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev,
 			      int32_t tx_queue_id)
 {
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint64_t tx_offloads;
 	int rc;

 	RTE_SET_USED(id);
 	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
 	if (rc < 0)
 		return rc;
+
+	/* Can't enable tstamp if all the ports don't have it enabled. */
+	tx_offloads = cnxk_eth_dev->tx_offload_flags;
+	if (dev->tx_adptr_configured) {
+		uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+		uint8_t tstmp_ena =
+			!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
+
+		if (tstmp_ena && !tstmp_req)
+			dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+		else if (!tstmp_ena && tstmp_req)
+			tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	dev->tx_offloads |= tx_offloads;
 	cn9k_sso_txq_fc_update(eth_dev, tx_queue_id, true);
 	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
 	if (rc < 0)
 		return rc;
 	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+	dev->tx_adptr_configured = 1;

 	return 0;
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index c99e459c1b..303b04c215 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -599,20 +599,13 @@ cn9k_sso_txq_fc_wait(const struct cn9k_eth_txq *txq)
 		;
 }

-static __rte_always_inline const struct cn9k_eth_txq *
-cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
-			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+static __rte_always_inline struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m, uint64_t *txq_data)
 {
-	return (const struct cn9k_eth_txq *)
-		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
-}
-
-static __rte_always_inline void
-cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
-			 uint64_t *cmd, const uint32_t flags)
-{
-	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
-	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
+	return (struct cn9k_eth_txq
+			*)(txq_data[(txq_data[m->port] >> 48) +
+				    rte_event_eth_tx_adapter_txq_get(m)] &
+			   (BIT_ULL(48) - 1));
 }

 #if defined(RTE_ARCH_ARM64)
@@ -669,7 +662,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
 	nixtx += BIT_ULL(7);
 	nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);

-	roc_lmt_mov((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
+	roc_lmt_mov_nv((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));

 	/* Load opcode and cptr already prepared at pkt metadata set */
 	pkt_len -= l2_len;
@@ -756,12 +749,11 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,

 static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
-		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
-		      const uint32_t flags)
+		      uint64_t *txq_data, const uint32_t flags)
 {
 	struct rte_mbuf *m = ev->mbuf;
-	const struct cn9k_eth_txq *txq;
 	uint16_t ref_cnt = m->refcnt;
+	struct cn9k_eth_txq *txq;

 	/* Perform header writes before barrier for TSO */
 	cn9k_nix_xmit_prepare_tso(m, flags);
@@ -774,7 +766,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	    !(flags & NIX_TX_OFFLOAD_SECURITY_F))
 		rte_io_wmb();
 	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
-	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
+	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);

 	if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
 		uint64_t ol_flags = m->ol_flags;
@@ -796,6 +789,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,

 	if (flags & NIX_TX_MULTI_SEG_F) {
 		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
+					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -808,6 +803,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					       segdw);
 		}
 	} else {
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
 			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
@@ -853,11 +849,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws *ws = port;                                \
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_TX_SEG(fn, sz, flags)                                              \
@@ -867,11 +861,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
 		struct cn9k_sso_hws *ws = port;                                \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base, &ev[0], cmd,                                 \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #define SSO_DUAL_TX(fn, sz, flags)                                             \
@@ -881,11 +873,9 @@ NIX_TX_FASTPATH_MODES
 		struct cn9k_sso_hws_dual *ws = port;                           \
 		uint64_t cmd[sz];                                              \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			flags);                                                \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     flags);                           \
 	}

 #define SSO_DUAL_TX_SEG(fn, sz, flags)                                         \
@@ -895,11 +885,9 @@ NIX_TX_FASTPATH_MODES
 		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
 		struct cn9k_sso_hws_dual *ws = port;                           \
 		RTE_SET_USED(nb_events);                                       \
-		return cn9k_sso_hws_event_tx(                                  \
-			ws->base[!ws->vws], &ev[0], cmd,                       \
-			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
-				ws->tx_adptr_data,                             \
-			(flags) | NIX_TX_MULTI_SEG_F);                         \
+		return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
+					     (uint64_t *)ws->tx_adptr_data,    \
+					     (flags) | NIX_TX_MULTI_SEG_F);    \
 	}

 #endif
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 4652b58a84..b26df58588 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -99,7 +99,10 @@ struct cnxk_sso_evdev {
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
 	uint64_t *tx_adptr_data;
+	size_t tx_adptr_data_sz;
 	uint16_t max_port_id;
+	uint16_t max_queue_id[RTE_MAX_ETHPORTS];
+	uint8_t tx_adptr_configured;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -131,8 +134,8 @@ struct cn10k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint64_t tx_base __rte_cache_aligned;
-	uintptr_t lmt_base;
+	uintptr_t lmt_base __rte_cache_aligned;
+	uint64_t lso_tun_fmt;
 	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

@@ -149,7 +152,8 @@ struct cn9k_sso_hws {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cn9k_sso_hws_dual {
@@ -165,7 +169,8 @@ struct cn9k_sso_hws_dual {
 	uint64_t *fc_mem;
 	uintptr_t grp_base;
 	/* Tx Fastpath data */
-	uint8_t tx_adptr_data[] __rte_cache_aligned;
+	uint64_t lso_tun_fmt __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;

 struct cnxk_sso_hws_cookie {
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index fdcd68ca63..5ebd3340e7 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -339,30 +339,179 @@ cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
 		sq->aura_handle, RTE_MIN(nb_sqb_bufs, sq->aura_sqb_bufs));
 }

+static void
+cnxk_sso_tx_queue_data_init(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			    uint16_t eth_port_id, uint16_t tx_queue_id)
+{
+	uint64_t offset = 0;
+	int i;
+
+	dev->max_queue_id[0] = RTE_MAX(dev->max_queue_id[0], eth_port_id);
+	for (i = 1; i < eth_port_id; i++) {
+		offset += (dev->max_queue_id[i - 1] + 1);
+		txq_data[i] |= offset << 48;
+	}
+	dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
+	dev->max_queue_id[eth_port_id] =
+		RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+}
+
+static void
+cnxk_sso_tx_queue_data_cpy(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			   uint64_t *otxq_data, uint16_t eth_port_id)
+{
+	uint64_t offset = 0;
+	int i, j;
+
+	for (i = 1; i < eth_port_id; i++) {
+		offset += (dev->max_queue_id[i - 1] + 1);
+		txq_data[i] |= offset << 48;
+		for (j = 0;
+		     (i < dev->max_port_id) && (j < dev->max_queue_id[i] + 1);
+		     j++)
+			txq_data[offset + j] =
+				otxq_data[(otxq_data[i] >> 48) + j];
+	}
+}
+
+static void
+cnxk_sso_tx_queue_data_cpy_max(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			       uint64_t *otxq_data, uint16_t eth_port_id,
+			       uint16_t max_port_id, uint16_t max_queue_id)
+{
+	uint64_t offset = 0;
+	int i, j;
+
+	for (i = 1; i < max_port_id + 1; i++) {
+		offset += (dev->max_queue_id[i - 1] + 1);
+		txq_data[i] |= offset << 48;
+		for (j = 0; j < dev->max_queue_id[i] + 1; j++) {
+			if (i == eth_port_id && j > max_queue_id)
+				continue;
+			txq_data[offset + j] =
+				otxq_data[(otxq_data[i] >> 48) + j];
+		}
+	}
+}
+
+static void
+cnxk_sso_tx_queue_data_rewrite(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
+			       uint16_t eth_port_id, uint16_t tx_queue_id,
+			       uint64_t *otxq_data, uint16_t max_port_id,
+			       uint16_t max_queue_id)
+{
+	int i;
+
+	for (i = 0; i < dev->max_queue_id[0] + 1; i++)
+		txq_data[i] |= (otxq_data[i] & ~((BIT_ULL(16) - 1) << 48));
+
+	if (eth_port_id > max_port_id) {
+		dev->max_queue_id[0] =
+			RTE_MAX(dev->max_queue_id[0], eth_port_id);
+		dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
+
+		cnxk_sso_tx_queue_data_cpy(dev, txq_data, otxq_data,
+					   eth_port_id);
+		dev->max_queue_id[eth_port_id] =
+			RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+	} else if (tx_queue_id > max_queue_id) {
+		dev->max_queue_id[eth_port_id] =
+			RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
+		dev->max_port_id = RTE_MAX(max_port_id, eth_port_id);
+		cnxk_sso_tx_queue_data_cpy_max(dev, txq_data, otxq_data,
+					       eth_port_id, max_port_id,
+					       max_queue_id);
+	}
+}
+
+static void
+cnxk_sso_tx_queue_data_sz(struct cnxk_sso_evdev *dev, uint16_t eth_port_id,
+			  uint16_t tx_queue_id, uint16_t max_port_id,
+			  uint16_t max_queue_id, uint64_t *r, size_t *sz)
+{
+	uint64_t row = 0;
+	size_t size = 0;
+	int i;
+
+	if (dev->tx_adptr_data == NULL) {
+		size = (eth_port_id + 1);
+		size += (eth_port_id + tx_queue_id);
+		row = 2 * eth_port_id;
+		*r = row;
+		*sz = size;
+		return;
+	}
+
+	if (eth_port_id > max_port_id) {
+		size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
+		for (i = 1; i < eth_port_id; i++)
+			size += (dev->max_queue_id[i] + 1);
+		row = size;
+		size += (tx_queue_id + 1);
+	} else if (tx_queue_id > max_queue_id) {
+		size = !eth_port_id ?
+			       tx_queue_id + 1 :
+				     RTE_MAX(max_port_id, dev->max_queue_id[0]) + 1;
+		for (i = 1; i < max_port_id + 1; i++) {
+			if (i == eth_port_id) {
+				row = size;
+				size += tx_queue_id + 1;
+			} else {
+				size += dev->max_queue_id[i] + 1;
+			}
+		}
+	}
+	*r = row;
+	*sz = size;
+}
+
 static int
 cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
 			    uint16_t eth_port_id, uint16_t tx_queue_id,
 			    void *txq)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
 	uint16_t max_port_id = dev->max_port_id;
-	uint64_t *txq_data = dev->tx_adptr_data;
-
-	if (txq_data == NULL || eth_port_id > max_port_id) {
-		max_port_id = RTE_MAX(max_port_id, eth_port_id);
-		txq_data = rte_realloc_socket(
-			txq_data,
-			(sizeof(uint64_t) * (max_port_id + 1) *
-			 RTE_MAX_QUEUES_PER_PORT),
-			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+	uint64_t *txq_data = NULL;
+	uint64_t row = 0;
+	size_t size = 0;
+
+	if (((uint64_t)txq) & 0xFFFF000000000000)
+		return -EINVAL;
+
+	cnxk_sso_tx_queue_data_sz(dev, eth_port_id, tx_queue_id, max_port_id,
+				  max_queue_id, &row, &size);
+
+	size *= sizeof(uint64_t);
+
+	if (size) {
+		uint64_t *otxq_data = dev->tx_adptr_data;
+
+		txq_data = malloc(size);
 		if (txq_data == NULL)
 			return -ENOMEM;
+		memset(txq_data, 0, size);
+		txq_data[eth_port_id] = ((uint64_t)row) << 48;
+		txq_data[row + tx_queue_id] = (uint64_t)txq;
+
+		if (otxq_data != NULL)
+			cnxk_sso_tx_queue_data_rewrite(
+				dev, txq_data, eth_port_id, tx_queue_id,
+				otxq_data, max_port_id, max_queue_id);
+		else
+			cnxk_sso_tx_queue_data_init(dev, txq_data, eth_port_id,
+						    tx_queue_id);
+		dev->tx_adptr_data_sz = size;
+		free(otxq_data);
+		dev->tx_adptr_data = txq_data;
+	} else {
+		txq_data = dev->tx_adptr_data;
+		row = txq_data[eth_port_id] >> 48;
+		txq_data[row + tx_queue_id] &= ~(BIT_ULL(48) - 1);
+		txq_data[row + tx_queue_id] |= (uint64_t)txq;
 	}

-	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
-		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
-	dev->max_port_id = max_port_id;
-	dev->tx_adptr_data = txq_data;
 	return 0;
 }

@@ -372,7 +521,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			      int32_t tx_queue_id)
 {
 	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
-	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 	struct roc_nix_sq *sq;
 	int i, ret;
 	void *txq;
@@ -388,8 +536,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
 			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
 		if (ret < 0)
 			return ret;
-
-		dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
 	}

 	return 0;
diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index 8378cbffc2..9bb08e1824 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -131,53 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
 	union nix_send_hdr_w0_u send_hdr_w0;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s sg_w0;
-
-	RTE_SET_USED(dev);

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
 	send_hdr_w0.u = 0;
-	sg_w0.u = 0;
-
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
 		send_hdr_w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[0];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
 			send_hdr_w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)(txq->cmd + 2);
-			send_mem->w0.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
 	} else {
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
 		send_hdr_w0.sizem1 = 1;
 	}
-
 	send_hdr_w0.sq = qid;
-	sg_w0.subdc = NIX_SUBDC_SG;
-	sg_w0.segs = 1;
-	sg_w0.ld_type = NIX_SENDLDTYPE_LDD;
-
 	txq->send_hdr_w0 = send_hdr_w0.u;
-	txq->sg_w0 = sg_w0.u;
-
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
index 0982158c62..ec40e53152 100644
--- a/drivers/net/cnxk/cn10k_ethdev.h
+++ b/drivers/net/cnxk/cn10k_ethdev.h
@@ -9,7 +9,6 @@

 struct cn10k_eth_txq {
 	uint64_t send_hdr_w0;
-	uint64_t sg_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	uintptr_t lmt_base;
@@ -20,8 +19,8 @@ struct cn10k_eth_txq {
 	uint64_t sa_base;
 	uint64_t *cpt_fc;
 	uint16_t cpt_desc;
-	uint64_t cmd[4];
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 } __plt_cache_aligned;

 struct cn10k_eth_rxq {
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index fc1f6ceb8c..4ae6bbf517 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -186,23 +186,26 @@ cn10k_cpt_tx_steor_data(void)
 }

 static __rte_always_inline void
-cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
-		      const uint16_t flags)
+cn10k_nix_tx_skeleton(struct cn10k_eth_txq *txq, uint64_t *cmd,
+		      const uint16_t flags, const uint16_t static_sz)
 {
-	/* Send hdr */
-	cmd[0] = txq->send_hdr_w0;
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
 	cmd[1] = 0;
-	cmd += 2;

-	/* Send ext if present */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		*(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
-		cmd += 2;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
 	}
-
-	/* Send sg */
-	cmd[0] = txq->sg_w0;
-	cmd[1] = 0;
 }

 static __rte_always_inline void
@@ -718,41 +721,29 @@ cn10k_nix_xmit_mv_lmt_base(uintptr_t lmt_addr, uint64_t *cmd,
 }

 static __rte_always_inline void
-cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
+cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
 			      const uint64_t ol_flags, const uint16_t no_segdw,
 			      const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
-		struct nix_send_ext_s *send_hdr_ext =
-			(struct nix_send_ext_s *)lmt_addr + 16;
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
 		uint64_t *lmt = (uint64_t *)lmt_addr;
 		uint16_t off = (no_segdw - 1) << 1;
 		struct nix_send_mem_s *send_mem;

 		send_mem = (struct nix_send_mem_s *)(lmt + off);
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
-		send_hdr_ext->w0.tstmp = 1;
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			lmt[off] = cmd[2];
-
-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
 		send_mem->w0.subdc = NIX_SUBDC_MEM;
-		send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+		send_mem->w0.alg =
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
 		send_mem->addr =
-			(rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
+			(rte_iova_t)(((uint64_t *)txq->ts_mem) + is_ol_tstamp);
 	}
 }

@@ -841,8 +832,8 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
-		    uint64_t *cmd, uintptr_t base, const uint16_t flags)
+cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
+		    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	const rte_iova_t io_addr = txq->io_addr;
@@ -863,9 +854,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		/* Reduce the cached count */
 		txq->fc_cache_pkts -= pkts;
 	}
-
 	/* Get cmd skeleton */
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -909,14 +899,14 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, 4, flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      4, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec)
 			lnum++;
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -967,9 +957,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
-			 uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			 const uint16_t flags)
+cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
+			 struct rte_mbuf **tx_pkts, uint16_t pkts,
+			 uint64_t *cmd, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	uintptr_t pa0, pa1, lbase = txq->lmt_base;
@@ -987,12 +977,13 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uintptr_t laddr;
 	bool sec;

-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
-	cn10k_nix_tx_skeleton(txq, cmd, flags);
-
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	}
+	/* Get cmd skeleton */
+	cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));

 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
@@ -1038,13 +1029,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
-
 		/* Store sg list directly on lmt line */
 		segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
 					       flags);
-		cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
-					      tx_pkts[i]->ol_flags, segdw,
-					      flags);
+		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
+					      segdw, flags);
 		if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
 			lnum++;
 			data128 |= (((__uint128_t)(segdw - 1)) << shft);
@@ -1053,7 +1042,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	}

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -1474,9 +1463,9 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1526,25 +1515,42 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
 	}

-	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+	} else {
+		uint64_t w0 =
+			(txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
+
+		senddesc01_w0 = vdupq_n_u64(w0);
+	}
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
 	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	/* Get LMT base address and LMT ID as lcore id */
@@ -2577,7 +2583,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		wd.data[0] >>= 16;

 	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(base);
+		roc_sso_hws_head_wait(ws[0]);

 	left -= burst;

@@ -2640,12 +2646,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 	if (unlikely(scalar)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
-			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
-							 scalar, cmd, base,
-							 flags);
+			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, ws, tx_pkts,
+							 scalar, cmd, flags);
 		else
-			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
-						    cmd, base, flags);
+			pkts += cn10k_nix_xmit_pkts(tx_queue, ws, tx_pkts,
+						    scalar, cmd, flags);
 	}

 	return pkts;
@@ -2653,16 +2658,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,

 #else
 static __rte_always_inline uint16_t
-cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
-			   const uint16_t flags)
+cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
+			   struct rte_mbuf **tx_pkts, uint16_t pkts,
+			   uint64_t *cmd, const uint16_t flags)
 {
+	RTE_SET_USED(ws);
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
-	RTE_SET_USED(base);
 	return 0;
 }
 #endif
@@ -2892,7 +2897,7 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, 0,    \
+		return cn10k_nix_xmit_pkts(tx_queue, NULL, tx_pkts, pkts, cmd, \
 					   flags);                             \
 	}

@@ -2905,8 +2910,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
-						0,                             \
+		return cn10k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, \
+						cmd,                           \
 						flags | NIX_TX_MULTI_SEG_F);   \
 	}

@@ -2919,8 +2924,8 @@ NIX_TX_FASTPATH_MODES
 		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
-		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts,     \
-						  cmd, 0, (flags));            \
+		return cn10k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts,     \
+						  pkts, cmd, (flags));         \
 	}

 #define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags)                                    \
@@ -2933,7 +2938,7 @@ NIX_TX_FASTPATH_MODES
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(                             \
-			tx_queue, tx_pkts, pkts, cmd, 0,                       \
+			tx_queue, NULL, tx_pkts, pkts, cmd,                    \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}

diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index d34bc6898f..01e3850561 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -131,51 +131,31 @@ static void
 nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
 		      uint16_t qid)
 {
-	struct nix_send_ext_s *send_hdr_ext;
-	struct nix_send_hdr_s *send_hdr;
-	struct nix_send_mem_s *send_mem;
-	union nix_send_sg_s *sg;
+	union nix_send_hdr_w0_u send_hdr_w0;

 	/* Initialize the fields based on basic single segment packet */
-	memset(&txq->cmd, 0, sizeof(txq->cmd));
-
+	send_hdr_w0.u = 0;
 	if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
-		send_hdr->w0.sizem1 = 2;
-
-		send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[2];
-		send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
+		send_hdr_w0.sizem1 = 2;
 		if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 			/* Default: one seg packet would have:
 			 * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
 			 * => 8/2 - 1 = 3
 			 */
-			send_hdr->w0.sizem1 = 3;
-			send_hdr_ext->w0.tstmp = 1;
+			send_hdr_w0.sizem1 = 3;

 			/* To calculate the offset for send_mem,
 			 * send_hdr->w0.sizem1 * 2
 			 */
-			send_mem = (struct nix_send_mem_s *)
-				(txq->cmd + (send_hdr->w0.sizem1 << 1));
-			send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
-			send_mem->w0.cn9k.alg = NIX_SENDMEMALG_SETTSTMP;
-			send_mem->addr = dev->tstamp.tx_tstamp_iova;
+			txq->ts_mem = dev->tstamp.tx_tstamp_iova;
 		}
-		sg = (union nix_send_sg_s *)&txq->cmd[4];
 	} else {
-		send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
 		/* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
-		send_hdr->w0.sizem1 = 1;
-		sg = (union nix_send_sg_s *)&txq->cmd[2];
+		send_hdr_w0.sizem1 = 1;
 	}
-
-	send_hdr->w0.sq = qid;
-	sg->subdc = NIX_SUBDC_SG;
-	sg->segs = 1;
-	sg->ld_type = NIX_SENDLDTYPE_LDD;
-
+	send_hdr_w0.sq = qid;
+	txq->send_hdr_w0 = send_hdr_w0.u;
 	rte_wmb();
 }

diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
index 2b452fe009..8ab924944c 100644
--- a/drivers/net/cnxk/cn9k_ethdev.h
+++ b/drivers/net/cnxk/cn9k_ethdev.h
@@ -9,12 +9,13 @@
 #include <cnxk_security_ar.h>

 struct cn9k_eth_txq {
-	uint64_t cmd[8];
+	uint64_t send_hdr_w0;
 	int64_t fc_cache_pkts;
 	uint64_t *fc_mem;
 	void *lmt_addr;
 	rte_iova_t io_addr;
 	uint64_t lso_tun_fmt;
+	uint64_t ts_mem;
 	uint16_t sqes_per_sqb_log2;
 	int16_t nb_sqb_bufs_adj;
 	rte_iova_t cpt_io_addr;
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 8564dd85ee..d23e4b61b4 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -58,6 +58,29 @@ cn9k_nix_tx_ext_subs(const uint16_t flags)
 				  : 0);
 }

+static __rte_always_inline void
+cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
+		     const uint16_t flags, const uint16_t static_sz)
+{
+	if (static_sz)
+		cmd[0] = txq->send_hdr_w0;
+	else
+		cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+			 ((uint64_t)(cn9k_nix_tx_ext_subs(flags) + 1) << 40);
+	cmd[1] = 0;
+
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
+			cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
+		else
+			cmd[2] = NIX_SUBDC_EXT << 60;
+		cmd[3] = 0;
+		cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	} else {
+		cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
+	}
+}
+
 static __rte_always_inline void
 cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 {
@@ -136,11 +159,11 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		w1.u = 0;
 	}

-	if (!(flags & NIX_TX_MULTI_SEG_F)) {
+	if (!(flags & NIX_TX_MULTI_SEG_F))
 		send_hdr->w0.total = m->data_len;
-		send_hdr->w0.aura =
-			roc_npa_aura_handle_to_aura(m->pool->pool_id);
-	}
+	else
+		send_hdr->w0.total = m->pkt_len;
+	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	/*
 	 * L3type:  2 => IPV4
@@ -287,41 +310,39 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		/* Mark mempool object as "put" since it is freed by NIX */
 		if (!send_hdr->w0.df)
 			RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	} else {
+		sg->seg1_size = m->data_len;
+		*(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
+
+		/* NOFF is handled later for multi-seg */
 	}
 }

 static __rte_always_inline void
-cn9k_nix_xmit_prepare_tstamp(uint64_t *cmd, const uint64_t *send_mem_desc,
+cn9k_nix_xmit_prepare_tstamp(struct cn9k_eth_txq *txq, uint64_t *cmd,
 			     const uint64_t ol_flags, const uint16_t no_segdw,
 			     const uint16_t flags)
 {
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 		struct nix_send_mem_s *send_mem;
 		uint16_t off = (no_segdw - 1) << 1;
-		const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
+		const uint8_t is_ol_tstamp =
+			!(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);

 		send_mem = (struct nix_send_mem_s *)(cmd + off);
-		if (flags & NIX_TX_MULTI_SEG_F) {
-			/* Retrieving the default desc values */
-			cmd[off] = send_mem_desc[6];

-			/* Using compiler barrier to avoid violation of C
-			 * aliasing rules.
-			 */
-			rte_compiler_barrier();
-		}
-
-		/* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
+		/* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
 		 * should not be recorded, hence changing the alg type to
-		 * NIX_SENDMEMALG_SET and also changing send mem addr field to
+		 * NIX_SENDMEMALG_SUB and also changing send mem addr field to
 		 * next 8 bytes as it corrupts the actual Tx tstamp registered
 		 * address.
 		 */
+		send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
 		send_mem->w0.cn9k.alg =
-			NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
+			NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);

-		send_mem->addr = (rte_iova_t)((uint64_t *)send_mem_desc[7] +
-					      (is_ol_tstamp));
+		send_mem->addr = (rte_iova_t)(((uint64_t *)txq->ts_mem) +
+				(is_ol_tstamp));
 	}
 }

@@ -367,8 +388,6 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 	uint8_t off, i;

 	send_hdr = (struct nix_send_hdr_s *)cmd;
-	send_hdr->w0.total = m->pkt_len;
-	send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);

 	if (flags & NIX_TX_NEED_EXT_HDR)
 		off = 2;
@@ -376,13 +395,29 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		off = 0;

 	sg = (union nix_send_sg_s *)&cmd[2 + off];
-	/* Clear sg->u header before use */
-	sg->u &= 0xFC00000000000000;
+
+	/* Start from second segment, first segment is already there */
+	i = 1;
 	sg_u = sg->u;
-	slist = &cmd[3 + off];
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+	slist = &cmd[3 + off + 1];

-	i = 0;
-	nb_segs = m->nb_segs;
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		rte_io_wmb();
+	}
+
+	/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+	m = m_next;
+	if (!m)
+		goto done;

 	/* Fill mbuf segments */
 	do {
@@ -417,6 +452,7 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		m = m_next;
 	} while (nb_segs);

+done:
 	sg->u = sg_u;
 	sg->segs = i;
 	segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
@@ -472,7 +508,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -490,8 +526,8 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,

 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, 4, flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
+					     flags);
 		cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
 	}

@@ -514,7 +550,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,

 	NIX_XMIT_FC_OR_RETURN(txq, pkts);

-	roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);

 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
@@ -533,9 +569,8 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	for (i = 0; i < pkts; i++) {
 		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
 		segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
-		cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
-					     tx_pkts[i]->ol_flags, segdw,
-					     flags);
+		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
+					     segdw, flags);
 		cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
 	}

@@ -862,28 +897,34 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
 		rte_io_wmb();

-	senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
+
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
+	sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
+	sgdesc23_w0 = sgdesc01_w0;

-	/* Load command defaults into vector variables. */
 	if (flags & NIX_TX_NEED_EXT_HDR) {
-		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sendext23_w0 = sendext01_w0;
-		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
-		sendext23_w1 = sendext01_w1;
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
-		sgdesc23_w0 = sgdesc01_w0;
 		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
-			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
+						   BIT_ULL(15));
+			sendmem01_w0 =
+				vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+					    (NIX_SENDMEMALG_SETTSTMP << 56));
 			sendmem23_w0 = sendmem01_w0;
-			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
 			sendmem23_w1 = sendmem01_w1;
+		} else {
+			sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
 		}
-	} else {
-		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-		sgdesc23_w0 = sgdesc01_w0;
+		sendext23_w0 = sendext01_w0;
+
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+			sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		else
+			sendext01_w1 = vdupq_n_u64(0);
+		sendext23_w1 = sendext01_w1;
 	}

 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
--
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4 1/3] event/cnxk: store and reuse workslot status
  2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
  2022-02-10 10:19   ` [PATCH v3 3/3] net/cnxk: improve Rx performance pbhagavatula
@ 2022-02-10 13:20   ` pbhagavatula
  2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
                       ` (2 more replies)
  2 siblings, 3 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 13:20 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, Shijith Thotton
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Store and reuse workslot status for TT, GRP and HEAD status
instead of reading from GWC as reading from GWC imposes
additional latency.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 Depends-on: 21590

 v4 Changes:
 - Update commit title for 3/3

 v3 Changes:
 - Split and rebase patches.

 v2 Changes:
 - Rebase.
 - Fix incorrect use of RoC API

 drivers/common/cnxk/roc_sso.h      | 14 ++++++++------
 drivers/event/cnxk/cn10k_worker.h  | 16 +++++++++-------
 drivers/event/cnxk/cn9k_worker.h   |  6 +++---
 drivers/event/cnxk/cnxk_eventdev.h |  2 ++
 drivers/event/cnxk/cnxk_worker.h   | 11 +++++++----
 drivers/net/cnxk/cn10k_tx.h        | 12 ++++++------
 6 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index 27d49c6c68..ab7cee1c60 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -54,12 +54,13 @@ struct roc_sso {
 	uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
 } __plt_cache_aligned;

-static __plt_always_inline void
-roc_sso_hws_head_wait(uintptr_t tag_op)
+static __plt_always_inline uint64_t
+roc_sso_hws_head_wait(uintptr_t base)
 {
-#ifdef RTE_ARCH_ARM64
+	uintptr_t tag_op = base + SSOW_LF_GWS_TAG;
 	uint64_t tag;

+#if defined(__aarch64__)
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_op]]	\n"
 		     "		tbnz %[tag], 35, done%=		\n"
@@ -71,10 +72,11 @@ roc_sso_hws_head_wait(uintptr_t tag_op)
 		     : [tag] "=&r"(tag)
 		     : [tag_op] "r"(tag_op));
 #else
-	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (!(plt_read64(tag_op) & BIT_ULL(35)))
-		;
+	do {
+		tag = plt_read64(tag_op);
+	} while (!(tag & BIT_ULL(35)));
 #endif
+	return tag;
 }

 /* SSO device initialization */
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ff08b2d974..ada230ea1d 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -40,8 +40,7 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 {
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
-	const uint8_t cur_tt =
-		CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
+	const uint8_t cur_tt = CNXK_TT_FROM_TAG(ws->gw_rdata);

 	/* CNXK model
 	 * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
@@ -81,7 +80,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 	const uint8_t grp = ev->queue_id;

 	/* Group hasn't changed, Use SWTAG to forward the event */
-	if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
+	if (CNXK_GRP_FROM_TAG(ws->gw_rdata) == grp)
 		cn10k_sso_hws_fwd_swtag(ws, ev);
 	else
 		/*
@@ -211,6 +210,7 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 	} while (gw.u64[0] & BIT_ULL(63));
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
+	ws->gw_rdata = gw.u64[0];
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
@@ -405,7 +405,8 @@ NIX_RX_FASTPATH_MODES
 		RTE_SET_USED(timeout_ticks);                                   \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return 1;                                              \
 		}                                                              \
 		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
@@ -424,7 +425,8 @@ NIX_RX_FASTPATH_MODES
 		uint64_t iter;                                                 \
 		if (ws->swtag_req) {                                           \
 			ws->swtag_req = 0;                                     \
-			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
+				ws->base + SSOW_LF_GWS_WQE0);                  \
 			return ret;                                            \
 		}                                                              \
 		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
@@ -507,8 +509,8 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	else
 		pa = txq->io_addr | ((segdw - 1) << 4);

-	if (!sched_type)
-		roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
+	if (!CNXK_TAG_IS_HEAD(ws->gw_rdata) && !sched_type)
+		ws->gw_rdata = roc_sso_hws_head_wait(ws->base);

 	roc_lmt_submit_steorl(lmt_id, pa);
 }
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 303b04c215..8455272005 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -700,7 +700,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,

 	/* Head wait if needed */
 	if (base)
-		roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(base);

 	/* ESN */
 	outb_priv = roc_nix_inl_onf_ipsec_outb_sa_sw_rsvd((void *)sa);
@@ -793,7 +793,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
@@ -806,7 +806,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
-			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base);
 			cn9k_sso_txq_fc_wait(txq);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index b26df58588..ab58508590 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -47,6 +47,7 @@
 #define CNXK_CLR_SUB_EVENT(x)	    (~(0xffu << 20) & x)
 #define CNXK_GRP_FROM_TAG(x)	    (((x) >> 36) & 0x3ff)
 #define CNXK_SWTAG_PEND(x)	    (BIT_ULL(62) & x)
+#define CNXK_TAG_IS_HEAD(x)	    (BIT_ULL(35) & x)

 #define CN9K_SSOW_GET_BASE_ADDR(_GW) ((_GW)-SSOW_LF_GWS_OP_GET_WORK0)

@@ -123,6 +124,7 @@ struct cnxk_sso_evdev {

 struct cn10k_sso_hws {
 	uint64_t base;
+	uint64_t gw_rdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 9f9ceab8a1..7de03f3fbb 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -52,11 +52,11 @@ cnxk_sso_hws_swtag_flush(uint64_t tag_op, uint64_t flush_op)
 	plt_write64(0, flush_op);
 }

-static __rte_always_inline void
+static __rte_always_inline uint64_t
 cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 {
-#ifdef RTE_ARCH_ARM64
 	uint64_t swtp;
+#ifdef RTE_ARCH_ARM64

 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[swtb], [%[swtp_loc]]	\n"
@@ -70,9 +70,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 		     : [swtp_loc] "r"(tag_op));
 #else
 	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (plt_read64(tag_op) & BIT_ULL(62))
-		;
+	do {
+		swtp = plt_read64(tag_op);
+	} while (swtp & BIT_ULL(62));
 #endif
+
+	return swtp;
 }

 #endif
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 4ae6bbf517..ec6366168c 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -905,8 +905,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 			lnum++;
 	}

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -1041,8 +1041,8 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		}
 	}

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;
 	tx_pkts += burst;
@@ -2582,8 +2582,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 	if (flags & (NIX_TX_MULTI_SEG_F | NIX_TX_OFFLOAD_SECURITY_F))
 		wd.data[0] >>= 16;

-	if (flags & NIX_TX_VWQE_F)
-		roc_sso_hws_head_wait(ws[0]);
+	if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
+		ws[1] = roc_sso_hws_head_wait(ws[0]);

 	left -= burst;

--
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue
  2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
@ 2022-02-10 13:20     ` pbhagavatula
  2022-02-10 13:20     ` [PATCH v4 3/3] event/cnxk: improve Rx performance pbhagavatula
  2022-02-14  9:29     ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status Jerin Jacob
  2 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 13:20 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Setting WAITW bit enables default min dequeue timeout of 1us.
Avoid the min dequeue timeout by setting WAITW only when dequeue_timeout
is configured.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c |  8 +++++--
 drivers/event/cnxk/cn9k_eventdev.c  |  9 ++++++-
 drivers/event/cnxk/cn9k_worker.h    | 37 +++++++++++++----------------
 drivers/event/cnxk/cnxk_eventdev.c  |  2 +-
 drivers/event/cnxk/cnxk_eventdev.h  |  2 ++
 5 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 97a88feb13..26d65e3568 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -15,7 +15,10 @@
 static uint32_t
 cn10k_sso_gw_mode_wdata(struct cnxk_sso_evdev *dev)
 {
-	uint32_t wdata = BIT(16) | 1;
+	uint32_t wdata = 1;
+
+	if (dev->deq_tmo_ns)
+		wdata |= BIT(16);
 
 	switch (dev->gw_mode) {
 	case CN10K_GW_MODE_NONE:
@@ -88,7 +91,8 @@ cn10k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	ws->xaq_lmt = dev->xaq_lmt;
 
 	/* Set get_work timeout for HWS */
-	val = NSEC2USEC(dev->deq_tmo_ns) - 1;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	plt_write64(val, ws->base + SSOW_LF_GWS_NW_TIM);
 }
 
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index f8652d4fbc..6d3d03c97c 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -72,7 +72,8 @@ cn9k_sso_hws_setup(void *arg, void *hws, uintptr_t grp_base)
 	uint64_t val;
 
 	/* Set get_work tmo for HWS */
-	val = dev->deq_tmo_ns ? NSEC2USEC(dev->deq_tmo_ns) - 1 : 0;
+	val = NSEC2USEC(dev->deq_tmo_ns);
+	val = val ? val - 1 : 0;
 	if (dev->dual_ws) {
 		dws = hws;
 		dws->grp_base = grp_base;
@@ -677,6 +678,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		dws->hws_id = port_id;
 		dws->swtag_req = 0;
 		dws->vws = 0;
+		if (dev->deq_tmo_ns)
+			dws->gw_wdata = BIT_ULL(16);
+		dws->gw_wdata |= 1;
 
 		data = dws;
 	} else {
@@ -695,6 +699,9 @@ cn9k_sso_init_hws_mem(void *arg, uint8_t port_id)
 		ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
 		ws->hws_id = port_id;
 		ws->swtag_req = 0;
+		if (dev->deq_tmo_ns)
+			ws->gw_wdata = BIT_ULL(16);
+		ws->gw_wdata |= 1;
 
 		data = ws;
 	}
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 8455272005..79374b8d95 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -149,10 +149,8 @@ cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			   struct rte_event *ev, const uint32_t flags,
-			   const void *const lookup_mem,
-			   struct cnxk_timesync_info *const tstamp)
+			   struct cn9k_sso_hws_dual *dws)
 {
-	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
@@ -161,7 +159,7 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 	uint64_t mbuf;
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
-		rte_prefetch_non_temporal(lookup_mem);
+		rte_prefetch_non_temporal(dws->lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -175,14 +173,14 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
 		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(base + SSOW_LF_GWS_TAG),
-		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(set_gw),
+		       [wqp_loc] "r"(base + SSOW_LF_GWS_WQP), [gw] "r"(dws->gw_wdata),
 		       [pong] "r"(pair_base + SSOW_LF_GWS_OP_GET_WORK0));
 #else
 	gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	while ((BIT_ULL(63)) & gw.u64[0])
 		gw.u64[0] = plt_read64(base + SSOW_LF_GWS_TAG);
 	gw.u64[1] = plt_read64(base + SSOW_LF_GWS_WQP);
-	plt_write64(set_gw, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(dws->gw_wdata, pair_base + SSOW_LF_GWS_OP_GET_WORK0);
 	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
@@ -202,12 +200,13 @@ cn9k_sso_hws_dual_get_work(uint64_t base, uint64_t pair_base,
 			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
 			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
 					 gw.u64[0] & 0xFFFFF, flags,
-					 lookup_mem);
+					 dws->lookup_mem);
 			/* Extracting tstamp, if PTP enabled*/
 			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
 							    gw.u64[1]) +
 						   CNXK_SSO_WQE_SG_PTR);
-			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						dws->tstamp,
 						flags & NIX_RX_OFFLOAD_TSTAMP_F,
 						flags & NIX_RX_MULTI_SEG_F,
 						(uint64_t *)tstamp_ptr);
@@ -232,9 +231,7 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
 	uint64_t tstamp_ptr;
 	uint64_t mbuf;
 
-	plt_write64(BIT_ULL(16) | /* wait for work. */
-			    1,	  /* Use Mask set 0. */
-		    ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+	plt_write64(ws->gw_wdata, ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 
 	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
 		rte_prefetch_non_temporal(lookup_mem);
@@ -529,9 +526,9 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return 1;                                              \
 		}                                                              \
-		gw = cn9k_sso_hws_dual_get_work(                               \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		gw = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],           \
+						dws->base[!dws->vws], ev,      \
+						flags, dws);                   \
 		dws->vws = !dws->vws;                                          \
 		return gw;                                                     \
 	}
@@ -554,14 +551,14 @@ NIX_RX_FASTPATH_MODES
 						SSOW_LF_GWS_TAG);              \
 			return ret;                                            \
 		}                                                              \
-		ret = cn9k_sso_hws_dual_get_work(                              \
-			dws->base[dws->vws], dws->base[!dws->vws], ev, flags,  \
-			dws->lookup_mem, dws->tstamp);                         \
+		ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],          \
+						 dws->base[!dws->vws], ev,     \
+						 flags, dws);                  \
 		dws->vws = !dws->vws;                                          \
 		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
-			ret = cn9k_sso_hws_dual_get_work(                      \
-				dws->base[dws->vws], dws->base[!dws->vws], ev, \
-				flags, dws->lookup_mem, dws->tstamp);          \
+			ret = cn9k_sso_hws_dual_get_work(dws->base[dws->vws],  \
+							 dws->base[!dws->vws], \
+							 ev, flags, dws);      \
 			dws->vws = !dws->vws;                                  \
 		}                                                              \
 		return ret;                                                    \
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index 6ad4e23e2b..be021d86c9 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -610,7 +610,7 @@ cnxk_sso_init(struct rte_eventdev *event_dev)
 	}
 
 	dev->is_timeout_deq = 0;
-	dev->min_dequeue_timeout_ns = USEC2NSEC(1);
+	dev->min_dequeue_timeout_ns = 0;
 	dev->max_dequeue_timeout_ns = USEC2NSEC(0x3FF);
 	dev->max_num_events = -1;
 	dev->nb_event_queues = 0;
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index ab58508590..e3b5ffa7eb 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -144,6 +144,7 @@ struct cn10k_sso_hws {
 /* Event port a.k.a GWS */
 struct cn9k_sso_hws {
 	uint64_t base;
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
@@ -160,6 +161,7 @@ struct cn9k_sso_hws {
 
 struct cn9k_sso_hws_dual {
 	uint64_t base[2]; /* Ping and Pong */
+	uint64_t gw_wdata;
 	/* PTP timestamp */
 	struct cnxk_timesync_info *tstamp;
 	void *lookup_mem;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4 3/3] event/cnxk: improve Rx performance
  2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
@ 2022-02-10 13:20     ` pbhagavatula
  2022-02-14  9:29     ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status Jerin Jacob
  2 siblings, 0 replies; 16+ messages in thread
From: pbhagavatula @ 2022-02-10 13:20 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h | 25 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx.h       |  8 ++++----
 drivers/net/cnxk/cn9k_rx.h        | 20 ++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ada230ea1d..cfe729cef9 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint8_t loff = 0;
 	uint64_t sa_base;
 	uint64_t **wqe;
+	int i;
 
 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
 	wqe = vec->u64s;
 
+	rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+		rte_prefetch_non_temporal(&vec->ptrs[i]);
+
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
 	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t u64[2];
 	} gw;
 	uint64_t tstamp_ptr;
-	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
-		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		"sub %[mbuf], %H[wdata], #0x80				\n"
-		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+		"caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+		: [wdata] "+r"(gw.get_work)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
-	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	ws->gw_rdata = gw.u64[0];
-	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
-		    (gw.u64[0] & 0xffffffff);
-
-	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+	if (gw.u64[1]) {
+		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+			    (gw.u64[0] & (0x3FFull << 36)) << 4 |
+			    (gw.u64[0] & 0xffffffff);
 		if ((flags & CPT_RX_WQE_F) &&
 		    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 		     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 			   RTE_EVENT_TYPE_ETHDEV) {
 			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			uint64_t mbuf;
 
+			mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+			rte_prefetch0((void *)mbuf);
 			if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 				struct rte_mbuf *m;
 				uintptr_t sa_base;
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 8b00fcc660..564e50f0af 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
 		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 1178f95317..d36f292c95 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -388,16 +388,16 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags =
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
-	mbuf->pkt_len = len;
-	mbuf->data_len = len;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
-
 	mbuf->ol_flags = ol_flags;
+	*(uint64_t *)(&mbuf->rearm_data) = val;
+	mbuf->pkt_len = len;
 
-	if (flag & NIX_RX_MULTI_SEG_F)
+	if (flag & NIX_RX_MULTI_SEG_F) {
 		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-	else
+	} else {
+		mbuf->data_len = len;
 		mbuf->next = NULL;
+	}
 }
 
 static inline uint16_t
@@ -769,10 +769,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
-		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
@@ -797,6 +793,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			mbuf3->next = NULL;
 		}
 
+		/* Store the mbufs to rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v4] net/cnxk: avoid command copy from Tx queue
  2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
@ 2022-02-11 10:27     ` Jerin Jacob
  0 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2022-02-11 10:27 UTC (permalink / raw)
  To: Pavan Nikhilesh, Ferruh Yigit
  Cc: Jerin Jacob, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj,
	Shijith Thotton, dpdk-dev

On Thu, Feb 10, 2022 at 6:46 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Tx command is prepared based on offloads enabled and stored in
> Tx queue structure at tx_queue_setup phase.
> In fastpath the command is copied from Tx queue to LMT line for
> all the packets.
> Since, the command contents are mostly constants we can move the
> command preparation to fastpath and avoid accessing Tx queue
> memory.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Jerin Jacob <jerinj@marvell.com>
Applied to dpdk-next-net-mrvl/for-next-net. Thanks


> ---
>  v4 Changes:
>  - Further refactor large functions.
>  v3 Changes:
>  - Rebase.
>  - Split patches.
>  - Refactoring large function.
>  v2 Changes:
>  - Rebase.
>  - Fix incorrect use of RoC API
>
>  drivers/common/cnxk/roc_io.h             |  33 ++++-
>  drivers/common/cnxk/roc_io_generic.h     |  15 ++
>  drivers/crypto/cnxk/cn9k_cryptodev_ops.c |   2 +-
>  drivers/crypto/cnxk/cn9k_ipsec.c         |   2 +-
>  drivers/event/cnxk/cn10k_eventdev.c      |  26 +++-
>  drivers/event/cnxk/cn10k_worker.h        |  89 ++++++------
>  drivers/event/cnxk/cn9k_eventdev.c       |  33 +++--
>  drivers/event/cnxk/cn9k_worker.h         |  64 ++++----
>  drivers/event/cnxk/cnxk_eventdev.h       |  13 +-
>  drivers/event/cnxk/cnxk_eventdev_adptr.c | 178 +++++++++++++++++++++--
>  drivers/net/cnxk/cn10k_ethdev.c          |  24 +--
>  drivers/net/cnxk/cn10k_ethdev.h          |   3 +-
>  drivers/net/cnxk/cn10k_tx.h              | 167 ++++++++++-----------
>  drivers/net/cnxk/cn9k_ethdev.c           |  36 +----
>  drivers/net/cnxk/cn9k_ethdev.h           |   3 +-
>  drivers/net/cnxk/cn9k_tx.h               | 135 +++++++++++------
>  16 files changed, 516 insertions(+), 307 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_io.h b/drivers/common/cnxk/roc_io.h
> index 4f15503c29..62e98d9d00 100644
> --- a/drivers/common/cnxk/roc_io.h
> +++ b/drivers/common/cnxk/roc_io.h
> @@ -164,13 +164,36 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
>         dst128[1] = src128[1];
>         /* lmtext receives following value:
>          * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
> -        * 2: NIX_SUBDC_EXT + NIX_SUBDC_MEM i.e. tstamp case
>          */
> -       if (lmtext) {
> +       if (lmtext)
> +               dst128[2] = src128[2];
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov64(void *out, const void *in)
> +{
> +       volatile const __uint128_t *src128 = (const __uint128_t *)in;
> +       volatile __uint128_t *dst128 = (__uint128_t *)out;
> +
> +       dst128[0] = src128[0];
> +       dst128[1] = src128[1];
> +       dst128[2] = src128[2];
> +       dst128[3] = src128[3];
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
> +{
> +       const __uint128_t *src128 = (const __uint128_t *)in;
> +       __uint128_t *dst128 = (__uint128_t *)out;
> +
> +       dst128[0] = src128[0];
> +       dst128[1] = src128[1];
> +       /* lmtext receives following value:
> +        * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
> +        */
> +       if (lmtext)
>                 dst128[2] = src128[2];
> -               if (lmtext > 1)
> -                       dst128[3] = src128[3];
> -       }
>  }
>
>  static __plt_always_inline void
> diff --git a/drivers/common/cnxk/roc_io_generic.h b/drivers/common/cnxk/roc_io_generic.h
> index 5f90835c09..42764455cc 100644
> --- a/drivers/common/cnxk/roc_io_generic.h
> +++ b/drivers/common/cnxk/roc_io_generic.h
> @@ -106,6 +106,21 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
>         memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
>  }
>
> +static __plt_always_inline void
> +roc_lmt_mov64(void *out, const void *in)
> +{
> +       PLT_SET_USED(out);
> +       PLT_SET_USED(in);
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
> +{
> +       PLT_SET_USED(in);
> +       PLT_SET_USED(lmtext);
> +       memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
> +}
> +
>  static __plt_always_inline void
>  roc_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
>  {
> diff --git a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> index ac1953b66d..ddba9d5dd0 100644
> --- a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> +++ b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> @@ -161,7 +161,7 @@ cn9k_cpt_inst_submit(struct cpt_inst_s *inst, uint64_t lmtline,
>
>         do {
>                 /* Copy CPT command to LMTLINE */
> -               roc_lmt_mov((void *)lmtline, inst, 2);
> +               roc_lmt_mov64((void *)lmtline, inst);
>
>                 /*
>                  * Make sure compiler does not reorder memcpy and ldeor.
> diff --git a/drivers/crypto/cnxk/cn9k_ipsec.c b/drivers/crypto/cnxk/cn9k_ipsec.c
> index 9f876f75f2..672b65a5d2 100644
> --- a/drivers/crypto/cnxk/cn9k_ipsec.c
> +++ b/drivers/crypto/cnxk/cn9k_ipsec.c
> @@ -53,7 +53,7 @@ cn9k_cpt_enq_sa_write(struct cn9k_ipsec_sa *sa, struct cnxk_cpt_qp *qp,
>
>         do {
>                 /* Copy CPT command to LMTLINE */
> -               roc_lmt_mov((void *)lmtline, &inst, 2);
> +               roc_lmt_mov64((void *)lmtline, &inst);
>                 lmt_status = roc_lmt_submit_ldeor(io_addr);
>         } while (lmt_status == 0);
>
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index 7b7ce44c74..97a88feb13 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -50,7 +50,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
>         /* First cache line is reserved for cookie */
>         ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
>         ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
> -       ws->tx_base = ws->base;
>         ws->hws_id = port_id;
>         ws->swtag_req = 0;
>         ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
> @@ -259,15 +258,13 @@ cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                         ws_cookie,
>                         sizeof(struct cnxk_sso_hws_cookie) +
>                                 sizeof(struct cn10k_sso_hws) +
> -                               (sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                RTE_MAX_QUEUES_PER_PORT),
> +                               dev->tx_adptr_data_sz,
>                         RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                 if (ws_cookie == NULL)
>                         return -ENOMEM;
>                 ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
>                 memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
> -                      sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                              RTE_MAX_QUEUES_PER_PORT);
> +                      dev->tx_adptr_data_sz);
>                 event_dev->data->ports[i] = ws;
>         }
>
> @@ -721,16 +718,35 @@ cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
>                                const struct rte_eth_dev *eth_dev,
>                                int32_t tx_queue_id)
>  {
> +       struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> +       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint64_t tx_offloads;
>         int rc;
>
>         RTE_SET_USED(id);
>         rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
>         if (rc < 0)
>                 return rc;
> +
> +       /* Can't enable tstamp if all the ports don't have it enabled. */
> +       tx_offloads = cnxk_eth_dev->tx_offload_flags;
> +       if (dev->tx_adptr_configured) {
> +               uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +               uint8_t tstmp_ena =
> +                       !!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +
> +               if (tstmp_ena && !tstmp_req)
> +                       dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +               else if (!tstmp_ena && tstmp_req)
> +                       tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +       }
> +
> +       dev->tx_offloads |= tx_offloads;
>         rc = cn10k_sso_updt_tx_adptr_data(event_dev);
>         if (rc < 0)
>                 return rc;
>         cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
> +       dev->tx_adptr_configured = 1;
>
>         return 0;
>  }
> diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
> index 4019c13bd2..ff08b2d974 100644
> --- a/drivers/event/cnxk/cn10k_worker.h
> +++ b/drivers/event/cnxk/cn10k_worker.h
> @@ -455,18 +455,18 @@ NIX_RX_FASTPATH_MODES
>         }
>
>  static __rte_always_inline struct cn10k_eth_txq *
> -cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
> -                         const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> +cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  {
> -       return (struct cn10k_eth_txq *)
> -               txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> +       return (struct cn10k_eth_txq
> +                       *)(txq_data[(txq_data[m->port] >> 48) +
> +                                   rte_event_eth_tx_adapter_txq_get(m)] &
> +                          (BIT_ULL(48) - 1));
>  }
>
>  static __rte_always_inline void
> -cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
> -                uintptr_t lmt_addr, uint8_t sched_type, uintptr_t base,
> -                const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                const uint32_t flags)
> +cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
> +                uint16_t lmt_id, uintptr_t lmt_addr, uint8_t sched_type,
> +                const uint64_t *txq_data, const uint32_t flags)
>  {
>         uint8_t lnum = 0, loff = 0, shft = 0;
>         struct cn10k_eth_txq *txq;
> @@ -476,7 +476,7 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
>         bool sec;
>
>         txq = cn10k_sso_hws_xtract_meta(m, txq_data);
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
>         /* Perform header writes before barrier
>          * for TSO
>          */
> @@ -501,23 +501,23 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
>         else
>                 segdw = cn10k_nix_tx_ext_subs(flags) + 2;
>
> +       cn10k_nix_xmit_prepare_tstamp(txq, laddr, m->ol_flags, segdw, flags);
>         if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
>                 pa = txq->cpt_io_addr | 3 << 4;
>         else
>                 pa = txq->io_addr | ((segdw - 1) << 4);
>
>         if (!sched_type)
> -               roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> +               roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
>
>         roc_lmt_submit_steorl(lmt_id, pa);
>  }
>
>  static __rte_always_inline void
> -cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
> -                       uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
> -                       uint8_t sched_type, uintptr_t base,
> -                       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                       const uint32_t flags)
> +cn10k_sso_vwqe_split_tx(struct cn10k_sso_hws *ws, struct rte_mbuf **mbufs,
> +                       uint16_t nb_mbufs, uint64_t *cmd, uint16_t lmt_id,
> +                       uintptr_t lmt_addr, uint8_t sched_type,
> +                       const uint64_t *txq_data, const uint32_t flags)
>  {
>         uint16_t port[4], queue[4];
>         uint16_t i, j, pkts, scalar;
> @@ -540,14 +540,16 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
>                 if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
>                     ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
>                         for (j = 0; j < 4; j++)
> -                               cn10k_sso_tx_one(mbufs[i + j], cmd, lmt_id,
> -                                                lmt_addr, sched_type, base,
> -                                                txq_data, flags);
> +                               cn10k_sso_tx_one(ws, mbufs[i + j], cmd, lmt_id,
> +                                                lmt_addr, sched_type, txq_data,
> +                                                flags);
>                 } else {
> -                       txq = (struct cn10k_eth_txq *)
> -                               txq_data[port[0]][queue[0]];
> -                       cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd,
> -                                                  base + SSOW_LF_GWS_TAG,
> +                       txq = (struct cn10k_eth_txq
> +                                      *)(txq_data[(txq_data[port[0]] >> 48) +
> +                                                  queue[0]] &
> +                                         (BIT_ULL(48) - 1));
> +                       cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws,
> +                                                  &mbufs[i], 4, cmd,
>                                                    flags | NIX_TX_VWQE_F);
>                 }
>         }
> @@ -555,15 +557,14 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
>         mbufs += i;
>
>         for (i = 0; i < scalar; i++) {
> -               cn10k_sso_tx_one(mbufs[i], cmd, lmt_id, lmt_addr, sched_type,
> -                                base, txq_data, flags);
> +               cn10k_sso_tx_one(ws, mbufs[i], cmd, lmt_id, lmt_addr,
> +                                sched_type, txq_data, flags);
>         }
>  }
>
>  static __rte_always_inline uint16_t
>  cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> -                      uint64_t *cmd,
> -                      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> +                      uint64_t *cmd, const uint64_t *txq_data,
>                        const uint32_t flags)
>  {
>         struct cn10k_eth_txq *txq;
> @@ -580,17 +581,19 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>                 uint64_t meta = *(uint64_t *)ev->vec;
>
>                 if (meta & BIT(31)) {
> -                       txq = (struct cn10k_eth_txq *)
> -                               txq_data[meta >> 32][meta >> 48];
> -
> -                       cn10k_nix_xmit_pkts_vector(
> -                               txq, mbufs, meta & 0xFFFF, cmd,
> -                               ws->tx_base + SSOW_LF_GWS_TAG,
> -                               flags | NIX_TX_VWQE_F);
> +                       txq = (struct cn10k_eth_txq
> +                                      *)(txq_data[(txq_data[meta >> 32] >>
> +                                                   48) +
> +                                                  (meta >> 48)] &
> +                                         (BIT_ULL(48) - 1));
> +
> +                       cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws, mbufs,
> +                                                  meta & 0xFFFF, cmd,
> +                                                  flags | NIX_TX_VWQE_F);
>                 } else {
>                         cn10k_sso_vwqe_split_tx(
> -                               mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> -                               ev->sched_type, ws->tx_base, txq_data, flags);
> +                               ws, mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> +                               ev->sched_type, txq_data, flags);
>                 }
>                 rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
>                 return (meta & 0xFFFF);
> @@ -598,16 +601,16 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>
>         m = ev->mbuf;
>         ref_cnt = m->refcnt;
> -       cn10k_sso_tx_one(m, cmd, lmt_id, lmt_addr, ev->sched_type, ws->tx_base,
> -                        txq_data, flags);
> +       cn10k_sso_tx_one(ws, m, cmd, lmt_id, lmt_addr, ev->sched_type, txq_data,
> +                        flags);
>
>         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
>                 if (ref_cnt > 1)
>                         return 1;
>         }
>
> -       cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
> -                                ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
> +       cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
> +                                ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
>         return 1;
>  }
>
> @@ -628,9 +631,7 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
>                 return cn10k_sso_hws_event_tx(                                 \
> -                       ws, &ev[0], cmd,                                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> +                       ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
>                         flags);                                                \
>         }
>
> @@ -642,9 +643,7 @@ NIX_TX_FASTPATH_MODES
>                 struct cn10k_sso_hws *ws = port;                               \
>                 RTE_SET_USED(nb_events);                                       \
>                 return cn10k_sso_hws_event_tx(                                 \
> -                       ws, &ev[0], cmd,                                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> +                       ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
>                         (flags) | NIX_TX_MULTI_SEG_F);                         \
>         }
>
> diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
> index 4611936b7f..f8652d4fbc 100644
> --- a/drivers/event/cnxk/cn9k_eventdev.c
> +++ b/drivers/event/cnxk/cn9k_eventdev.c
> @@ -259,17 +259,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                                 ws_cookie,
>                                 sizeof(struct cnxk_sso_hws_cookie) +
>                                         sizeof(struct cn9k_sso_hws_dual) +
> -                                       (sizeof(uint64_t) *
> -                                        (dev->max_port_id + 1) *
> -                                        RTE_MAX_QUEUES_PER_PORT),
> +                                       dev->tx_adptr_data_sz,
>                                 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                         if (ws_cookie == NULL)
>                                 return -ENOMEM;
>                         dws = RTE_PTR_ADD(ws_cookie,
>                                           sizeof(struct cnxk_sso_hws_cookie));
>                         memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
> -                              sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                      RTE_MAX_QUEUES_PER_PORT);
> +                              dev->tx_adptr_data_sz);
>                         event_dev->data->ports[i] = dws;
>                 } else {
>                         struct cn9k_sso_hws *ws = event_dev->data->ports[i];
> @@ -280,17 +277,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                                 ws_cookie,
>                                 sizeof(struct cnxk_sso_hws_cookie) +
>                                         sizeof(struct cn9k_sso_hws_dual) +
> -                                       (sizeof(uint64_t) *
> -                                        (dev->max_port_id + 1) *
> -                                        RTE_MAX_QUEUES_PER_PORT),
> +                                       dev->tx_adptr_data_sz,
>                                 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                         if (ws_cookie == NULL)
>                                 return -ENOMEM;
>                         ws = RTE_PTR_ADD(ws_cookie,
>                                          sizeof(struct cnxk_sso_hws_cookie));
>                         memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
> -                              sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                      RTE_MAX_QUEUES_PER_PORT);
> +                              dev->tx_adptr_data_sz);
>                         event_dev->data->ports[i] = ws;
>                 }
>         }
> @@ -987,17 +981,36 @@ cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
>                               const struct rte_eth_dev *eth_dev,
>                               int32_t tx_queue_id)
>  {
> +       struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> +       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint64_t tx_offloads;
>         int rc;
>
>         RTE_SET_USED(id);
>         rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
>         if (rc < 0)
>                 return rc;
> +
> +       /* Can't enable tstamp if all the ports don't have it enabled. */
> +       tx_offloads = cnxk_eth_dev->tx_offload_flags;
> +       if (dev->tx_adptr_configured) {
> +               uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +               uint8_t tstmp_ena =
> +                       !!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +
> +               if (tstmp_ena && !tstmp_req)
> +                       dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +               else if (!tstmp_ena && tstmp_req)
> +                       tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +       }
> +
> +       dev->tx_offloads |= tx_offloads;
>         cn9k_sso_txq_fc_update(eth_dev, tx_queue_id, true);
>         rc = cn9k_sso_updt_tx_adptr_data(event_dev);
>         if (rc < 0)
>                 return rc;
>         cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
> +       dev->tx_adptr_configured = 1;
>
>         return 0;
>  }
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index c99e459c1b..303b04c215 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -599,20 +599,13 @@ cn9k_sso_txq_fc_wait(const struct cn9k_eth_txq *txq)
>                 ;
>  }
>
> -static __rte_always_inline const struct cn9k_eth_txq *
> -cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
> -                        const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> +static __rte_always_inline struct cn9k_eth_txq *
> +cn9k_sso_hws_xtract_meta(struct rte_mbuf *m, uint64_t *txq_data)
>  {
> -       return (const struct cn9k_eth_txq *)
> -               txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> -}
> -
> -static __rte_always_inline void
> -cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
> -                        uint64_t *cmd, const uint32_t flags)
> -{
> -       roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
> -       cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
> +       return (struct cn9k_eth_txq
> +                       *)(txq_data[(txq_data[m->port] >> 48) +
> +                                   rte_event_eth_tx_adapter_txq_get(m)] &
> +                          (BIT_ULL(48) - 1));
>  }
>
>  #if defined(RTE_ARCH_ARM64)
> @@ -669,7 +662,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>         nixtx += BIT_ULL(7);
>         nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);
>
> -       roc_lmt_mov((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
> +       roc_lmt_mov_nv((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
>
>         /* Load opcode and cptr already prepared at pkt metadata set */
>         pkt_len -= l2_len;
> @@ -756,12 +749,11 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>
>  static __rte_always_inline uint16_t
>  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> -                     const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                     const uint32_t flags)
> +                     uint64_t *txq_data, const uint32_t flags)
>  {
>         struct rte_mbuf *m = ev->mbuf;
> -       const struct cn9k_eth_txq *txq;
>         uint16_t ref_cnt = m->refcnt;
> +       struct cn9k_eth_txq *txq;
>
>         /* Perform header writes before barrier for TSO */
>         cn9k_nix_xmit_prepare_tso(m, flags);
> @@ -774,7 +766,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>             !(flags & NIX_TX_OFFLOAD_SECURITY_F))
>                 rte_io_wmb();
>         txq = cn9k_sso_hws_xtract_meta(m, txq_data);
> -       cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
> +       cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
>
>         if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
>                 uint64_t ol_flags = m->ol_flags;
> @@ -796,6 +789,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>
>         if (flags & NIX_TX_MULTI_SEG_F) {
>                 const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
> +                                            flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
>                         roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> @@ -808,6 +803,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>                                                segdw);
>                 }
>         } else {
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
>                         roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> @@ -853,11 +849,9 @@ NIX_TX_FASTPATH_MODES
>                 struct cn9k_sso_hws *ws = port;                                \
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base, &ev[0], cmd,                                 \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       flags);                                                \
> +               return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            flags);                           \
>         }
>
>  #define SSO_TX_SEG(fn, sz, flags)                                              \
> @@ -867,11 +861,9 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
>                 struct cn9k_sso_hws *ws = port;                                \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base, &ev[0], cmd,                                 \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       (flags) | NIX_TX_MULTI_SEG_F);                         \
> +               return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            (flags) | NIX_TX_MULTI_SEG_F);    \
>         }
>
>  #define SSO_DUAL_TX(fn, sz, flags)                                             \
> @@ -881,11 +873,9 @@ NIX_TX_FASTPATH_MODES
>                 struct cn9k_sso_hws_dual *ws = port;                           \
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base[!ws->vws], &ev[0], cmd,                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       flags);                                                \
> +               return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            flags);                           \
>         }
>
>  #define SSO_DUAL_TX_SEG(fn, sz, flags)                                         \
> @@ -895,11 +885,9 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
>                 struct cn9k_sso_hws_dual *ws = port;                           \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base[!ws->vws], &ev[0], cmd,                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       (flags) | NIX_TX_MULTI_SEG_F);                         \
> +               return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            (flags) | NIX_TX_MULTI_SEG_F);    \
>         }
>
>  #endif
> diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
> index 4652b58a84..b26df58588 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.h
> +++ b/drivers/event/cnxk/cnxk_eventdev.h
> @@ -99,7 +99,10 @@ struct cnxk_sso_evdev {
>         uint16_t rx_adptr_pool_cnt;
>         uint64_t *rx_adptr_pools;
>         uint64_t *tx_adptr_data;
> +       size_t tx_adptr_data_sz;
>         uint16_t max_port_id;
> +       uint16_t max_queue_id[RTE_MAX_ETHPORTS];
> +       uint8_t tx_adptr_configured;
>         uint16_t tim_adptr_ring_cnt;
>         uint16_t *timer_adptr_rings;
>         uint64_t *timer_adptr_sz;
> @@ -131,8 +134,8 @@ struct cn10k_sso_hws {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint64_t tx_base __rte_cache_aligned;
> -       uintptr_t lmt_base;
> +       uintptr_t lmt_base __rte_cache_aligned;
> +       uint64_t lso_tun_fmt;
>         uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
> @@ -149,7 +152,8 @@ struct cn9k_sso_hws {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint8_t tx_adptr_data[] __rte_cache_aligned;
> +       uint64_t lso_tun_fmt __rte_cache_aligned;
> +       uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
>  struct cn9k_sso_hws_dual {
> @@ -165,7 +169,8 @@ struct cn9k_sso_hws_dual {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint8_t tx_adptr_data[] __rte_cache_aligned;
> +       uint64_t lso_tun_fmt __rte_cache_aligned;
> +       uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
>  struct cnxk_sso_hws_cookie {
> diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
> index fdcd68ca63..5ebd3340e7 100644
> --- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
> +++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
> @@ -339,30 +339,179 @@ cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
>                 sq->aura_handle, RTE_MIN(nb_sqb_bufs, sq->aura_sqb_bufs));
>  }
>
> +static void
> +cnxk_sso_tx_queue_data_init(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                           uint16_t eth_port_id, uint16_t tx_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i;
> +
> +       dev->max_queue_id[0] = RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +       for (i = 1; i < eth_port_id; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +       }
> +       dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +       dev->max_queue_id[eth_port_id] =
> +               RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_cpy(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                          uint64_t *otxq_data, uint16_t eth_port_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       for (i = 1; i < eth_port_id; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +               for (j = 0;
> +                    (i < dev->max_port_id) && (j < dev->max_queue_id[i] + 1);
> +                    j++)
> +                       txq_data[offset + j] =
> +                               otxq_data[(otxq_data[i] >> 48) + j];
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_cpy_max(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                              uint64_t *otxq_data, uint16_t eth_port_id,
> +                              uint16_t max_port_id, uint16_t max_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       for (i = 1; i < max_port_id + 1; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +               for (j = 0; j < dev->max_queue_id[i] + 1; j++) {
> +                       if (i == eth_port_id && j > max_queue_id)
> +                               continue;
> +                       txq_data[offset + j] =
> +                               otxq_data[(otxq_data[i] >> 48) + j];
> +               }
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_rewrite(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                              uint16_t eth_port_id, uint16_t tx_queue_id,
> +                              uint64_t *otxq_data, uint16_t max_port_id,
> +                              uint16_t max_queue_id)
> +{
> +       int i;
> +
> +       for (i = 0; i < dev->max_queue_id[0] + 1; i++)
> +               txq_data[i] |= (otxq_data[i] & ~((BIT_ULL(16) - 1) << 48));
> +
> +       if (eth_port_id > max_port_id) {
> +               dev->max_queue_id[0] =
> +                       RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +               dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +
> +               cnxk_sso_tx_queue_data_cpy(dev, txq_data, otxq_data,
> +                                          eth_port_id);
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +       } else if (tx_queue_id > max_queue_id) {
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +               dev->max_port_id = RTE_MAX(max_port_id, eth_port_id);
> +               cnxk_sso_tx_queue_data_cpy_max(dev, txq_data, otxq_data,
> +                                              eth_port_id, max_port_id,
> +                                              max_queue_id);
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_sz(struct cnxk_sso_evdev *dev, uint16_t eth_port_id,
> +                         uint16_t tx_queue_id, uint16_t max_port_id,
> +                         uint16_t max_queue_id, uint64_t *r, size_t *sz)
> +{
> +       uint64_t row = 0;
> +       size_t size = 0;
> +       int i;
> +
> +       if (dev->tx_adptr_data == NULL) {
> +               size = (eth_port_id + 1);
> +               size += (eth_port_id + tx_queue_id);
> +               row = 2 * eth_port_id;
> +               *r = row;
> +               *sz = size;
> +               return;
> +       }
> +
> +       if (eth_port_id > max_port_id) {
> +               size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
> +               for (i = 1; i < eth_port_id; i++)
> +                       size += (dev->max_queue_id[i] + 1);
> +               row = size;
> +               size += (tx_queue_id + 1);
> +       } else if (tx_queue_id > max_queue_id) {
> +               size = !eth_port_id ?
> +                              tx_queue_id + 1 :
> +                                    RTE_MAX(max_port_id, dev->max_queue_id[0]) + 1;
> +               for (i = 1; i < max_port_id + 1; i++) {
> +                       if (i == eth_port_id) {
> +                               row = size;
> +                               size += tx_queue_id + 1;
> +                       } else {
> +                               size += dev->max_queue_id[i] + 1;
> +                       }
> +               }
> +       }
> +       *r = row;
> +       *sz = size;
> +}
> +
>  static int
>  cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
>                             uint16_t eth_port_id, uint16_t tx_queue_id,
>                             void *txq)
>  {
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
>         uint16_t max_port_id = dev->max_port_id;
> -       uint64_t *txq_data = dev->tx_adptr_data;
> -
> -       if (txq_data == NULL || eth_port_id > max_port_id) {
> -               max_port_id = RTE_MAX(max_port_id, eth_port_id);
> -               txq_data = rte_realloc_socket(
> -                       txq_data,
> -                       (sizeof(uint64_t) * (max_port_id + 1) *
> -                        RTE_MAX_QUEUES_PER_PORT),
> -                       RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
> +       uint64_t *txq_data = NULL;
> +       uint64_t row = 0;
> +       size_t size = 0;
> +
> +       if (((uint64_t)txq) & 0xFFFF000000000000)
> +               return -EINVAL;
> +
> +       cnxk_sso_tx_queue_data_sz(dev, eth_port_id, tx_queue_id, max_port_id,
> +                                 max_queue_id, &row, &size);
> +
> +       size *= sizeof(uint64_t);
> +
> +       if (size) {
> +               uint64_t *otxq_data = dev->tx_adptr_data;
> +
> +               txq_data = malloc(size);
>                 if (txq_data == NULL)
>                         return -ENOMEM;
> +               memset(txq_data, 0, size);
> +               txq_data[eth_port_id] = ((uint64_t)row) << 48;
> +               txq_data[row + tx_queue_id] = (uint64_t)txq;
> +
> +               if (otxq_data != NULL)
> +                       cnxk_sso_tx_queue_data_rewrite(
> +                               dev, txq_data, eth_port_id, tx_queue_id,
> +                               otxq_data, max_port_id, max_queue_id);
> +               else
> +                       cnxk_sso_tx_queue_data_init(dev, txq_data, eth_port_id,
> +                                                   tx_queue_id);
> +               dev->tx_adptr_data_sz = size;
> +               free(otxq_data);
> +               dev->tx_adptr_data = txq_data;
> +       } else {
> +               txq_data = dev->tx_adptr_data;
> +               row = txq_data[eth_port_id] >> 48;
> +               txq_data[row + tx_queue_id] &= ~(BIT_ULL(48) - 1);
> +               txq_data[row + tx_queue_id] |= (uint64_t)txq;
>         }
>
> -       ((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
> -                txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
> -       dev->max_port_id = max_port_id;
> -       dev->tx_adptr_data = txq_data;
>         return 0;
>  }
>
> @@ -372,7 +521,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
>                               int32_t tx_queue_id)
>  {
>         struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> -       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>         struct roc_nix_sq *sq;
>         int i, ret;
>         void *txq;
> @@ -388,8 +536,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
>                         event_dev, eth_dev->data->port_id, tx_queue_id, txq);
>                 if (ret < 0)
>                         return ret;
> -
> -               dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
>         }
>
>         return 0;
> diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
> index 8378cbffc2..9bb08e1824 100644
> --- a/drivers/net/cnxk/cn10k_ethdev.c
> +++ b/drivers/net/cnxk/cn10k_ethdev.c
> @@ -131,53 +131,31 @@ static void
>  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
>                       uint16_t qid)
>  {
> -       struct nix_send_ext_s *send_hdr_ext;
>         union nix_send_hdr_w0_u send_hdr_w0;
> -       struct nix_send_mem_s *send_mem;
> -       union nix_send_sg_s sg_w0;
> -
> -       RTE_SET_USED(dev);
>
>         /* Initialize the fields based on basic single segment packet */
> -       memset(&txq->cmd, 0, sizeof(txq->cmd));
>         send_hdr_w0.u = 0;
> -       sg_w0.u = 0;
> -
>         if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
>                 /* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
>                 send_hdr_w0.sizem1 = 2;
> -
> -               send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[0];
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
>                 if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                         /* Default: one seg packet would have:
>                          * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
>                          * => 8/2 - 1 = 3
>                          */
>                         send_hdr_w0.sizem1 = 3;
> -                       send_hdr_ext->w0.tstmp = 1;
>
>                         /* To calculate the offset for send_mem,
>                          * send_hdr->w0.sizem1 * 2
>                          */
> -                       send_mem = (struct nix_send_mem_s *)(txq->cmd + 2);
> -                       send_mem->w0.subdc = NIX_SUBDC_MEM;
> -                       send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP;
> -                       send_mem->addr = dev->tstamp.tx_tstamp_iova;
> +                       txq->ts_mem = dev->tstamp.tx_tstamp_iova;
>                 }
>         } else {
>                 /* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
>                 send_hdr_w0.sizem1 = 1;
>         }
> -
>         send_hdr_w0.sq = qid;
> -       sg_w0.subdc = NIX_SUBDC_SG;
> -       sg_w0.segs = 1;
> -       sg_w0.ld_type = NIX_SENDLDTYPE_LDD;
> -
>         txq->send_hdr_w0 = send_hdr_w0.u;
> -       txq->sg_w0 = sg_w0.u;
> -
>         rte_wmb();
>  }
>
> diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
> index 0982158c62..ec40e53152 100644
> --- a/drivers/net/cnxk/cn10k_ethdev.h
> +++ b/drivers/net/cnxk/cn10k_ethdev.h
> @@ -9,7 +9,6 @@
>
>  struct cn10k_eth_txq {
>         uint64_t send_hdr_w0;
> -       uint64_t sg_w0;
>         int64_t fc_cache_pkts;
>         uint64_t *fc_mem;
>         uintptr_t lmt_base;
> @@ -20,8 +19,8 @@ struct cn10k_eth_txq {
>         uint64_t sa_base;
>         uint64_t *cpt_fc;
>         uint16_t cpt_desc;
> -       uint64_t cmd[4];
>         uint64_t lso_tun_fmt;
> +       uint64_t ts_mem;
>  } __plt_cache_aligned;
>
>  struct cn10k_eth_rxq {
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index fc1f6ceb8c..4ae6bbf517 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -186,23 +186,26 @@ cn10k_cpt_tx_steor_data(void)
>  }
>
>  static __rte_always_inline void
> -cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
> -                     const uint16_t flags)
> +cn10k_nix_tx_skeleton(struct cn10k_eth_txq *txq, uint64_t *cmd,
> +                     const uint16_t flags, const uint16_t static_sz)
>  {
> -       /* Send hdr */
> -       cmd[0] = txq->send_hdr_w0;
> +       if (static_sz)
> +               cmd[0] = txq->send_hdr_w0;
> +       else
> +               cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                        ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
>         cmd[1] = 0;
> -       cmd += 2;
>
> -       /* Send ext if present */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
> -               cmd += 2;
> +               if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
> +                       cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
> +               else
> +                       cmd[2] = NIX_SUBDC_EXT << 60;
> +               cmd[3] = 0;
> +               cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       } else {
> +               cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
>         }
> -
> -       /* Send sg */
> -       cmd[0] = txq->sg_w0;
> -       cmd[1] = 0;
>  }
>
>  static __rte_always_inline void
> @@ -718,41 +721,29 @@ cn10k_nix_xmit_mv_lmt_base(uintptr_t lmt_addr, uint64_t *cmd,
>  }
>
>  static __rte_always_inline void
> -cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
> +cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
>                               const uint64_t ol_flags, const uint16_t no_segdw,
>                               const uint16_t flags)
>  {
>         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -               const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
> -               struct nix_send_ext_s *send_hdr_ext =
> -                       (struct nix_send_ext_s *)lmt_addr + 16;
> +               const uint8_t is_ol_tstamp =
> +                       !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
>                 uint64_t *lmt = (uint64_t *)lmt_addr;
>                 uint16_t off = (no_segdw - 1) << 1;
>                 struct nix_send_mem_s *send_mem;
>
>                 send_mem = (struct nix_send_mem_s *)(lmt + off);
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
> -               send_hdr_ext->w0.tstmp = 1;
> -               if (flags & NIX_TX_MULTI_SEG_F) {
> -                       /* Retrieving the default desc values */
> -                       lmt[off] = cmd[2];
> -
> -                       /* Using compiler barrier to avoid violation of C
> -                        * aliasing rules.
> -                        */
> -                       rte_compiler_barrier();
> -               }
> -
> -               /* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
> +               /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
>                  * should not be recorded, hence changing the alg type to
> -                * NIX_SENDMEMALG_SET and also changing send mem addr field to
> +                * NIX_SENDMEMALG_SUB and also changing send mem addr field to
>                  * next 8 bytes as it corrupts the actual Tx tstamp registered
>                  * address.
>                  */
>                 send_mem->w0.subdc = NIX_SUBDC_MEM;
> -               send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
> +               send_mem->w0.alg =
> +                       NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
>                 send_mem->addr =
> -                       (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
> +                       (rte_iova_t)(((uint64_t *)txq->ts_mem) + is_ol_tstamp);
>         }
>  }
>
> @@ -841,8 +832,8 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> -                   uint64_t *cmd, uintptr_t base, const uint16_t flags)
> +cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
> +                   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
>  {
>         struct cn10k_eth_txq *txq = tx_queue;
>         const rte_iova_t io_addr = txq->io_addr;
> @@ -863,9 +854,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>                 /* Reduce the cached count */
>                 txq->fc_cache_pkts -= pkts;
>         }
> -
>         /* Get cmd skeleton */
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));
>
>         if (flags & NIX_TX_OFFLOAD_TSO_F)
>                 lso_tun_fmt = txq->lso_tun_fmt;
> @@ -909,14 +899,14 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>                 /* Move NIX desc to LMT/NIXTX area */
>                 cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
> -               cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
> -                                             tx_pkts[i]->ol_flags, 4, flags);
> +               cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
> +                                             4, flags);
>                 if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec)
>                         lnum++;
>         }
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -967,9 +957,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                        uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                        const uint16_t flags)
> +cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
> +                        struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                        uint64_t *cmd, const uint16_t flags)
>  {
>         struct cn10k_eth_txq *txq = tx_queue;
>         uintptr_t pa0, pa1, lbase = txq->lmt_base;
> @@ -987,12 +977,13 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         uintptr_t laddr;
>         bool sec;
>
> -       NIX_XMIT_FC_OR_RETURN(txq, pkts);
> -
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> -
> -       /* Reduce the cached count */
> -       txq->fc_cache_pkts -= pkts;
> +       if (!(flags & NIX_TX_VWQE_F)) {
> +               NIX_XMIT_FC_OR_RETURN(txq, pkts);
> +               /* Reduce the cached count */
> +               txq->fc_cache_pkts -= pkts;
> +       }
> +       /* Get cmd skeleton */
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));
>
>         if (flags & NIX_TX_OFFLOAD_TSO_F)
>                 lso_tun_fmt = txq->lso_tun_fmt;
> @@ -1038,13 +1029,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>                 /* Move NIX desc to LMT/NIXTX area */
>                 cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
> -
>                 /* Store sg list directly on lmt line */
>                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
>                                                flags);
> -               cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
> -                                             tx_pkts[i]->ol_flags, segdw,
> -                                             flags);
> +               cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
> +                                             segdw, flags);
>                 if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
>                         lnum++;
>                         data128 |= (((__uint128_t)(segdw - 1)) << shft);
> @@ -1053,7 +1042,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         }
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -1474,9 +1463,9 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                          const uint16_t flags)
> +cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
> +                          struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                          uint64_t *cmd, const uint16_t flags)
>  {
>         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
>         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
> @@ -1526,25 +1515,42 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
>         }
>
> -       senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
> +       if (!(flags & NIX_TX_VWQE_F)) {
> +               senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
> +       } else {
> +               uint64_t w0 =
> +                       (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                       ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
> +
> +               senddesc01_w0 = vdupq_n_u64(w0);
> +       }
>         senddesc23_w0 = senddesc01_w0;
> +
>         senddesc01_w1 = vdupq_n_u64(0);
>         senddesc23_w1 = senddesc01_w1;
> -       sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
> +       sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
>         sgdesc23_w0 = sgdesc01_w0;
>
> -       /* Load command defaults into vector variables. */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
> -               sendext23_w0 = sendext01_w0;
> -               sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> -               sendext23_w1 = sendext01_w1;
>                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -                       sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
> +                                                  BIT_ULL(15));
> +                       sendmem01_w0 =
> +                               vdupq_n_u64((NIX_SUBDC_MEM << 60) |
> +                                           (NIX_SENDMEMALG_SETTSTMP << 56));
>                         sendmem23_w0 = sendmem01_w0;
> -                       sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
> +                       sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
>                         sendmem23_w1 = sendmem01_w1;
> +               } else {
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
>                 }
> +               sendext23_w0 = sendext01_w0;
> +
> +               if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
> +                       sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> +               else
> +                       sendext01_w1 = vdupq_n_u64(0);
> +               sendext23_w1 = sendext01_w1;
>         }
>
>         /* Get LMT base address and LMT ID as lcore id */
> @@ -2577,7 +2583,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>                 wd.data[0] >>= 16;
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>
> @@ -2640,12 +2646,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>         if (unlikely(scalar)) {
>                 if (flags & NIX_TX_MULTI_SEG_F)
> -                       pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
> -                                                        scalar, cmd, base,
> -                                                        flags);
> +                       pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, ws, tx_pkts,
> +                                                        scalar, cmd, flags);
>                 else
> -                       pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
> -                                                   cmd, base, flags);
> +                       pkts += cn10k_nix_xmit_pkts(tx_queue, ws, tx_pkts,
> +                                                   scalar, cmd, flags);
>         }
>
>         return pkts;
> @@ -2653,16 +2658,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>  #else
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                          const uint16_t flags)
> +cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
> +                          struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                          uint64_t *cmd, const uint16_t flags)
>  {
> +       RTE_SET_USED(ws);
>         RTE_SET_USED(tx_queue);
>         RTE_SET_USED(tx_pkts);
>         RTE_SET_USED(pkts);
>         RTE_SET_USED(cmd);
>         RTE_SET_USED(flags);
> -       RTE_SET_USED(base);
>         return 0;
>  }
>  #endif
> @@ -2892,7 +2897,7 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, 0,    \
> +               return cn10k_nix_xmit_pkts(tx_queue, NULL, tx_pkts, pkts, cmd, \
>                                            flags);                             \
>         }
>
> @@ -2905,8 +2910,8 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
> -                                               0,                             \
> +               return cn10k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, \
> +                                               cmd,                           \
>                                                 flags | NIX_TX_MULTI_SEG_F);   \
>         }
>
> @@ -2919,8 +2924,8 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts,     \
> -                                                 cmd, 0, (flags));            \
> +               return cn10k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts,     \
> +                                                 pkts, cmd, (flags));         \
>         }
>
>  #define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags)                                    \
> @@ -2933,7 +2938,7 @@ NIX_TX_FASTPATH_MODES
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
>                 return cn10k_nix_xmit_pkts_vector(                             \
> -                       tx_queue, tx_pkts, pkts, cmd, 0,                       \
> +                       tx_queue, NULL, tx_pkts, pkts, cmd,                    \
>                         (flags) | NIX_TX_MULTI_SEG_F);                         \
>         }
>
> diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
> index d34bc6898f..01e3850561 100644
> --- a/drivers/net/cnxk/cn9k_ethdev.c
> +++ b/drivers/net/cnxk/cn9k_ethdev.c
> @@ -131,51 +131,31 @@ static void
>  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
>                       uint16_t qid)
>  {
> -       struct nix_send_ext_s *send_hdr_ext;
> -       struct nix_send_hdr_s *send_hdr;
> -       struct nix_send_mem_s *send_mem;
> -       union nix_send_sg_s *sg;
> +       union nix_send_hdr_w0_u send_hdr_w0;
>
>         /* Initialize the fields based on basic single segment packet */
> -       memset(&txq->cmd, 0, sizeof(txq->cmd));
> -
> +       send_hdr_w0.u = 0;
>         if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
> -               send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
>                 /* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
> -               send_hdr->w0.sizem1 = 2;
> -
> -               send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[2];
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
> +               send_hdr_w0.sizem1 = 2;
>                 if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                         /* Default: one seg packet would have:
>                          * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
>                          * => 8/2 - 1 = 3
>                          */
> -                       send_hdr->w0.sizem1 = 3;
> -                       send_hdr_ext->w0.tstmp = 1;
> +                       send_hdr_w0.sizem1 = 3;
>
>                         /* To calculate the offset for send_mem,
>                          * send_hdr->w0.sizem1 * 2
>                          */
> -                       send_mem = (struct nix_send_mem_s *)
> -                               (txq->cmd + (send_hdr->w0.sizem1 << 1));
> -                       send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
> -                       send_mem->w0.cn9k.alg = NIX_SENDMEMALG_SETTSTMP;
> -                       send_mem->addr = dev->tstamp.tx_tstamp_iova;
> +                       txq->ts_mem = dev->tstamp.tx_tstamp_iova;
>                 }
> -               sg = (union nix_send_sg_s *)&txq->cmd[4];
>         } else {
> -               send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
>                 /* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
> -               send_hdr->w0.sizem1 = 1;
> -               sg = (union nix_send_sg_s *)&txq->cmd[2];
> +               send_hdr_w0.sizem1 = 1;
>         }
> -
> -       send_hdr->w0.sq = qid;
> -       sg->subdc = NIX_SUBDC_SG;
> -       sg->segs = 1;
> -       sg->ld_type = NIX_SENDLDTYPE_LDD;
> -
> +       send_hdr_w0.sq = qid;
> +       txq->send_hdr_w0 = send_hdr_w0.u;
>         rte_wmb();
>  }
>
> diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
> index 2b452fe009..8ab924944c 100644
> --- a/drivers/net/cnxk/cn9k_ethdev.h
> +++ b/drivers/net/cnxk/cn9k_ethdev.h
> @@ -9,12 +9,13 @@
>  #include <cnxk_security_ar.h>
>
>  struct cn9k_eth_txq {
> -       uint64_t cmd[8];
> +       uint64_t send_hdr_w0;
>         int64_t fc_cache_pkts;
>         uint64_t *fc_mem;
>         void *lmt_addr;
>         rte_iova_t io_addr;
>         uint64_t lso_tun_fmt;
> +       uint64_t ts_mem;
>         uint16_t sqes_per_sqb_log2;
>         int16_t nb_sqb_bufs_adj;
>         rte_iova_t cpt_io_addr;
> diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
> index 8564dd85ee..d23e4b61b4 100644
> --- a/drivers/net/cnxk/cn9k_tx.h
> +++ b/drivers/net/cnxk/cn9k_tx.h
> @@ -58,6 +58,29 @@ cn9k_nix_tx_ext_subs(const uint16_t flags)
>                                   : 0);
>  }
>
> +static __rte_always_inline void
> +cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
> +                    const uint16_t flags, const uint16_t static_sz)
> +{
> +       if (static_sz)
> +               cmd[0] = txq->send_hdr_w0;
> +       else
> +               cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                        ((uint64_t)(cn9k_nix_tx_ext_subs(flags) + 1) << 40);
> +       cmd[1] = 0;
> +
> +       if (flags & NIX_TX_NEED_EXT_HDR) {
> +               if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
> +                       cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
> +               else
> +                       cmd[2] = NIX_SUBDC_EXT << 60;
> +               cmd[3] = 0;
> +               cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       } else {
> +               cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       }
> +}
> +
>  static __rte_always_inline void
>  cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
>  {
> @@ -136,11 +159,11 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
>                 w1.u = 0;
>         }
>
> -       if (!(flags & NIX_TX_MULTI_SEG_F)) {
> +       if (!(flags & NIX_TX_MULTI_SEG_F))
>                 send_hdr->w0.total = m->data_len;
> -               send_hdr->w0.aura =
> -                       roc_npa_aura_handle_to_aura(m->pool->pool_id);
> -       }
> +       else
> +               send_hdr->w0.total = m->pkt_len;
> +       send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
>
>         /*
>          * L3type:  2 => IPV4
> @@ -287,41 +310,39 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
>                 /* Mark mempool object as "put" since it is freed by NIX */
>                 if (!send_hdr->w0.df)
>                         RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
> +       } else {
> +               sg->seg1_size = m->data_len;
> +               *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
> +
> +               /* NOFF is handled later for multi-seg */
>         }
>  }
>
>  static __rte_always_inline void
> -cn9k_nix_xmit_prepare_tstamp(uint64_t *cmd, const uint64_t *send_mem_desc,
> +cn9k_nix_xmit_prepare_tstamp(struct cn9k_eth_txq *txq, uint64_t *cmd,
>                              const uint64_t ol_flags, const uint16_t no_segdw,
>                              const uint16_t flags)
>  {
>         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                 struct nix_send_mem_s *send_mem;
>                 uint16_t off = (no_segdw - 1) << 1;
> -               const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
> +               const uint8_t is_ol_tstamp =
> +                       !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
>
>                 send_mem = (struct nix_send_mem_s *)(cmd + off);
> -               if (flags & NIX_TX_MULTI_SEG_F) {
> -                       /* Retrieving the default desc values */
> -                       cmd[off] = send_mem_desc[6];
>
> -                       /* Using compiler barrier to avoid violation of C
> -                        * aliasing rules.
> -                        */
> -                       rte_compiler_barrier();
> -               }
> -
> -               /* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
> +               /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
>                  * should not be recorded, hence changing the alg type to
> -                * NIX_SENDMEMALG_SET and also changing send mem addr field to
> +                * NIX_SENDMEMALG_SUB and also changing send mem addr field to
>                  * next 8 bytes as it corrupts the actual Tx tstamp registered
>                  * address.
>                  */
> +               send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
>                 send_mem->w0.cn9k.alg =
> -                       NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
> +                       NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
>
> -               send_mem->addr = (rte_iova_t)((uint64_t *)send_mem_desc[7] +
> -                                             (is_ol_tstamp));
> +               send_mem->addr = (rte_iova_t)(((uint64_t *)txq->ts_mem) +
> +                               (is_ol_tstamp));
>         }
>  }
>
> @@ -367,8 +388,6 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>         uint8_t off, i;
>
>         send_hdr = (struct nix_send_hdr_s *)cmd;
> -       send_hdr->w0.total = m->pkt_len;
> -       send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
>
>         if (flags & NIX_TX_NEED_EXT_HDR)
>                 off = 2;
> @@ -376,13 +395,29 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>                 off = 0;
>
>         sg = (union nix_send_sg_s *)&cmd[2 + off];
> -       /* Clear sg->u header before use */
> -       sg->u &= 0xFC00000000000000;
> +
> +       /* Start from second segment, first segment is already there */
> +       i = 1;
>         sg_u = sg->u;
> -       slist = &cmd[3 + off];
> +       nb_segs = m->nb_segs - 1;
> +       m_next = m->next;
> +       slist = &cmd[3 + off + 1];
>
> -       i = 0;
> -       nb_segs = m->nb_segs;
> +       /* Set invert df if buffer is not to be freed by H/W */
> +       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
> +               sg_u |= (cnxk_nix_prefree_seg(m) << 55);
> +               rte_io_wmb();
> +       }
> +
> +       /* Mark mempool object as "put" since it is freed by NIX */
> +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +       if (!(sg_u & (1ULL << 55)))
> +               RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
> +       rte_io_wmb();
> +#endif
> +       m = m_next;
> +       if (!m)
> +               goto done;
>
>         /* Fill mbuf segments */
>         do {
> @@ -417,6 +452,7 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>                 m = m_next;
>         } while (nb_segs);
>
> +done:
>         sg->u = sg_u;
>         sg->segs = i;
>         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
> @@ -472,7 +508,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>         NIX_XMIT_FC_OR_RETURN(txq, pkts);
>
> -       roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
>
>         /* Perform header writes before barrier for TSO */
>         if (flags & NIX_TX_OFFLOAD_TSO_F) {
> @@ -490,8 +526,8 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>         for (i = 0; i < pkts; i++) {
>                 cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
> -               cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
> -                                            tx_pkts[i]->ol_flags, 4, flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
> +                                            flags);
>                 cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
>         }
>
> @@ -514,7 +550,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>         NIX_XMIT_FC_OR_RETURN(txq, pkts);
>
> -       roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
>
>         /* Perform header writes before barrier for TSO */
>         if (flags & NIX_TX_OFFLOAD_TSO_F) {
> @@ -533,9 +569,8 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         for (i = 0; i < pkts; i++) {
>                 cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
>                 segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
> -               cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
> -                                            tx_pkts[i]->ol_flags, segdw,
> -                                            flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
> +                                            segdw, flags);
>                 cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
>         }
>
> @@ -862,28 +897,34 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
>                 rte_io_wmb();
>
> -       senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
> +       senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
>         senddesc23_w0 = senddesc01_w0;
> +
>         senddesc01_w1 = vdupq_n_u64(0);
>         senddesc23_w1 = senddesc01_w1;
> +       sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
> +       sgdesc23_w0 = sgdesc01_w0;
>
> -       /* Load command defaults into vector variables. */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> -               sendext23_w0 = sendext01_w0;
> -               sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> -               sendext23_w1 = sendext01_w1;
> -               sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
> -               sgdesc23_w0 = sgdesc01_w0;
>                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -                       sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
> +                                                  BIT_ULL(15));
> +                       sendmem01_w0 =
> +                               vdupq_n_u64((NIX_SUBDC_MEM << 60) |
> +                                           (NIX_SENDMEMALG_SETTSTMP << 56));
>                         sendmem23_w0 = sendmem01_w0;
> -                       sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
> +                       sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
>                         sendmem23_w1 = sendmem01_w1;
> +               } else {
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
>                 }
> -       } else {
> -               sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> -               sgdesc23_w0 = sgdesc01_w0;
> +               sendext23_w0 = sendext01_w0;
> +
> +               if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
> +                       sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> +               else
> +                       sendext01_w1 = vdupq_n_u64(0);
> +               sendext23_w1 = sendext01_w1;
>         }
>
>         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v4 1/3] event/cnxk: store and reuse workslot status
  2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
  2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
  2022-02-10 13:20     ` [PATCH v4 3/3] event/cnxk: improve Rx performance pbhagavatula
@ 2022-02-14  9:29     ` Jerin Jacob
  2 siblings, 0 replies; 16+ messages in thread
From: Jerin Jacob @ 2022-02-14  9:29 UTC (permalink / raw)
  To: Pavan Nikhilesh
  Cc: Jerin Jacob, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Shijith Thotton, dpdk-dev

On Thu, Feb 10, 2022 at 6:51 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Store and reuse workslot status for TT, GRP and HEAD status
> instead of reading from GWC as reading from GWC imposes
> additional latency.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Series Acked-by: Jerin Jacob <jerinj@marvell.com>
Series Applied to dpdk-next-net-eventdev/for-main. Thanks


> ---
>  Depends-on: 21590
>
>  v4 Changes:
>  - Update commit title for 3/3
>
>  v3 Changes:
>  - Split and rebase patches.
>
>  v2 Changes:
>  - Rebase.
>  - Fix incorrect use of RoC API
>
>  drivers/common/cnxk/roc_sso.h      | 14 ++++++++------
>  drivers/event/cnxk/cn10k_worker.h  | 16 +++++++++-------
>  drivers/event/cnxk/cn9k_worker.h   |  6 +++---
>  drivers/event/cnxk/cnxk_eventdev.h |  2 ++
>  drivers/event/cnxk/cnxk_worker.h   | 11 +++++++----
>  drivers/net/cnxk/cn10k_tx.h        | 12 ++++++------
>  6 files changed, 35 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> index 27d49c6c68..ab7cee1c60 100644
> --- a/drivers/common/cnxk/roc_sso.h
> +++ b/drivers/common/cnxk/roc_sso.h
> @@ -54,12 +54,13 @@ struct roc_sso {
>         uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
>  } __plt_cache_aligned;
>
> -static __plt_always_inline void
> -roc_sso_hws_head_wait(uintptr_t tag_op)
> +static __plt_always_inline uint64_t
> +roc_sso_hws_head_wait(uintptr_t base)
>  {
> -#ifdef RTE_ARCH_ARM64
> +       uintptr_t tag_op = base + SSOW_LF_GWS_TAG;
>         uint64_t tag;
>
> +#if defined(__aarch64__)
>         asm volatile(PLT_CPU_FEATURE_PREAMBLE
>                      "          ldr %[tag], [%[tag_op]] \n"
>                      "          tbnz %[tag], 35, done%=         \n"
> @@ -71,10 +72,11 @@ roc_sso_hws_head_wait(uintptr_t tag_op)
>                      : [tag] "=&r"(tag)
>                      : [tag_op] "r"(tag_op));
>  #else
> -       /* Wait for the SWTAG/SWTAG_FULL operation */
> -       while (!(plt_read64(tag_op) & BIT_ULL(35)))
> -               ;
> +       do {
> +               tag = plt_read64(tag_op);
> +       } while (!(tag & BIT_ULL(35)));
>  #endif
> +       return tag;
>  }
>
>  /* SSO device initialization */
> diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
> index ff08b2d974..ada230ea1d 100644
> --- a/drivers/event/cnxk/cn10k_worker.h
> +++ b/drivers/event/cnxk/cn10k_worker.h
> @@ -40,8 +40,7 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
>  {
>         const uint32_t tag = (uint32_t)ev->event;
>         const uint8_t new_tt = ev->sched_type;
> -       const uint8_t cur_tt =
> -               CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
> +       const uint8_t cur_tt = CNXK_TT_FROM_TAG(ws->gw_rdata);
>
>         /* CNXK model
>          * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
> @@ -81,7 +80,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
>         const uint8_t grp = ev->queue_id;
>
>         /* Group hasn't changed, Use SWTAG to forward the event */
> -       if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
> +       if (CNXK_GRP_FROM_TAG(ws->gw_rdata) == grp)
>                 cn10k_sso_hws_fwd_swtag(ws, ev);
>         else
>                 /*
> @@ -211,6 +210,7 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
>         } while (gw.u64[0] & BIT_ULL(63));
>         mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
>  #endif
> +       ws->gw_rdata = gw.u64[0];
>         gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
>                     (gw.u64[0] & (0x3FFull << 36)) << 4 |
>                     (gw.u64[0] & 0xffffffff);
> @@ -405,7 +405,8 @@ NIX_RX_FASTPATH_MODES
>                 RTE_SET_USED(timeout_ticks);                                   \
>                 if (ws->swtag_req) {                                           \
>                         ws->swtag_req = 0;                                     \
> -                       cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
> +                       ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
> +                               ws->base + SSOW_LF_GWS_WQE0);                  \
>                         return 1;                                              \
>                 }                                                              \
>                 return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
> @@ -424,7 +425,8 @@ NIX_RX_FASTPATH_MODES
>                 uint64_t iter;                                                 \
>                 if (ws->swtag_req) {                                           \
>                         ws->swtag_req = 0;                                     \
> -                       cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
> +                       ws->gw_rdata = cnxk_sso_hws_swtag_wait(                \
> +                               ws->base + SSOW_LF_GWS_WQE0);                  \
>                         return ret;                                            \
>                 }                                                              \
>                 ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
> @@ -507,8 +509,8 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
>         else
>                 pa = txq->io_addr | ((segdw - 1) << 4);
>
> -       if (!sched_type)
> -               roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
> +       if (!CNXK_TAG_IS_HEAD(ws->gw_rdata) && !sched_type)
> +               ws->gw_rdata = roc_sso_hws_head_wait(ws->base);
>
>         roc_lmt_submit_steorl(lmt_id, pa);
>  }
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index 303b04c215..8455272005 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -700,7 +700,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>
>         /* Head wait if needed */
>         if (base)
> -               roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> +               roc_sso_hws_head_wait(base);
>
>         /* ESN */
>         outb_priv = roc_nix_inl_onf_ipsec_outb_sa_sw_rsvd((void *)sa);
> @@ -793,7 +793,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>                                              flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
> -                       roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> +                       roc_sso_hws_head_wait(base);
>                         cn9k_sso_txq_fc_wait(txq);
>                         if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
>                                 cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
> @@ -806,7 +806,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>                 cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
> -                       roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> +                       roc_sso_hws_head_wait(base);
>                         cn9k_sso_txq_fc_wait(txq);
>                         if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
>                                 cn9k_nix_xmit_one(cmd, txq->lmt_addr,
> diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
> index b26df58588..ab58508590 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.h
> +++ b/drivers/event/cnxk/cnxk_eventdev.h
> @@ -47,6 +47,7 @@
>  #define CNXK_CLR_SUB_EVENT(x)      (~(0xffu << 20) & x)
>  #define CNXK_GRP_FROM_TAG(x)       (((x) >> 36) & 0x3ff)
>  #define CNXK_SWTAG_PEND(x)         (BIT_ULL(62) & x)
> +#define CNXK_TAG_IS_HEAD(x)        (BIT_ULL(35) & x)
>
>  #define CN9K_SSOW_GET_BASE_ADDR(_GW) ((_GW)-SSOW_LF_GWS_OP_GET_WORK0)
>
> @@ -123,6 +124,7 @@ struct cnxk_sso_evdev {
>
>  struct cn10k_sso_hws {
>         uint64_t base;
> +       uint64_t gw_rdata;
>         /* PTP timestamp */
>         struct cnxk_timesync_info *tstamp;
>         void *lookup_mem;
> diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
> index 9f9ceab8a1..7de03f3fbb 100644
> --- a/drivers/event/cnxk/cnxk_worker.h
> +++ b/drivers/event/cnxk/cnxk_worker.h
> @@ -52,11 +52,11 @@ cnxk_sso_hws_swtag_flush(uint64_t tag_op, uint64_t flush_op)
>         plt_write64(0, flush_op);
>  }
>
> -static __rte_always_inline void
> +static __rte_always_inline uint64_t
>  cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
>  {
> -#ifdef RTE_ARCH_ARM64
>         uint64_t swtp;
> +#ifdef RTE_ARCH_ARM64
>
>         asm volatile(PLT_CPU_FEATURE_PREAMBLE
>                      "          ldr %[swtb], [%[swtp_loc]]      \n"
> @@ -70,9 +70,12 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
>                      : [swtp_loc] "r"(tag_op));
>  #else
>         /* Wait for the SWTAG/SWTAG_FULL operation */
> -       while (plt_read64(tag_op) & BIT_ULL(62))
> -               ;
> +       do {
> +               swtp = plt_read64(tag_op);
> +       } while (swtp & BIT_ULL(62));
>  #endif
> +
> +       return swtp;
>  }
>
>  #endif
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index 4ae6bbf517..ec6366168c 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -905,8 +905,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
>                         lnum++;
>         }
>
> -       if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(ws[0]);
> +       if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
> +               ws[1] = roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -1041,8 +1041,8 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
>                 }
>         }
>
> -       if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(ws[0]);
> +       if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
> +               ws[1] = roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -2582,8 +2582,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
>         if (flags & (NIX_TX_MULTI_SEG_F | NIX_TX_OFFLOAD_SECURITY_F))
>                 wd.data[0] >>= 16;
>
> -       if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(ws[0]);
> +       if ((flags & NIX_TX_VWQE_F) && !(ws[1] & BIT_ULL(35)))
> +               ws[1] = roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-02-14  9:29 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-19  7:13 [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
2022-01-19  7:13 ` [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-01-19  7:13 ` [PATCH v2 4/4] net/cnxk: improve Rx performance pbhagavatula
2022-02-07 14:03 ` [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue Jerin Jacob
2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
2022-02-10 10:19   ` Jerin Jacob
2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
2022-02-11 10:27     ` Jerin Jacob
2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 10:19   ` [PATCH v3 3/3] net/cnxk: improve Rx performance pbhagavatula
2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 13:20     ` [PATCH v4 3/3] event/cnxk: improve Rx performance pbhagavatula
2022-02-14  9:29     ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).