DPDK patches and discussions
 help / color / mirror / Atom feed
From: Jerin Jacob <jerinjacobk@gmail.com>
To: Pavan Nikhilesh <pbhagavatula@marvell.com>,
	Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Jerin Jacob <jerinj@marvell.com>,
	Nithin Dabilpuram <ndabilpuram@marvell.com>,
	 Kiran Kumar K <kirankumark@marvell.com>,
	Sunil Kumar Kori <skori@marvell.com>,
	 Satha Rao <skoteshwar@marvell.com>,
	Ankur Dwivedi <adwivedi@marvell.com>,
	 Anoob Joseph <anoobj@marvell.com>,
	Tejasree Kondoj <ktejasree@marvell.com>,
	 Shijith Thotton <sthotton@marvell.com>, dpdk-dev <dev@dpdk.org>
Subject: Re: [PATCH v4] net/cnxk: avoid command copy from Tx queue
Date: Fri, 11 Feb 2022 15:57:11 +0530	[thread overview]
Message-ID: <CALBAE1OQd2L4WpOZXCCnbPtX_i8apcTwEfLH=BatnYJtGvsNVw@mail.gmail.com> (raw)
In-Reply-To: <20220210131526.1878-1-pbhagavatula@marvell.com>

On Thu, Feb 10, 2022 at 6:46 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Tx command is prepared based on offloads enabled and stored in
> Tx queue structure at tx_queue_setup phase.
> In fastpath the command is copied from Tx queue to LMT line for
> all the packets.
> Since, the command contents are mostly constants we can move the
> command preparation to fastpath and avoid accessing Tx queue
> memory.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Jerin Jacob <jerinj@marvell.com>
Applied to dpdk-next-net-mrvl/for-next-net. Thanks


> ---
>  v4 Changes:
>  - Further refactor large functions.
>  v3 Changes:
>  - Rebase.
>  - Split patches.
>  - Refactoring large function.
>  v2 Changes:
>  - Rebase.
>  - Fix incorrect use of RoC API
>
>  drivers/common/cnxk/roc_io.h             |  33 ++++-
>  drivers/common/cnxk/roc_io_generic.h     |  15 ++
>  drivers/crypto/cnxk/cn9k_cryptodev_ops.c |   2 +-
>  drivers/crypto/cnxk/cn9k_ipsec.c         |   2 +-
>  drivers/event/cnxk/cn10k_eventdev.c      |  26 +++-
>  drivers/event/cnxk/cn10k_worker.h        |  89 ++++++------
>  drivers/event/cnxk/cn9k_eventdev.c       |  33 +++--
>  drivers/event/cnxk/cn9k_worker.h         |  64 ++++----
>  drivers/event/cnxk/cnxk_eventdev.h       |  13 +-
>  drivers/event/cnxk/cnxk_eventdev_adptr.c | 178 +++++++++++++++++++++--
>  drivers/net/cnxk/cn10k_ethdev.c          |  24 +--
>  drivers/net/cnxk/cn10k_ethdev.h          |   3 +-
>  drivers/net/cnxk/cn10k_tx.h              | 167 ++++++++++-----------
>  drivers/net/cnxk/cn9k_ethdev.c           |  36 +----
>  drivers/net/cnxk/cn9k_ethdev.h           |   3 +-
>  drivers/net/cnxk/cn9k_tx.h               | 135 +++++++++++------
>  16 files changed, 516 insertions(+), 307 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_io.h b/drivers/common/cnxk/roc_io.h
> index 4f15503c29..62e98d9d00 100644
> --- a/drivers/common/cnxk/roc_io.h
> +++ b/drivers/common/cnxk/roc_io.h
> @@ -164,13 +164,36 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
>         dst128[1] = src128[1];
>         /* lmtext receives following value:
>          * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
> -        * 2: NIX_SUBDC_EXT + NIX_SUBDC_MEM i.e. tstamp case
>          */
> -       if (lmtext) {
> +       if (lmtext)
> +               dst128[2] = src128[2];
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov64(void *out, const void *in)
> +{
> +       volatile const __uint128_t *src128 = (const __uint128_t *)in;
> +       volatile __uint128_t *dst128 = (__uint128_t *)out;
> +
> +       dst128[0] = src128[0];
> +       dst128[1] = src128[1];
> +       dst128[2] = src128[2];
> +       dst128[3] = src128[3];
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
> +{
> +       const __uint128_t *src128 = (const __uint128_t *)in;
> +       __uint128_t *dst128 = (__uint128_t *)out;
> +
> +       dst128[0] = src128[0];
> +       dst128[1] = src128[1];
> +       /* lmtext receives following value:
> +        * 1: NIX_SUBDC_EXT needed i.e. tx vlan case
> +        */
> +       if (lmtext)
>                 dst128[2] = src128[2];
> -               if (lmtext > 1)
> -                       dst128[3] = src128[3];
> -       }
>  }
>
>  static __plt_always_inline void
> diff --git a/drivers/common/cnxk/roc_io_generic.h b/drivers/common/cnxk/roc_io_generic.h
> index 5f90835c09..42764455cc 100644
> --- a/drivers/common/cnxk/roc_io_generic.h
> +++ b/drivers/common/cnxk/roc_io_generic.h
> @@ -106,6 +106,21 @@ roc_lmt_mov(void *out, const void *in, const uint32_t lmtext)
>         memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
>  }
>
> +static __plt_always_inline void
> +roc_lmt_mov64(void *out, const void *in)
> +{
> +       PLT_SET_USED(out);
> +       PLT_SET_USED(in);
> +}
> +
> +static __plt_always_inline void
> +roc_lmt_mov_nv(void *out, const void *in, const uint32_t lmtext)
> +{
> +       PLT_SET_USED(in);
> +       PLT_SET_USED(lmtext);
> +       memset(out, 0, sizeof(__uint128_t) * (lmtext ? lmtext > 1 ? 4 : 3 : 2));
> +}
> +
>  static __plt_always_inline void
>  roc_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
>  {
> diff --git a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> index ac1953b66d..ddba9d5dd0 100644
> --- a/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> +++ b/drivers/crypto/cnxk/cn9k_cryptodev_ops.c
> @@ -161,7 +161,7 @@ cn9k_cpt_inst_submit(struct cpt_inst_s *inst, uint64_t lmtline,
>
>         do {
>                 /* Copy CPT command to LMTLINE */
> -               roc_lmt_mov((void *)lmtline, inst, 2);
> +               roc_lmt_mov64((void *)lmtline, inst);
>
>                 /*
>                  * Make sure compiler does not reorder memcpy and ldeor.
> diff --git a/drivers/crypto/cnxk/cn9k_ipsec.c b/drivers/crypto/cnxk/cn9k_ipsec.c
> index 9f876f75f2..672b65a5d2 100644
> --- a/drivers/crypto/cnxk/cn9k_ipsec.c
> +++ b/drivers/crypto/cnxk/cn9k_ipsec.c
> @@ -53,7 +53,7 @@ cn9k_cpt_enq_sa_write(struct cn9k_ipsec_sa *sa, struct cnxk_cpt_qp *qp,
>
>         do {
>                 /* Copy CPT command to LMTLINE */
> -               roc_lmt_mov((void *)lmtline, &inst, 2);
> +               roc_lmt_mov64((void *)lmtline, &inst);
>                 lmt_status = roc_lmt_submit_ldeor(io_addr);
>         } while (lmt_status == 0);
>
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index 7b7ce44c74..97a88feb13 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -50,7 +50,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
>         /* First cache line is reserved for cookie */
>         ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
>         ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
> -       ws->tx_base = ws->base;
>         ws->hws_id = port_id;
>         ws->swtag_req = 0;
>         ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
> @@ -259,15 +258,13 @@ cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                         ws_cookie,
>                         sizeof(struct cnxk_sso_hws_cookie) +
>                                 sizeof(struct cn10k_sso_hws) +
> -                               (sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                RTE_MAX_QUEUES_PER_PORT),
> +                               dev->tx_adptr_data_sz,
>                         RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                 if (ws_cookie == NULL)
>                         return -ENOMEM;
>                 ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
>                 memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
> -                      sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                              RTE_MAX_QUEUES_PER_PORT);
> +                      dev->tx_adptr_data_sz);
>                 event_dev->data->ports[i] = ws;
>         }
>
> @@ -721,16 +718,35 @@ cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
>                                const struct rte_eth_dev *eth_dev,
>                                int32_t tx_queue_id)
>  {
> +       struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> +       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint64_t tx_offloads;
>         int rc;
>
>         RTE_SET_USED(id);
>         rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
>         if (rc < 0)
>                 return rc;
> +
> +       /* Can't enable tstamp if all the ports don't have it enabled. */
> +       tx_offloads = cnxk_eth_dev->tx_offload_flags;
> +       if (dev->tx_adptr_configured) {
> +               uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +               uint8_t tstmp_ena =
> +                       !!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +
> +               if (tstmp_ena && !tstmp_req)
> +                       dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +               else if (!tstmp_ena && tstmp_req)
> +                       tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +       }
> +
> +       dev->tx_offloads |= tx_offloads;
>         rc = cn10k_sso_updt_tx_adptr_data(event_dev);
>         if (rc < 0)
>                 return rc;
>         cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
> +       dev->tx_adptr_configured = 1;
>
>         return 0;
>  }
> diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
> index 4019c13bd2..ff08b2d974 100644
> --- a/drivers/event/cnxk/cn10k_worker.h
> +++ b/drivers/event/cnxk/cn10k_worker.h
> @@ -455,18 +455,18 @@ NIX_RX_FASTPATH_MODES
>         }
>
>  static __rte_always_inline struct cn10k_eth_txq *
> -cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
> -                         const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> +cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  {
> -       return (struct cn10k_eth_txq *)
> -               txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> +       return (struct cn10k_eth_txq
> +                       *)(txq_data[(txq_data[m->port] >> 48) +
> +                                   rte_event_eth_tx_adapter_txq_get(m)] &
> +                          (BIT_ULL(48) - 1));
>  }
>
>  static __rte_always_inline void
> -cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
> -                uintptr_t lmt_addr, uint8_t sched_type, uintptr_t base,
> -                const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                const uint32_t flags)
> +cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
> +                uint16_t lmt_id, uintptr_t lmt_addr, uint8_t sched_type,
> +                const uint64_t *txq_data, const uint32_t flags)
>  {
>         uint8_t lnum = 0, loff = 0, shft = 0;
>         struct cn10k_eth_txq *txq;
> @@ -476,7 +476,7 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
>         bool sec;
>
>         txq = cn10k_sso_hws_xtract_meta(m, txq_data);
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
>         /* Perform header writes before barrier
>          * for TSO
>          */
> @@ -501,23 +501,23 @@ cn10k_sso_tx_one(struct rte_mbuf *m, uint64_t *cmd, uint16_t lmt_id,
>         else
>                 segdw = cn10k_nix_tx_ext_subs(flags) + 2;
>
> +       cn10k_nix_xmit_prepare_tstamp(txq, laddr, m->ol_flags, segdw, flags);
>         if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
>                 pa = txq->cpt_io_addr | 3 << 4;
>         else
>                 pa = txq->io_addr | ((segdw - 1) << 4);
>
>         if (!sched_type)
> -               roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> +               roc_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
>
>         roc_lmt_submit_steorl(lmt_id, pa);
>  }
>
>  static __rte_always_inline void
> -cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
> -                       uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
> -                       uint8_t sched_type, uintptr_t base,
> -                       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                       const uint32_t flags)
> +cn10k_sso_vwqe_split_tx(struct cn10k_sso_hws *ws, struct rte_mbuf **mbufs,
> +                       uint16_t nb_mbufs, uint64_t *cmd, uint16_t lmt_id,
> +                       uintptr_t lmt_addr, uint8_t sched_type,
> +                       const uint64_t *txq_data, const uint32_t flags)
>  {
>         uint16_t port[4], queue[4];
>         uint16_t i, j, pkts, scalar;
> @@ -540,14 +540,16 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
>                 if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
>                     ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
>                         for (j = 0; j < 4; j++)
> -                               cn10k_sso_tx_one(mbufs[i + j], cmd, lmt_id,
> -                                                lmt_addr, sched_type, base,
> -                                                txq_data, flags);
> +                               cn10k_sso_tx_one(ws, mbufs[i + j], cmd, lmt_id,
> +                                                lmt_addr, sched_type, txq_data,
> +                                                flags);
>                 } else {
> -                       txq = (struct cn10k_eth_txq *)
> -                               txq_data[port[0]][queue[0]];
> -                       cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd,
> -                                                  base + SSOW_LF_GWS_TAG,
> +                       txq = (struct cn10k_eth_txq
> +                                      *)(txq_data[(txq_data[port[0]] >> 48) +
> +                                                  queue[0]] &
> +                                         (BIT_ULL(48) - 1));
> +                       cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws,
> +                                                  &mbufs[i], 4, cmd,
>                                                    flags | NIX_TX_VWQE_F);
>                 }
>         }
> @@ -555,15 +557,14 @@ cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
>         mbufs += i;
>
>         for (i = 0; i < scalar; i++) {
> -               cn10k_sso_tx_one(mbufs[i], cmd, lmt_id, lmt_addr, sched_type,
> -                                base, txq_data, flags);
> +               cn10k_sso_tx_one(ws, mbufs[i], cmd, lmt_id, lmt_addr,
> +                                sched_type, txq_data, flags);
>         }
>  }
>
>  static __rte_always_inline uint16_t
>  cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> -                      uint64_t *cmd,
> -                      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> +                      uint64_t *cmd, const uint64_t *txq_data,
>                        const uint32_t flags)
>  {
>         struct cn10k_eth_txq *txq;
> @@ -580,17 +581,19 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>                 uint64_t meta = *(uint64_t *)ev->vec;
>
>                 if (meta & BIT(31)) {
> -                       txq = (struct cn10k_eth_txq *)
> -                               txq_data[meta >> 32][meta >> 48];
> -
> -                       cn10k_nix_xmit_pkts_vector(
> -                               txq, mbufs, meta & 0xFFFF, cmd,
> -                               ws->tx_base + SSOW_LF_GWS_TAG,
> -                               flags | NIX_TX_VWQE_F);
> +                       txq = (struct cn10k_eth_txq
> +                                      *)(txq_data[(txq_data[meta >> 32] >>
> +                                                   48) +
> +                                                  (meta >> 48)] &
> +                                         (BIT_ULL(48) - 1));
> +
> +                       cn10k_nix_xmit_pkts_vector(txq, (uint64_t *)ws, mbufs,
> +                                                  meta & 0xFFFF, cmd,
> +                                                  flags | NIX_TX_VWQE_F);
>                 } else {
>                         cn10k_sso_vwqe_split_tx(
> -                               mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> -                               ev->sched_type, ws->tx_base, txq_data, flags);
> +                               ws, mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> +                               ev->sched_type, txq_data, flags);
>                 }
>                 rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
>                 return (meta & 0xFFFF);
> @@ -598,16 +601,16 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>
>         m = ev->mbuf;
>         ref_cnt = m->refcnt;
> -       cn10k_sso_tx_one(m, cmd, lmt_id, lmt_addr, ev->sched_type, ws->tx_base,
> -                        txq_data, flags);
> +       cn10k_sso_tx_one(ws, m, cmd, lmt_id, lmt_addr, ev->sched_type, txq_data,
> +                        flags);
>
>         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
>                 if (ref_cnt > 1)
>                         return 1;
>         }
>
> -       cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
> -                                ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
> +       cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
> +                                ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
>         return 1;
>  }
>
> @@ -628,9 +631,7 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
>                 return cn10k_sso_hws_event_tx(                                 \
> -                       ws, &ev[0], cmd,                                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> +                       ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
>                         flags);                                                \
>         }
>
> @@ -642,9 +643,7 @@ NIX_TX_FASTPATH_MODES
>                 struct cn10k_sso_hws *ws = port;                               \
>                 RTE_SET_USED(nb_events);                                       \
>                 return cn10k_sso_hws_event_tx(                                 \
> -                       ws, &ev[0], cmd,                                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> +                       ws, &ev[0], cmd, (const uint64_t *)ws->tx_adptr_data,  \
>                         (flags) | NIX_TX_MULTI_SEG_F);                         \
>         }
>
> diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
> index 4611936b7f..f8652d4fbc 100644
> --- a/drivers/event/cnxk/cn9k_eventdev.c
> +++ b/drivers/event/cnxk/cn9k_eventdev.c
> @@ -259,17 +259,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                                 ws_cookie,
>                                 sizeof(struct cnxk_sso_hws_cookie) +
>                                         sizeof(struct cn9k_sso_hws_dual) +
> -                                       (sizeof(uint64_t) *
> -                                        (dev->max_port_id + 1) *
> -                                        RTE_MAX_QUEUES_PER_PORT),
> +                                       dev->tx_adptr_data_sz,
>                                 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                         if (ws_cookie == NULL)
>                                 return -ENOMEM;
>                         dws = RTE_PTR_ADD(ws_cookie,
>                                           sizeof(struct cnxk_sso_hws_cookie));
>                         memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
> -                              sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                      RTE_MAX_QUEUES_PER_PORT);
> +                              dev->tx_adptr_data_sz);
>                         event_dev->data->ports[i] = dws;
>                 } else {
>                         struct cn9k_sso_hws *ws = event_dev->data->ports[i];
> @@ -280,17 +277,14 @@ cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
>                                 ws_cookie,
>                                 sizeof(struct cnxk_sso_hws_cookie) +
>                                         sizeof(struct cn9k_sso_hws_dual) +
> -                                       (sizeof(uint64_t) *
> -                                        (dev->max_port_id + 1) *
> -                                        RTE_MAX_QUEUES_PER_PORT),
> +                                       dev->tx_adptr_data_sz,
>                                 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
>                         if (ws_cookie == NULL)
>                                 return -ENOMEM;
>                         ws = RTE_PTR_ADD(ws_cookie,
>                                          sizeof(struct cnxk_sso_hws_cookie));
>                         memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
> -                              sizeof(uint64_t) * (dev->max_port_id + 1) *
> -                                      RTE_MAX_QUEUES_PER_PORT);
> +                              dev->tx_adptr_data_sz);
>                         event_dev->data->ports[i] = ws;
>                 }
>         }
> @@ -987,17 +981,36 @@ cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
>                               const struct rte_eth_dev *eth_dev,
>                               int32_t tx_queue_id)
>  {
> +       struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> +       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint64_t tx_offloads;
>         int rc;
>
>         RTE_SET_USED(id);
>         rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
>         if (rc < 0)
>                 return rc;
> +
> +       /* Can't enable tstamp if all the ports don't have it enabled. */
> +       tx_offloads = cnxk_eth_dev->tx_offload_flags;
> +       if (dev->tx_adptr_configured) {
> +               uint8_t tstmp_req = !!(tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +               uint8_t tstmp_ena =
> +                       !!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F);
> +
> +               if (tstmp_ena && !tstmp_req)
> +                       dev->tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +               else if (!tstmp_ena && tstmp_req)
> +                       tx_offloads &= ~(NIX_TX_OFFLOAD_TSTAMP_F);
> +       }
> +
> +       dev->tx_offloads |= tx_offloads;
>         cn9k_sso_txq_fc_update(eth_dev, tx_queue_id, true);
>         rc = cn9k_sso_updt_tx_adptr_data(event_dev);
>         if (rc < 0)
>                 return rc;
>         cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
> +       dev->tx_adptr_configured = 1;
>
>         return 0;
>  }
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index c99e459c1b..303b04c215 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -599,20 +599,13 @@ cn9k_sso_txq_fc_wait(const struct cn9k_eth_txq *txq)
>                 ;
>  }
>
> -static __rte_always_inline const struct cn9k_eth_txq *
> -cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
> -                        const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> +static __rte_always_inline struct cn9k_eth_txq *
> +cn9k_sso_hws_xtract_meta(struct rte_mbuf *m, uint64_t *txq_data)
>  {
> -       return (const struct cn9k_eth_txq *)
> -               txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> -}
> -
> -static __rte_always_inline void
> -cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
> -                        uint64_t *cmd, const uint32_t flags)
> -{
> -       roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
> -       cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
> +       return (struct cn9k_eth_txq
> +                       *)(txq_data[(txq_data[m->port] >> 48) +
> +                                   rte_event_eth_tx_adapter_txq_get(m)] &
> +                          (BIT_ULL(48) - 1));
>  }
>
>  #if defined(RTE_ARCH_ARM64)
> @@ -669,7 +662,7 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>         nixtx += BIT_ULL(7);
>         nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);
>
> -       roc_lmt_mov((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
> +       roc_lmt_mov_nv((void *)(nixtx + 16), cmd, cn9k_nix_tx_ext_subs(flags));
>
>         /* Load opcode and cptr already prepared at pkt metadata set */
>         pkt_len -= l2_len;
> @@ -756,12 +749,11 @@ cn9k_sso_hws_xmit_sec_one(const struct cn9k_eth_txq *txq, uint64_t base,
>
>  static __rte_always_inline uint16_t
>  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> -                     const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> -                     const uint32_t flags)
> +                     uint64_t *txq_data, const uint32_t flags)
>  {
>         struct rte_mbuf *m = ev->mbuf;
> -       const struct cn9k_eth_txq *txq;
>         uint16_t ref_cnt = m->refcnt;
> +       struct cn9k_eth_txq *txq;
>
>         /* Perform header writes before barrier for TSO */
>         cn9k_nix_xmit_prepare_tso(m, flags);
> @@ -774,7 +766,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>             !(flags & NIX_TX_OFFLOAD_SECURITY_F))
>                 rte_io_wmb();
>         txq = cn9k_sso_hws_xtract_meta(m, txq_data);
> -       cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
> +       cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
>
>         if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
>                 uint64_t ol_flags = m->ol_flags;
> @@ -796,6 +789,8 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>
>         if (flags & NIX_TX_MULTI_SEG_F) {
>                 const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
> +                                            flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
>                         roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> @@ -808,6 +803,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
>                                                segdw);
>                 }
>         } else {
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, 4, flags);
>                 if (!CNXK_TT_FROM_EVENT(ev->event)) {
>                         cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
>                         roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> @@ -853,11 +849,9 @@ NIX_TX_FASTPATH_MODES
>                 struct cn9k_sso_hws *ws = port;                                \
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base, &ev[0], cmd,                                 \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       flags);                                                \
> +               return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            flags);                           \
>         }
>
>  #define SSO_TX_SEG(fn, sz, flags)                                              \
> @@ -867,11 +861,9 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
>                 struct cn9k_sso_hws *ws = port;                                \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base, &ev[0], cmd,                                 \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       (flags) | NIX_TX_MULTI_SEG_F);                         \
> +               return cn9k_sso_hws_event_tx(ws->base, &ev[0], cmd,            \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            (flags) | NIX_TX_MULTI_SEG_F);    \
>         }
>
>  #define SSO_DUAL_TX(fn, sz, flags)                                             \
> @@ -881,11 +873,9 @@ NIX_TX_FASTPATH_MODES
>                 struct cn9k_sso_hws_dual *ws = port;                           \
>                 uint64_t cmd[sz];                                              \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base[!ws->vws], &ev[0], cmd,                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       flags);                                                \
> +               return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            flags);                           \
>         }
>
>  #define SSO_DUAL_TX_SEG(fn, sz, flags)                                         \
> @@ -895,11 +885,9 @@ NIX_TX_FASTPATH_MODES
>                 uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
>                 struct cn9k_sso_hws_dual *ws = port;                           \
>                 RTE_SET_USED(nb_events);                                       \
> -               return cn9k_sso_hws_event_tx(                                  \
> -                       ws->base[!ws->vws], &ev[0], cmd,                       \
> -                       (const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
> -                               ws->tx_adptr_data,                             \
> -                       (flags) | NIX_TX_MULTI_SEG_F);                         \
> +               return cn9k_sso_hws_event_tx(ws->base[!ws->vws], &ev[0], cmd,  \
> +                                            (uint64_t *)ws->tx_adptr_data,    \
> +                                            (flags) | NIX_TX_MULTI_SEG_F);    \
>         }
>
>  #endif
> diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
> index 4652b58a84..b26df58588 100644
> --- a/drivers/event/cnxk/cnxk_eventdev.h
> +++ b/drivers/event/cnxk/cnxk_eventdev.h
> @@ -99,7 +99,10 @@ struct cnxk_sso_evdev {
>         uint16_t rx_adptr_pool_cnt;
>         uint64_t *rx_adptr_pools;
>         uint64_t *tx_adptr_data;
> +       size_t tx_adptr_data_sz;
>         uint16_t max_port_id;
> +       uint16_t max_queue_id[RTE_MAX_ETHPORTS];
> +       uint8_t tx_adptr_configured;
>         uint16_t tim_adptr_ring_cnt;
>         uint16_t *timer_adptr_rings;
>         uint64_t *timer_adptr_sz;
> @@ -131,8 +134,8 @@ struct cn10k_sso_hws {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint64_t tx_base __rte_cache_aligned;
> -       uintptr_t lmt_base;
> +       uintptr_t lmt_base __rte_cache_aligned;
> +       uint64_t lso_tun_fmt;
>         uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
> @@ -149,7 +152,8 @@ struct cn9k_sso_hws {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint8_t tx_adptr_data[] __rte_cache_aligned;
> +       uint64_t lso_tun_fmt __rte_cache_aligned;
> +       uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
>  struct cn9k_sso_hws_dual {
> @@ -165,7 +169,8 @@ struct cn9k_sso_hws_dual {
>         uint64_t *fc_mem;
>         uintptr_t grp_base;
>         /* Tx Fastpath data */
> -       uint8_t tx_adptr_data[] __rte_cache_aligned;
> +       uint64_t lso_tun_fmt __rte_cache_aligned;
> +       uint8_t tx_adptr_data[];
>  } __rte_cache_aligned;
>
>  struct cnxk_sso_hws_cookie {
> diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
> index fdcd68ca63..5ebd3340e7 100644
> --- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
> +++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
> @@ -339,30 +339,179 @@ cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
>                 sq->aura_handle, RTE_MIN(nb_sqb_bufs, sq->aura_sqb_bufs));
>  }
>
> +static void
> +cnxk_sso_tx_queue_data_init(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                           uint16_t eth_port_id, uint16_t tx_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i;
> +
> +       dev->max_queue_id[0] = RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +       for (i = 1; i < eth_port_id; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +       }
> +       dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +       dev->max_queue_id[eth_port_id] =
> +               RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_cpy(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                          uint64_t *otxq_data, uint16_t eth_port_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       for (i = 1; i < eth_port_id; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +               for (j = 0;
> +                    (i < dev->max_port_id) && (j < dev->max_queue_id[i] + 1);
> +                    j++)
> +                       txq_data[offset + j] =
> +                               otxq_data[(otxq_data[i] >> 48) + j];
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_cpy_max(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                              uint64_t *otxq_data, uint16_t eth_port_id,
> +                              uint16_t max_port_id, uint16_t max_queue_id)
> +{
> +       uint64_t offset = 0;
> +       int i, j;
> +
> +       for (i = 1; i < max_port_id + 1; i++) {
> +               offset += (dev->max_queue_id[i - 1] + 1);
> +               txq_data[i] |= offset << 48;
> +               for (j = 0; j < dev->max_queue_id[i] + 1; j++) {
> +                       if (i == eth_port_id && j > max_queue_id)
> +                               continue;
> +                       txq_data[offset + j] =
> +                               otxq_data[(otxq_data[i] >> 48) + j];
> +               }
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_rewrite(struct cnxk_sso_evdev *dev, uint64_t *txq_data,
> +                              uint16_t eth_port_id, uint16_t tx_queue_id,
> +                              uint64_t *otxq_data, uint16_t max_port_id,
> +                              uint16_t max_queue_id)
> +{
> +       int i;
> +
> +       for (i = 0; i < dev->max_queue_id[0] + 1; i++)
> +               txq_data[i] |= (otxq_data[i] & ~((BIT_ULL(16) - 1) << 48));
> +
> +       if (eth_port_id > max_port_id) {
> +               dev->max_queue_id[0] =
> +                       RTE_MAX(dev->max_queue_id[0], eth_port_id);
> +               dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id);
> +
> +               cnxk_sso_tx_queue_data_cpy(dev, txq_data, otxq_data,
> +                                          eth_port_id);
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +       } else if (tx_queue_id > max_queue_id) {
> +               dev->max_queue_id[eth_port_id] =
> +                       RTE_MAX(dev->max_queue_id[eth_port_id], tx_queue_id);
> +               dev->max_port_id = RTE_MAX(max_port_id, eth_port_id);
> +               cnxk_sso_tx_queue_data_cpy_max(dev, txq_data, otxq_data,
> +                                              eth_port_id, max_port_id,
> +                                              max_queue_id);
> +       }
> +}
> +
> +static void
> +cnxk_sso_tx_queue_data_sz(struct cnxk_sso_evdev *dev, uint16_t eth_port_id,
> +                         uint16_t tx_queue_id, uint16_t max_port_id,
> +                         uint16_t max_queue_id, uint64_t *r, size_t *sz)
> +{
> +       uint64_t row = 0;
> +       size_t size = 0;
> +       int i;
> +
> +       if (dev->tx_adptr_data == NULL) {
> +               size = (eth_port_id + 1);
> +               size += (eth_port_id + tx_queue_id);
> +               row = 2 * eth_port_id;
> +               *r = row;
> +               *sz = size;
> +               return;
> +       }
> +
> +       if (eth_port_id > max_port_id) {
> +               size = (RTE_MAX(eth_port_id, dev->max_queue_id[0]) + 1);
> +               for (i = 1; i < eth_port_id; i++)
> +                       size += (dev->max_queue_id[i] + 1);
> +               row = size;
> +               size += (tx_queue_id + 1);
> +       } else if (tx_queue_id > max_queue_id) {
> +               size = !eth_port_id ?
> +                              tx_queue_id + 1 :
> +                                    RTE_MAX(max_port_id, dev->max_queue_id[0]) + 1;
> +               for (i = 1; i < max_port_id + 1; i++) {
> +                       if (i == eth_port_id) {
> +                               row = size;
> +                               size += tx_queue_id + 1;
> +                       } else {
> +                               size += dev->max_queue_id[i] + 1;
> +                       }
> +               }
> +       }
> +       *r = row;
> +       *sz = size;
> +}
> +
>  static int
>  cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
>                             uint16_t eth_port_id, uint16_t tx_queue_id,
>                             void *txq)
>  {
>         struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
> +       uint16_t max_queue_id = dev->max_queue_id[eth_port_id];
>         uint16_t max_port_id = dev->max_port_id;
> -       uint64_t *txq_data = dev->tx_adptr_data;
> -
> -       if (txq_data == NULL || eth_port_id > max_port_id) {
> -               max_port_id = RTE_MAX(max_port_id, eth_port_id);
> -               txq_data = rte_realloc_socket(
> -                       txq_data,
> -                       (sizeof(uint64_t) * (max_port_id + 1) *
> -                        RTE_MAX_QUEUES_PER_PORT),
> -                       RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
> +       uint64_t *txq_data = NULL;
> +       uint64_t row = 0;
> +       size_t size = 0;
> +
> +       if (((uint64_t)txq) & 0xFFFF000000000000)
> +               return -EINVAL;
> +
> +       cnxk_sso_tx_queue_data_sz(dev, eth_port_id, tx_queue_id, max_port_id,
> +                                 max_queue_id, &row, &size);
> +
> +       size *= sizeof(uint64_t);
> +
> +       if (size) {
> +               uint64_t *otxq_data = dev->tx_adptr_data;
> +
> +               txq_data = malloc(size);
>                 if (txq_data == NULL)
>                         return -ENOMEM;
> +               memset(txq_data, 0, size);
> +               txq_data[eth_port_id] = ((uint64_t)row) << 48;
> +               txq_data[row + tx_queue_id] = (uint64_t)txq;
> +
> +               if (otxq_data != NULL)
> +                       cnxk_sso_tx_queue_data_rewrite(
> +                               dev, txq_data, eth_port_id, tx_queue_id,
> +                               otxq_data, max_port_id, max_queue_id);
> +               else
> +                       cnxk_sso_tx_queue_data_init(dev, txq_data, eth_port_id,
> +                                                   tx_queue_id);
> +               dev->tx_adptr_data_sz = size;
> +               free(otxq_data);
> +               dev->tx_adptr_data = txq_data;
> +       } else {
> +               txq_data = dev->tx_adptr_data;
> +               row = txq_data[eth_port_id] >> 48;
> +               txq_data[row + tx_queue_id] &= ~(BIT_ULL(48) - 1);
> +               txq_data[row + tx_queue_id] |= (uint64_t)txq;
>         }
>
> -       ((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
> -                txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
> -       dev->max_port_id = max_port_id;
> -       dev->tx_adptr_data = txq_data;
>         return 0;
>  }
>
> @@ -372,7 +521,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
>                               int32_t tx_queue_id)
>  {
>         struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
> -       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
>         struct roc_nix_sq *sq;
>         int i, ret;
>         void *txq;
> @@ -388,8 +536,6 @@ cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
>                         event_dev, eth_dev->data->port_id, tx_queue_id, txq);
>                 if (ret < 0)
>                         return ret;
> -
> -               dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
>         }
>
>         return 0;
> diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
> index 8378cbffc2..9bb08e1824 100644
> --- a/drivers/net/cnxk/cn10k_ethdev.c
> +++ b/drivers/net/cnxk/cn10k_ethdev.c
> @@ -131,53 +131,31 @@ static void
>  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
>                       uint16_t qid)
>  {
> -       struct nix_send_ext_s *send_hdr_ext;
>         union nix_send_hdr_w0_u send_hdr_w0;
> -       struct nix_send_mem_s *send_mem;
> -       union nix_send_sg_s sg_w0;
> -
> -       RTE_SET_USED(dev);
>
>         /* Initialize the fields based on basic single segment packet */
> -       memset(&txq->cmd, 0, sizeof(txq->cmd));
>         send_hdr_w0.u = 0;
> -       sg_w0.u = 0;
> -
>         if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
>                 /* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
>                 send_hdr_w0.sizem1 = 2;
> -
> -               send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[0];
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
>                 if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                         /* Default: one seg packet would have:
>                          * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
>                          * => 8/2 - 1 = 3
>                          */
>                         send_hdr_w0.sizem1 = 3;
> -                       send_hdr_ext->w0.tstmp = 1;
>
>                         /* To calculate the offset for send_mem,
>                          * send_hdr->w0.sizem1 * 2
>                          */
> -                       send_mem = (struct nix_send_mem_s *)(txq->cmd + 2);
> -                       send_mem->w0.subdc = NIX_SUBDC_MEM;
> -                       send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP;
> -                       send_mem->addr = dev->tstamp.tx_tstamp_iova;
> +                       txq->ts_mem = dev->tstamp.tx_tstamp_iova;
>                 }
>         } else {
>                 /* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
>                 send_hdr_w0.sizem1 = 1;
>         }
> -
>         send_hdr_w0.sq = qid;
> -       sg_w0.subdc = NIX_SUBDC_SG;
> -       sg_w0.segs = 1;
> -       sg_w0.ld_type = NIX_SENDLDTYPE_LDD;
> -
>         txq->send_hdr_w0 = send_hdr_w0.u;
> -       txq->sg_w0 = sg_w0.u;
> -
>         rte_wmb();
>  }
>
> diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
> index 0982158c62..ec40e53152 100644
> --- a/drivers/net/cnxk/cn10k_ethdev.h
> +++ b/drivers/net/cnxk/cn10k_ethdev.h
> @@ -9,7 +9,6 @@
>
>  struct cn10k_eth_txq {
>         uint64_t send_hdr_w0;
> -       uint64_t sg_w0;
>         int64_t fc_cache_pkts;
>         uint64_t *fc_mem;
>         uintptr_t lmt_base;
> @@ -20,8 +19,8 @@ struct cn10k_eth_txq {
>         uint64_t sa_base;
>         uint64_t *cpt_fc;
>         uint16_t cpt_desc;
> -       uint64_t cmd[4];
>         uint64_t lso_tun_fmt;
> +       uint64_t ts_mem;
>  } __plt_cache_aligned;
>
>  struct cn10k_eth_rxq {
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index fc1f6ceb8c..4ae6bbf517 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -186,23 +186,26 @@ cn10k_cpt_tx_steor_data(void)
>  }
>
>  static __rte_always_inline void
> -cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
> -                     const uint16_t flags)
> +cn10k_nix_tx_skeleton(struct cn10k_eth_txq *txq, uint64_t *cmd,
> +                     const uint16_t flags, const uint16_t static_sz)
>  {
> -       /* Send hdr */
> -       cmd[0] = txq->send_hdr_w0;
> +       if (static_sz)
> +               cmd[0] = txq->send_hdr_w0;
> +       else
> +               cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                        ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
>         cmd[1] = 0;
> -       cmd += 2;
>
> -       /* Send ext if present */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
> -               cmd += 2;
> +               if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
> +                       cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
> +               else
> +                       cmd[2] = NIX_SUBDC_EXT << 60;
> +               cmd[3] = 0;
> +               cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       } else {
> +               cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
>         }
> -
> -       /* Send sg */
> -       cmd[0] = txq->sg_w0;
> -       cmd[1] = 0;
>  }
>
>  static __rte_always_inline void
> @@ -718,41 +721,29 @@ cn10k_nix_xmit_mv_lmt_base(uintptr_t lmt_addr, uint64_t *cmd,
>  }
>
>  static __rte_always_inline void
> -cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
> +cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
>                               const uint64_t ol_flags, const uint16_t no_segdw,
>                               const uint16_t flags)
>  {
>         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -               const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
> -               struct nix_send_ext_s *send_hdr_ext =
> -                       (struct nix_send_ext_s *)lmt_addr + 16;
> +               const uint8_t is_ol_tstamp =
> +                       !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
>                 uint64_t *lmt = (uint64_t *)lmt_addr;
>                 uint16_t off = (no_segdw - 1) << 1;
>                 struct nix_send_mem_s *send_mem;
>
>                 send_mem = (struct nix_send_mem_s *)(lmt + off);
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
> -               send_hdr_ext->w0.tstmp = 1;
> -               if (flags & NIX_TX_MULTI_SEG_F) {
> -                       /* Retrieving the default desc values */
> -                       lmt[off] = cmd[2];
> -
> -                       /* Using compiler barrier to avoid violation of C
> -                        * aliasing rules.
> -                        */
> -                       rte_compiler_barrier();
> -               }
> -
> -               /* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
> +               /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
>                  * should not be recorded, hence changing the alg type to
> -                * NIX_SENDMEMALG_SET and also changing send mem addr field to
> +                * NIX_SENDMEMALG_SUB and also changing send mem addr field to
>                  * next 8 bytes as it corrupts the actual Tx tstamp registered
>                  * address.
>                  */
>                 send_mem->w0.subdc = NIX_SUBDC_MEM;
> -               send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
> +               send_mem->w0.alg =
> +                       NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
>                 send_mem->addr =
> -                       (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
> +                       (rte_iova_t)(((uint64_t *)txq->ts_mem) + is_ol_tstamp);
>         }
>  }
>
> @@ -841,8 +832,8 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> -                   uint64_t *cmd, uintptr_t base, const uint16_t flags)
> +cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
> +                   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
>  {
>         struct cn10k_eth_txq *txq = tx_queue;
>         const rte_iova_t io_addr = txq->io_addr;
> @@ -863,9 +854,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>                 /* Reduce the cached count */
>                 txq->fc_cache_pkts -= pkts;
>         }
> -
>         /* Get cmd skeleton */
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));
>
>         if (flags & NIX_TX_OFFLOAD_TSO_F)
>                 lso_tun_fmt = txq->lso_tun_fmt;
> @@ -909,14 +899,14 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>                 /* Move NIX desc to LMT/NIXTX area */
>                 cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
> -               cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
> -                                             tx_pkts[i]->ol_flags, 4, flags);
> +               cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
> +                                             4, flags);
>                 if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec)
>                         lnum++;
>         }
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -967,9 +957,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                        uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                        const uint16_t flags)
> +cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
> +                        struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                        uint64_t *cmd, const uint16_t flags)
>  {
>         struct cn10k_eth_txq *txq = tx_queue;
>         uintptr_t pa0, pa1, lbase = txq->lmt_base;
> @@ -987,12 +977,13 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         uintptr_t laddr;
>         bool sec;
>
> -       NIX_XMIT_FC_OR_RETURN(txq, pkts);
> -
> -       cn10k_nix_tx_skeleton(txq, cmd, flags);
> -
> -       /* Reduce the cached count */
> -       txq->fc_cache_pkts -= pkts;
> +       if (!(flags & NIX_TX_VWQE_F)) {
> +               NIX_XMIT_FC_OR_RETURN(txq, pkts);
> +               /* Reduce the cached count */
> +               txq->fc_cache_pkts -= pkts;
> +       }
> +       /* Get cmd skeleton */
> +       cn10k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));
>
>         if (flags & NIX_TX_OFFLOAD_TSO_F)
>                 lso_tun_fmt = txq->lso_tun_fmt;
> @@ -1038,13 +1029,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>                 /* Move NIX desc to LMT/NIXTX area */
>                 cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
> -
>                 /* Store sg list directly on lmt line */
>                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
>                                                flags);
> -               cn10k_nix_xmit_prepare_tstamp(laddr, &txq->cmd[0],
> -                                             tx_pkts[i]->ol_flags, segdw,
> -                                             flags);
> +               cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
> +                                             segdw, flags);
>                 if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
>                         lnum++;
>                         data128 |= (((__uint128_t)(segdw - 1)) << shft);
> @@ -1053,7 +1042,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         }
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>         tx_pkts += burst;
> @@ -1474,9 +1463,9 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
>  }
>
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                          const uint16_t flags)
> +cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
> +                          struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                          uint64_t *cmd, const uint16_t flags)
>  {
>         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
>         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
> @@ -1526,25 +1515,42 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
>         }
>
> -       senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
> +       if (!(flags & NIX_TX_VWQE_F)) {
> +               senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
> +       } else {
> +               uint64_t w0 =
> +                       (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                       ((uint64_t)(cn10k_nix_tx_ext_subs(flags) + 1) << 40);
> +
> +               senddesc01_w0 = vdupq_n_u64(w0);
> +       }
>         senddesc23_w0 = senddesc01_w0;
> +
>         senddesc01_w1 = vdupq_n_u64(0);
>         senddesc23_w1 = senddesc01_w1;
> -       sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
> +       sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
>         sgdesc23_w0 = sgdesc01_w0;
>
> -       /* Load command defaults into vector variables. */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
> -               sendext23_w0 = sendext01_w0;
> -               sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> -               sendext23_w1 = sendext01_w1;
>                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -                       sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
> +                                                  BIT_ULL(15));
> +                       sendmem01_w0 =
> +                               vdupq_n_u64((NIX_SUBDC_MEM << 60) |
> +                                           (NIX_SENDMEMALG_SETTSTMP << 56));
>                         sendmem23_w0 = sendmem01_w0;
> -                       sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
> +                       sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
>                         sendmem23_w1 = sendmem01_w1;
> +               } else {
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
>                 }
> +               sendext23_w0 = sendext01_w0;
> +
> +               if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
> +                       sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> +               else
> +                       sendext01_w1 = vdupq_n_u64(0);
> +               sendext23_w1 = sendext01_w1;
>         }
>
>         /* Get LMT base address and LMT ID as lcore id */
> @@ -2577,7 +2583,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>                 wd.data[0] >>= 16;
>
>         if (flags & NIX_TX_VWQE_F)
> -               roc_sso_hws_head_wait(base);
> +               roc_sso_hws_head_wait(ws[0]);
>
>         left -= burst;
>
> @@ -2640,12 +2646,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>         if (unlikely(scalar)) {
>                 if (flags & NIX_TX_MULTI_SEG_F)
> -                       pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
> -                                                        scalar, cmd, base,
> -                                                        flags);
> +                       pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, ws, tx_pkts,
> +                                                        scalar, cmd, flags);
>                 else
> -                       pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
> -                                                   cmd, base, flags);
> +                       pkts += cn10k_nix_xmit_pkts(tx_queue, ws, tx_pkts,
> +                                                   scalar, cmd, flags);
>         }
>
>         return pkts;
> @@ -2653,16 +2658,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>  #else
>  static __rte_always_inline uint16_t
> -cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
> -                          const uint16_t flags)
> +cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
> +                          struct rte_mbuf **tx_pkts, uint16_t pkts,
> +                          uint64_t *cmd, const uint16_t flags)
>  {
> +       RTE_SET_USED(ws);
>         RTE_SET_USED(tx_queue);
>         RTE_SET_USED(tx_pkts);
>         RTE_SET_USED(pkts);
>         RTE_SET_USED(cmd);
>         RTE_SET_USED(flags);
> -       RTE_SET_USED(base);
>         return 0;
>  }
>  #endif
> @@ -2892,7 +2897,7 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, 0,    \
> +               return cn10k_nix_xmit_pkts(tx_queue, NULL, tx_pkts, pkts, cmd, \
>                                            flags);                             \
>         }
>
> @@ -2905,8 +2910,8 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
> -                                               0,                             \
> +               return cn10k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, \
> +                                               cmd,                           \
>                                                 flags | NIX_TX_MULTI_SEG_F);   \
>         }
>
> @@ -2919,8 +2924,8 @@ NIX_TX_FASTPATH_MODES
>                 if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
> -               return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts,     \
> -                                                 cmd, 0, (flags));            \
> +               return cn10k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts,     \
> +                                                 pkts, cmd, (flags));         \
>         }
>
>  #define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags)                                    \
> @@ -2933,7 +2938,7 @@ NIX_TX_FASTPATH_MODES
>                     !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
>                         return 0;                                              \
>                 return cn10k_nix_xmit_pkts_vector(                             \
> -                       tx_queue, tx_pkts, pkts, cmd, 0,                       \
> +                       tx_queue, NULL, tx_pkts, pkts, cmd,                    \
>                         (flags) | NIX_TX_MULTI_SEG_F);                         \
>         }
>
> diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
> index d34bc6898f..01e3850561 100644
> --- a/drivers/net/cnxk/cn9k_ethdev.c
> +++ b/drivers/net/cnxk/cn9k_ethdev.c
> @@ -131,51 +131,31 @@ static void
>  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
>                       uint16_t qid)
>  {
> -       struct nix_send_ext_s *send_hdr_ext;
> -       struct nix_send_hdr_s *send_hdr;
> -       struct nix_send_mem_s *send_mem;
> -       union nix_send_sg_s *sg;
> +       union nix_send_hdr_w0_u send_hdr_w0;
>
>         /* Initialize the fields based on basic single segment packet */
> -       memset(&txq->cmd, 0, sizeof(txq->cmd));
> -
> +       send_hdr_w0.u = 0;
>         if (dev->tx_offload_flags & NIX_TX_NEED_EXT_HDR) {
> -               send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
>                 /* 2(HDR) + 2(EXT_HDR) + 1(SG) + 1(IOVA) = 6/2 - 1 = 2 */
> -               send_hdr->w0.sizem1 = 2;
> -
> -               send_hdr_ext = (struct nix_send_ext_s *)&txq->cmd[2];
> -               send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
> +               send_hdr_w0.sizem1 = 2;
>                 if (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                         /* Default: one seg packet would have:
>                          * 2(HDR) + 2(EXT) + 1(SG) + 1(IOVA) + 2(MEM)
>                          * => 8/2 - 1 = 3
>                          */
> -                       send_hdr->w0.sizem1 = 3;
> -                       send_hdr_ext->w0.tstmp = 1;
> +                       send_hdr_w0.sizem1 = 3;
>
>                         /* To calculate the offset for send_mem,
>                          * send_hdr->w0.sizem1 * 2
>                          */
> -                       send_mem = (struct nix_send_mem_s *)
> -                               (txq->cmd + (send_hdr->w0.sizem1 << 1));
> -                       send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
> -                       send_mem->w0.cn9k.alg = NIX_SENDMEMALG_SETTSTMP;
> -                       send_mem->addr = dev->tstamp.tx_tstamp_iova;
> +                       txq->ts_mem = dev->tstamp.tx_tstamp_iova;
>                 }
> -               sg = (union nix_send_sg_s *)&txq->cmd[4];
>         } else {
> -               send_hdr = (struct nix_send_hdr_s *)&txq->cmd[0];
>                 /* 2(HDR) + 1(SG) + 1(IOVA) = 4/2 - 1 = 1 */
> -               send_hdr->w0.sizem1 = 1;
> -               sg = (union nix_send_sg_s *)&txq->cmd[2];
> +               send_hdr_w0.sizem1 = 1;
>         }
> -
> -       send_hdr->w0.sq = qid;
> -       sg->subdc = NIX_SUBDC_SG;
> -       sg->segs = 1;
> -       sg->ld_type = NIX_SENDLDTYPE_LDD;
> -
> +       send_hdr_w0.sq = qid;
> +       txq->send_hdr_w0 = send_hdr_w0.u;
>         rte_wmb();
>  }
>
> diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
> index 2b452fe009..8ab924944c 100644
> --- a/drivers/net/cnxk/cn9k_ethdev.h
> +++ b/drivers/net/cnxk/cn9k_ethdev.h
> @@ -9,12 +9,13 @@
>  #include <cnxk_security_ar.h>
>
>  struct cn9k_eth_txq {
> -       uint64_t cmd[8];
> +       uint64_t send_hdr_w0;
>         int64_t fc_cache_pkts;
>         uint64_t *fc_mem;
>         void *lmt_addr;
>         rte_iova_t io_addr;
>         uint64_t lso_tun_fmt;
> +       uint64_t ts_mem;
>         uint16_t sqes_per_sqb_log2;
>         int16_t nb_sqb_bufs_adj;
>         rte_iova_t cpt_io_addr;
> diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
> index 8564dd85ee..d23e4b61b4 100644
> --- a/drivers/net/cnxk/cn9k_tx.h
> +++ b/drivers/net/cnxk/cn9k_tx.h
> @@ -58,6 +58,29 @@ cn9k_nix_tx_ext_subs(const uint16_t flags)
>                                   : 0);
>  }
>
> +static __rte_always_inline void
> +cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
> +                    const uint16_t flags, const uint16_t static_sz)
> +{
> +       if (static_sz)
> +               cmd[0] = txq->send_hdr_w0;
> +       else
> +               cmd[0] = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
> +                        ((uint64_t)(cn9k_nix_tx_ext_subs(flags) + 1) << 40);
> +       cmd[1] = 0;
> +
> +       if (flags & NIX_TX_NEED_EXT_HDR) {
> +               if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
> +                       cmd[2] = (NIX_SUBDC_EXT << 60) | BIT_ULL(15);
> +               else
> +                       cmd[2] = NIX_SUBDC_EXT << 60;
> +               cmd[3] = 0;
> +               cmd[4] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       } else {
> +               cmd[2] = (NIX_SUBDC_SG << 60) | BIT_ULL(48);
> +       }
> +}
> +
>  static __rte_always_inline void
>  cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
>  {
> @@ -136,11 +159,11 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
>                 w1.u = 0;
>         }
>
> -       if (!(flags & NIX_TX_MULTI_SEG_F)) {
> +       if (!(flags & NIX_TX_MULTI_SEG_F))
>                 send_hdr->w0.total = m->data_len;
> -               send_hdr->w0.aura =
> -                       roc_npa_aura_handle_to_aura(m->pool->pool_id);
> -       }
> +       else
> +               send_hdr->w0.total = m->pkt_len;
> +       send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
>
>         /*
>          * L3type:  2 => IPV4
> @@ -287,41 +310,39 @@ cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
>                 /* Mark mempool object as "put" since it is freed by NIX */
>                 if (!send_hdr->w0.df)
>                         RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
> +       } else {
> +               sg->seg1_size = m->data_len;
> +               *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
> +
> +               /* NOFF is handled later for multi-seg */
>         }
>  }
>
>  static __rte_always_inline void
> -cn9k_nix_xmit_prepare_tstamp(uint64_t *cmd, const uint64_t *send_mem_desc,
> +cn9k_nix_xmit_prepare_tstamp(struct cn9k_eth_txq *txq, uint64_t *cmd,
>                              const uint64_t ol_flags, const uint16_t no_segdw,
>                              const uint16_t flags)
>  {
>         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
>                 struct nix_send_mem_s *send_mem;
>                 uint16_t off = (no_segdw - 1) << 1;
> -               const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
> +               const uint8_t is_ol_tstamp =
> +                       !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
>
>                 send_mem = (struct nix_send_mem_s *)(cmd + off);
> -               if (flags & NIX_TX_MULTI_SEG_F) {
> -                       /* Retrieving the default desc values */
> -                       cmd[off] = send_mem_desc[6];
>
> -                       /* Using compiler barrier to avoid violation of C
> -                        * aliasing rules.
> -                        */
> -                       rte_compiler_barrier();
> -               }
> -
> -               /* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
> +               /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
>                  * should not be recorded, hence changing the alg type to
> -                * NIX_SENDMEMALG_SET and also changing send mem addr field to
> +                * NIX_SENDMEMALG_SUB and also changing send mem addr field to
>                  * next 8 bytes as it corrupts the actual Tx tstamp registered
>                  * address.
>                  */
> +               send_mem->w0.cn9k.subdc = NIX_SUBDC_MEM;
>                 send_mem->w0.cn9k.alg =
> -                       NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
> +                       NIX_SENDMEMALG_SETTSTMP + (is_ol_tstamp << 3);
>
> -               send_mem->addr = (rte_iova_t)((uint64_t *)send_mem_desc[7] +
> -                                             (is_ol_tstamp));
> +               send_mem->addr = (rte_iova_t)(((uint64_t *)txq->ts_mem) +
> +                               (is_ol_tstamp));
>         }
>  }
>
> @@ -367,8 +388,6 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>         uint8_t off, i;
>
>         send_hdr = (struct nix_send_hdr_s *)cmd;
> -       send_hdr->w0.total = m->pkt_len;
> -       send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
>
>         if (flags & NIX_TX_NEED_EXT_HDR)
>                 off = 2;
> @@ -376,13 +395,29 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>                 off = 0;
>
>         sg = (union nix_send_sg_s *)&cmd[2 + off];
> -       /* Clear sg->u header before use */
> -       sg->u &= 0xFC00000000000000;
> +
> +       /* Start from second segment, first segment is already there */
> +       i = 1;
>         sg_u = sg->u;
> -       slist = &cmd[3 + off];
> +       nb_segs = m->nb_segs - 1;
> +       m_next = m->next;
> +       slist = &cmd[3 + off + 1];
>
> -       i = 0;
> -       nb_segs = m->nb_segs;
> +       /* Set invert df if buffer is not to be freed by H/W */
> +       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
> +               sg_u |= (cnxk_nix_prefree_seg(m) << 55);
> +               rte_io_wmb();
> +       }
> +
> +       /* Mark mempool object as "put" since it is freed by NIX */
> +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +       if (!(sg_u & (1ULL << 55)))
> +               RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0);
> +       rte_io_wmb();
> +#endif
> +       m = m_next;
> +       if (!m)
> +               goto done;
>
>         /* Fill mbuf segments */
>         do {
> @@ -417,6 +452,7 @@ cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>                 m = m_next;
>         } while (nb_segs);
>
> +done:
>         sg->u = sg_u;
>         sg->segs = i;
>         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
> @@ -472,7 +508,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>         NIX_XMIT_FC_OR_RETURN(txq, pkts);
>
> -       roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
>
>         /* Perform header writes before barrier for TSO */
>         if (flags & NIX_TX_OFFLOAD_TSO_F) {
> @@ -490,8 +526,8 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
>         for (i = 0; i < pkts; i++) {
>                 cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
> -               cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
> -                                            tx_pkts[i]->ol_flags, 4, flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
> +                                            flags);
>                 cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
>         }
>
> @@ -514,7 +550,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>
>         NIX_XMIT_FC_OR_RETURN(txq, pkts);
>
> -       roc_lmt_mov(cmd, &txq->cmd[0], cn9k_nix_tx_ext_subs(flags));
> +       cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
>
>         /* Perform header writes before barrier for TSO */
>         if (flags & NIX_TX_OFFLOAD_TSO_F) {
> @@ -533,9 +569,8 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>         for (i = 0; i < pkts; i++) {
>                 cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
>                 segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
> -               cn9k_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
> -                                            tx_pkts[i]->ol_flags, segdw,
> -                                            flags);
> +               cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
> +                                            segdw, flags);
>                 cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
>         }
>
> @@ -862,28 +897,34 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
>                 rte_io_wmb();
>
> -       senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
> +       senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
>         senddesc23_w0 = senddesc01_w0;
> +
>         senddesc01_w1 = vdupq_n_u64(0);
>         senddesc23_w1 = senddesc01_w1;
> +       sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
> +       sgdesc23_w0 = sgdesc01_w0;
>
> -       /* Load command defaults into vector variables. */
>         if (flags & NIX_TX_NEED_EXT_HDR) {
> -               sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> -               sendext23_w0 = sendext01_w0;
> -               sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> -               sendext23_w1 = sendext01_w1;
> -               sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
> -               sgdesc23_w0 = sgdesc01_w0;
>                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
> -                       sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) |
> +                                                  BIT_ULL(15));
> +                       sendmem01_w0 =
> +                               vdupq_n_u64((NIX_SUBDC_MEM << 60) |
> +                                           (NIX_SENDMEMALG_SETTSTMP << 56));
>                         sendmem23_w0 = sendmem01_w0;
> -                       sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
> +                       sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
>                         sendmem23_w1 = sendmem01_w1;
> +               } else {
> +                       sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
>                 }
> -       } else {
> -               sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
> -               sgdesc23_w0 = sgdesc01_w0;
> +               sendext23_w0 = sendext01_w0;
> +
> +               if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
> +                       sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
> +               else
> +                       sendext01_w1 = vdupq_n_u64(0);
> +               sendext23_w1 = sendext01_w1;
>         }
>
>         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
> --
> 2.17.1
>

  reply	other threads:[~2022-02-11 10:27 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-19  7:13 [PATCH v2 1/4] " pbhagavatula
2022-01-19  7:13 ` [PATCH v2 2/4] event/cnxk: store and reuse workslot status pbhagavatula
2022-01-19  7:13 ` [PATCH v2 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-01-19  7:13 ` [PATCH v2 4/4] net/cnxk: improve Rx performance pbhagavatula
2022-02-07 14:03 ` [PATCH v2 1/4] net/cnxk: avoid command copy from Tx queue Jerin Jacob
2022-02-10 10:13 ` [PATCH v3] " pbhagavatula
2022-02-10 10:19   ` Jerin Jacob
2022-02-10 13:15   ` [PATCH v4] " pbhagavatula
2022-02-11 10:27     ` Jerin Jacob [this message]
2022-02-10 10:19 ` [PATCH v3 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 10:19   ` [PATCH v3 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 10:19   ` [PATCH v3 3/3] net/cnxk: improve Rx performance pbhagavatula
2022-02-10 13:20   ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status pbhagavatula
2022-02-10 13:20     ` [PATCH v4 2/3] event/cnxk: disable default wait time for dequeue pbhagavatula
2022-02-10 13:20     ` [PATCH v4 3/3] event/cnxk: improve Rx performance pbhagavatula
2022-02-14  9:29     ` [PATCH v4 1/3] event/cnxk: store and reuse workslot status Jerin Jacob

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CALBAE1OQd2L4WpOZXCCnbPtX_i8apcTwEfLH=BatnYJtGvsNVw@mail.gmail.com' \
    --to=jerinjacobk@gmail.com \
    --cc=adwivedi@marvell.com \
    --cc=anoobj@marvell.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@intel.com \
    --cc=jerinj@marvell.com \
    --cc=kirankumark@marvell.com \
    --cc=ktejasree@marvell.com \
    --cc=ndabilpuram@marvell.com \
    --cc=pbhagavatula@marvell.com \
    --cc=skori@marvell.com \
    --cc=skoteshwar@marvell.com \
    --cc=sthotton@marvell.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).