From: Nithin Dabilpuram <ndabilpuram@marvell.com>
To: <jerinj@marvell.com>, Nithin Dabilpuram <ndabilpuram@marvell.com>,
"Kiran Kumar K" <kirankumark@marvell.com>,
Sunil Kumar Kori <skori@marvell.com>,
Satha Rao <skoteshwar@marvell.com>,
Harman Kalra <hkalra@marvell.com>
Cc: <dev@dpdk.org>, Rahul Bhansali <rbhansali@marvell.com>,
Pavan Nikhilesh <pbhagavatula@marvell.com>
Subject: [PATCH v2 17/18] net/cnxk: support Tx burst vector for cn20k
Date: Thu, 26 Sep 2024 21:31:57 +0530 [thread overview]
Message-ID: <20240926160158.3206321-18-ndabilpuram@marvell.com> (raw)
In-Reply-To: <20240926160158.3206321-1-ndabilpuram@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
Signed-off-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/net/cnxk/cn20k_tx.h | 1445 ++++++++++++++++++++++++++++++++++-
1 file changed, 1441 insertions(+), 4 deletions(-)
diff --git a/drivers/net/cnxk/cn20k_tx.h b/drivers/net/cnxk/cn20k_tx.h
index 3f163285f0..05c8b80fcb 100644
--- a/drivers/net/cnxk/cn20k_tx.h
+++ b/drivers/net/cnxk/cn20k_tx.h
@@ -219,6 +219,28 @@ cn20k_nix_tx_ext_subs(const uint16_t flags)
((flags & (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)) ? 1 : 0);
}
+static __rte_always_inline uint8_t
+cn20k_nix_tx_dwords(const uint16_t flags, const uint8_t segdw)
+{
+ if (!(flags & NIX_TX_MULTI_SEG_F))
+ return cn20k_nix_tx_ext_subs(flags) + 2;
+
+ /* Already everything is accounted for in segdw */
+ return segdw;
+}
+
+static __rte_always_inline uint8_t
+cn20k_nix_pkts_per_vec_brst(const uint16_t flags)
+{
+ return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4) << ROC_LMT_LINES_PER_CORE_LOG2;
+}
+
+static __rte_always_inline uint8_t
+cn20k_nix_tx_dwords_per_line(const uint16_t flags)
+{
+ return (flags & NIX_TX_NEED_EXT_HDR) ? ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) : 8;
+}
+
static __rte_always_inline uint64_t
cn20k_nix_tx_steor_data(const uint16_t flags)
{
@@ -247,6 +269,40 @@ cn20k_nix_tx_steor_data(const uint16_t flags)
return data;
}
+static __rte_always_inline uint8_t
+cn20k_nix_tx_dwords_per_line_seg(const uint16_t flags)
+{
+ return ((flags & NIX_TX_NEED_EXT_HDR) ? (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 : 4);
+}
+
+static __rte_always_inline uint64_t
+cn20k_nix_tx_steor_vec_data(const uint16_t flags)
+{
+ const uint64_t dw_m1 = cn20k_nix_tx_dwords_per_line(flags) - 1;
+ uint64_t data;
+
+ /* This will be moved to addr area */
+ data = dw_m1;
+ /* 15 vector sizes for single seg */
+ data |= dw_m1 << 19;
+ data |= dw_m1 << 22;
+ data |= dw_m1 << 25;
+ data |= dw_m1 << 28;
+ data |= dw_m1 << 31;
+ data |= dw_m1 << 34;
+ data |= dw_m1 << 37;
+ data |= dw_m1 << 40;
+ data |= dw_m1 << 43;
+ data |= dw_m1 << 46;
+ data |= dw_m1 << 49;
+ data |= dw_m1 << 52;
+ data |= dw_m1 << 55;
+ data |= dw_m1 << 58;
+ data |= dw_m1 << 61;
+
+ return data;
+}
+
static __rte_always_inline void
cn20k_nix_tx_skeleton(struct cn20k_eth_txq *txq, uint64_t *cmd, const uint16_t flags,
const uint16_t static_sz)
@@ -276,6 +332,33 @@ cn20k_nix_tx_skeleton(struct cn20k_eth_txq *txq, uint64_t *cmd, const uint16_t f
}
}
+static __rte_always_inline void
+cn20k_nix_sec_fc_wait_one(struct cn20k_eth_txq *txq)
+{
+ uint64_t nb_desc = txq->cpt_desc;
+ uint64_t fc;
+
+#ifdef RTE_ARCH_ARM64
+ asm volatile(PLT_CPU_FEATURE_PREAMBLE
+ " ldxr %[space], [%[addr]] \n"
+ " cmp %[nb_desc], %[space] \n"
+ " b.hi .Ldne%= \n"
+ " sevl \n"
+ ".Lrty%=: wfe \n"
+ " ldxr %[space], [%[addr]] \n"
+ " cmp %[nb_desc], %[space] \n"
+ " b.ls .Lrty%= \n"
+ ".Ldne%=: \n"
+ : [space] "=&r"(fc)
+ : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
+ : "memory");
+#else
+ RTE_SET_USED(fc);
+ while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
+ ;
+#endif
+}
+
static __rte_always_inline void
cn20k_nix_sec_fc_wait(struct cn20k_eth_txq *txq, uint16_t nb_pkts)
{
@@ -346,6 +429,137 @@ cn20k_nix_sec_fc_wait(struct cn20k_eth_txq *txq, uint16_t nb_pkts)
}
#if defined(RTE_ARCH_ARM64)
+static __rte_always_inline void
+cn20k_nix_prep_sec_vec(struct rte_mbuf *m, uint64x2_t *cmd0, uint64x2_t *cmd1,
+ uintptr_t *nixtx_addr, uintptr_t lbase, uint8_t *lnum, uint8_t *loff,
+ uint8_t *shft, uint64_t sa_base, const uint16_t flags)
+{
+ struct cn20k_sec_sess_priv sess_priv;
+ uint32_t pkt_len, dlen_adj, rlen;
+ uint8_t l3l4type, chksum;
+ uint64x2_t cmd01, cmd23;
+ uint8_t l2_len, l3_len;
+ uintptr_t dptr, nixtx;
+ uint64_t ucode_cmd[4];
+ uint64_t *laddr, w0;
+ uint16_t tag;
+ uint64_t sa;
+
+ sess_priv.u64 = *rte_security_dynfield(m);
+
+ if (flags & NIX_TX_NEED_SEND_HDR_W1) {
+ /* Extract l3l4type either from il3il4type or ol3ol4type */
+ if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F && flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
+ l2_len = vgetq_lane_u8(*cmd0, 10);
+ /* L4 ptr from send hdr includes l2 and l3 len */
+ l3_len = vgetq_lane_u8(*cmd0, 11) - l2_len;
+ l3l4type = vgetq_lane_u8(*cmd0, 13);
+ } else {
+ l2_len = vgetq_lane_u8(*cmd0, 8);
+ /* L4 ptr from send hdr includes l2 and l3 len */
+ l3_len = vgetq_lane_u8(*cmd0, 9) - l2_len;
+ l3l4type = vgetq_lane_u8(*cmd0, 12);
+ }
+
+ chksum = (l3l4type & 0x1) << 1 | !!(l3l4type & 0x30);
+ chksum = ~chksum;
+ sess_priv.chksum = sess_priv.chksum & chksum;
+ /* Clear SEND header flags */
+ *cmd0 = vsetq_lane_u16(0, *cmd0, 6);
+ } else {
+ l2_len = m->l2_len;
+ l3_len = m->l3_len;
+ }
+
+ /* Retrieve DPTR */
+ dptr = vgetq_lane_u64(*cmd1, 1);
+ pkt_len = vgetq_lane_u16(*cmd0, 0);
+
+ /* Calculate dlen adj */
+ dlen_adj = pkt_len - l2_len;
+ /* Exclude l3 len from roundup for transport mode */
+ dlen_adj -= sess_priv.mode ? 0 : l3_len;
+ rlen = (dlen_adj + sess_priv.roundup_len) + (sess_priv.roundup_byte - 1);
+ rlen &= ~(uint64_t)(sess_priv.roundup_byte - 1);
+ rlen += sess_priv.partial_len;
+ dlen_adj = rlen - dlen_adj;
+
+ /* Update send descriptors. Security is single segment only */
+ *cmd0 = vsetq_lane_u16(pkt_len + dlen_adj, *cmd0, 0);
+
+ /* CPT word 5 and word 6 */
+ w0 = 0;
+ ucode_cmd[2] = 0;
+ if (flags & NIX_TX_MULTI_SEG_F && m->nb_segs > 1) {
+ struct rte_mbuf *last = rte_pktmbuf_lastseg(m);
+
+ /* Get area where NIX descriptor needs to be stored */
+ nixtx = rte_pktmbuf_mtod_offset(last, uintptr_t, last->data_len + dlen_adj);
+ nixtx += BIT_ULL(7);
+ nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);
+ nixtx += 16;
+
+ dptr = nixtx + ((flags & NIX_TX_NEED_EXT_HDR) ? 32 : 16);
+
+ /* Set l2 length as data offset */
+ w0 = (uint64_t)l2_len << 16;
+ w0 |= cn20k_nix_tx_ext_subs(flags) + NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
+ ucode_cmd[1] = dptr | ((uint64_t)m->nb_segs << 60);
+ } else {
+ /* Get area where NIX descriptor needs to be stored */
+ nixtx = dptr + pkt_len + dlen_adj;
+ nixtx += BIT_ULL(7);
+ nixtx = (nixtx - 1) & ~(BIT_ULL(7) - 1);
+ nixtx += 16;
+
+ w0 |= cn20k_nix_tx_ext_subs(flags) + 1ULL;
+ dptr += l2_len;
+ ucode_cmd[1] = dptr;
+ *cmd1 = vsetq_lane_u16(pkt_len + dlen_adj, *cmd1, 0);
+ /* DLEN passed is excluding L2 HDR */
+ pkt_len -= l2_len;
+ }
+ w0 |= ((((int64_t)nixtx - (int64_t)dptr) & 0xFFFFF) << 32);
+ /* CPT word 0 and 1 */
+ cmd01 = vdupq_n_u64(0);
+ cmd01 = vsetq_lane_u64(w0, cmd01, 0);
+ /* CPT_RES_S is 16B above NIXTX */
+ cmd01 = vsetq_lane_u64(nixtx - 16, cmd01, 1);
+
+ /* Return nixtx addr */
+ *nixtx_addr = nixtx;
+
+ /* CPT Word 4 and Word 7 */
+ tag = sa_base & 0xFFFFUL;
+ sa_base &= ~0xFFFFUL;
+ sa = (uintptr_t)roc_nix_inl_ot_ipsec_outb_sa(sa_base, sess_priv.sa_idx);
+ ucode_cmd[3] = (ROC_CPT_DFLT_ENG_GRP_SE_IE << 61 | 1UL << 60 | sa);
+ ucode_cmd[0] = (ROC_IE_OT_MAJOR_OP_PROCESS_OUTBOUND_IPSEC << 48 | 1UL << 54 |
+ ((uint64_t)sess_priv.chksum) << 32 | ((uint64_t)sess_priv.dec_ttl) << 34 |
+ pkt_len);
+
+ /* CPT word 2 and 3 */
+ cmd23 = vdupq_n_u64(0);
+ cmd23 = vsetq_lane_u64(
+ (((uint64_t)RTE_EVENT_TYPE_CPU << 28) | tag | CNXK_ETHDEV_SEC_OUTB_EV_SUB << 20),
+ cmd23, 0);
+ cmd23 = vsetq_lane_u64((uintptr_t)m | 1, cmd23, 1);
+
+ /* Move to our line */
+ laddr = LMT_OFF(lbase, *lnum, *loff ? 64 : 0);
+
+ /* Write CPT instruction to lmt line */
+ vst1q_u64(laddr, cmd01);
+ vst1q_u64((laddr + 2), cmd23);
+
+ *(__uint128_t *)(laddr + 4) = *(__uint128_t *)ucode_cmd;
+ *(__uint128_t *)(laddr + 6) = *(__uint128_t *)(ucode_cmd + 2);
+
+ /* Move to next line for every other CPT inst */
+ *loff = !(*loff);
+ *lnum = *lnum + (*loff ? 0 : 1);
+ *shft = *shft + (*loff ? 0 : 3);
+}
static __rte_always_inline void
cn20k_nix_prep_sec(struct rte_mbuf *m, uint64_t *cmd, uintptr_t *nixtx_addr, uintptr_t lbase,
@@ -546,6 +760,156 @@ cn20k_nix_prefree_seg(struct rte_mbuf *m, struct rte_mbuf **extm, struct cn20k_e
}
}
+#if defined(RTE_ARCH_ARM64)
+/* Only called for first segments of single segmented mbufs */
+static __rte_always_inline void
+cn20k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct rte_mbuf **extm,
+ struct cn20k_eth_txq *txq, uint64x2_t *senddesc01_w0,
+ uint64x2_t *senddesc23_w0, uint64x2_t *senddesc01_w1,
+ uint64x2_t *senddesc23_w1)
+{
+ struct rte_mbuf **tx_compl_ptr = txq->tx_compl.ptr;
+ uint32_t nb_desc_mask = txq->tx_compl.nb_desc_mask;
+ bool tx_compl_ena = txq->tx_compl.ena;
+ struct rte_mbuf *m0, *m1, *m2, *m3;
+ struct rte_mbuf *cookie;
+ uint64_t w0, w1, aura;
+ uint64_t sqe_id;
+
+ m0 = mbufs[0];
+ m1 = mbufs[1];
+ m2 = mbufs[2];
+ m3 = mbufs[3];
+
+ /* mbuf 0 */
+ w0 = vgetq_lane_u64(*senddesc01_w0, 0);
+ if (RTE_MBUF_HAS_EXTBUF(m0)) {
+ w0 |= BIT_ULL(19);
+ w1 = vgetq_lane_u64(*senddesc01_w1, 0);
+ w1 &= ~0xFFFF000000000000UL;
+ if (unlikely(!tx_compl_ena)) {
+ m0->next = *extm;
+ *extm = m0;
+ } else {
+ sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
+ rte_memory_order_relaxed);
+ sqe_id = sqe_id & nb_desc_mask;
+ /* Set PNC */
+ w0 |= BIT_ULL(43);
+ w1 |= sqe_id << 48;
+ tx_compl_ptr[sqe_id] = m0;
+ *senddesc01_w1 = vsetq_lane_u64(w1, *senddesc01_w1, 0);
+ }
+ } else {
+ cookie = RTE_MBUF_DIRECT(m0) ? m0 : rte_mbuf_from_indirect(m0);
+ aura = (w0 >> 20) & 0xFFFFF;
+ w0 &= ~0xFFFFF00000UL;
+ w0 |= cnxk_nix_prefree_seg(m0, &aura) << 19;
+ w0 |= aura << 20;
+
+ if ((w0 & BIT_ULL(19)) == 0)
+ RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void **)&cookie, 1, 0);
+ }
+ *senddesc01_w0 = vsetq_lane_u64(w0, *senddesc01_w0, 0);
+
+ /* mbuf1 */
+ w0 = vgetq_lane_u64(*senddesc01_w0, 1);
+ if (RTE_MBUF_HAS_EXTBUF(m1)) {
+ w0 |= BIT_ULL(19);
+ w1 = vgetq_lane_u64(*senddesc01_w1, 1);
+ w1 &= ~0xFFFF000000000000UL;
+ if (unlikely(!tx_compl_ena)) {
+ m1->next = *extm;
+ *extm = m1;
+ } else {
+ sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
+ rte_memory_order_relaxed);
+ sqe_id = sqe_id & nb_desc_mask;
+ /* Set PNC */
+ w0 |= BIT_ULL(43);
+ w1 |= sqe_id << 48;
+ tx_compl_ptr[sqe_id] = m1;
+ *senddesc01_w1 = vsetq_lane_u64(w1, *senddesc01_w1, 1);
+ }
+ } else {
+ cookie = RTE_MBUF_DIRECT(m1) ? m1 : rte_mbuf_from_indirect(m1);
+ aura = (w0 >> 20) & 0xFFFFF;
+ w0 &= ~0xFFFFF00000UL;
+ w0 |= cnxk_nix_prefree_seg(m1, &aura) << 19;
+ w0 |= aura << 20;
+
+ if ((w0 & BIT_ULL(19)) == 0)
+ RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void **)&cookie, 1, 0);
+ }
+ *senddesc01_w0 = vsetq_lane_u64(w0, *senddesc01_w0, 1);
+
+ /* mbuf 2 */
+ w0 = vgetq_lane_u64(*senddesc23_w0, 0);
+ if (RTE_MBUF_HAS_EXTBUF(m2)) {
+ w0 |= BIT_ULL(19);
+ w1 = vgetq_lane_u64(*senddesc23_w1, 0);
+ w1 &= ~0xFFFF000000000000UL;
+ if (unlikely(!tx_compl_ena)) {
+ m2->next = *extm;
+ *extm = m2;
+ } else {
+ sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
+ rte_memory_order_relaxed);
+ sqe_id = sqe_id & nb_desc_mask;
+ /* Set PNC */
+ w0 |= BIT_ULL(43);
+ w1 |= sqe_id << 48;
+ tx_compl_ptr[sqe_id] = m2;
+ *senddesc23_w1 = vsetq_lane_u64(w1, *senddesc23_w1, 0);
+ }
+ } else {
+ cookie = RTE_MBUF_DIRECT(m2) ? m2 : rte_mbuf_from_indirect(m2);
+ aura = (w0 >> 20) & 0xFFFFF;
+ w0 &= ~0xFFFFF00000UL;
+ w0 |= cnxk_nix_prefree_seg(m2, &aura) << 19;
+ w0 |= aura << 20;
+
+ if ((w0 & BIT_ULL(19)) == 0)
+ RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void **)&cookie, 1, 0);
+ }
+ *senddesc23_w0 = vsetq_lane_u64(w0, *senddesc23_w0, 0);
+
+ /* mbuf3 */
+ w0 = vgetq_lane_u64(*senddesc23_w0, 1);
+ if (RTE_MBUF_HAS_EXTBUF(m3)) {
+ w0 |= BIT_ULL(19);
+ w1 = vgetq_lane_u64(*senddesc23_w1, 1);
+ w1 &= ~0xFFFF000000000000UL;
+ if (unlikely(!tx_compl_ena)) {
+ m3->next = *extm;
+ *extm = m3;
+ } else {
+ sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
+ rte_memory_order_relaxed);
+ sqe_id = sqe_id & nb_desc_mask;
+ /* Set PNC */
+ w0 |= BIT_ULL(43);
+ w1 |= sqe_id << 48;
+ tx_compl_ptr[sqe_id] = m3;
+ *senddesc23_w1 = vsetq_lane_u64(w1, *senddesc23_w1, 1);
+ }
+ } else {
+ cookie = RTE_MBUF_DIRECT(m3) ? m3 : rte_mbuf_from_indirect(m3);
+ aura = (w0 >> 20) & 0xFFFFF;
+ w0 &= ~0xFFFFF00000UL;
+ w0 |= cnxk_nix_prefree_seg(m3, &aura) << 19;
+ w0 |= aura << 20;
+
+ if ((w0 & BIT_ULL(19)) == 0)
+ RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void **)&cookie, 1, 0);
+ }
+ *senddesc23_w0 = vsetq_lane_u64(w0, *senddesc23_w0, 1);
+#ifndef RTE_LIBRTE_MEMPOOL_DEBUG
+ RTE_SET_USED(cookie);
+#endif
+}
+#endif
+
static __rte_always_inline void
cn20k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
{
@@ -1351,6 +1715,1078 @@ cn20k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts
return pkts;
}
+#if defined(RTE_ARCH_ARM64)
+
+#define NIX_DESCS_PER_LOOP 4
+
+static __rte_always_inline void
+cn20k_nix_lmt_next(uint8_t dw, uintptr_t laddr, uint8_t *lnum, uint8_t *loff, uint8_t *shift,
+ __uint128_t *data128, uintptr_t *next)
+{
+ /* Go to next line if we are out of space */
+ if ((*loff + (dw << 4)) > 128) {
+ *data128 = *data128 | (((__uint128_t)((*loff >> 4) - 1)) << *shift);
+ *shift = *shift + 3;
+ *loff = 0;
+ *lnum = *lnum + 1;
+ }
+
+ *next = (uintptr_t)LMT_OFF(laddr, *lnum, *loff);
+ *loff = *loff + (dw << 4);
+}
+
+static __rte_always_inline void
+cn20k_nix_xmit_store(struct cn20k_eth_txq *txq, struct rte_mbuf *mbuf, struct rte_mbuf **extm,
+ uint8_t segdw, uintptr_t laddr, uint64x2_t cmd0, uint64x2_t cmd1,
+ uint64x2_t cmd2, uint64x2_t cmd3, const uint16_t flags)
+{
+ RTE_SET_USED(txq);
+ RTE_SET_USED(mbuf);
+ RTE_SET_USED(extm);
+ RTE_SET_USED(segdw);
+
+ if (flags & NIX_TX_NEED_EXT_HDR) {
+ /* Store the prepared send desc to LMT lines */
+ if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+ vst1q_u64(LMT_OFF(laddr, 0, 0), cmd0);
+ vst1q_u64(LMT_OFF(laddr, 0, 16), cmd2);
+ vst1q_u64(LMT_OFF(laddr, 0, 32), cmd1);
+ vst1q_u64(LMT_OFF(laddr, 0, 48), cmd3);
+ } else {
+ vst1q_u64(LMT_OFF(laddr, 0, 0), cmd0);
+ vst1q_u64(LMT_OFF(laddr, 0, 16), cmd2);
+ vst1q_u64(LMT_OFF(laddr, 0, 32), cmd1);
+ }
+ RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf, 1, 0);
+ } else {
+ /* Store the prepared send desc to LMT lines */
+ vst1q_u64(LMT_OFF(laddr, 0, 0), cmd0);
+ vst1q_u64(LMT_OFF(laddr, 0, 16), cmd1);
+ RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf, 1, 0);
+ }
+}
+
+static __rte_always_inline uint16_t
+cn20k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts, uint16_t pkts,
+ uint64_t *cmd, const uint16_t flags)
+{
+ uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
+ uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
+ uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP], cmd2[NIX_DESCS_PER_LOOP],
+ cmd3[NIX_DESCS_PER_LOOP];
+ uint16_t left, scalar, burst, i, lmt_id, c_lmt_id;
+ uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
+ uint64x2_t senddesc01_w0, senddesc23_w0;
+ uint64x2_t senddesc01_w1, senddesc23_w1;
+ uint64x2_t sendext01_w0, sendext23_w0;
+ uint64x2_t sendext01_w1, sendext23_w1;
+ uint64x2_t sendmem01_w0, sendmem23_w0;
+ uint64x2_t sendmem01_w1, sendmem23_w1;
+ uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
+ uint64x2_t sgdesc01_w0, sgdesc23_w0;
+ uint64x2_t sgdesc01_w1, sgdesc23_w1;
+ struct cn20k_eth_txq *txq = tx_queue;
+ rte_iova_t io_addr = txq->io_addr;
+ uint8_t lnum, shift = 0, loff = 0;
+ uintptr_t laddr = txq->lmt_base;
+ uint8_t c_lnum, c_shft, c_loff;
+ uint64x2_t ltypes01, ltypes23;
+ uint64x2_t xtmp128, ytmp128;
+ uint64x2_t xmask01, xmask23;
+ uintptr_t c_laddr = laddr;
+ rte_iova_t c_io_addr;
+ uint64_t sa_base;
+ union wdata {
+ __uint128_t data128;
+ uint64_t data[2];
+ } wd;
+ struct rte_mbuf *extm = NULL;
+
+ if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+ handle_tx_completion_pkts(txq, flags & NIX_TX_VWQE_F);
+
+ if (!(flags & NIX_TX_VWQE_F)) {
+ scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+ pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+ NIX_XMIT_FC_CHECK_RETURN(txq, pkts);
+ } else {
+ scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+ pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+ }
+
+ if (!(flags & NIX_TX_VWQE_F)) {
+ senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
+ } else {
+ uint64_t w0 = (txq->send_hdr_w0 & 0xFFFFF00000000000) |
+ ((uint64_t)(cn20k_nix_tx_ext_subs(flags) + 1) << 40);
+
+ senddesc01_w0 = vdupq_n_u64(w0);
+ }
+ senddesc23_w0 = senddesc01_w0;
+
+ senddesc01_w1 = vdupq_n_u64(0);
+ senddesc23_w1 = senddesc01_w1;
+ if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+ sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | (NIX_SENDLDTYPE_LDWB << 58) |
+ BIT_ULL(48));
+ else
+ sgdesc01_w0 = vdupq_n_u64((NIX_SUBDC_SG << 60) | BIT_ULL(48));
+ sgdesc23_w0 = sgdesc01_w0;
+
+ if (flags & NIX_TX_NEED_EXT_HDR) {
+ if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+ sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60) | BIT_ULL(15));
+ sendmem01_w0 = vdupq_n_u64((NIX_SUBDC_MEM << 60) |
+ (NIX_SENDMEMALG_SETTSTMP << 56));
+ sendmem23_w0 = sendmem01_w0;
+ sendmem01_w1 = vdupq_n_u64(txq->ts_mem);
+ sendmem23_w1 = sendmem01_w1;
+ } else {
+ sendext01_w0 = vdupq_n_u64((NIX_SUBDC_EXT << 60));
+ }
+ sendext23_w0 = sendext01_w0;
+
+ if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)
+ sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+ else
+ sendext01_w1 = vdupq_n_u64(0);
+ sendext23_w1 = sendext01_w1;
+ }
+
+ /* Get LMT base address and LMT ID as lcore id */
+ ROC_LMT_BASE_ID_GET(laddr, lmt_id);
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ ROC_LMT_CPT_BASE_ID_GET(c_laddr, c_lmt_id);
+ c_io_addr = txq->cpt_io_addr;
+ sa_base = txq->sa_base;
+ }
+
+ left = pkts;
+again:
+ /* Number of packets to prepare depends on offloads enabled. */
+ burst = left > cn20k_nix_pkts_per_vec_brst(flags) ? cn20k_nix_pkts_per_vec_brst(flags) :
+ left;
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ wd.data128 = 0;
+ shift = 16;
+ }
+ lnum = 0;
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ loff = 0;
+ c_loff = 0;
+ c_lnum = 0;
+ c_shft = 16;
+ }
+
+ for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F &&
+ (((int)((16 - c_lnum) << 1) - c_loff) < 4)) {
+ burst = i;
+ break;
+ }
+
+ /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
+ senddesc01_w0 = vbicq_u64(senddesc01_w0, vdupq_n_u64(0x800FFFFFFFF));
+ sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
+
+ senddesc23_w0 = senddesc01_w0;
+ sgdesc23_w0 = sgdesc01_w0;
+
+ /* Clear vlan enables. */
+ if (flags & NIX_TX_NEED_EXT_HDR) {
+ sendext01_w1 = vbicq_u64(sendext01_w1, vdupq_n_u64(0x3FFFF00FFFF00));
+ sendext23_w1 = sendext01_w1;
+ }
+
+ if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+ /* Reset send mem alg to SETTSTMP from SUB*/
+ sendmem01_w0 = vbicq_u64(sendmem01_w0, vdupq_n_u64(BIT_ULL(59)));
+ /* Reset send mem address to default. */
+ sendmem01_w1 = vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+ sendmem23_w0 = sendmem01_w0;
+ sendmem23_w1 = sendmem01_w1;
+ }
+
+ /* Move mbufs to iova */
+ mbuf0 = (uint64_t *)tx_pkts[0];
+ mbuf1 = (uint64_t *)tx_pkts[1];
+ mbuf2 = (uint64_t *)tx_pkts[2];
+ mbuf3 = (uint64_t *)tx_pkts[3];
+
+ /*
+ * Get mbuf's, olflags, iova, pktlen, dataoff
+ * dataoff_iovaX.D[0] = iova,
+ * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
+ * len_olflagsX.D[0] = ol_flags,
+ * len_olflagsX.D[1](63:32) = mbuf->pkt_len
+ */
+ dataoff_iova0 =
+ vsetq_lane_u64(((struct rte_mbuf *)mbuf0)->data_off, vld1q_u64(mbuf0), 1);
+ len_olflags0 = vld1q_u64(mbuf0 + 3);
+ dataoff_iova1 =
+ vsetq_lane_u64(((struct rte_mbuf *)mbuf1)->data_off, vld1q_u64(mbuf1), 1);
+ len_olflags1 = vld1q_u64(mbuf1 + 3);
+ dataoff_iova2 =
+ vsetq_lane_u64(((struct rte_mbuf *)mbuf2)->data_off, vld1q_u64(mbuf2), 1);
+ len_olflags2 = vld1q_u64(mbuf2 + 3);
+ dataoff_iova3 =
+ vsetq_lane_u64(((struct rte_mbuf *)mbuf3)->data_off, vld1q_u64(mbuf3), 1);
+ len_olflags3 = vld1q_u64(mbuf3 + 3);
+
+ /* Move mbufs to point pool */
+ mbuf0 = (uint64_t *)((uintptr_t)mbuf0 + offsetof(struct rte_mbuf, pool));
+ mbuf1 = (uint64_t *)((uintptr_t)mbuf1 + offsetof(struct rte_mbuf, pool));
+ mbuf2 = (uint64_t *)((uintptr_t)mbuf2 + offsetof(struct rte_mbuf, pool));
+ mbuf3 = (uint64_t *)((uintptr_t)mbuf3 + offsetof(struct rte_mbuf, pool));
+
+ if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F | NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
+ /* Get tx_offload for ol2, ol3, l2, l3 lengths */
+ /*
+ * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
+ * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
+ */
+
+ asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
+ : [a] "+w"(senddesc01_w1)
+ : [in] "r"(mbuf0 + 2)
+ : "memory");
+
+ asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
+ : [a] "+w"(senddesc01_w1)
+ : [in] "r"(mbuf1 + 2)
+ : "memory");
+
+ asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
+ : [b] "+w"(senddesc23_w1)
+ : [in] "r"(mbuf2 + 2)
+ : "memory");
+
+ asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
+ : [b] "+w"(senddesc23_w1)
+ : [in] "r"(mbuf3 + 2)
+ : "memory");
+
+ /* Get pool pointer alone */
+ mbuf0 = (uint64_t *)*mbuf0;
+ mbuf1 = (uint64_t *)*mbuf1;
+ mbuf2 = (uint64_t *)*mbuf2;
+ mbuf3 = (uint64_t *)*mbuf3;
+ } else {
+ /* Get pool pointer alone */
+ mbuf0 = (uint64_t *)*mbuf0;
+ mbuf1 = (uint64_t *)*mbuf1;
+ mbuf2 = (uint64_t *)*mbuf2;
+ mbuf3 = (uint64_t *)*mbuf3;
+ }
+
+ const uint8x16_t shuf_mask2 = {
+ 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ };
+ xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
+
+ /*
+ * Pick only 16 bits of pktlen preset at bits 63:32
+ * and place them at bits 15:0.
+ */
+ xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
+ ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
+
+ /* Add pairwise to get dataoff + iova in sgdesc_w1 */
+ sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
+ sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
+
+ /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
+ * pktlen at 15:0 position.
+ */
+ sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
+ sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
+ senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
+ senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
+
+ /* Move mbuf to point to pool_id. */
+ mbuf0 = (uint64_t *)((uintptr_t)mbuf0 + offsetof(struct rte_mempool, pool_id));
+ mbuf1 = (uint64_t *)((uintptr_t)mbuf1 + offsetof(struct rte_mempool, pool_id));
+ mbuf2 = (uint64_t *)((uintptr_t)mbuf2 + offsetof(struct rte_mempool, pool_id));
+ mbuf3 = (uint64_t *)((uintptr_t)mbuf3 + offsetof(struct rte_mempool, pool_id));
+
+ if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
+ !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
+ /*
+ * Lookup table to translate ol_flags to
+ * il3/il4 types. But we still use ol3/ol4 types in
+ * senddesc_w1 as only one header processing is enabled.
+ */
+ const uint8x16_t tbl = {
+ /* [0-15] = il4type:il3type */
+ 0x04, /* none (IPv6 assumed) */
+ 0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6 assumed) */
+ 0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6 assumed) */
+ 0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6 assumed) */
+ 0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
+ 0x13, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_TCP_CKSUM */
+ 0x23, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_SCTP_CKSUM */
+ 0x33, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_UDP_CKSUM */
+ 0x02, /* RTE_MBUF_F_TX_IPV4 */
+ 0x12, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_TCP_CKSUM */
+ 0x22, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_SCTP_CKSUM */
+ 0x32, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_UDP_CKSUM */
+ 0x03, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM */
+ 0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_TCP_CKSUM
+ */
+ 0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_SCTP_CKSUM
+ */
+ 0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_UDP_CKSUM
+ */
+ };
+
+ /* Extract olflags to translate to iltypes */
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ /*
+ * E(47):L3_LEN(9):L2_LEN(7+z)
+ * E(47):L3_LEN(9):L2_LEN(7+z)
+ */
+ senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
+ senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
+
+ /* Move OLFLAGS bits 55:52 to 51:48
+ * with zeros preprended on the byte and rest
+ * don't care
+ */
+ xtmp128 = vshrq_n_u8(xtmp128, 4);
+ ytmp128 = vshrq_n_u8(ytmp128, 4);
+ /*
+ * E(48):L3_LEN(8):L2_LEN(z+7)
+ * E(48):L3_LEN(8):L2_LEN(z+7)
+ */
+ const int8x16_t tshft3 = {
+ -1, 0, 8, 8, 8, 8, 8, 8,
+ -1, 0, 8, 8, 8, 8, 8, 8,
+ };
+
+ senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
+ senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
+
+ /* Do the lookup */
+ ltypes01 = vqtbl1q_u8(tbl, xtmp128);
+ ltypes23 = vqtbl1q_u8(tbl, ytmp128);
+
+ /* Pick only relevant fields i.e Bit 48:55 of iltype
+ * and place it in ol3/ol4type of senddesc_w1
+ */
+ const uint8x16_t shuf_mask0 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
+ };
+
+ ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
+ ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
+
+ /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
+ * a [E(32):E(16):OL3(8):OL2(8)]
+ * a = a + (a << 8)
+ * a [E(32):E(16):(OL3+OL2):OL2]
+ * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
+ */
+ senddesc01_w1 = vaddq_u8(senddesc01_w1, vshlq_n_u16(senddesc01_w1, 8));
+ senddesc23_w1 = vaddq_u8(senddesc23_w1, vshlq_n_u16(senddesc23_w1, 8));
+
+ /* Move ltypes to senddesc*_w1 */
+ senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
+ senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
+ } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
+ (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
+ /*
+ * Lookup table to translate ol_flags to
+ * ol3/ol4 types.
+ */
+
+ const uint8x16_t tbl = {
+ /* [0-15] = ol4type:ol3type */
+ 0x00, /* none */
+ 0x03, /* OUTER_IP_CKSUM */
+ 0x02, /* OUTER_IPV4 */
+ 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
+ 0x04, /* OUTER_IPV6 */
+ 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
+ 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
+ 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
+ * OUTER_IP_CKSUM
+ */
+ 0x00, /* OUTER_UDP_CKSUM */
+ 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
+ 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
+ 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
+ * OUTER_IP_CKSUM
+ */
+ 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IP_CKSUM
+ */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IPV4
+ */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IPV4 | OUTER_IP_CKSUM
+ */
+ };
+
+ /* Extract olflags to translate to iltypes */
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ /*
+ * E(47):OL3_LEN(9):OL2_LEN(7+z)
+ * E(47):OL3_LEN(9):OL2_LEN(7+z)
+ */
+ const uint8x16_t shuf_mask5 = {
+ 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ };
+ senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
+ senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
+
+ /* Extract outer ol flags only */
+ const uint64x2_t o_cksum_mask = {
+ 0x1C00020000000000,
+ 0x1C00020000000000,
+ };
+
+ xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
+ ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
+
+ /* Extract OUTER_UDP_CKSUM bit 41 and
+ * move it to bit 61
+ */
+
+ xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
+ ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
+
+ /* Shift oltype by 2 to start nibble from BIT(56)
+ * instead of BIT(58)
+ */
+ xtmp128 = vshrq_n_u8(xtmp128, 2);
+ ytmp128 = vshrq_n_u8(ytmp128, 2);
+ /*
+ * E(48):L3_LEN(8):L2_LEN(z+7)
+ * E(48):L3_LEN(8):L2_LEN(z+7)
+ */
+ const int8x16_t tshft3 = {
+ -1, 0, 8, 8, 8, 8, 8, 8,
+ -1, 0, 8, 8, 8, 8, 8, 8,
+ };
+
+ senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
+ senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
+
+ /* Do the lookup */
+ ltypes01 = vqtbl1q_u8(tbl, xtmp128);
+ ltypes23 = vqtbl1q_u8(tbl, ytmp128);
+
+ /* Pick only relevant fields i.e Bit 56:63 of oltype
+ * and place it in ol3/ol4type of senddesc_w1
+ */
+ const uint8x16_t shuf_mask0 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
+ };
+
+ ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
+ ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
+
+ /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
+ * a [E(32):E(16):OL3(8):OL2(8)]
+ * a = a + (a << 8)
+ * a [E(32):E(16):(OL3+OL2):OL2]
+ * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
+ */
+ senddesc01_w1 = vaddq_u8(senddesc01_w1, vshlq_n_u16(senddesc01_w1, 8));
+ senddesc23_w1 = vaddq_u8(senddesc23_w1, vshlq_n_u16(senddesc23_w1, 8));
+
+ /* Move ltypes to senddesc*_w1 */
+ senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
+ senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
+ } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
+ (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
+ /* Lookup table to translate ol_flags to
+ * ol4type, ol3type, il4type, il3type of senddesc_w1
+ */
+ const uint8x16x2_t tbl = {{
+ {
+ /* [0-15] = il4type:il3type */
+ 0x04, /* none (IPv6) */
+ 0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6) */
+ 0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6) */
+ 0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6) */
+ 0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
+ 0x13, /* RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_TCP_CKSUM
+ */
+ 0x23, /* RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_SCTP_CKSUM
+ */
+ 0x33, /* RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_UDP_CKSUM
+ */
+ 0x02, /* RTE_MBUF_F_TX_IPV4 */
+ 0x12, /* RTE_MBUF_F_TX_IPV4 |
+ * RTE_MBUF_F_TX_TCP_CKSUM
+ */
+ 0x22, /* RTE_MBUF_F_TX_IPV4 |
+ * RTE_MBUF_F_TX_SCTP_CKSUM
+ */
+ 0x32, /* RTE_MBUF_F_TX_IPV4 |
+ * RTE_MBUF_F_TX_UDP_CKSUM
+ */
+ 0x03, /* RTE_MBUF_F_TX_IPV4 |
+ * RTE_MBUF_F_TX_IP_CKSUM
+ */
+ 0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_TCP_CKSUM
+ */
+ 0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_SCTP_CKSUM
+ */
+ 0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
+ * RTE_MBUF_F_TX_UDP_CKSUM
+ */
+ },
+
+ {
+ /* [16-31] = ol4type:ol3type */
+ 0x00, /* none */
+ 0x03, /* OUTER_IP_CKSUM */
+ 0x02, /* OUTER_IPV4 */
+ 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
+ 0x04, /* OUTER_IPV6 */
+ 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
+ 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
+ 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
+ * OUTER_IP_CKSUM
+ */
+ 0x00, /* OUTER_UDP_CKSUM */
+ 0x33, /* OUTER_UDP_CKSUM |
+ * OUTER_IP_CKSUM
+ */
+ 0x32, /* OUTER_UDP_CKSUM |
+ * OUTER_IPV4
+ */
+ 0x33, /* OUTER_UDP_CKSUM |
+ * OUTER_IPV4 | OUTER_IP_CKSUM
+ */
+ 0x34, /* OUTER_UDP_CKSUM |
+ * OUTER_IPV6
+ */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IP_CKSUM
+ */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IPV4
+ */
+ 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
+ * OUTER_IPV4 | OUTER_IP_CKSUM
+ */
+ },
+ }};
+
+ /* Extract olflags to translate to oltype & iltype */
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ /*
+ * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
+ * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
+ */
+ const uint32x4_t tshft_4 = {
+ 1,
+ 0,
+ 1,
+ 0,
+ };
+ senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
+ senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
+
+ /*
+ * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
+ * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
+ */
+ const uint8x16_t shuf_mask5 = {
+ 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
+ };
+ senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
+ senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
+
+ /* Extract outer and inner header ol_flags */
+ const uint64x2_t oi_cksum_mask = {
+ 0x1CF0020000000000,
+ 0x1CF0020000000000,
+ };
+
+ xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
+ ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
+
+ /* Extract OUTER_UDP_CKSUM bit 41 and
+ * move it to bit 61
+ */
+
+ xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
+ ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
+
+ /* Shift right oltype by 2 and iltype by 4
+ * to start oltype nibble from BIT(58)
+ * instead of BIT(56) and iltype nibble from BIT(48)
+ * instead of BIT(52).
+ */
+ const int8x16_t tshft5 = {
+ 8, 8, 8, 8, 8, 8, -4, -2,
+ 8, 8, 8, 8, 8, 8, -4, -2,
+ };
+
+ xtmp128 = vshlq_u8(xtmp128, tshft5);
+ ytmp128 = vshlq_u8(ytmp128, tshft5);
+ /*
+ * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
+ * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
+ */
+ const int8x16_t tshft3 = {
+ -1, 0, -1, 0, 0, 0, 0, 0,
+ -1, 0, -1, 0, 0, 0, 0, 0,
+ };
+
+ senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
+ senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
+
+ /* Mark Bit(4) of oltype */
+ const uint64x2_t oi_cksum_mask2 = {
+ 0x1000000000000000,
+ 0x1000000000000000,
+ };
+
+ xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
+ ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
+
+ /* Do the lookup */
+ ltypes01 = vqtbl2q_u8(tbl, xtmp128);
+ ltypes23 = vqtbl2q_u8(tbl, ytmp128);
+
+ /* Pick only relevant fields i.e Bit 48:55 of iltype and
+ * Bit 56:63 of oltype and place it in corresponding
+ * place in senddesc_w1.
+ */
+ const uint8x16_t shuf_mask0 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
+ };
+
+ ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
+ ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
+
+ /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
+ * l3len, l2len, ol3len, ol2len.
+ * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
+ * a = a + (a << 8)
+ * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
+ * a = a + (a << 16)
+ * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
+ * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
+ */
+ senddesc01_w1 = vaddq_u8(senddesc01_w1, vshlq_n_u32(senddesc01_w1, 8));
+ senddesc23_w1 = vaddq_u8(senddesc23_w1, vshlq_n_u32(senddesc23_w1, 8));
+
+ /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
+ senddesc01_w1 = vaddq_u8(senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
+ senddesc23_w1 = vaddq_u8(senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
+
+ /* Move ltypes to senddesc*_w1 */
+ senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
+ senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
+ }
+
+ xmask01 = vdupq_n_u64(0);
+ xmask23 = xmask01;
+ asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
+ : [a] "+w"(xmask01)
+ : [in] "r"(mbuf0)
+ : "memory");
+
+ asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
+ : [a] "+w"(xmask01)
+ : [in] "r"(mbuf1)
+ : "memory");
+
+ asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
+ : [b] "+w"(xmask23)
+ : [in] "r"(mbuf2)
+ : "memory");
+
+ asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
+ : [b] "+w"(xmask23)
+ : [in] "r"(mbuf3)
+ : "memory");
+ xmask01 = vshlq_n_u64(xmask01, 20);
+ xmask23 = vshlq_n_u64(xmask23, 20);
+
+ senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
+ senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
+
+ if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+ /* Tx ol_flag for vlan. */
+ const uint64x2_t olv = {RTE_MBUF_F_TX_VLAN, RTE_MBUF_F_TX_VLAN};
+ /* Bit enable for VLAN1 */
+ const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+ /* Tx ol_flag for QnQ. */
+ const uint64x2_t olq = {RTE_MBUF_F_TX_QINQ, RTE_MBUF_F_TX_QINQ};
+ /* Bit enable for VLAN0 */
+ const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+ /* Load vlan values from packet. outer is VLAN 0 */
+ uint64x2_t ext01 = {
+ ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+ ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+ ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+ ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+ };
+ uint64x2_t ext23 = {
+ ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+ ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+ ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+ ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+ };
+
+ /* Get ol_flags of the packets. */
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ /* ORR vlan outer/inner values into cmd. */
+ sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+ sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+ /* Test for offload enable bits and generate masks. */
+ xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv), mlv),
+ vandq_u64(vtstq_u64(xtmp128, olq), mlq));
+ ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv), mlv),
+ vandq_u64(vtstq_u64(ytmp128, olq), mlq));
+
+ /* Set vlan enable bits into cmd based on mask. */
+ sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+ sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+ }
+
+ if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+ /* Tx ol_flag for timestamp. */
+ const uint64x2_t olf = {RTE_MBUF_F_TX_IEEE1588_TMST,
+ RTE_MBUF_F_TX_IEEE1588_TMST};
+ /* Set send mem alg to SUB. */
+ const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+ /* Increment send mem address by 8. */
+ const uint64x2_t addr = {0x8, 0x8};
+
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ /* Check if timestamp is requested and generate inverted
+ * mask as we need not make any changes to default cmd
+ * value.
+ */
+ xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+ ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+ /* Change send mem address to an 8 byte offset when
+ * TSTMP is disabled.
+ */
+ sendmem01_w1 = vaddq_u64(sendmem01_w1, vandq_u64(xtmp128, addr));
+ sendmem23_w1 = vaddq_u64(sendmem23_w1, vandq_u64(ytmp128, addr));
+ /* Change send mem alg to SUB when TSTMP is disabled. */
+ sendmem01_w0 = vorrq_u64(sendmem01_w0, vandq_u64(xtmp128, alg));
+ sendmem23_w0 = vorrq_u64(sendmem23_w0, vandq_u64(ytmp128, alg));
+
+ cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+ cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+ cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+ cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+ }
+
+ if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) && !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
+ /* Set don't free bit if reference count > 1 */
+ cn20k_nix_prefree_seg_vec(tx_pkts, &extm, txq, &senddesc01_w0,
+ &senddesc23_w0, &senddesc01_w1, &senddesc23_w1);
+ } else if (!(flags & NIX_TX_MULTI_SEG_F) && !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
+ /* Move mbufs to iova */
+ mbuf0 = (uint64_t *)tx_pkts[0];
+ mbuf1 = (uint64_t *)tx_pkts[1];
+ mbuf2 = (uint64_t *)tx_pkts[2];
+ mbuf3 = (uint64_t *)tx_pkts[3];
+
+ /* Mark mempool object as "put" since
+ * it is freed by NIX
+ */
+ RTE_MEMPOOL_CHECK_COOKIES(((struct rte_mbuf *)mbuf0)->pool, (void **)&mbuf0,
+ 1, 0);
+
+ RTE_MEMPOOL_CHECK_COOKIES(((struct rte_mbuf *)mbuf1)->pool, (void **)&mbuf1,
+ 1, 0);
+
+ RTE_MEMPOOL_CHECK_COOKIES(((struct rte_mbuf *)mbuf2)->pool, (void **)&mbuf2,
+ 1, 0);
+
+ RTE_MEMPOOL_CHECK_COOKIES(((struct rte_mbuf *)mbuf3)->pool, (void **)&mbuf3,
+ 1, 0);
+ }
+
+ /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
+ cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
+ cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
+ cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
+ cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
+
+ cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
+ cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
+ cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
+ cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
+
+ if (flags & NIX_TX_NEED_EXT_HDR) {
+ cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+ cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+ cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+ cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+ }
+
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ const uint64x2_t olf = {RTE_MBUF_F_TX_SEC_OFFLOAD,
+ RTE_MBUF_F_TX_SEC_OFFLOAD};
+ uintptr_t next;
+ uint8_t dw;
+
+ /* Extract ol_flags. */
+ xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+ ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+ xtmp128 = vtstq_u64(olf, xtmp128);
+ ytmp128 = vtstq_u64(olf, ytmp128);
+
+ /* Process mbuf0 */
+ dw = cn20k_nix_tx_dwords(flags, segdw[0]);
+ if (vgetq_lane_u64(xtmp128, 0))
+ cn20k_nix_prep_sec_vec(tx_pkts[0], &cmd0[0], &cmd1[0], &next,
+ c_laddr, &c_lnum, &c_loff, &c_shft, sa_base,
+ flags);
+ else
+ cn20k_nix_lmt_next(dw, laddr, &lnum, &loff, &shift, &wd.data128,
+ &next);
+
+ /* Store mbuf0 to LMTLINE/CPT NIXTX area */
+ cn20k_nix_xmit_store(txq, tx_pkts[0], &extm, segdw[0], next, cmd0[0],
+ cmd1[0], cmd2[0], cmd3[0], flags);
+
+ /* Process mbuf1 */
+ dw = cn20k_nix_tx_dwords(flags, segdw[1]);
+ if (vgetq_lane_u64(xtmp128, 1))
+ cn20k_nix_prep_sec_vec(tx_pkts[1], &cmd0[1], &cmd1[1], &next,
+ c_laddr, &c_lnum, &c_loff, &c_shft, sa_base,
+ flags);
+ else
+ cn20k_nix_lmt_next(dw, laddr, &lnum, &loff, &shift, &wd.data128,
+ &next);
+
+ /* Store mbuf1 to LMTLINE/CPT NIXTX area */
+ cn20k_nix_xmit_store(txq, tx_pkts[1], &extm, segdw[1], next, cmd0[1],
+ cmd1[1], cmd2[1], cmd3[1], flags);
+
+ /* Process mbuf2 */
+ dw = cn20k_nix_tx_dwords(flags, segdw[2]);
+ if (vgetq_lane_u64(ytmp128, 0))
+ cn20k_nix_prep_sec_vec(tx_pkts[2], &cmd0[2], &cmd1[2], &next,
+ c_laddr, &c_lnum, &c_loff, &c_shft, sa_base,
+ flags);
+ else
+ cn20k_nix_lmt_next(dw, laddr, &lnum, &loff, &shift, &wd.data128,
+ &next);
+
+ /* Store mbuf2 to LMTLINE/CPT NIXTX area */
+ cn20k_nix_xmit_store(txq, tx_pkts[2], &extm, segdw[2], next, cmd0[2],
+ cmd1[2], cmd2[2], cmd3[2], flags);
+
+ /* Process mbuf3 */
+ dw = cn20k_nix_tx_dwords(flags, segdw[3]);
+ if (vgetq_lane_u64(ytmp128, 1))
+ cn20k_nix_prep_sec_vec(tx_pkts[3], &cmd0[3], &cmd1[3], &next,
+ c_laddr, &c_lnum, &c_loff, &c_shft, sa_base,
+ flags);
+ else
+ cn20k_nix_lmt_next(dw, laddr, &lnum, &loff, &shift, &wd.data128,
+ &next);
+
+ /* Store mbuf3 to LMTLINE/CPT NIXTX area */
+ cn20k_nix_xmit_store(txq, tx_pkts[3], &extm, segdw[3], next, cmd0[3],
+ cmd1[3], cmd2[3], cmd3[3], flags);
+
+ } else if (flags & NIX_TX_NEED_EXT_HDR) {
+ /* Store the prepared send desc to LMT lines */
+ if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+ vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
+ lnum += 1;
+ vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
+ } else {
+ vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+ lnum += 1;
+ vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+ }
+ lnum += 1;
+ } else {
+ /* Store the prepared send desc to LMT lines */
+ vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
+ vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
+ lnum += 1;
+ }
+
+ tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
+ }
+
+ /* Roundup lnum to last line if it is partial */
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ lnum = lnum + !!loff;
+ wd.data128 = wd.data128 | (((__uint128_t)(((loff >> 4) - 1) & 0x7) << shift));
+ }
+
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F)
+ wd.data[0] >>= 16;
+
+ if ((flags & NIX_TX_VWQE_F) && !(ws[3] & BIT_ULL(35)))
+ ws[3] = roc_sso_hws_head_wait(ws[0]);
+
+ left -= burst;
+
+ /* Submit CPT instructions if any */
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+ uint16_t sec_pkts = (c_lnum << 1) + c_loff;
+
+ if (flags & NIX_TX_VWQE_F)
+ cn20k_nix_vwqe_wait_fc(txq, sec_pkts);
+ cn20k_nix_sec_fc_wait(txq, sec_pkts);
+ cn20k_nix_sec_steorl(c_io_addr, c_lmt_id, c_lnum, c_loff, c_shft);
+ }
+
+ /* Trigger LMTST */
+ if (lnum > 16) {
+ if (!(flags & NIX_TX_OFFLOAD_SECURITY_F))
+ wd.data[0] = cn20k_nix_tx_steor_vec_data(flags);
+
+ pa = io_addr | (wd.data[0] & 0x7) << 4;
+ wd.data[0] &= ~0x7ULL;
+
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F)
+ wd.data[0] <<= 16;
+
+ wd.data[0] |= (15ULL << 12);
+ wd.data[0] |= (uint64_t)lmt_id;
+
+ if (flags & NIX_TX_VWQE_F)
+ cn20k_nix_vwqe_wait_fc(txq, cn20k_nix_pkts_per_vec_brst(flags) >> 1);
+ /* STEOR0 */
+ roc_lmt_submit_steorl(wd.data[0], pa);
+
+ if (!(flags & NIX_TX_OFFLOAD_SECURITY_F))
+ wd.data[1] = cn20k_nix_tx_steor_vec_data(flags);
+
+ pa = io_addr | (wd.data[1] & 0x7) << 4;
+ wd.data[1] &= ~0x7ULL;
+
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F)
+ wd.data[1] <<= 16;
+
+ wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
+ wd.data[1] |= (uint64_t)(lmt_id + 16);
+
+ if (flags & NIX_TX_VWQE_F) {
+ cn20k_nix_vwqe_wait_fc(txq,
+ burst - (cn20k_nix_pkts_per_vec_brst(flags) >> 1));
+ }
+ /* STEOR1 */
+ roc_lmt_submit_steorl(wd.data[1], pa);
+ } else if (lnum) {
+ if (!(flags & NIX_TX_OFFLOAD_SECURITY_F))
+ wd.data[0] = cn20k_nix_tx_steor_vec_data(flags);
+
+ pa = io_addr | (wd.data[0] & 0x7) << 4;
+ wd.data[0] &= ~0x7ULL;
+
+ if (flags & NIX_TX_OFFLOAD_SECURITY_F)
+ wd.data[0] <<= 16;
+
+ wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
+ wd.data[0] |= (uint64_t)lmt_id;
+
+ if (flags & NIX_TX_VWQE_F)
+ cn20k_nix_vwqe_wait_fc(txq, burst);
+ /* STEOR0 */
+ roc_lmt_submit_steorl(wd.data[0], pa);
+ }
+
+ rte_io_wmb();
+ if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+ cn20k_nix_free_extmbuf(extm);
+ extm = NULL;
+ }
+
+ if (left)
+ goto again;
+
+ if (unlikely(scalar))
+ pkts += cn20k_nix_xmit_pkts(tx_queue, ws, tx_pkts, scalar, cmd, flags);
+ return pkts;
+}
+
+#else
+static __rte_always_inline uint16_t
+cn20k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts, uint16_t pkts,
+ uint64_t *cmd, const uint16_t flags)
+{
+ RTE_SET_USED(ws);
+ RTE_SET_USED(tx_queue);
+ RTE_SET_USED(tx_pkts);
+ RTE_SET_USED(pkts);
+ RTE_SET_USED(cmd);
+ RTE_SET_USED(flags);
+ return 0;
+}
+#endif
+
#define L3L4CSUM_F NIX_TX_OFFLOAD_L3_L4_CSUM_F
#define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
#define VLAN_F NIX_TX_OFFLOAD_VLAN_QINQ_F
@@ -1567,10 +3003,11 @@ NIX_TX_FASTPATH_MODES
uint16_t __rte_noinline __rte_hot fn(void *tx_queue, struct rte_mbuf **tx_pkts, \
uint16_t pkts) \
{ \
- RTE_SET_USED(tx_queue); \
- RTE_SET_USED(tx_pkts); \
- RTE_SET_USED(pkts); \
- return 0; \
+ uint64_t cmd[sz]; \
+ /* For TSO inner checksum is a must */ \
+ if (((flags) & NIX_TX_OFFLOAD_TSO_F) && !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
+ return 0; \
+ return cn20k_nix_xmit_pkts_vector(tx_queue, NULL, tx_pkts, pkts, cmd, (flags)); \
}
#define NIX_TX_XMIT_VEC_MSEG(fn, sz, flags) \
--
2.34.1
next prev parent reply other threads:[~2024-09-26 16:05 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-10 8:58 [PATCH 00/33] add Marvell cn20k SOC support for mempool and net Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 01/33] mempool/cnxk: add cn20k PCI device ids Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 02/33] common/cnxk: accommodate change in aura field width Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 03/33] common/cnxk: use new NPA aq enq mbox for cn20k Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 04/33] mempool/cnxk: initialize mempool ops " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 05/33] net/cnxk: added telemetry support do dump SA information Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 06/33] net/cnxk: handle timestamp correctly for VF Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 07/33] net/cnxk: update Rx offloads to handle timestamp Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 08/33] event/cnxk: handle timestamp for event mode Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 09/33] net/cnxk: update mbuf and rearm data for Rx inject packets Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 10/33] common/cnxk: remove restriction to clear RPM stats Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 11/33] common/cnxk: allow MAC address set/add with active VFs Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 12/33] net/cnxk: move PMD function defines to common code Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 13/33] common/cnxk: add cn20k NIX register definitions Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 14/33] common/cnxk: support NIX queue config for cn20k Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 15/33] common/cnxk: support bandwidth profile " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 16/33] common/cnxk: support NIX debug " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 17/33] common/cnxk: add RSS support " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 18/33] net/cnxk: add cn20k base control path support Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 19/33] net/cnxk: support Rx function select for cn20k Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 20/33] net/cnxk: support Tx " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 21/33] net/cnxk: support Rx burst scalar " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 22/33] net/cnxk: support Rx burst vector " Nithin Dabilpuram
2024-09-10 8:58 ` [PATCH 23/33] net/cnxk: support Tx burst scalar " Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 24/33] net/cnxk: support Tx multi-seg in cn20k Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 25/33] net/cnxk: support Tx burst vector for cn20k Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 26/33] net/cnxk: support Tx multi-seg in " Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 27/33] common/cnxk: add flush wait after write of inline ctx Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 28/33] common/cnxk: fix CPT HW word size for outbound SA Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 29/33] net/cnxk: add PMD APIs for IPsec SA base and flush Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 30/33] net/cnxk: add PMD APIs to submit CPT instruction Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 31/33] net/cnxk: add PMD API to retrieve CPT queue statistics Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 32/33] net/cnxk: add option to enable custom inbound sa usage Nithin Dabilpuram
2024-09-10 8:59 ` [PATCH 33/33] net/cnxk: add PMD API to retrieve the model string Nithin Dabilpuram
2024-09-23 15:44 ` [PATCH 00/33] add Marvell cn20k SOC support for mempool and net Jerin Jacob
2024-09-26 16:01 ` [PATCH v2 00/18] " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 01/18] mempool/cnxk: add cn20k PCI device ids Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 02/18] common/cnxk: accommodate change in aura field width Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 03/18] common/cnxk: use new NPA aq enq mbox for cn20k Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 04/18] mempool/cnxk: initialize mempool ops " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 05/18] common/cnxk: add cn20k NIX register definitions Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 06/18] common/cnxk: support NIX queue config for cn20k Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 07/18] common/cnxk: support bandwidth profile " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 08/18] common/cnxk: support NIX debug " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 09/18] common/cnxk: add RSS support " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 10/18] net/cnxk: add cn20k base control path support Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 11/18] net/cnxk: support Rx function select for cn20k Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 12/18] net/cnxk: support Tx " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 13/18] net/cnxk: support Rx burst scalar " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 14/18] net/cnxk: support Rx burst vector " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 15/18] net/cnxk: support Tx burst scalar " Nithin Dabilpuram
2024-09-26 16:01 ` [PATCH v2 16/18] net/cnxk: support Tx multi-seg in cn20k Nithin Dabilpuram
2024-09-26 16:01 ` Nithin Dabilpuram [this message]
2024-09-26 16:01 ` [PATCH v2 18/18] net/cnxk: support Tx multi-seg in vector for cn20k Nithin Dabilpuram
2024-10-01 11:01 ` [PATCH v2 00/18] add Marvell cn20k SOC support for mempool and net Jerin Jacob
2024-10-01 12:40 ` [PATCH v3 " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 01/18] mempool/cnxk: add cn20k PCI device ids Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 02/18] common/cnxk: accommodate change in aura field width Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 03/18] common/cnxk: use new NPA aq enq mbox for cn20k Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 04/18] mempool/cnxk: initialize mempool ops " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 05/18] common/cnxk: add cn20k NIX register definitions Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 06/18] common/cnxk: support NIX queue config for cn20k Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 07/18] common/cnxk: support bandwidth profile " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 08/18] common/cnxk: support NIX debug " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 09/18] common/cnxk: add RSS support " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 10/18] net/cnxk: add cn20k base control path support Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 11/18] net/cnxk: support Rx function select for cn20k Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 12/18] net/cnxk: support Tx " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 13/18] net/cnxk: support Rx burst scalar " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 14/18] net/cnxk: support Rx burst vector " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 15/18] net/cnxk: support Tx burst scalar " Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 16/18] net/cnxk: support Tx multi-seg in cn20k Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 17/18] net/cnxk: support Tx burst vector for cn20k Nithin Dabilpuram
2024-10-01 12:40 ` [PATCH v3 18/18] net/cnxk: support Tx multi-seg in " Nithin Dabilpuram
2024-10-03 15:52 ` [PATCH v3 00/18] add Marvell cn20k SOC support for mempool and net Jerin Jacob
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240926160158.3206321-18-ndabilpuram@marvell.com \
--to=ndabilpuram@marvell.com \
--cc=dev@dpdk.org \
--cc=hkalra@marvell.com \
--cc=jerinj@marvell.com \
--cc=kirankumark@marvell.com \
--cc=pbhagavatula@marvell.com \
--cc=rbhansali@marvell.com \
--cc=skori@marvell.com \
--cc=skoteshwar@marvell.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).