DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 0/3] rewrite fastpath routines
@ 2023-10-11  1:50 Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  1:50 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx desciptor format which improves the
performance.

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 263 +++++++---------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 804 insertions(+), 257 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-11  1:50 [PATCH 0/3] rewrite fastpath routines Vamsi Attunuru
@ 2023-10-11  1:50 ` Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  1:50 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  8 +++-----
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 5 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..5b759d759b 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -484,7 +484,7 @@ otx_ep_ring_doorbell(struct otx_ep_device *otx_ep __rte_unused,
 static inline int
 post_iqcmd(struct otx_ep_instr_queue *iq, uint8_t *iqcmd)
 {
-	uint8_t *iqptr, cmdsize;
+	uint8_t *iqptr;
 
 	/* This ensures that the read index does not wrap around to
 	 * the same position if queue gets full before OCTEON 9 could
@@ -494,10 +494,8 @@ post_iqcmd(struct otx_ep_instr_queue *iq, uint8_t *iqcmd)
 		return OTX_EP_IQ_SEND_FAILED;
 
 	/* Copy cmd into iq */
-	cmdsize = 64;
-	iqptr   = iq->base_addr + (iq->host_write_index << 6);
-
-	rte_memcpy(iqptr, iqcmd, cmdsize);
+	iqptr = iq->base_addr + (iq->host_write_index * iq->desc_size);
+	rte_memcpy(iqptr, iqcmd, iq->desc_size);
 
 	/* Increment the host write index */
 	iq->host_write_index =
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/3] net/octeon_ep: clean up receive routine
  2023-10-11  1:50 [PATCH 0/3] rewrite fastpath routines Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-11  1:50 ` Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  1:50 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 162 ++++++++++++----------------
 1 file changed, 68 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 5b759d759b..ea7c9a5d62 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,14 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -565,9 +572,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -750,36 +755,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -791,21 +786,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -826,32 +818,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -863,61 +848,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -941,7 +915,14 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -950,36 +931,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1004,7 +979,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1012,7 +986,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/3] net/octeon_ep: add new fastpath routines
  2023-10-11  1:50 [PATCH 0/3] rewrite fastpath routines Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-11  1:50 ` [PATCH 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-11  1:50 ` Vamsi Attunuru
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  1:50 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 704 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..74f0011283
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..9f11a2f317
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..749776d70c 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,4 +9,6 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 57b965ad06..e965cbaa16 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	return 0;
@@ -255,18 +299,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -656,8 +705,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index ea7c9a5d62..e7556c5fd2 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -673,85 +673,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 0/3] rewrite fastpath routines
  2023-10-11  1:50 [PATCH 0/3] rewrite fastpath routines Vamsi Attunuru
                   ` (2 preceding siblings ...)
  2023-10-11  1:50 ` [PATCH 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-11  8:36 ` Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                     ` (3 more replies)
  3 siblings, 4 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  8:36 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx desciptor format which improves the
performance.

v2 change:
- Fixed CI

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |  12 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 263 +++++++---------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 814 insertions(+), 257 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
@ 2023-10-11  8:36   ` Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  8:36 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/meson.build     | 10 ++++++++++
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  8 +++-----
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 6 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..4538c0396e 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -10,3 +10,13 @@ sources = files(
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
 )
+
+if (toolchain == 'gcc' and cc.version().version_compare('>=11.0.0'))
+    error_cflags += ['-Wno-array-bounds']
+endif
+
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        c_args += flag
+    endif
+endforeach
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..5b759d759b 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -484,7 +484,7 @@ otx_ep_ring_doorbell(struct otx_ep_device *otx_ep __rte_unused,
 static inline int
 post_iqcmd(struct otx_ep_instr_queue *iq, uint8_t *iqcmd)
 {
-	uint8_t *iqptr, cmdsize;
+	uint8_t *iqptr;
 
 	/* This ensures that the read index does not wrap around to
 	 * the same position if queue gets full before OCTEON 9 could
@@ -494,10 +494,8 @@ post_iqcmd(struct otx_ep_instr_queue *iq, uint8_t *iqcmd)
 		return OTX_EP_IQ_SEND_FAILED;
 
 	/* Copy cmd into iq */
-	cmdsize = 64;
-	iqptr   = iq->base_addr + (iq->host_write_index << 6);
-
-	rte_memcpy(iqptr, iqcmd, cmdsize);
+	iqptr = iq->base_addr + (iq->host_write_index * iq->desc_size);
+	rte_memcpy(iqptr, iqcmd, iq->desc_size);
 
 	/* Increment the host write index */
 	iq->host_write_index =
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 2/3] net/octeon_ep: clean up receive routine
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-11  8:36   ` Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  8:36 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 162 ++++++++++++----------------
 1 file changed, 68 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 5b759d759b..ea7c9a5d62 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,14 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -565,9 +572,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -750,36 +755,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -791,21 +786,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -826,32 +818,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -863,61 +848,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -941,7 +915,14 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -950,36 +931,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1004,7 +979,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1012,7 +986,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 3/3] net/octeon_ep: add new fastpath routines
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-11  8:36   ` [PATCH v2 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-11  8:36   ` Vamsi Attunuru
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11  8:36 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 704 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..74f0011283
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..9f11a2f317
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index 4538c0396e..ef5eed6a34 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,6 +9,8 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
 
 if (toolchain == 'gcc' and cc.version().version_compare('>=11.0.0'))
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 57b965ad06..e965cbaa16 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	return 0;
@@ -255,18 +299,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -656,8 +705,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index ea7c9a5d62..e7556c5fd2 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -673,85 +673,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 0/3] rewrite fastpath routines
  2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
                     ` (2 preceding siblings ...)
  2023-10-11  8:36   ` [PATCH v2 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-11 12:53   ` Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                       ` (3 more replies)
  3 siblings, 4 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11 12:53 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx desciptor format which improves the
performance.

v2 & v3 changes:
- Fixed CI

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 255 +++++++--------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 801 insertions(+), 252 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
@ 2023-10-11 12:53     ` Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11 12:53 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 2/3] net/octeon_ep: clean up receive routine
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-11 12:53     ` Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11 12:53 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 162 ++++++++++++----------------
 1 file changed, 68 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..4c509a419f 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,14 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -567,9 +574,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -752,36 +757,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -793,21 +788,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -828,32 +820,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -865,61 +850,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -943,7 +917,14 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -952,36 +933,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1006,7 +981,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1014,7 +988,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 3/3] net/octeon_ep: add new fastpath routines
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-11 12:53     ` [PATCH v3 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-11 12:53     ` Vamsi Attunuru
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-11 12:53 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 704 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..74f0011283
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..9f11a2f317
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..749776d70c 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,4 +9,6 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 57b965ad06..e965cbaa16 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	return 0;
@@ -255,18 +299,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -656,8 +705,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 4c509a419f..c421ef0a1c 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -675,85 +675,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 0/3] rewrite fastpath routines
  2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
                       ` (2 preceding siblings ...)
  2023-10-11 12:53     ` [PATCH v3 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-12  6:23     ` Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                         ` (3 more replies)
  3 siblings, 4 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-12  6:23 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx desciptor format which improves the
performance.

v4 changes:
- Use rte_atomic_xxx instead of __atomic_xxx built-ins

v2 & v3 changes:
- Fixed CI

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 310 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 210 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 257 +++++++--------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 805 insertions(+), 252 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
@ 2023-10-12  6:23       ` Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-12  6:23 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 2/3] net/octeon_ep: clean up receive routine
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-12  6:23       ` Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-12  6:23 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 164 ++++++++++++----------------
 1 file changed, 70 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..2654e13e18 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,15 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -567,9 +575,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -752,36 +758,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -793,21 +789,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -828,32 +821,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -865,61 +851,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -943,7 +918,15 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -952,36 +935,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1006,7 +983,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1014,7 +990,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v4 3/3] net/octeon_ep: add new fastpath routines
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-12  6:23       ` [PATCH v4 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-12  6:23       ` Vamsi Attunuru
  2023-10-18  3:48         ` Jerin Jacob
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
  3 siblings, 1 reply; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-12  6:23 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 310 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 210 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 706 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..22bf3ce7a7
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..86f771ca7e
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..749776d70c 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,4 +9,6 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 57b965ad06..e965cbaa16 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	return 0;
@@ -255,18 +299,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -656,8 +705,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 2654e13e18..f53f0578ef 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -676,85 +676,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v4 3/3] net/octeon_ep: add new fastpath routines
  2023-10-12  6:23       ` [PATCH v4 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-18  3:48         ` Jerin Jacob
  0 siblings, 0 replies; 26+ messages in thread
From: Jerin Jacob @ 2023-10-18  3:48 UTC (permalink / raw)
  To: Vamsi Attunuru; +Cc: dev, jerinj, sthotton

On Thu, Oct 12, 2023 at 4:41 PM Vamsi Attunuru <vattunuru@marvell.com> wrote:
>
> Adds new fastpath routines for cn10k & cn9k endpoint
> devices and assigns the fastpath routines based on
> the offload flags.
>
> Patch also adds misc changes to improve performance
> and code-readability.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

Could you rebase to next-net-mrvl. There is build issue, either due to
latest clang or new changes in main.

[for-next-net][dpdk-next-net-mrvl] $ clang -v
clang version 16.0.6
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-pc-linux-gnu/13.2.1
Found candidate GCC installation:
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/13.2.1
Selected GCC installation: /usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/13.2.1
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Selected multilib: .;@m64

ccache clang -Idrivers/libtmp_rte_net_octeon_ep.a.p -Idrivers
-I../drivers -Idrivers/net/octeon_ep -I../drivers/net/octeon_ep
-Ilib/ethdev -I../lib/ethdev -I. -I.. -Iconfig -I../config
-Ilib/eal/include -I../lib/eal/include -Ilib/eal/linux/
include -I../lib/eal/linux/include -Ilib/eal/x86/include
-I../lib/eal/x86/include -Ilib/eal/common -I../lib/eal/common
-Ilib/eal -I../lib/eal -Ilib/kvargs -I../lib/kvargs -Ilib/log
-I../lib/log -Ilib/metrics -I../lib/metrics -Ilib/telemetry
 -I../lib/telemetry -Ilib/net -I../lib/net -Ilib/mbuf -I../lib/mbuf
-Ilib/mempool -I../lib/mempool -Ilib/ring -I../lib/ring -Ilib/meter
-I../lib/meter -Idrivers/bus/pci -I../drivers/bus/pci
-I../drivers/bus/pci/linux -Ilib/pci -I../lib/pci
-Idrivers/bus/vdev -I../drivers/bus/vdev -fcolor-diagnostics
-D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -Werror -std=c11
-O2 -g -include rte_config.h -Wcast-qual -Wdeprecated -Wformat
-Wformat-nonliteral -Wformat-security -Wmissing-
declarations -Wmissing-prototypes -Wnested-externs
-Wold-style-definition -Wpointer-arith -Wsign-compare
-Wstrict-prototypes -Wundef -Wwrite-strings
-Wno-address-of-packed-member -Wno-missing-field-initializers
-D_GNU_SOURCE -fPIC -march=na
tive -mrtm -DALLOW_EXPERIMENTAL_API -DALLOW_INTERNAL_API
-DRTE_LOG_DEFAULT_LOGTYPE=pmd.net.octeon_ep -DRTE_ANNOTATE_LOCKS
-Wthread-safety -MD -MQ
drivers/libtmp_rte_net_octeon_ep.a.p/net_octeon_ep_otx_ep_rxtx.c.o -MF
drivers/libtmp_rte_net_
octeon_ep.a.p/net_octeon_ep_otx_ep_rxtx.c.o.d -o
drivers/libtmp_rte_net_octeon_ep.a.p/net_octeon_ep_otx_ep_rxtx.c.o -c
../drivers/net/octeon_ep/otx_ep_rxtx.c
../drivers/net/octeon_ep/otx_ep_rxtx.c:448:10: error: address argument
to atomic operation must be a pointer to _Atomic type ('uint32_t *'
(aka 'unsigned int *') invalid)
                while (rte_atomic_load_explicit(iq->inst_cnt_ism,
rte_memory_order_relaxed) >=
                       ^                        ~~~~~~~~~~~~~~~~
../lib/eal/include/rte_stdatomic.h:71:2: note: expanded from macro
'rte_atomic_load_explicit'
        atomic_load_explicit(ptr, memorder)
        ^                    ~~~
/usr/lib/clang/16/include/stdatomic.h:134:30: note: expanded from
macro 'atomic_load_explicit'
#define atomic_load_explicit __c11_atomic_load
                             ^
../drivers/net/octeon_ep/otx_ep_rxtx.c:924:10: error: address argument
to atomic operation must be a pointer to _Atomic type ('uint32_t *'
(aka 'unsigned int *') invalid)
                while (rte_atomic_load_explicit(droq->pkts_sent_ism,
rte_memory_order_relaxed) >=
                       ^                        ~~~~~~~~~~~~~~~~~~~
../lib/eal/include/rte_stdatomic.h:71:2: note: expanded from macro
'rte_atomic_load_explicit'
        atomic_load_explicit(ptr, memorder)
        ^                    ~~~
/usr/lib/clang/16/include/stdatomic.h:134:30: note: expanded from
macro 'atomic_load_explicit'
#define atomic_load_explicit __c11_atomic_load
                             ^

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v5 0/3] rewrite fastpath routines
  2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
                         ` (2 preceding siblings ...)
  2023-10-12  6:23       ` [PATCH v4 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-18  8:07       ` Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                           ` (3 more replies)
  3 siblings, 4 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18  8:07 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx desciptor format which improves the
performance.

V5 changes:
- Series rebased

v4 changes:
- Use rte_atomic_xxx instead of __atomic_xxx built-ins

v2 & v3 changes:
- Fixed CI

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 310 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 210 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 257 +++++++--------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 805 insertions(+), 252 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v5 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
@ 2023-10-18  8:07         ` Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18  8:07 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v5 2/3] net/octeon_ep: clean up receive routine
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-18  8:07         ` Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18  8:07 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 164 ++++++++++++----------------
 1 file changed, 70 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..2654e13e18 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,15 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -567,9 +575,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -752,36 +758,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -793,21 +789,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -828,32 +821,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -865,61 +851,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -943,7 +918,15 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -952,36 +935,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1006,7 +983,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1014,7 +990,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v5 3/3] net/octeon_ep: add new fastpath routines
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-18  8:07         ` [PATCH v5 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-18  8:07         ` Vamsi Attunuru
  2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
  3 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18  8:07 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 310 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 210 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 706 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..22bf3ce7a7
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (rte_atomic_load_explicit(droq->pkts_sent_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..86f771ca7e
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (rte_atomic_load_explicit(iq->inst_cnt_ism, rte_memory_order_relaxed) >=
+		       val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..749776d70c 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,4 +9,6 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 970372bbd7..615cbbb648 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	for (q = 0; q < eth_dev->data->nb_rx_queues; q++)
@@ -266,18 +310,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -667,8 +716,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 2654e13e18..f53f0578ef 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -676,85 +676,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v6 0/3] rewrite fastpath routines
  2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
                           ` (2 preceding siblings ...)
  2023-10-18  8:07         ` [PATCH v5 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-18 11:14         ` Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
                             ` (2 more replies)
  3 siblings, 3 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18 11:14 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

This series adds new fastpath routines for cn10k & cn9k endpoint
devices and supports 32B Tx descriptor format which improves the
performance.

V6 changes:
- Use __atomic_xxx built-ins to fix CI build

V5 changes:
- Series rebased

v4 changes:
- Use rte_atomic_xxx instead of __atomic_xxx built-ins

v2 & v3 changes:
- Fixed CI

Shijith Thotton (1):
  net/octeon_ep: support 32B IQ descriptor size

Vamsi Attunuru (2):
  net/octeon_ep: clean up receive routine
  net/octeon_ep: add new fastpath routines

 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |  12 +-
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |  11 +-
 drivers/net/octeon_ep/otx_ep_common.h | 127 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   | 255 +++++++--------------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 drivers/net/octeon_ep/otx_ep_vf.c     |   8 +
 11 files changed, 801 insertions(+), 252 deletions(-)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.c
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_tx.c

-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v6 1/3] net/octeon_ep: support 32B IQ descriptor size
  2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
@ 2023-10-18 11:14           ` Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18 11:14 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton

From: Shijith Thotton <sthotton@marvell.com>

Update input queue setup to consider descriptor size in driver conf.
The default instruction size for otx2 and cnxk devices has been updated
to 32 bytes.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx2_ep_vf.c    | 10 +++++++++-
 drivers/net/octeon_ep/otx_ep_common.h |  4 ++++
 drivers/net/octeon_ep/otx_ep_vf.c     |  8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 92c2d2ca5c..7b3669fe0c 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -106,6 +106,14 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(CNXK_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
@@ -354,7 +362,7 @@ static const struct otx_ep_config default_cnxk_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index ced3a415a5..f72b8d25d7 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -256,6 +256,14 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (otx_ep->conf->iq.instr_type == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(SDP_VF_R_IN_CTL_IS_64B);
+	else
+		reg_val |= SDP_VF_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + SDP_VF_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_BADDR(iq_no));
 	oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + SDP_VF_R_IN_INSTR_RSIZE(iq_no));
@@ -500,7 +508,7 @@ static const struct otx_ep_config default_otx2_ep_conf = {
 	/* IQ attributes */
 	.iq                        = {
 		.max_iqs           = OTX_EP_CFG_IO_QUEUES,
-		.instr_type        = OTX_EP_64BYTE_INSTR,
+		.instr_type        = OTX_EP_32BYTE_INSTR,
 		.pending_list_size = (OTX_EP_MAX_IQ_DESCRIPTORS *
 				      OTX_EP_CFG_IO_QUEUES),
 	},
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index c150cbe619..90e059cad0 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -11,6 +11,7 @@
 
 #define OTX_EP_MAX_RINGS_PER_VF        (8)
 #define OTX_EP_CFG_IO_QUEUES        OTX_EP_MAX_RINGS_PER_VF
+#define OTX_EP_32BYTE_INSTR         (32)
 #define OTX_EP_64BYTE_INSTR         (64)
 /*
  * Backpressure for SDP is configured on Octeon, and the minimum queue sizes
@@ -215,6 +216,9 @@ struct otx_ep_instr_queue {
 	/* Number of  descriptors in this ring. */
 	uint32_t nb_desc;
 
+	/* Size of the descriptor. */
+	uint8_t desc_size;
+
 	/* Input ring index, where the driver should write the next packet */
 	uint32_t host_write_index;
 
diff --git a/drivers/net/octeon_ep/otx_ep_vf.c b/drivers/net/octeon_ep/otx_ep_vf.c
index 4f3538146b..236b7a874c 100644
--- a/drivers/net/octeon_ep/otx_ep_vf.c
+++ b/drivers/net/octeon_ep/otx_ep_vf.c
@@ -120,6 +120,14 @@ otx_ep_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 			return -EIO;
 	}
 
+	/* Configure input queue instruction size. */
+	if (iq->desc_size == OTX_EP_32BYTE_INSTR)
+		reg_val &= ~(OTX_EP_R_IN_CTL_IS_64B);
+	else
+		reg_val |= OTX_EP_R_IN_CTL_IS_64B;
+	oct_ep_write64(reg_val, otx_ep->hw_addr + OTX_EP_R_IN_CONTROL(iq_no));
+	iq->desc_size = otx_ep->conf->iq.instr_type;
+
 	/* Write the start of the input queue's ring and its size  */
 	otx_ep_write64(iq->base_addr_dma, otx_ep->hw_addr,
 		       OTX_EP_R_IN_INSTR_BADDR(iq_no));
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v6 2/3] net/octeon_ep: clean up receive routine
  2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
@ 2023-10-18 11:14           ` Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
  2 siblings, 0 replies; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18 11:14 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Patch improves Rx routine and pkt count update routines,
packet count update routines need to drain inflight ISM
memory updates while decrementing the packet count register.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/otx_ep_rxtx.c | 162 ++++++++++++----------------
 1 file changed, 68 insertions(+), 94 deletions(-)

diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index b37fc8109f..4c509a419f 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -442,7 +442,14 @@ otx_vf_update_read_index(struct otx_ep_instr_queue *iq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, iq->inst_cnt_reg);
-		*iq->inst_cnt_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
 		iq->inst_cnt_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
@@ -567,9 +574,7 @@ prepare_xmit_gather_list(struct otx_ep_instr_queue *iq, struct rte_mbuf *m, uint
 
 	finfo = &iq->req_list[iq->host_write_index].finfo;
 	*dptr = rte_mem_virt2iova(finfo->g.sg);
-	ih->s.tlen = pkt_len + ih->s.fsz;
-	ih->s.gsz = frags;
-	ih->s.gather = 1;
+	ih->u64 |= ((1ULL << 62) | ((uint64_t)frags << 48) | (pkt_len + ih->s.fsz));
 
 	while (frags--) {
 		finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
@@ -752,36 +757,26 @@ otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
-	struct otx_ep_droq_desc *desc_ring;
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
 	struct otx_ep_droq_info *info;
 	struct rte_mbuf *buf = NULL;
 	uint32_t desc_refilled = 0;
 
-	desc_ring = droq->desc_ring;
-
 	while (droq->refill_count && (desc_refilled < droq->nb_desc)) {
-		/* If a valid buffer exists (happens if there is no dispatch),
-		 * reuse the buffer, else allocate.
-		 */
-		if (droq->recv_buf_list[droq->refill_idx] != NULL)
-			break;
-
 		buf = rte_pktmbuf_alloc(droq->mpool);
 		/* If a buffer could not be allocated, no point in
 		 * continuing
 		 */
-		if (buf == NULL) {
+		if (unlikely(!buf)) {
 			droq->stats.rx_alloc_failure++;
 			break;
 		}
 		info = rte_pktmbuf_mtod(buf, struct otx_ep_droq_info *);
-		memset(info, 0, sizeof(*info));
+		info->length = 0;
 
 		droq->recv_buf_list[droq->refill_idx] = buf;
 		desc_ring[droq->refill_idx].buffer_ptr =
 					rte_mbuf_data_iova_default(buf);
-
-
 		droq->refill_idx = otx_ep_incr_index(droq->refill_idx, 1,
 				droq->nb_desc);
 
@@ -793,21 +788,18 @@ otx_ep_droq_refill(struct otx_ep_droq *droq)
 }
 
 static struct rte_mbuf *
-otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
-			struct otx_ep_droq *droq, int next_fetch)
+otx_ep_droq_read_packet(struct otx_ep_device *otx_ep, struct otx_ep_droq *droq, int next_fetch)
 {
 	volatile struct otx_ep_droq_info *info;
-	struct rte_mbuf *droq_pkt2 = NULL;
-	struct rte_mbuf *droq_pkt = NULL;
-	struct rte_net_hdr_lens hdr_lens;
-	struct otx_ep_droq_info *info2;
+	struct rte_mbuf *mbuf_next = NULL;
+	struct rte_mbuf *mbuf = NULL;
 	uint64_t total_pkt_len;
 	uint32_t pkt_len = 0;
 	int next_idx;
 
-	droq_pkt  = droq->recv_buf_list[droq->read_idx];
-	droq_pkt2  = droq->recv_buf_list[droq->read_idx];
-	info = rte_pktmbuf_mtod(droq_pkt, struct otx_ep_droq_info *);
+	mbuf = droq->recv_buf_list[droq->read_idx];
+	info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
 	/* make sure info is available */
 	rte_rmb();
 	if (unlikely(!info->length)) {
@@ -828,32 +820,25 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 			assert(0);
 		}
 	}
+
 	if (next_fetch) {
 		next_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
-		droq_pkt2  = droq->recv_buf_list[next_idx];
-		info2 = rte_pktmbuf_mtod(droq_pkt2, struct otx_ep_droq_info *);
-		rte_prefetch_non_temporal((const void *)info2);
+		mbuf_next = droq->recv_buf_list[next_idx];
+		rte_prefetch0(rte_pktmbuf_mtod(mbuf_next, void *));
 	}
 
-	info->length = rte_bswap64(info->length);
+	info->length = rte_bswap16(info->length >> 48);
 	/* Deduce the actual data size */
 	total_pkt_len = info->length + OTX_EP_INFO_SIZE;
 	if (total_pkt_len <= droq->buffer_size) {
-		droq_pkt  = droq->recv_buf_list[droq->read_idx];
-		if (likely(droq_pkt != NULL)) {
-			droq_pkt->data_off += OTX_EP_INFO_SIZE;
-			/* otx_ep_dbg("OQ: pkt_len[%ld], buffer_size %d\n",
-			 * (long)info->length, droq->buffer_size);
-			 */
-			pkt_len = (uint32_t)info->length;
-			droq_pkt->pkt_len  = pkt_len;
-			droq_pkt->data_len  = pkt_len;
-			droq_pkt->port = otx_ep->port_id;
-			droq->recv_buf_list[droq->read_idx] = NULL;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
-			droq->refill_count++;
-		}
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		pkt_len = (uint32_t)info->length;
+		mbuf->pkt_len  = pkt_len;
+		mbuf->data_len  = pkt_len;
+		mbuf->port = otx_ep->port_id;
+		droq->recv_buf_list[droq->read_idx] = NULL;
+		droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
+		droq->refill_count++;
 	} else {
 		struct rte_mbuf *first_buf = NULL;
 		struct rte_mbuf *last_buf = NULL;
@@ -865,61 +850,50 @@ otx_ep_droq_read_packet(struct otx_ep_device *otx_ep,
 		while (pkt_len < total_pkt_len) {
 			int cpy_len = 0;
 
-			cpy_len = ((pkt_len + droq->buffer_size) >
-					total_pkt_len)
-					? ((uint32_t)total_pkt_len -
-						pkt_len)
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len)
 					: droq->buffer_size;
 
-			droq_pkt = droq->recv_buf_list[droq->read_idx];
+			mbuf = droq->recv_buf_list[droq->read_idx];
 			droq->recv_buf_list[droq->read_idx] = NULL;
 
-			if (likely(droq_pkt != NULL)) {
+			if (likely(mbuf)) {
 				/* Note the first seg */
 				if (!pkt_len)
-					first_buf = droq_pkt;
+					first_buf = mbuf;
 
-				droq_pkt->port = otx_ep->port_id;
+				mbuf->port = otx_ep->port_id;
 				if (!pkt_len) {
-					droq_pkt->data_off +=
-						OTX_EP_INFO_SIZE;
-					droq_pkt->pkt_len =
-						cpy_len - OTX_EP_INFO_SIZE;
-					droq_pkt->data_len =
-						cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_off += OTX_EP_INFO_SIZE;
+					mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+					mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
 				} else {
-					droq_pkt->pkt_len = cpy_len;
-					droq_pkt->data_len = cpy_len;
+					mbuf->pkt_len = cpy_len;
+					mbuf->data_len = cpy_len;
 				}
 
 				if (pkt_len) {
 					first_buf->nb_segs++;
-					first_buf->pkt_len += droq_pkt->pkt_len;
+					first_buf->pkt_len += mbuf->pkt_len;
 				}
 
 				if (last_buf)
-					last_buf->next = droq_pkt;
+					last_buf->next = mbuf;
 
-				last_buf = droq_pkt;
+				last_buf = mbuf;
 			} else {
 				otx_ep_err("no buf\n");
 				assert(0);
 			}
 
 			pkt_len += cpy_len;
-			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1,
-							   droq->nb_desc);
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, droq->nb_desc);
 			droq->refill_count++;
 		}
-		droq_pkt = first_buf;
+		mbuf = first_buf;
 	}
-	droq_pkt->packet_type = rte_net_get_ptype(droq_pkt, &hdr_lens,
-					RTE_PTYPE_ALL_MASK);
-	droq_pkt->l2_len = hdr_lens.l2_len;
-	droq_pkt->l3_len = hdr_lens.l3_len;
-	droq_pkt->l4_len = hdr_lens.l4_len;
 
-	return droq_pkt;
+	return mbuf;
 }
 
 static inline uint32_t
@@ -943,7 +917,14 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 		 * when count above halfway to saturation.
 		 */
 		rte_write32(val, droq->pkts_sent_reg);
-		*droq->pkts_sent_ism = 0;
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
 		droq->pkts_sent_ism_prev = 0;
 	}
 	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
@@ -952,36 +933,30 @@ otx_ep_check_droq_pkts(struct otx_ep_droq *droq)
 	return new_pkts;
 }
 
+static inline int32_t __rte_hot
+otx_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (unlikely(droq->pkts_pending < nb_pkts))
+		otx_ep_check_droq_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
 /* Check for response arrival from OCTEON 9
  * returns number of requests completed
  */
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget)
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct otx_ep_droq *droq = rx_queue;
 	struct otx_ep_device *otx_ep;
 	struct rte_mbuf *oq_pkt;
-
-	uint32_t pkts = 0;
+	uint16_t pkts, new_pkts;
 	uint32_t valid_pkts = 0;
-	uint32_t new_pkts = 0;
 	int next_fetch;
 
 	otx_ep = droq->otx_ep_dev;
-
-	if (droq->pkts_pending > budget) {
-		new_pkts = budget;
-	} else {
-		new_pkts = droq->pkts_pending;
-		new_pkts += otx_ep_check_droq_pkts(droq);
-		if (new_pkts > budget)
-			new_pkts = budget;
-	}
-
-	if (!new_pkts)
-		goto update_credit; /* No pkts at this moment */
+	new_pkts = otx_ep_rx_pkts_to_process(droq, nb_pkts);
 
 	for (pkts = 0; pkts < new_pkts; pkts++) {
 		/* Push the received pkt to application */
@@ -1006,7 +981,6 @@ otx_ep_recv_pkts(void *rx_queue,
 	droq->pkts_pending -= pkts;
 
 	/* Refill DROQ buffers */
-update_credit:
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		int desc_refilled = otx_ep_droq_refill(droq);
 
@@ -1014,7 +988,7 @@ otx_ep_recv_pkts(void *rx_queue,
 		 * that when we update the credits the data in memory is
 		 * accurate.
 		 */
-		rte_wmb();
+		rte_io_wmb();
 		rte_write32(desc_refilled, droq->pkts_credit_reg);
 	} else {
 		/*
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v6 3/3] net/octeon_ep: add new fastpath routines
  2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
  2023-10-18 11:14           ` [PATCH v6 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
@ 2023-10-18 11:14           ` Vamsi Attunuru
  2023-10-19  3:03             ` Jerin Jacob
  2 siblings, 1 reply; 26+ messages in thread
From: Vamsi Attunuru @ 2023-10-18 11:14 UTC (permalink / raw)
  To: dev, jerinj; +Cc: sthotton, Vamsi Attunuru

Adds new fastpath routines for cn10k & cn9k endpoint
devices and assigns the fastpath routines based on
the offload flags.

Patch also adds misc changes to improve performance
and code-readability.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/net/octeon_ep/cnxk_ep_rx.c    | 309 ++++++++++++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_tx.c    | 209 +++++++++++++++++
 drivers/net/octeon_ep/cnxk_ep_vf.c    |   2 +
 drivers/net/octeon_ep/cnxk_ep_vf.h    |  13 ++
 drivers/net/octeon_ep/meson.build     |   2 +
 drivers/net/octeon_ep/otx2_ep_vf.c    |   1 +
 drivers/net/octeon_ep/otx_ep_common.h | 125 ++++++-----
 drivers/net/octeon_ep/otx_ep_ethdev.c |  69 +++++-
 drivers/net/octeon_ep/otx_ep_rxtx.c   |  93 +-------
 drivers/net/octeon_ep/otx_ep_rxtx.h   |  38 +++-
 10 files changed, 704 insertions(+), 157 deletions(-)

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
new file mode 100644
index 0000000000..74f0011283
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "otx_ep_common.h"
+#include "otx2_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static inline int
+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
+{
+	struct otx_ep_droq_desc *desc_ring = droq->desc_ring;
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t refill_idx = droq->refill_idx;
+	struct rte_mbuf *buf;
+	uint32_t i;
+	int rc;
+
+	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	if (unlikely(rc)) {
+		droq->stats.rx_alloc_failure++;
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		buf = recv_buf_list[refill_idx];
+		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
+		refill_idx++;
+	}
+
+	droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);
+	droq->refill_count -= count;
+
+	return 0;
+}
+
+static inline void
+cnxk_ep_rx_refill(struct otx_ep_droq *droq)
+{
+	uint32_t desc_refilled = 0, count;
+	uint32_t nb_desc = droq->nb_desc;
+	uint32_t refill_idx = droq->refill_idx;
+	int rc;
+
+	if (unlikely(droq->read_idx == refill_idx))
+		return;
+
+	if (refill_idx < droq->read_idx) {
+		count = droq->read_idx - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled = count;
+	} else {
+		count = nb_desc - refill_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+
+		desc_refilled = count;
+		count = droq->read_idx;
+		rc = cnxk_ep_rx_refill_mbuf(droq, count);
+		if (unlikely(rc)) {
+			droq->stats.rx_alloc_failure++;
+			return;
+		}
+		desc_refilled += count;
+	}
+
+	/* Flush the droq descriptor data to memory to be sure
+	 * that when we update the credits the data in memory is
+	 * accurate.
+	 */
+	rte_io_wmb();
+	rte_write32(desc_refilled, droq->pkts_credit_reg);
+}
+
+static inline uint32_t
+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)
+{
+	uint32_t new_pkts;
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);
+	new_pkts = val - droq->pkts_sent_ism_prev;
+	droq->pkts_sent_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, droq->pkts_sent_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+		while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+			rte_mb();
+		}
+
+		droq->pkts_sent_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);
+	droq->pkts_pending += new_pkts;
+
+	return new_pkts;
+}
+
+static inline int16_t __rte_hot
+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
+{
+	if (droq->pkts_pending < nb_pkts)
+		cnxk_ep_check_rx_pkts(droq);
+
+	return RTE_MIN(nb_pkts, droq->pkts_pending);
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *mbuf;
+		uint16_t pkt_len;
+
+		mbuf = recv_buf_list[read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
+		pkt_len = rte_bswap16(info->length >> 48);
+		mbuf->data_off += OTX_EP_INFO_SIZE;
+		mbuf->pkt_len = pkt_len;
+		mbuf->data_len = pkt_len;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+	droq->read_idx = read_idx;
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
+				 uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t total_pkt_len, bytes_rsvd = 0;
+	uint16_t port_id = droq->otx_ep_dev->port_id;
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts;
+
+	for (pkts = 0; pkts < new_pkts; pkts++) {
+		struct otx_ep_droq_info *info;
+		struct rte_mbuf *first_buf = NULL;
+		struct rte_mbuf *last_buf = NULL;
+		struct rte_mbuf *mbuf;
+		uint32_t pkt_len = 0;
+
+		mbuf = recv_buf_list[droq->read_idx];
+		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+
+		total_pkt_len = rte_bswap16(info->length >> 48) + OTX_EP_INFO_SIZE;
+
+		while (pkt_len < total_pkt_len) {
+			int cpy_len;
+
+			cpy_len = ((pkt_len + droq->buffer_size) > total_pkt_len)
+					? ((uint32_t)total_pkt_len - pkt_len) : droq->buffer_size;
+
+			mbuf = droq->recv_buf_list[droq->read_idx];
+
+			if (!pkt_len) {
+				/* Note the first seg */
+				first_buf = mbuf;
+				mbuf->data_off += OTX_EP_INFO_SIZE;
+				mbuf->pkt_len = cpy_len - OTX_EP_INFO_SIZE;
+				mbuf->data_len = cpy_len - OTX_EP_INFO_SIZE;
+			} else {
+				mbuf->pkt_len = cpy_len;
+				mbuf->data_len = cpy_len;
+				first_buf->nb_segs++;
+				first_buf->pkt_len += mbuf->pkt_len;
+			}
+
+			if (last_buf)
+				last_buf->next = mbuf;
+
+			last_buf = mbuf;
+
+			pkt_len += cpy_len;
+			droq->read_idx = otx_ep_incr_index(droq->read_idx, 1, nb_desc);
+			droq->refill_count++;
+		}
+		mbuf = first_buf;
+		mbuf->port = port_id;
+		rx_pkts[pkts] = mbuf;
+		bytes_rsvd += pkt_len;
+	}
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= pkts;
+	/* Stats */
+	droq->stats.pkts_received += pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	cnxk_ep_process_pkts_scalar_mseg(rx_pkts, droq, new_pkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_tx.c b/drivers/net/octeon_ep/cnxk_ep_tx.c
new file mode 100644
index 0000000000..9f11a2f317
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_tx.c
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_vf.h"
+#include "otx_ep_rxtx.h"
+
+static uint32_t
+cnxk_vf_update_read_index(struct otx_ep_instr_queue *iq)
+{
+	uint32_t val;
+
+	/* Batch subtractions from the HW counter to reduce PCIe traffic
+	 * This adds an extra local variable, but almost halves the
+	 * number of PCIe writes.
+	 */
+	val = __atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED);
+	iq->inst_cnt += val - iq->inst_cnt_ism_prev;
+	iq->inst_cnt_ism_prev = val;
+
+	if (val > (uint32_t)(1 << 31)) {
+		/* Only subtract the packet count in the HW counter
+		 * when count above halfway to saturation.
+		 */
+		rte_write64((uint64_t)val, iq->inst_cnt_reg);
+		rte_mb();
+
+		rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+		while (__atomic_load_n(iq->inst_cnt_ism, __ATOMIC_RELAXED) >= val) {
+			rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+			rte_mb();
+		}
+
+		iq->inst_cnt_ism_prev = 0;
+	}
+	rte_write64(OTX2_SDP_REQUEST_ISM, iq->inst_cnt_reg);
+
+	/* Modulo of the new index with the IQ size will give us
+	 * the new index.
+	 */
+	return iq->inst_cnt & (iq->nb_desc - 1);
+}
+
+static inline void
+cnxk_ep_flush_iq(struct otx_ep_instr_queue *iq)
+{
+	uint32_t instr_processed = 0;
+	uint32_t cnt = 0;
+
+	iq->otx_read_index = cnxk_vf_update_read_index(iq);
+
+	if (unlikely(iq->flush_index == iq->otx_read_index))
+		return;
+
+	if (iq->flush_index < iq->otx_read_index) {
+		instr_processed = iq->otx_read_index - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+	} else {
+		cnt = iq->nb_desc - iq->flush_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], cnt);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, cnt, iq->nb_desc);
+
+		instr_processed = iq->otx_read_index;
+		rte_pktmbuf_free_bulk(&iq->mbuf_list[iq->flush_index], instr_processed);
+		iq->flush_index = otx_ep_incr_index(iq->flush_index, instr_processed, iq->nb_desc);
+
+		instr_processed += cnt;
+	}
+
+	iq->stats.instr_processed = instr_processed;
+	iq->instr_pending -= instr_processed;
+}
+
+static inline void
+set_sg_size(struct otx_ep_sg_entry *sg_entry, uint16_t size, uint32_t pos)
+{
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+	sg_entry->u.size[pos] = size;
+#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	sg_entry->u.size[(OTX_EP_NUM_SG_PTRS - 1) - pos] = size;
+#endif
+}
+
+static __rte_always_inline void
+cnxk_ep_xmit_pkts_scalar(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq, uint16_t nb_pkts)
+{
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len;
+	uint32_t tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		m = tx_pkts[pkts];
+		iq->mbuf_list[write_idx] = m;
+		pkt_len = rte_pktmbuf_data_len(m);
+
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->ih.u64 = iq->partial_ih | pkt_len;
+		iqcmd->dptr = rte_mbuf_data_iova(m); /*dptr*/
+		tx_bytes += pkt_len;
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+	}
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+}
+
+static __rte_always_inline uint16_t
+cnxk_ep_xmit_pkts_scalar_mseg(struct rte_mbuf **tx_pkts, struct otx_ep_instr_queue *iq,
+			      uint16_t nb_pkts)
+{
+	uint16_t frags, num_sg, mask = OTX_EP_NUM_SG_PTRS - 1;
+	struct otx_ep_buf_free_info *finfo;
+	struct cnxk_ep_instr_32B *iqcmd;
+	struct rte_mbuf *m;
+	uint32_t pkt_len, tx_bytes = 0;
+	uint32_t write_idx = iq->host_write_index;
+	uint16_t pkts, nb_desc = iq->nb_desc;
+	uint8_t desc_size = iq->desc_size;
+
+	for (pkts = 0; pkts < nb_pkts; pkts++) {
+		uint16_t j = 0;
+
+		m = tx_pkts[pkts];
+		frags = m->nb_segs;
+
+		pkt_len = rte_pktmbuf_pkt_len(m);
+		num_sg = (frags + mask) / OTX_EP_NUM_SG_PTRS;
+
+		if (unlikely(pkt_len > OTX_EP_MAX_PKT_SZ && num_sg > OTX_EP_MAX_SG_LISTS)) {
+			otx_ep_err("Failed to xmit the pkt, pkt_len is higher or pkt has more segments\n");
+			goto exit;
+		}
+
+		finfo = &iq->req_list[write_idx].finfo;
+
+		iq->mbuf_list[write_idx] = m;
+		iqcmd = (struct cnxk_ep_instr_32B *)(iq->base_addr + (write_idx * desc_size));
+		iqcmd->dptr = rte_mem_virt2iova(finfo->g.sg);
+		iqcmd->ih.u64 = iq->partial_ih | (1ULL << 62) | ((uint64_t)frags << 48) | pkt_len;
+
+		while (frags--) {
+			finfo->g.sg[(j >> 2)].ptr[(j & mask)] = rte_mbuf_data_iova(m);
+			set_sg_size(&finfo->g.sg[(j >> 2)], m->data_len, (j & mask));
+			j++;
+			m = m->next;
+		}
+
+		/* Increment the host write index */
+		write_idx = otx_ep_incr_index(write_idx, 1, nb_desc);
+		tx_bytes += pkt_len;
+	}
+exit:
+	iq->host_write_index = write_idx;
+
+	/* ring dbell */
+	rte_io_wmb();
+	rte_write64(pkts, iq->doorbell_reg);
+	iq->instr_pending += pkts;
+	iq->stats.tx_pkts += pkts;
+	iq->stats.tx_bytes += tx_bytes;
+
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	cnxk_ep_xmit_pkts_scalar(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
+	uint16_t pkts;
+
+	pkts = RTE_MIN(nb_pkts, iq->nb_desc - iq->instr_pending);
+
+	pkts = cnxk_ep_xmit_pkts_scalar_mseg(tx_pkts, iq, pkts);
+
+	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
+		cnxk_ep_flush_iq(iq);
+
+	/* Return no# of instructions posted successfully. */
+	return pkts;
+}
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c b/drivers/net/octeon_ep/cnxk_ep_vf.c
index 7b3669fe0c..ef275703c3 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.c
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
@@ -156,6 +156,8 @@ cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (void *)iq->inst_cnt_ism, ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
+
 	return 0;
 }
 
diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.h b/drivers/net/octeon_ep/cnxk_ep_vf.h
index 86277449ea..41d8fbbb3a 100644
--- a/drivers/net/octeon_ep/cnxk_ep_vf.h
+++ b/drivers/net/octeon_ep/cnxk_ep_vf.h
@@ -6,6 +6,8 @@
 
 #include <rte_io.h>
 
+#include "otx_ep_common.h"
+
 #define CNXK_CONFIG_XPANSION_BAR             0x38
 #define CNXK_CONFIG_PCIE_CAP                 0x70
 #define CNXK_CONFIG_PCIE_DEVCAP              0x74
@@ -178,6 +180,17 @@ struct cnxk_ep_instr_64B {
 	uint64_t exhdr[4];
 };
 
+struct cnxk_ep_instr_32B {
+	/* Pointer where the input data is available. */
+	uint64_t dptr;
+
+	/* OTX_EP Instruction Header. */
+	union otx_ep_instr_ih ih;
+
+	/* Misc data bytes that can be passed as front data */
+	uint64_t rsvd[2];
+};
+
 #define CNXK_EP_IQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue) + 4)
 #define CNXK_EP_OQ_ISM_OFFSET(queue)    (RTE_CACHE_LINE_SIZE * (queue))
 #define CNXK_EP_ISM_EN                  (0x1)
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index e698bf9792..749776d70c 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -9,4 +9,6 @@ sources = files(
         'otx2_ep_vf.c',
         'cnxk_ep_vf.c',
         'otx_ep_mbox.c',
+        'cnxk_ep_rx.c',
+        'cnxk_ep_tx.c',
 )
diff --git a/drivers/net/octeon_ep/otx2_ep_vf.c b/drivers/net/octeon_ep/otx2_ep_vf.c
index f72b8d25d7..7f4edf8dcf 100644
--- a/drivers/net/octeon_ep/otx2_ep_vf.c
+++ b/drivers/net/octeon_ep/otx2_ep_vf.c
@@ -307,6 +307,7 @@ otx2_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
 		   (unsigned int)ism_addr);
 	*iq->inst_cnt_ism = 0;
 	iq->inst_cnt_ism_prev = 0;
+	iq->partial_ih = ((uint64_t)otx_ep->pkind) << 36;
 
 	return 0;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h
index 90e059cad0..82e57520d3 100644
--- a/drivers/net/octeon_ep/otx_ep_common.h
+++ b/drivers/net/octeon_ep/otx_ep_common.h
@@ -4,7 +4,20 @@
 #ifndef _OTX_EP_COMMON_H_
 #define _OTX_EP_COMMON_H_
 
+#include <rte_bitops.h>
 #include <rte_spinlock.h>
+#include <unistd.h>
+#include <assert.h>
+#include <rte_eal.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_io.h>
+#include <rte_net.h>
+#include <ethdev_pci.h>
+
+#define OTX_EP_CN8XX  RTE_BIT32(0)
+#define OTX_EP_CN9XX  RTE_BIT32(1)
+#define OTX_EP_CN10XX RTE_BIT32(2)
 
 #define OTX_EP_NW_PKT_OP               0x1220
 #define OTX_EP_NW_CMD_OP               0x1221
@@ -38,7 +51,7 @@
 #define OTX_EP_NORESP_OHSM_SEND     (4)
 #define OTX_EP_NORESP_LAST          (4)
 #define OTX_EP_PCI_RING_ALIGN   65536
-#define OTX_EP_MAX_SG_LISTS 4
+#define OTX_EP_MAX_SG_LISTS 6
 #define OTX_EP_NUM_SG_PTRS 4
 #define SDP_PKIND 40
 #define SDP_OTX2_PKIND 57
@@ -203,6 +216,38 @@ struct otx_ep_iq_config {
  *  such structure to represent it.
  */
 struct otx_ep_instr_queue {
+	/* Location in memory updated by SDP ISM */
+	uint32_t *inst_cnt_ism;
+	struct rte_mbuf **mbuf_list;
+	/* Pointer to the Virtual Base addr of the input ring. */
+	uint8_t *base_addr;
+
+	/* track inst count locally to consolidate HW counter updates */
+	uint32_t inst_cnt_ism_prev;
+
+	/* Input ring index, where the driver should write the next packet */
+	uint32_t host_write_index;
+
+	/* Input ring index, where the OCTEON 9 should read the next packet */
+	uint32_t otx_read_index;
+	/** This index aids in finding the window in the queue where OCTEON 9
+	 *  has read the commands.
+	 */
+	uint32_t flush_index;
+	/* This keeps track of the instructions pending in this queue. */
+	uint64_t instr_pending;
+
+	/* Memory zone */
+	const struct rte_memzone *iq_mz;
+	/* OTX_EP doorbell register for the ring. */
+	void *doorbell_reg;
+
+	/* OTX_EP instruction count register for this ring. */
+	void *inst_cnt_reg;
+
+	/* Number of instructions pending to be posted to OCTEON 9. */
+	uint32_t fill_cnt;
+
 	struct otx_ep_device *otx_ep_dev;
 
 	uint32_t q_no;
@@ -219,54 +264,21 @@ struct otx_ep_instr_queue {
 	/* Size of the descriptor. */
 	uint8_t desc_size;
 
-	/* Input ring index, where the driver should write the next packet */
-	uint32_t host_write_index;
-
-	/* Input ring index, where the OCTEON 9 should read the next packet */
-	uint32_t otx_read_index;
-
 	uint32_t reset_instr_cnt;
 
-	/** This index aids in finding the window in the queue where OCTEON 9
-	 *  has read the commands.
-	 */
-	uint32_t flush_index;
-
 	/* Free-running/wrapping instruction counter for IQ. */
 	uint32_t inst_cnt;
 
-	/* This keeps track of the instructions pending in this queue. */
-	uint64_t instr_pending;
-
-	/* Pointer to the Virtual Base addr of the input ring. */
-	uint8_t *base_addr;
+	uint64_t partial_ih;
 
 	/* This IQ request list */
 	struct otx_ep_instr_list *req_list;
 
-	/* OTX_EP doorbell register for the ring. */
-	void *doorbell_reg;
-
-	/* OTX_EP instruction count register for this ring. */
-	void *inst_cnt_reg;
-
-	/* Number of instructions pending to be posted to OCTEON 9. */
-	uint32_t fill_cnt;
-
 	/* Statistics for this input queue. */
 	struct otx_ep_iq_stats stats;
 
 	/* DMA mapped base address of the input descriptor ring. */
 	uint64_t base_addr_dma;
-
-	/* Memory zone */
-	const struct rte_memzone *iq_mz;
-
-	/* Location in memory updated by SDP ISM */
-	uint32_t *inst_cnt_ism;
-
-	/* track inst count locally to consolidate HW counter updates */
-	uint32_t inst_cnt_ism_prev;
 };
 
 /** Descriptor format.
@@ -344,14 +356,17 @@ struct otx_ep_oq_config {
 
 /* The Descriptor Ring Output Queue(DROQ) structure. */
 struct otx_ep_droq {
-	struct otx_ep_device *otx_ep_dev;
 	/* The 8B aligned descriptor ring starts at this address. */
 	struct otx_ep_droq_desc *desc_ring;
 
-	uint32_t q_no;
-	uint64_t last_pkt_count;
+	/* The 8B aligned info ptrs begin from this address. */
+	struct otx_ep_droq_info *info_list;
 
-	struct rte_mempool *mpool;
+	/* receive buffer list contains mbuf ptr list */
+	struct rte_mbuf **recv_buf_list;
+
+	/* Packets pending to be processed */
+	uint64_t pkts_pending;
 
 	/* Driver should read the next packet at this index */
 	uint32_t read_idx;
@@ -362,22 +377,17 @@ struct otx_ep_droq {
 	/* At this index, the driver will refill the descriptor's buffer */
 	uint32_t refill_idx;
 
-	/* Packets pending to be processed */
-	uint64_t pkts_pending;
+	/* The number of descriptors pending to refill. */
+	uint32_t refill_count;
 
 	/* Number of descriptors in this ring. */
 	uint32_t nb_desc;
 
-	/* The number of descriptors pending to refill. */
-	uint32_t refill_count;
-
 	uint32_t refill_threshold;
 
-	/* The 8B aligned info ptrs begin from this address. */
-	struct otx_ep_droq_info *info_list;
+	uint64_t last_pkt_count;
 
-	/* receive buffer list contains mbuf ptr list */
-	struct rte_mbuf **recv_buf_list;
+	struct rte_mempool *mpool;
 
 	/* The size of each buffer pointed by the buffer pointer. */
 	uint32_t buffer_size;
@@ -392,6 +402,13 @@ struct otx_ep_droq {
 	 */
 	void *pkts_sent_reg;
 
+	/* Pointer to host memory copy of output packet count, set by ISM */
+	uint32_t *pkts_sent_ism;
+	uint32_t pkts_sent_ism_prev;
+
+	/* Statistics for this DROQ. */
+	struct otx_ep_droq_stats stats;
+
 	/** Handle DMA incompletion during pkt reads.
 	 * This variable is used to initiate a sent_reg_read
 	 * that completes pending dma
@@ -400,8 +417,9 @@ struct otx_ep_droq {
 	 */
 	uint32_t sent_reg_val;
 
-	/* Statistics for this DROQ. */
-	struct otx_ep_droq_stats stats;
+	uint32_t q_no;
+
+	struct otx_ep_device *otx_ep_dev;
 
 	/* DMA mapped address of the DROQ descriptor ring. */
 	size_t desc_ring_dma;
@@ -419,10 +437,6 @@ struct otx_ep_droq {
 	const struct rte_memzone *desc_ring_mz;
 
 	const struct rte_memzone *info_mz;
-
-	/* Pointer to host memory copy of output packet count, set by ISM */
-	uint32_t *pkts_sent_ism;
-	uint32_t pkts_sent_ism_prev;
 };
 #define OTX_EP_DROQ_SIZE		(sizeof(struct otx_ep_droq))
 
@@ -545,6 +559,9 @@ struct otx_ep_device {
 
 	/* Negotiated Mbox version */
 	uint32_t mbox_neg_ver;
+
+	/* Generation */
+	uint32_t chip_gen;
 };
 
 int otx_ep_setup_iqs(struct otx_ep_device *otx_ep, uint32_t iq_no,
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 970372bbd7..615cbbb648 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -27,6 +27,46 @@ static const struct rte_eth_desc_lim otx_ep_tx_desc_lim = {
 	.nb_align	= OTX_EP_TXD_ALIGN,
 };
 
+static void
+otx_ep_set_tx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX || otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
+		if (otx_epvf->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+			eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts_mseg;
+	} else {
+		eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].tx_pkt_burst =
+			eth_dev->tx_pkt_burst;
+}
+
+static void
+otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
+{
+	struct otx_ep_device *otx_epvf = OTX_EP_DEV(eth_dev);
+
+	if (otx_epvf->chip_gen == OTX_EP_CN10XX) {
+		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
+	} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {
+		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;
+	} else {
+		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
+	}
+
+	if (eth_dev->data->dev_started)
+		rte_eth_fp_ops[eth_dev->data->port_id].rx_pkt_burst =
+			eth_dev->rx_pkt_burst;
+}
+
 static int
 otx_ep_dev_info_get(struct rte_eth_dev *eth_dev,
 		    struct rte_eth_dev_info *devinfo)
@@ -154,6 +194,10 @@ otx_ep_dev_start(struct rte_eth_dev *eth_dev)
 	}
 
 	otx_ep_dev_link_update(eth_dev, 0);
+
+	otx_ep_set_tx_func(eth_dev);
+	otx_ep_set_rx_func(eth_dev);
+
 	otx_ep_info("dev started\n");
 
 	for (q = 0; q < eth_dev->data->nb_rx_queues; q++)
@@ -266,18 +310,23 @@ otx_epdev_init(struct otx_ep_device *otx_epvf)
 
 	otx_epvf->fn_list.setup_device_regs(otx_epvf);
 
+	otx_epvf->eth_dev->tx_pkt_burst = &cnxk_ep_xmit_pkts;
 	otx_epvf->eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF)
+	if (otx_epvf->chip_id == PCI_DEVID_OCTEONTX_EP_VF) {
 		otx_epvf->eth_dev->tx_pkt_burst = &otx_ep_xmit_pkts;
-	else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
+		otx_epvf->chip_gen = OTX_EP_CN8XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN9K_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CN98XX_EP_NET_VF ||
 		 otx_epvf->chip_id == PCI_DEVID_CNF95N_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
-		 otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
-		otx_epvf->eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		 otx_epvf->chip_id == PCI_DEVID_CNF95O_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN9XX;
+	} else if (otx_epvf->chip_id == PCI_DEVID_CN10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CN10KB_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KA_EP_NET_VF ||
+		   otx_epvf->chip_id == PCI_DEVID_CNF10KB_EP_NET_VF) {
+		otx_epvf->eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
+		otx_epvf->chip_gen = OTX_EP_CN10XX;
 	} else {
 		otx_ep_err("Invalid chip_id\n");
 		ret = -EINVAL;
@@ -667,8 +716,8 @@ otx_ep_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Single process support */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		eth_dev->dev_ops = &otx_ep_eth_dev_ops;
-		eth_dev->rx_pkt_burst = &otx_ep_recv_pkts;
-		eth_dev->tx_pkt_burst = &otx2_ep_xmit_pkts;
+		otx_ep_set_tx_func(eth_dev);
+		otx_ep_set_rx_func(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c
index 4c509a419f..c421ef0a1c 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.c
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.c
@@ -13,15 +13,8 @@
 
 #include "otx_ep_common.h"
 #include "otx_ep_vf.h"
-#include "otx2_ep_vf.h"
 #include "otx_ep_rxtx.h"
 
-/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
-#define OTX_EP_INFO_SIZE 8
-#define OTX_EP_FSZ_FS0 0
-#define DROQ_REFILL_THRESHOLD 16
-#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
-
 static void
 otx_ep_dmazone_free(const struct rte_memzone *mz)
 {
@@ -144,6 +137,13 @@ otx_ep_init_instr_queue(struct otx_ep_device *otx_ep, int iq_no, int num_descs,
 		     iq_no, iq->base_addr, (unsigned long)iq->base_addr_dma,
 		     iq->nb_desc);
 
+	iq->mbuf_list = rte_zmalloc_socket("mbuf_list",	(iq->nb_desc * sizeof(struct rte_mbuf *)),
+					   RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!iq->mbuf_list) {
+		otx_ep_err("IQ[%d] mbuf_list alloc failed\n", iq_no);
+		goto iq_init_fail;
+	}
+
 	iq->otx_ep_dev = otx_ep;
 	iq->q_no = iq_no;
 	iq->fill_cnt = 0;
@@ -675,85 +675,6 @@ otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
 	return count;
 }
 
-/* Enqueue requests/packets to OTX_EP IQ queue.
- * returns number of requests enqueued successfully
- */
-uint16_t
-otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
-{
-	struct otx_ep_instr_queue *iq = (struct otx_ep_instr_queue *)tx_queue;
-	struct otx_ep_device *otx_ep = iq->otx_ep_dev;
-	struct otx2_ep_instr_64B iqcmd2;
-	uint32_t iqreq_type;
-	struct rte_mbuf *m;
-	uint32_t pkt_len;
-	int count = 0;
-	uint16_t i;
-	int dbell;
-	int index;
-
-	iqcmd2.ih.u64 = 0;
-	iqcmd2.irh.u64 = 0;
-
-	/* ih invars */
-	iqcmd2.ih.s.fsz = OTX_EP_FSZ_FS0;
-	iqcmd2.ih.s.pkind = otx_ep->pkind; /* The SDK decided PKIND value */
-	/* irh invars */
-	iqcmd2.irh.s.opcode = OTX_EP_NW_PKT_OP;
-
-	for (i = 0; i < nb_pkts; i++) {
-		m = pkts[i];
-		if (m->nb_segs == 1) {
-			pkt_len = rte_pktmbuf_data_len(m);
-			iqcmd2.ih.s.tlen = pkt_len + iqcmd2.ih.s.fsz;
-			iqcmd2.dptr = rte_mbuf_data_iova(m); /*dptr*/
-			iqcmd2.ih.s.gather = 0;
-			iqcmd2.ih.s.gsz = 0;
-			iqreq_type = OTX_EP_REQTYPE_NORESP_NET;
-		} else {
-			if (!(otx_ep->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS))
-				goto xmit_fail;
-
-			if (unlikely(prepare_xmit_gather_list(iq, m, &iqcmd2.dptr, &iqcmd2.ih) < 0))
-				goto xmit_fail;
-
-			pkt_len = rte_pktmbuf_pkt_len(m);
-			iqreq_type = OTX_EP_REQTYPE_NORESP_GATHER;
-		}
-
-		iqcmd2.irh.u64 = rte_bswap64(iqcmd2.irh.u64);
-
-#ifdef OTX_EP_IO_DEBUG
-		otx_ep_dbg("After swapping\n");
-		otx_ep_dbg("Word0 [dptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.dptr);
-		otx_ep_dbg("Word1 [ihtx]: 0x%016lx\n", (unsigned long)iqcmd.ih);
-		otx_ep_dbg("Word2 [pki_ih3]: 0x%016lx\n",
-			   (unsigned long)iqcmd.pki_ih3);
-		otx_ep_dbg("Word3 [rptr]: 0x%016lx\n",
-			   (unsigned long)iqcmd.rptr);
-		otx_ep_dbg("Word4 [irh]: 0x%016lx\n", (unsigned long)iqcmd.irh);
-		otx_ep_dbg("Word5 [exhdr[0]]: 0x%016lx\n",
-			   (unsigned long)iqcmd.exhdr[0]);
-#endif
-		index = iq->host_write_index;
-		dbell = (i == (unsigned int)(nb_pkts - 1)) ? 1 : 0;
-		if (otx_ep_send_data(otx_ep, iq, &iqcmd2, dbell))
-			goto xmit_fail;
-		otx_ep_iqreq_add(iq, m, iqreq_type, index);
-		iq->stats.tx_pkts++;
-		iq->stats.tx_bytes += pkt_len;
-		count++;
-	}
-
-xmit_fail:
-	if (iq->instr_pending >= OTX_EP_MAX_INSTR)
-		otx_ep_flush_iq(iq);
-
-	/* Return no# of instructions posted successfully. */
-	return count;
-}
-
 static uint32_t
 otx_ep_droq_refill(struct otx_ep_droq *droq)
 {
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 3f12527004..cb68ef3b41 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -7,29 +7,53 @@
 
 #include <rte_byteorder.h>
 
-#define OTX_EP_RXD_ALIGN 2
-#define OTX_EP_TXD_ALIGN 2
+#define OTX_EP_RXD_ALIGN 8
+#define OTX_EP_TXD_ALIGN 8
 
 #define OTX_EP_IQ_SEND_FAILED      (-1)
 #define OTX_EP_IQ_SEND_SUCCESS     (0)
 
-#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10000
+#define OTX_EP_MAX_DELAYED_PKT_RETRIES 10
 
 #define OTX_EP_FSZ 28
 #define OTX2_EP_FSZ 24
-#define OTX_EP_MAX_INSTR 16
+#define OTX_EP_MAX_INSTR 128
+
+/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
+#define OTX_EP_INFO_SIZE 8
+#define DROQ_REFILL_THRESHOLD 16
+#define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)
 
 static inline uint32_t
 otx_ep_incr_index(uint32_t index, uint32_t count, uint32_t max)
 {
 	return ((index + count) & (max - 1));
 }
+
 uint16_t
 otx_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
 uint16_t
 otx2_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+otx_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_xmit_pkts(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts);
+
+uint16_t
+cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
+uint16_t
+cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
-otx_ep_recv_pkts(void *rx_queue,
-		  struct rte_mbuf **rx_pkts,
-		  uint16_t budget);
+cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v6 3/3] net/octeon_ep: add new fastpath routines
  2023-10-18 11:14           ` [PATCH v6 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
@ 2023-10-19  3:03             ` Jerin Jacob
  0 siblings, 0 replies; 26+ messages in thread
From: Jerin Jacob @ 2023-10-19  3:03 UTC (permalink / raw)
  To: Vamsi Attunuru; +Cc: dev, jerinj, sthotton

On Thu, Oct 19, 2023 at 6:52 AM Vamsi Attunuru <vattunuru@marvell.com> wrote:
>
> Adds new fastpath routines for cn10k & cn9k endpoint
> devices and assigns the fastpath routines based on
> the offload flags.
>
> Patch also adds misc changes to improve performance
> and code-readability.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

Series applied to dpdk-next-net-mrvl/for-next-net. Thanks

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2023-10-19  3:04 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-11  1:50 [PATCH 0/3] rewrite fastpath routines Vamsi Attunuru
2023-10-11  1:50 ` [PATCH 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-11  1:50 ` [PATCH 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-11  1:50 ` [PATCH 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-11  8:36 ` [PATCH v2 0/3] rewrite " Vamsi Attunuru
2023-10-11  8:36   ` [PATCH v2 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-11  8:36   ` [PATCH v2 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-11  8:36   ` [PATCH v2 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-11 12:53   ` [PATCH v3 0/3] rewrite " Vamsi Attunuru
2023-10-11 12:53     ` [PATCH v3 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-11 12:53     ` [PATCH v3 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-11 12:53     ` [PATCH v3 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-12  6:23     ` [PATCH v4 0/3] rewrite " Vamsi Attunuru
2023-10-12  6:23       ` [PATCH v4 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-12  6:23       ` [PATCH v4 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-12  6:23       ` [PATCH v4 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-18  3:48         ` Jerin Jacob
2023-10-18  8:07       ` [PATCH v5 0/3] rewrite " Vamsi Attunuru
2023-10-18  8:07         ` [PATCH v5 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-18  8:07         ` [PATCH v5 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-18  8:07         ` [PATCH v5 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-18 11:14         ` [PATCH v6 0/3] rewrite " Vamsi Attunuru
2023-10-18 11:14           ` [PATCH v6 1/3] net/octeon_ep: support 32B IQ descriptor size Vamsi Attunuru
2023-10-18 11:14           ` [PATCH v6 2/3] net/octeon_ep: clean up receive routine Vamsi Attunuru
2023-10-18 11:14           ` [PATCH v6 3/3] net/octeon_ep: add new fastpath routines Vamsi Attunuru
2023-10-19  3:03             ` Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).