DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 1/2] dma/cnxk: use mempool for DMA chunk pool
@ 2023-08-30  7:56 pbhagavatula
  2023-08-30  7:56 ` [PATCH 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-08-30 14:30 ` [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  0 siblings, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30  7:56 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use rte_mempool for DMA chunk pool to allow using mempool cache.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/common/cnxk/roc_dpi.c      |  95 +++++--------------------
 drivers/common/cnxk/roc_dpi.h      |  28 +-------
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |   1 +
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 108 +++++++++++++++++++++--------
 drivers/dma/cnxk/cnxk_dmadev.h     |  10 ++-
 8 files changed, 110 insertions(+), 138 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..9cb479371a 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
+#include "roc_api.h"
+#include "roc_priv.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "roc_api.h"
-#include "roc_priv.h"
-
 #define DPI_PF_MBOX_SYSFS_ENTRY "dpi_device_config"
 
 static inline int
@@ -52,17 +52,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }
 
 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;
 
 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,80 +65,31 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}
 
 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
 
-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
 
 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();
 
 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}
 
 	return rc;
-
-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
-	return rc;
 }
 
 int
@@ -153,11 +99,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;
 
 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);
 
 	return 0;
 }
@@ -180,14 +124,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;
 
-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);
 
 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_
 
-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;
 
 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);
 
-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);
 
diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4
 
-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..f8287bcf6b 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -70,4 +70,5 @@ RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER(cnxk_logtype_dpi, pmd.dma.cnxk.dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;
 
 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)
 
 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..35c2b79156 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -11,6 +11,7 @@
 #include <rte_dmadev_pmd.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 #include <rte_pci.h>
 
@@ -70,10 +71,54 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 	return 0;
 }
 
+static int
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+	uint64_t nb_chunks;
+	int rc;
+
+	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);
+
+	nb_chunks = DPI_CMD_QUEUE_BUFS;
+	nb_chunks += (CNXK_DMA_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool =
+		rte_mempool_create_empty(pool_name, nb_chunks, DPI_CMD_QUEUE_BUF_SIZE,
+					 CNXK_DMA_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
+	}
+
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}
+
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
+	if (rc < 0) {
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
+	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);
+
+	return 0;
+
+free:
+	rte_mempool_free(dpivf->chunk_pool);
+	return rc;
+}
+
 static int
 cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
 {
 	struct cnxk_dpi_vf_s *dpivf = NULL;
+	void *chunk;
 	int rc = 0;
 
 	RTE_SET_USED(conf_sz);
@@ -92,12 +137,29 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
 		return rc;
 
-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = cnxk_dmadev_chunk_pool_create(dev);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}
+
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	rc = roc_dpi_configure(&dpivf->rdpi, DPI_CMD_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
 	if (rc < 0) {
 		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
 		goto done;
 	}
 
+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (DPI_CMD_QUEUE_BUF_SIZE >> 3) - 2;
 	dpivf->flag |= CNXK_DPI_DEV_CONFIG;
 
 done:
@@ -335,7 +397,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }
 
 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;
 
@@ -346,31 +408,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");
 
-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}
 
 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,19 +451,11 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;
 
 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}
-
 			/* Write next buffer address */
 			*ptr = (uint64_t)new_buff;
 			dpi->chunk_base = new_buff;
@@ -465,7 +513,7 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;
 
-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -537,7 +585,7 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}
 
-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -593,7 +641,7 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;
 
-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -656,7 +704,7 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}
 
-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..65f12d844d 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -12,12 +12,15 @@
 #define DPI_MAX_DESC	     2048
 #define DPI_MIN_DESC	     2
 #define MAX_VCHANS_PER_QUEUE 4
+#define DPI_CMD_QUEUE_BUF_SIZE 4096
+#define DPI_CMD_QUEUE_BUFS     1024
 
 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
 #define DPI_REQ_CDATA 0xFF
 
+#define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
 #define CNXK_DPI_DEV_CONFIG (1ULL << 0)
 #define CNXK_DPI_DEV_START  (1ULL << 1)
 
@@ -45,8 +48,13 @@ struct cnxk_dpi_conf {
 };
 
 struct cnxk_dpi_vf_s {
-	struct roc_dpi rdpi;
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	struct roc_dpi rdpi;
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 2/2] dma/cnxk: rewrite DMA fastpath
  2023-08-30  7:56 [PATCH 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
@ 2023-08-30  7:56 ` pbhagavatula
  2023-08-30 14:30 ` [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30  7:56 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   2 +-
 4 files changed, 570 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..db1e57bf51
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..a35b3a3b70 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -2,5 +2,5 @@
 # Copyright(C) 2021 Marvell International Ltd.
 
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool
  2023-08-30  7:56 [PATCH 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  2023-08-30  7:56 ` [PATCH 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-08-30 14:30 ` pbhagavatula
  2023-08-30 14:30   ` [PATCH v2 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-08-30 16:54   ` [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30 14:30 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use rte_mempool for DMA chunk pool to allow using mempool cache.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 drivers/common/cnxk/roc_dpi.c      |  95 +++++--------------------
 drivers/common/cnxk/roc_dpi.h      |  28 +-------
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |   1 +
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 108 +++++++++++++++++++++--------
 drivers/dma/cnxk/cnxk_dmadev.h     |  10 ++-
 8 files changed, 110 insertions(+), 138 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..9cb479371a 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
+#include "roc_api.h"
+#include "roc_priv.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

-#include "roc_api.h"
-#include "roc_priv.h"
-
 #define DPI_PF_MBOX_SYSFS_ENTRY "dpi_device_config"

 static inline int
@@ -52,17 +52,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,80 +65,31 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}

 	return rc;
-
-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
-	return rc;
 }

 int
@@ -153,11 +99,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +124,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..f8287bcf6b 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -70,4 +70,5 @@ RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER(cnxk_logtype_dpi, pmd.dma.cnxk.dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..35c2b79156 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -11,6 +11,7 @@
 #include <rte_dmadev_pmd.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 #include <rte_pci.h>

@@ -70,10 +71,54 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 	return 0;
 }

+static int
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+	uint64_t nb_chunks;
+	int rc;
+
+	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);
+
+	nb_chunks = DPI_CMD_QUEUE_BUFS;
+	nb_chunks += (CNXK_DMA_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool =
+		rte_mempool_create_empty(pool_name, nb_chunks, DPI_CMD_QUEUE_BUF_SIZE,
+					 CNXK_DMA_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
+	}
+
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}
+
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
+	if (rc < 0) {
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
+	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);
+
+	return 0;
+
+free:
+	rte_mempool_free(dpivf->chunk_pool);
+	return rc;
+}
+
 static int
 cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
 {
 	struct cnxk_dpi_vf_s *dpivf = NULL;
+	void *chunk;
 	int rc = 0;

 	RTE_SET_USED(conf_sz);
@@ -92,12 +137,29 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
 		return rc;

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = cnxk_dmadev_chunk_pool_create(dev);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}
+
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	rc = roc_dpi_configure(&dpivf->rdpi, DPI_CMD_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
 	if (rc < 0) {
 		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
 		goto done;
 	}

+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (DPI_CMD_QUEUE_BUF_SIZE >> 3) - 2;
 	dpivf->flag |= CNXK_DPI_DEV_CONFIG;

 done:
@@ -335,7 +397,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +408,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,19 +451,11 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}
-
 			/* Write next buffer address */
 			*ptr = (uint64_t)new_buff;
 			dpi->chunk_base = new_buff;
@@ -465,7 +513,7 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -537,7 +585,7 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -593,7 +641,7 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -656,7 +704,7 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..65f12d844d 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -12,12 +12,15 @@
 #define DPI_MAX_DESC	     2048
 #define DPI_MIN_DESC	     2
 #define MAX_VCHANS_PER_QUEUE 4
+#define DPI_CMD_QUEUE_BUF_SIZE 4096
+#define DPI_CMD_QUEUE_BUFS     1024

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
 #define DPI_REQ_CDATA 0xFF

+#define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
 #define CNXK_DPI_DEV_CONFIG (1ULL << 0)
 #define CNXK_DPI_DEV_START  (1ULL << 1)

@@ -45,8 +48,13 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
-	struct roc_dpi rdpi;
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	struct roc_dpi rdpi;
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 2/2] dma/cnxk: rewrite DMA fastpath
  2023-08-30 14:30 ` [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
@ 2023-08-30 14:30   ` pbhagavatula
  2023-08-30 16:54   ` [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30 14:30 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   2 +-
 4 files changed, 570 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..db1e57bf51
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..a35b3a3b70 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -2,5 +2,5 @@
 # Copyright(C) 2021 Marvell International Ltd.
 
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool
  2023-08-30 14:30 ` [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  2023-08-30 14:30   ` [PATCH v2 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-08-30 16:54   ` pbhagavatula
  2023-08-30 16:54     ` [PATCH v3 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-08-31  5:32     ` [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30 16:54 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use rte_mempool for DMA chunk pool to allow using mempool cache.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v3 Changes:
 - Fix build

 drivers/common/cnxk/roc_dpi.c      |  95 +++++--------------------
 drivers/common/cnxk/roc_dpi.h      |  28 +-------
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |   1 +
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 108 +++++++++++++++++++++--------
 drivers/dma/cnxk/cnxk_dmadev.h     |  10 ++-
 8 files changed, 110 insertions(+), 138 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..9cb479371a 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
+#include "roc_api.h"
+#include "roc_priv.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

-#include "roc_api.h"
-#include "roc_priv.h"
-
 #define DPI_PF_MBOX_SYSFS_ENTRY "dpi_device_config"

 static inline int
@@ -52,17 +52,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,80 +65,31 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}

 	return rc;
-
-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
-	return rc;
 }

 int
@@ -153,11 +99,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +124,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..f8287bcf6b 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -70,4 +70,5 @@ RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER(cnxk_logtype_dpi, pmd.dma.cnxk.dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..35c2b79156 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -11,6 +11,7 @@
 #include <rte_dmadev_pmd.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 #include <rte_pci.h>

@@ -70,10 +71,54 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 	return 0;
 }

+static int
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+	uint64_t nb_chunks;
+	int rc;
+
+	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);
+
+	nb_chunks = DPI_CMD_QUEUE_BUFS;
+	nb_chunks += (CNXK_DMA_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool =
+		rte_mempool_create_empty(pool_name, nb_chunks, DPI_CMD_QUEUE_BUF_SIZE,
+					 CNXK_DMA_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
+	}
+
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}
+
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
+	if (rc < 0) {
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
+	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);
+
+	return 0;
+
+free:
+	rte_mempool_free(dpivf->chunk_pool);
+	return rc;
+}
+
 static int
 cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
 {
 	struct cnxk_dpi_vf_s *dpivf = NULL;
+	void *chunk;
 	int rc = 0;

 	RTE_SET_USED(conf_sz);
@@ -92,12 +137,29 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
 		return rc;

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = cnxk_dmadev_chunk_pool_create(dev);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}
+
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	rc = roc_dpi_configure(&dpivf->rdpi, DPI_CMD_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
 	if (rc < 0) {
 		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
 		goto done;
 	}

+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (DPI_CMD_QUEUE_BUF_SIZE >> 3) - 2;
 	dpivf->flag |= CNXK_DPI_DEV_CONFIG;

 done:
@@ -335,7 +397,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +408,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,19 +451,11 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}
-
 			/* Write next buffer address */
 			*ptr = (uint64_t)new_buff;
 			dpi->chunk_base = new_buff;
@@ -465,7 +513,7 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -537,7 +585,7 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -593,7 +641,7 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -656,7 +704,7 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..65f12d844d 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -12,12 +12,15 @@
 #define DPI_MAX_DESC	     2048
 #define DPI_MIN_DESC	     2
 #define MAX_VCHANS_PER_QUEUE 4
+#define DPI_CMD_QUEUE_BUF_SIZE 4096
+#define DPI_CMD_QUEUE_BUFS     1024

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
 #define DPI_REQ_CDATA 0xFF

+#define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
 #define CNXK_DPI_DEV_CONFIG (1ULL << 0)
 #define CNXK_DPI_DEV_START  (1ULL << 1)

@@ -45,8 +48,13 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
-	struct roc_dpi rdpi;
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	struct roc_dpi rdpi;
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v3 2/2] dma/cnxk: rewrite DMA fastpath
  2023-08-30 16:54   ` [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
@ 2023-08-30 16:54     ` pbhagavatula
  2023-08-31  5:32     ` [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-30 16:54 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 577 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..db1e57bf51
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool
  2023-08-30 16:54   ` [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  2023-08-30 16:54     ` [PATCH v3 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-08-31  5:32     ` pbhagavatula
  2023-08-31  5:32       ` [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-09-05 16:19       ` [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-08-31  5:32 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use rte_mempool for DMA chunk pool to allow using mempool cache.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v4 Changes:
 - Fix clang build.

 v3 Changes:
 - Fix build.

 drivers/common/cnxk/roc_dpi.c      |  95 +++++--------------------
 drivers/common/cnxk/roc_dpi.h      |  28 +-------
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |   1 +
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 108 +++++++++++++++++++++--------
 drivers/dma/cnxk/cnxk_dmadev.h     |  10 ++-
 8 files changed, 110 insertions(+), 138 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..9cb479371a 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
+#include "roc_api.h"
+#include "roc_priv.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

-#include "roc_api.h"
-#include "roc_priv.h"
-
 #define DPI_PF_MBOX_SYSFS_ENTRY "dpi_device_config"

 static inline int
@@ -52,17 +52,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,80 +65,31 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}

 	return rc;
-
-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
-	return rc;
 }

 int
@@ -153,11 +99,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +124,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..f8287bcf6b 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -70,4 +70,5 @@ RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
 RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER(cnxk_logtype_dpi, pmd.dma.cnxk.dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..35c2b79156 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -11,6 +11,7 @@
 #include <rte_dmadev_pmd.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 #include <rte_pci.h>

@@ -70,10 +71,54 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 	return 0;
 }

+static int
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+	uint64_t nb_chunks;
+	int rc;
+
+	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);
+
+	nb_chunks = DPI_CMD_QUEUE_BUFS;
+	nb_chunks += (CNXK_DMA_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool =
+		rte_mempool_create_empty(pool_name, nb_chunks, DPI_CMD_QUEUE_BUF_SIZE,
+					 CNXK_DMA_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
+	}
+
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}
+
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
+	if (rc < 0) {
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
+	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);
+
+	return 0;
+
+free:
+	rte_mempool_free(dpivf->chunk_pool);
+	return rc;
+}
+
 static int
 cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
 {
 	struct cnxk_dpi_vf_s *dpivf = NULL;
+	void *chunk;
 	int rc = 0;

 	RTE_SET_USED(conf_sz);
@@ -92,12 +137,29 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
 		return rc;

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = cnxk_dmadev_chunk_pool_create(dev);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}
+
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	rc = roc_dpi_configure(&dpivf->rdpi, DPI_CMD_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
 	if (rc < 0) {
 		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
 		goto done;
 	}

+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (DPI_CMD_QUEUE_BUF_SIZE >> 3) - 2;
 	dpivf->flag |= CNXK_DPI_DEV_CONFIG;

 done:
@@ -335,7 +397,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +408,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,19 +451,11 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}
-
 			/* Write next buffer address */
 			*ptr = (uint64_t)new_buff;
 			dpi->chunk_base = new_buff;
@@ -465,7 +513,7 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -537,7 +585,7 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -593,7 +641,7 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -656,7 +704,7 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..65f12d844d 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -12,12 +12,15 @@
 #define DPI_MAX_DESC	     2048
 #define DPI_MIN_DESC	     2
 #define MAX_VCHANS_PER_QUEUE 4
+#define DPI_CMD_QUEUE_BUF_SIZE 4096
+#define DPI_CMD_QUEUE_BUFS     1024

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
 #define DPI_REQ_CDATA 0xFF

+#define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
 #define CNXK_DPI_DEV_CONFIG (1ULL << 0)
 #define CNXK_DPI_DEV_START  (1ULL << 1)

@@ -45,8 +48,13 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
-	struct roc_dpi rdpi;
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	struct roc_dpi rdpi;
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath
  2023-08-31  5:32     ` [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
@ 2023-08-31  5:32       ` pbhagavatula
  2023-09-05 15:58         ` Jerin Jacob
  2023-09-05 16:19       ` [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  1 sibling, 1 reply; 18+ messages in thread
From: pbhagavatula @ 2023-08-31  5:32 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 577 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..d1f27ba2a6
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#else
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath
  2023-08-31  5:32       ` [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-05 15:58         ` Jerin Jacob
  0 siblings, 0 replies; 18+ messages in thread
From: Jerin Jacob @ 2023-09-05 15:58 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Vamsi Attunuru, dev

On Thu, Aug 31, 2023 at 11:15 AM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Rewrite DMA fastpath to use NEON instructions and reduce number
> of words read from config.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Please fix

### [PATCH] dma/cnxk: use mempool for DMA chunk pool

Warning in drivers/common/cnxk/roc_platform.c:
Using RTE_LOG_REGISTER, prefer RTE_LOG_REGISTER_(DEFAULT|SUFFIX)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool
  2023-08-31  5:32     ` [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  2023-08-31  5:32       ` [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-05 16:19       ` pbhagavatula
  2023-09-05 16:19         ` [PATCH v5 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-09-09 16:32         ` [PATCH v6 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-05 16:19 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use rte_mempool for DMA chunk pool to allow using mempool cache.
Convert logtype register macro for all cnxk drivers to
RTE_LOG_REGISTER_DEFAULT.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v5 Changes:
 - Use RTE_LOG_REGISTER_DEFAULT for registering logging.
 v4 Changes:
 - Fix clang build.
 v3 Changes:
 - Fix build.

 drivers/common/cnxk/roc_dpi.c      |  95 +++++--------------------
 drivers/common/cnxk/roc_dpi.h      |  28 +-------
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |  21 +++---
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 108 +++++++++++++++++++++--------
 drivers/dma/cnxk/cnxk_dmadev.h     |  10 ++-
 8 files changed, 120 insertions(+), 148 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..9cb479371a 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
+#include "roc_api.h"
+#include "roc_priv.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

-#include "roc_api.h"
-#include "roc_priv.h"
-
 #define DPI_PF_MBOX_SYSFS_ENTRY "dpi_device_config"

 static inline int
@@ -52,17 +52,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,80 +65,31 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}

 	return rc;
-
-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
-	return rc;
 }

 int
@@ -153,11 +99,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +124,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..63abdb2c86 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -60,14 +60,15 @@ roc_plt_init(void)
 	return 0;
 }

-RTE_LOG_REGISTER(cnxk_logtype_base, pmd.cnxk.base, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_mbox, pmd.cnxk.mbox, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_cpt, pmd.crypto.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_ml, pmd.ml.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npa, pmd.mempool.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_nix, pmd.net.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_base, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_mbox, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_cpt, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ml, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npa, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_nix, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npc, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_sso, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tim, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..35c2b79156 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -11,6 +11,7 @@
 #include <rte_dmadev_pmd.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 #include <rte_pci.h>

@@ -70,10 +71,54 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 	return 0;
 }

+static int
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+	uint64_t nb_chunks;
+	int rc;
+
+	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);
+
+	nb_chunks = DPI_CMD_QUEUE_BUFS;
+	nb_chunks += (CNXK_DMA_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool =
+		rte_mempool_create_empty(pool_name, nb_chunks, DPI_CMD_QUEUE_BUF_SIZE,
+					 CNXK_DMA_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
+	}
+
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}
+
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
+	if (rc < 0) {
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
+	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);
+
+	return 0;
+
+free:
+	rte_mempool_free(dpivf->chunk_pool);
+	return rc;
+}
+
 static int
 cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
 {
 	struct cnxk_dpi_vf_s *dpivf = NULL;
+	void *chunk;
 	int rc = 0;

 	RTE_SET_USED(conf_sz);
@@ -92,12 +137,29 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
 		return rc;

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = cnxk_dmadev_chunk_pool_create(dev);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}
+
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	rc = roc_dpi_configure(&dpivf->rdpi, DPI_CMD_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
 	if (rc < 0) {
 		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
 		goto done;
 	}

+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (DPI_CMD_QUEUE_BUF_SIZE >> 3) - 2;
 	dpivf->flag |= CNXK_DPI_DEV_CONFIG;

 done:
@@ -335,7 +397,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +408,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,19 +451,11 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}
-
 			/* Write next buffer address */
 			*ptr = (uint64_t)new_buff;
 			dpi->chunk_base = new_buff;
@@ -465,7 +513,7 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -537,7 +585,7 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -593,7 +641,7 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
@@ -656,7 +704,7 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
 		STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..65f12d844d 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -12,12 +12,15 @@
 #define DPI_MAX_DESC	     2048
 #define DPI_MIN_DESC	     2
 #define MAX_VCHANS_PER_QUEUE 4
+#define DPI_CMD_QUEUE_BUF_SIZE 4096
+#define DPI_CMD_QUEUE_BUFS     1024

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
 #define DPI_REQ_CDATA 0xFF

+#define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
 #define CNXK_DPI_DEV_CONFIG (1ULL << 0)
 #define CNXK_DPI_DEV_START  (1ULL << 1)

@@ -45,8 +48,13 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
-	struct roc_dpi rdpi;
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	struct roc_dpi rdpi;
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v5 2/2] dma/cnxk: rewrite DMA fastpath
  2023-09-05 16:19       ` [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
@ 2023-09-05 16:19         ` pbhagavatula
  2023-09-09 16:32         ` [PATCH v6 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-05 16:19 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 577 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..d1f27ba2a6
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#else
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v6 1/2] dma/cnxk: rework DMA driver
  2023-09-05 16:19       ` [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
  2023-09-05 16:19         ` [PATCH v5 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-09 16:32         ` pbhagavatula
  2023-09-09 16:32           ` [PATCH v6 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-09-09 16:37           ` [PATCH v7 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:32 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

To use the mempool cache, use rte_mempool for the DMA chunk pool.
Move the mempool creation to device start to limit the number of
chunks allocated based on the total number of descriptors
configured across all the vchans.

Remove unnecessary state tracking flags as the library handles it
and add the `CNXK` prefix to driver macros.

Convert the log register macro for all cnxk drivers to
RTE_LOG_REGISTER_DEFAULT.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v6 Changes:
 - Rework drvice configuration and start logic.
 - add CNXK prefix to driver macros.
 v5 Changes:
 - Use RTE_LOG_REGISTER_DEFAULT for registering logging.
 v4 Changes:
 - Fix clang build.
 v3 Changes:
 - Fix build.

 drivers/common/cnxk/roc_dpi.c      |  90 ++---------
 drivers/common/cnxk/roc_dpi.h      |  28 +---
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |  21 +--
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 252 ++++++++++++++++-------------
 drivers/dma/cnxk/cnxk_dmadev.h     |  45 ++++--
 8 files changed, 203 insertions(+), 239 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..c241168294 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -52,17 +53,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,79 +66,30 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}
-
-	return rc;

-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
 	return rc;
 }

@@ -153,11 +100,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +125,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..a8a83a3723 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -60,14 +60,15 @@ roc_plt_init(void)
 	return 0;
 }

-RTE_LOG_REGISTER(cnxk_logtype_base, pmd.cnxk.base, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_mbox, pmd.cnxk.mbox, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_cpt, pmd.crypto.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_ml, pmd.ml.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npa, pmd.mempool.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_nix, pmd.net.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_base, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_mbox, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_cpt, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ml, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npa, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_nix, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npc, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_sso, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tim, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..f58bb92dbc 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,18 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */

-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>

 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -24,14 +12,14 @@ cnxk_dmadev_info_get(const struct rte_dma_dev *dev, struct rte_dma_info *dev_inf
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	RTE_SET_USED(size);

-	dev_info->max_vchans = MAX_VCHANS_PER_QUEUE;
+	dev_info->max_vchans = CNXK_DPI_MAX_VCHANS_PER_QUEUE;
 	dev_info->nb_vchans = dpivf->num_vchans;
 	dev_info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM | RTE_DMA_CAPA_MEM_TO_DEV |
 			     RTE_DMA_CAPA_DEV_TO_MEM | RTE_DMA_CAPA_DEV_TO_DEV |
 			     RTE_DMA_CAPA_OPS_COPY | RTE_DMA_CAPA_OPS_COPY_SG;
-	dev_info->max_desc = DPI_MAX_DESC;
-	dev_info->min_desc = DPI_MIN_DESC;
-	dev_info->max_sges = DPI_MAX_POINTER;
+	dev_info->max_desc = CNXK_DPI_MAX_DESC;
+	dev_info->min_desc = CNXK_DPI_MIN_DESC;
+	dev_info->max_sges = CNXK_DPI_MAX_POINTER;

 	return 0;
 }
@@ -48,7 +36,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 		num_vchans = dpivf->num_vchans;
 		i = 0;
 	} else {
-		if (vchan >= MAX_VCHANS_PER_QUEUE)
+		if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 			return -EINVAL;

 		num_vchans = vchan + 1;
@@ -57,7 +45,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)

 	for (; i < num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
-		max_desc = dpi_conf->c_desc.max_cnt;
+		max_desc = dpi_conf->c_desc.max_cnt + 1;
 		if (dpi_conf->c_desc.compl_ptr) {
 			for (j = 0; j < max_desc; j++)
 				rte_free(dpi_conf->c_desc.compl_ptr[j]);
@@ -71,39 +59,62 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 }

 static int
-cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev, uint32_t nb_chunks, uint32_t chunk_sz)
 {
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
 	struct cnxk_dpi_vf_s *dpivf = NULL;
-	int rc = 0;
-
-	RTE_SET_USED(conf_sz);
+	int rc;

 	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);

-	/* Accept only number of vchans as config from application. */
-	if (!(dpivf->flag & CNXK_DPI_DEV_START)) {
-		/* After config function, vchan setup function has to be called.
-		 * Free up vchan memory if any, before configuring num_vchans.
-		 */
-		cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
-		dpivf->num_vchans = conf->nb_vchans;
+	nb_chunks += (CNXK_DPI_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool = rte_mempool_create_empty(
+		pool_name, nb_chunks, chunk_sz, CNXK_DPI_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
 	}

-	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
-		return rc;
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
 	if (rc < 0) {
-		plt_err("DMA configure failed err = %d", rc);
-		goto done;
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
 	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);

-	dpivf->flag |= CNXK_DPI_DEV_CONFIG;
+	return 0;

-done:
+free:
+	rte_mempool_free(dpivf->chunk_pool);
 	return rc;
 }

+static int
+cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+
+	RTE_SET_USED(conf_sz);
+	dpivf = dev->fp_obj->dev_private;
+
+	/* After config function, vchan setup function has to be called.
+	 * Free up vchan memory if any, before configuring num_vchans.
+	 */
+	cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
+	dpivf->num_vchans = conf->nb_vchans;
+
+	return 0;
+}
+
 static int
 cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
@@ -117,9 +128,6 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -163,8 +171,8 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -182,7 +190,7 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			return -ENOMEM;
 		}

-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -203,9 +211,6 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -249,8 +254,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -267,7 +272,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -280,10 +286,9 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	struct cnxk_dpi_conf *dpi_conf;
-	int i, j;
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
+	uint32_t chunks, nb_desc = 0;
+	int i, j, rc = 0;
+	void *chunk;

 	for (i = 0; i < dpivf->num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
@@ -292,29 +297,61 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 		dpi_conf->pnum_words = 0;
 		dpi_conf->pending = 0;
 		dpi_conf->desc_idx = 0;
-		for (j = 0; j < dpi_conf->c_desc.max_cnt; j++) {
+		for (j = 0; j < dpi_conf->c_desc.max_cnt + 1; j++) {
 			if (dpi_conf->c_desc.compl_ptr[j])
-				dpi_conf->c_desc.compl_ptr[j]->cdata = DPI_REQ_CDATA;
+				dpi_conf->c_desc.compl_ptr[j]->cdata = CNXK_DPI_REQ_CDATA;
 		}
-
+		nb_desc += dpi_conf->c_desc.max_cnt + 1;
 		cnxk_stats_reset(dev, i);
 		dpi_conf->completed_offset = 0;
 	}

-	roc_dpi_enable(&dpivf->rdpi);
+	chunks = CNXK_DPI_CHUNKS_FROM_DESC(CNXK_DPI_QUEUE_BUF_SIZE, nb_desc);
+	rc = cnxk_dmadev_chunk_pool_create(dev, chunks, CNXK_DPI_QUEUE_BUF_SIZE);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}

-	dpivf->flag |= CNXK_DPI_DEV_START;
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}

-	return 0;
+	rc = roc_dpi_configure(&dpivf->rdpi, CNXK_DPI_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
+	if (rc < 0) {
+		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (CNXK_DPI_QUEUE_BUF_SIZE >> 3) - 2;
+
+	roc_dpi_enable(&dpivf->rdpi);
+
+done:
+	return rc;
 }

 static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);

 	roc_dpi_disable(&dpivf->rdpi);
-	dpivf->flag &= ~CNXK_DPI_DEV_START;
+	rte_mempool_free(dpivf->chunk_pool);
+	dpivf->chunk_pool = NULL;
+	dpivf->chunk_base = NULL;
+	dpivf->chunk_size_m1 = 0;

 	return 0;
 }
@@ -335,7 +372,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +383,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,17 +426,10 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}

 			/* Write next buffer address */
@@ -433,10 +457,10 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -465,9 +489,9 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -498,10 +522,10 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -510,13 +534,13 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 	 * For all other cases, src pointers are first pointers.
 	 */
 	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
 		fptr = &dst[0];
 		lptr = &src[0];
 	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 		fptr = &src[0];
 		lptr = &dst[0];
 	}
@@ -537,9 +561,9 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -570,10 +594,10 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -593,9 +617,9 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -627,15 +651,15 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
+	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 	fptr = &src[0];
 	lptr = &dst[0];

@@ -656,9 +680,9 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -688,16 +712,16 @@ cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls,
 		comp_ptr = c_desc->compl_ptr[c_desc->head];

 		if (comp_ptr->cdata) {
-			if (comp_ptr->cdata == DPI_REQ_CDATA)
+			if (comp_ptr->cdata == CNXK_DPI_REQ_CDATA)
 				break;
 			*has_error = 1;
 			dpi_conf->stats.errors++;
-			STRM_INC(*c_desc, head);
+			CNXK_DPI_STRM_INC(*c_desc, head);
 			break;
 		}

-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -720,13 +744,13 @@ cnxk_dmadev_completed_status(void *dev_private, uint16_t vchan, const uint16_t n
 		comp_ptr = c_desc->compl_ptr[c_desc->head];
 		status[cnt] = comp_ptr->cdata;
 		if (status[cnt]) {
-			if (status[cnt] == DPI_REQ_CDATA)
+			if (status[cnt] == CNXK_DPI_REQ_CDATA)
 				break;

 			dpi_conf->stats.errors++;
 		}
-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -794,7 +818,7 @@ cnxk_stats_get(const struct rte_dma_dev *dev, uint16_t vchan, struct rte_dma_sta
 		goto done;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
@@ -822,7 +846,7 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 		return 0;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..d691f5fba2 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,22 +4,38 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H

+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>

-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define CNXK_DPI_MAX_POINTER		    15
+#define CNXK_DPI_STRM_INC(s, var)	    ((s).var = ((s).var + 1) & (s).max_cnt)
+#define CNXK_DPI_STRM_DEC(s, var)	    ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define CNXK_DPI_MAX_DESC		    32768
+#define CNXK_DPI_MIN_DESC		    2
+#define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
+#define CNXK_DPI_QUEUE_BUF_SIZE		    16256
+#define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_HDR_LEN		    4
+#define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + (src << 1) + (dst << 1))
+#define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER, CNXK_DPI_MAX_POINTER)
+#define CNXK_DPI_CHUNKS_FROM_DESC(cz, desc) (desc / ((cz / 8) / CNXK_DPI_MAX_CMD_SZ)) + 1

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
-#define DPI_REQ_CDATA 0xFF
-
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_REQ_CDATA 0xFF

 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -45,8 +61,15 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
+	/* Fast path */
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
+	struct cnxk_dpi_conf conf[CNXK_DPI_MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
-	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v6 2/2] dma/cnxk: rewrite DMA fastpath
  2023-09-09 16:32         ` [PATCH v6 1/2] dma/cnxk: rework DMA driver pbhagavatula
@ 2023-09-09 16:32           ` pbhagavatula
  2023-09-09 16:37           ` [PATCH v7 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:32 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 428 ++---------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  59 +++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 436 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 528 insertions(+), 404 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index f58bb92dbc..26680edfde 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -115,19 +115,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return 0;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -163,54 +153,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > CNXK_DPI_MAX_DESC)
-		max_desc = CNXK_DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -246,6 +193,27 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -371,333 +339,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -856,17 +497,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -917,12 +547,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index d691f5fba2..6479434de9 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -27,6 +27,7 @@
 #define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
 #define CNXK_DPI_QUEUE_BUF_SIZE		    16256
 #define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_DW_PER_SINGLE_CMD	    8
 #define CNXK_DPI_HDR_LEN		    4
 #define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + (src << 1) + (dst << 1))
 #define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER, CNXK_DPI_MAX_POINTER)
@@ -37,6 +38,49 @@
  */
 #define CNXK_DPI_REQ_CDATA 0xFF
 
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
+
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
 	void *cb_data;
@@ -50,12 +94,11 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
@@ -72,6 +115,18 @@ struct cnxk_dpi_vf_s {
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..16d7b5426b
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#else
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + CNXK_DPI_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, CNXK_DPI_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += CNXK_DPI_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		/* Copy the remaining cmd words to new chunk. */
+		__dpi_cpy_scalar(cmd + count, ptr, CNXK_DPI_DW_PER_SINGLE_CMD - count);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = CNXK_DPI_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, CNXK_DPI_HDR_LEN);
+		ptr += CNXK_DPI_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them, copy the rest to the new buffer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v7 1/2] dma/cnxk: rework DMA driver
  2023-09-09 16:32         ` [PATCH v6 1/2] dma/cnxk: rework DMA driver pbhagavatula
  2023-09-09 16:32           ` [PATCH v6 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-09 16:37           ` pbhagavatula
  2023-09-09 16:37             ` [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  2023-09-09 16:54             ` [PATCH v8 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 2 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:37 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

To use the mempool cache, use rte_mempool for the DMA chunk pool.
Move the mempool creation to device start to limit the number of
chunks allocated based on the total number of descriptors
configured across all the vchans.

Remove unnecessary state tracking flags as the library handles it
and add the `CNXK` prefix to driver macros.

Convert the log register macro for all cnxk drivers to
RTE_LOG_REGISTER_DEFAULT.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v7 Changes:
 - Fix checkpatch warnings.
 v6 Changes:
 - Rework drvice configuration and start logic.
 - add CNXK prefix to driver macros.
 v5 Changes:
 - Use RTE_LOG_REGISTER_DEFAULT for registering logging.
 v4 Changes:
 - Fix clang build.
 v3 Changes:
 - Fix build.

 drivers/common/cnxk/roc_dpi.c      |  90 ++---------
 drivers/common/cnxk/roc_dpi.h      |  28 +---
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |  21 +--
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 252 ++++++++++++++++-------------
 drivers/dma/cnxk/cnxk_dmadev.h     |  45 ++++--
 8 files changed, 203 insertions(+), 239 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..c241168294 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -52,17 +53,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,79 +66,30 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}
-
-	return rc;

-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
 	return rc;
 }

@@ -153,11 +100,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +125,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..a8a83a3723 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -60,14 +60,15 @@ roc_plt_init(void)
 	return 0;
 }

-RTE_LOG_REGISTER(cnxk_logtype_base, pmd.cnxk.base, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_mbox, pmd.cnxk.mbox, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_cpt, pmd.crypto.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_ml, pmd.ml.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npa, pmd.mempool.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_nix, pmd.net.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_base, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_mbox, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_cpt, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ml, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npa, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_nix, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npc, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_sso, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tim, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..f58bb92dbc 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,18 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */

-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>

 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -24,14 +12,14 @@ cnxk_dmadev_info_get(const struct rte_dma_dev *dev, struct rte_dma_info *dev_inf
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	RTE_SET_USED(size);

-	dev_info->max_vchans = MAX_VCHANS_PER_QUEUE;
+	dev_info->max_vchans = CNXK_DPI_MAX_VCHANS_PER_QUEUE;
 	dev_info->nb_vchans = dpivf->num_vchans;
 	dev_info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM | RTE_DMA_CAPA_MEM_TO_DEV |
 			     RTE_DMA_CAPA_DEV_TO_MEM | RTE_DMA_CAPA_DEV_TO_DEV |
 			     RTE_DMA_CAPA_OPS_COPY | RTE_DMA_CAPA_OPS_COPY_SG;
-	dev_info->max_desc = DPI_MAX_DESC;
-	dev_info->min_desc = DPI_MIN_DESC;
-	dev_info->max_sges = DPI_MAX_POINTER;
+	dev_info->max_desc = CNXK_DPI_MAX_DESC;
+	dev_info->min_desc = CNXK_DPI_MIN_DESC;
+	dev_info->max_sges = CNXK_DPI_MAX_POINTER;

 	return 0;
 }
@@ -48,7 +36,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 		num_vchans = dpivf->num_vchans;
 		i = 0;
 	} else {
-		if (vchan >= MAX_VCHANS_PER_QUEUE)
+		if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 			return -EINVAL;

 		num_vchans = vchan + 1;
@@ -57,7 +45,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)

 	for (; i < num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
-		max_desc = dpi_conf->c_desc.max_cnt;
+		max_desc = dpi_conf->c_desc.max_cnt + 1;
 		if (dpi_conf->c_desc.compl_ptr) {
 			for (j = 0; j < max_desc; j++)
 				rte_free(dpi_conf->c_desc.compl_ptr[j]);
@@ -71,39 +59,62 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 }

 static int
-cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev, uint32_t nb_chunks, uint32_t chunk_sz)
 {
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
 	struct cnxk_dpi_vf_s *dpivf = NULL;
-	int rc = 0;
-
-	RTE_SET_USED(conf_sz);
+	int rc;

 	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);

-	/* Accept only number of vchans as config from application. */
-	if (!(dpivf->flag & CNXK_DPI_DEV_START)) {
-		/* After config function, vchan setup function has to be called.
-		 * Free up vchan memory if any, before configuring num_vchans.
-		 */
-		cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
-		dpivf->num_vchans = conf->nb_vchans;
+	nb_chunks += (CNXK_DPI_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool = rte_mempool_create_empty(
+		pool_name, nb_chunks, chunk_sz, CNXK_DPI_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
 	}

-	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
-		return rc;
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
 	if (rc < 0) {
-		plt_err("DMA configure failed err = %d", rc);
-		goto done;
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
 	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);

-	dpivf->flag |= CNXK_DPI_DEV_CONFIG;
+	return 0;

-done:
+free:
+	rte_mempool_free(dpivf->chunk_pool);
 	return rc;
 }

+static int
+cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+
+	RTE_SET_USED(conf_sz);
+	dpivf = dev->fp_obj->dev_private;
+
+	/* After config function, vchan setup function has to be called.
+	 * Free up vchan memory if any, before configuring num_vchans.
+	 */
+	cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
+	dpivf->num_vchans = conf->nb_vchans;
+
+	return 0;
+}
+
 static int
 cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
@@ -117,9 +128,6 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -163,8 +171,8 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -182,7 +190,7 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			return -ENOMEM;
 		}

-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -203,9 +211,6 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -249,8 +254,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -267,7 +272,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -280,10 +286,9 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	struct cnxk_dpi_conf *dpi_conf;
-	int i, j;
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
+	uint32_t chunks, nb_desc = 0;
+	int i, j, rc = 0;
+	void *chunk;

 	for (i = 0; i < dpivf->num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
@@ -292,29 +297,61 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 		dpi_conf->pnum_words = 0;
 		dpi_conf->pending = 0;
 		dpi_conf->desc_idx = 0;
-		for (j = 0; j < dpi_conf->c_desc.max_cnt; j++) {
+		for (j = 0; j < dpi_conf->c_desc.max_cnt + 1; j++) {
 			if (dpi_conf->c_desc.compl_ptr[j])
-				dpi_conf->c_desc.compl_ptr[j]->cdata = DPI_REQ_CDATA;
+				dpi_conf->c_desc.compl_ptr[j]->cdata = CNXK_DPI_REQ_CDATA;
 		}
-
+		nb_desc += dpi_conf->c_desc.max_cnt + 1;
 		cnxk_stats_reset(dev, i);
 		dpi_conf->completed_offset = 0;
 	}

-	roc_dpi_enable(&dpivf->rdpi);
+	chunks = CNXK_DPI_CHUNKS_FROM_DESC(CNXK_DPI_QUEUE_BUF_SIZE, nb_desc);
+	rc = cnxk_dmadev_chunk_pool_create(dev, chunks, CNXK_DPI_QUEUE_BUF_SIZE);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}

-	dpivf->flag |= CNXK_DPI_DEV_START;
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}

-	return 0;
+	rc = roc_dpi_configure(&dpivf->rdpi, CNXK_DPI_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
+	if (rc < 0) {
+		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (CNXK_DPI_QUEUE_BUF_SIZE >> 3) - 2;
+
+	roc_dpi_enable(&dpivf->rdpi);
+
+done:
+	return rc;
 }

 static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);

 	roc_dpi_disable(&dpivf->rdpi);
-	dpivf->flag &= ~CNXK_DPI_DEV_START;
+	rte_mempool_free(dpivf->chunk_pool);
+	dpivf->chunk_pool = NULL;
+	dpivf->chunk_base = NULL;
+	dpivf->chunk_size_m1 = 0;

 	return 0;
 }
@@ -335,7 +372,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +383,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,17 +426,10 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}

 			/* Write next buffer address */
@@ -433,10 +457,10 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -465,9 +489,9 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -498,10 +522,10 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -510,13 +534,13 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 	 * For all other cases, src pointers are first pointers.
 	 */
 	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
 		fptr = &dst[0];
 		lptr = &src[0];
 	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 		fptr = &src[0];
 		lptr = &dst[0];
 	}
@@ -537,9 +561,9 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -570,10 +594,10 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -593,9 +617,9 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -627,15 +651,15 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
+	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 	fptr = &src[0];
 	lptr = &dst[0];

@@ -656,9 +680,9 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -688,16 +712,16 @@ cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls,
 		comp_ptr = c_desc->compl_ptr[c_desc->head];

 		if (comp_ptr->cdata) {
-			if (comp_ptr->cdata == DPI_REQ_CDATA)
+			if (comp_ptr->cdata == CNXK_DPI_REQ_CDATA)
 				break;
 			*has_error = 1;
 			dpi_conf->stats.errors++;
-			STRM_INC(*c_desc, head);
+			CNXK_DPI_STRM_INC(*c_desc, head);
 			break;
 		}

-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -720,13 +744,13 @@ cnxk_dmadev_completed_status(void *dev_private, uint16_t vchan, const uint16_t n
 		comp_ptr = c_desc->compl_ptr[c_desc->head];
 		status[cnt] = comp_ptr->cdata;
 		if (status[cnt]) {
-			if (status[cnt] == DPI_REQ_CDATA)
+			if (status[cnt] == CNXK_DPI_REQ_CDATA)
 				break;

 			dpi_conf->stats.errors++;
 		}
-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -794,7 +818,7 @@ cnxk_stats_get(const struct rte_dma_dev *dev, uint16_t vchan, struct rte_dma_sta
 		goto done;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
@@ -822,7 +846,7 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 		return 0;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..d691f5fba2 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,22 +4,38 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H

+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>

-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define CNXK_DPI_MAX_POINTER		    15
+#define CNXK_DPI_STRM_INC(s, var)	    ((s).var = ((s).var + 1) & (s).max_cnt)
+#define CNXK_DPI_STRM_DEC(s, var)	    ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define CNXK_DPI_MAX_DESC		    32768
+#define CNXK_DPI_MIN_DESC		    2
+#define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
+#define CNXK_DPI_QUEUE_BUF_SIZE		    16256
+#define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_HDR_LEN		    4
+#define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + (src << 1) + (dst << 1))
+#define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER, CNXK_DPI_MAX_POINTER)
+#define CNXK_DPI_CHUNKS_FROM_DESC(cz, desc) (desc / ((cz / 8) / CNXK_DPI_MAX_CMD_SZ)) + 1

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
-#define DPI_REQ_CDATA 0xFF
-
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_REQ_CDATA 0xFF

 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -45,8 +61,15 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
+	/* Fast path */
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
+	struct cnxk_dpi_conf conf[CNXK_DPI_MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
-	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath
  2023-09-09 16:37           ` [PATCH v7 1/2] dma/cnxk: rework DMA driver pbhagavatula
@ 2023-09-09 16:37             ` pbhagavatula
  2023-09-20  4:17               ` Jerin Jacob
  2023-09-09 16:54             ` [PATCH v8 1/2] dma/cnxk: rework DMA driver pbhagavatula
  1 sibling, 1 reply; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:37 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 428 ++---------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  59 +++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 436 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 528 insertions(+), 404 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index f58bb92dbc..26680edfde 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -115,19 +115,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return 0;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -163,54 +153,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > CNXK_DPI_MAX_DESC)
-		max_desc = CNXK_DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -246,6 +193,27 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -371,333 +339,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -856,17 +497,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -917,12 +547,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index d691f5fba2..6479434de9 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -27,6 +27,7 @@
 #define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
 #define CNXK_DPI_QUEUE_BUF_SIZE		    16256
 #define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_DW_PER_SINGLE_CMD	    8
 #define CNXK_DPI_HDR_LEN		    4
 #define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + (src << 1) + (dst << 1))
 #define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER, CNXK_DPI_MAX_POINTER)
@@ -37,6 +38,49 @@
  */
 #define CNXK_DPI_REQ_CDATA 0xFF
 
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
+
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
 	void *cb_data;
@@ -50,12 +94,11 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
@@ -72,6 +115,18 @@ struct cnxk_dpi_vf_s {
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..16d7b5426b
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#else
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + CNXK_DPI_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, CNXK_DPI_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += CNXK_DPI_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		/* Copy the remaining cmd words to new chunk. */
+		__dpi_cpy_scalar(cmd + count, ptr, CNXK_DPI_DW_PER_SINGLE_CMD - count);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = CNXK_DPI_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, CNXK_DPI_HDR_LEN);
+		ptr += CNXK_DPI_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them, copy the rest to the new buffer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v8 1/2] dma/cnxk: rework DMA driver
  2023-09-09 16:37           ` [PATCH v7 1/2] dma/cnxk: rework DMA driver pbhagavatula
  2023-09-09 16:37             ` [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-09 16:54             ` pbhagavatula
  2023-09-09 16:54               ` [PATCH v8 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
  1 sibling, 1 reply; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:54 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Vamsi Attunuru
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

To use the mempool cache, use rte_mempool for the DMA chunk pool.
Move the mempool creation to device start to limit the number of
chunks allocated based on the total number of descriptors
configured across all the vchans.

Remove unnecessary state tracking flags as the library handles it
and add the `CNXK` prefix to driver macros.

Convert the log register macro for all cnxk drivers to
RTE_LOG_REGISTER_DEFAULT.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
Depends-on: 29324

 v8 Changes:
 - Actually include the fixes to checkpatch.
 v7 Changes:
 - Fix checkpatch warnings.
 v6 Changes:
 - Rework drvice configuration and start logic.
 - add CNXK prefix to driver macros.
 v5 Changes:
 - Use RTE_LOG_REGISTER_DEFAULT for registering logging.
 v4 Changes:
 - Fix clang build.
 v3 Changes:
 - Fix build.

 drivers/common/cnxk/roc_dpi.c      |  90 ++---------
 drivers/common/cnxk/roc_dpi.h      |  28 +---
 drivers/common/cnxk/roc_dpi_priv.h |   3 -
 drivers/common/cnxk/roc_platform.c |  21 +--
 drivers/common/cnxk/roc_platform.h |   2 +
 drivers/common/cnxk/version.map    |   1 +
 drivers/dma/cnxk/cnxk_dmadev.c     | 252 ++++++++++++++++-------------
 drivers/dma/cnxk/cnxk_dmadev.h     |  47 ++++--
 8 files changed, 205 insertions(+), 239 deletions(-)

diff --git a/drivers/common/cnxk/roc_dpi.c b/drivers/common/cnxk/roc_dpi.c
index 0e2f803077..c241168294 100644
--- a/drivers/common/cnxk/roc_dpi.c
+++ b/drivers/common/cnxk/roc_dpi.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(C) 2021 Marvell.
  */
+
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -52,17 +53,12 @@ roc_dpi_disable(struct roc_dpi *dpi)
 }

 int
-roc_dpi_configure(struct roc_dpi *roc_dpi)
+roc_dpi_configure(struct roc_dpi *roc_dpi, uint32_t chunk_sz, uint64_t aura, uint64_t chunk_base)
 {
 	struct plt_pci_device *pci_dev;
-	const struct plt_memzone *dpi_mz;
 	dpi_mbox_msg_t mbox_msg;
-	struct npa_pool_s pool;
-	struct npa_aura_s aura;
-	int rc, count, buflen;
-	uint64_t aura_handle;
-	plt_iova_t iova;
-	char name[32];
+	uint64_t reg;
+	int rc;

 	if (!roc_dpi) {
 		plt_err("roc_dpi is NULL");
@@ -70,79 +66,30 @@ roc_dpi_configure(struct roc_dpi *roc_dpi)
 	}

 	pci_dev = roc_dpi->pci_dev;
-	memset(&pool, 0, sizeof(struct npa_pool_s));
-	pool.nat_align = 1;
-
-	memset(&aura, 0, sizeof(aura));
-	rc = roc_npa_pool_create(&aura_handle, DPI_CMD_QUEUE_SIZE,
-				 DPI_CMD_QUEUE_BUFS, &aura, &pool, 0);
-	if (rc) {
-		plt_err("Failed to create NPA pool, err %d\n", rc);
-		return rc;
-	}
-
-	snprintf(name, sizeof(name), "dpimem%d:%d:%d:%d", pci_dev->addr.domain, pci_dev->addr.bus,
-		 pci_dev->addr.devid, pci_dev->addr.function);
-	buflen = DPI_CMD_QUEUE_SIZE * DPI_CMD_QUEUE_BUFS;
-	dpi_mz = plt_memzone_reserve_aligned(name, buflen, 0, DPI_CMD_QUEUE_SIZE);
-	if (dpi_mz == NULL) {
-		plt_err("dpi memzone reserve failed");
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	roc_dpi->mz = dpi_mz;
-	iova = dpi_mz->iova;
-	for (count = 0; count < DPI_CMD_QUEUE_BUFS; count++) {
-		roc_npa_aura_op_free(aura_handle, 0, iova);
-		iova += DPI_CMD_QUEUE_SIZE;
-	}
-
-	roc_dpi->chunk_base = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_base) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}
-
-	roc_dpi->chunk_next = (void *)roc_npa_aura_op_alloc(aura_handle, 0);
-	if (!roc_dpi->chunk_next) {
-		plt_err("Failed to alloc buffer from NPA aura");
-		rc = -ENOMEM;
-		goto err2;
-	}

-	roc_dpi->aura_handle = aura_handle;
-	/* subtract 2 as they have already been alloc'ed above */
-	roc_dpi->pool_size_m1 = (DPI_CMD_QUEUE_SIZE >> 3) - 2;
+	roc_dpi_disable(roc_dpi);
+	reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(roc_dpi->rbase + DPI_VDMA_SADDR);

 	plt_write64(0x0, roc_dpi->rbase + DPI_VDMA_REQQ_CTL);
-	plt_write64(((uint64_t)(roc_dpi->chunk_base) >> 7) << 7,
-		    roc_dpi->rbase + DPI_VDMA_SADDR);
+	plt_write64(chunk_base, roc_dpi->rbase + DPI_VDMA_SADDR);
 	mbox_msg.u[0] = 0;
 	mbox_msg.u[1] = 0;
 	/* DPI PF driver expects vfid starts from index 0 */
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_OPEN;
-	mbox_msg.s.csize = DPI_CMD_QUEUE_SIZE;
-	mbox_msg.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	mbox_msg.s.csize = chunk_sz;
+	mbox_msg.s.aura = aura;
 	mbox_msg.s.sso_pf_func = idev_sso_pffunc_get();
 	mbox_msg.s.npa_pf_func = idev_npa_pffunc_get();

 	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
 			    sizeof(dpi_mbox_msg_t));
-	if (rc < 0) {
+	if (rc < 0)
 		plt_err("Failed to send mbox message %d to DPI PF, err %d",
 			mbox_msg.s.cmd, rc);
-		goto err2;
-	}
-
-	return rc;

-err2:
-	plt_memzone_free(dpi_mz);
-err1:
-	roc_npa_pool_destroy(aura_handle);
 	return rc;
 }

@@ -153,11 +100,9 @@ roc_dpi_dev_init(struct roc_dpi *roc_dpi)
 	uint16_t vfid;

 	roc_dpi->rbase = pci_dev->mem_resource[0].addr;
-	vfid = ((pci_dev->addr.devid & 0x1F) << 3) |
-	       (pci_dev->addr.function & 0x7);
+	vfid = ((pci_dev->addr.devid & 0x1F) << 3) | (pci_dev->addr.function & 0x7);
 	vfid -= 1;
 	roc_dpi->vfid = vfid;
-	plt_spinlock_init(&roc_dpi->chunk_lock);

 	return 0;
 }
@@ -180,14 +125,9 @@ roc_dpi_dev_fini(struct roc_dpi *roc_dpi)
 	mbox_msg.s.vfid = roc_dpi->vfid;
 	mbox_msg.s.cmd = DPI_QUEUE_CLOSE;

-	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg,
-			    sizeof(dpi_mbox_msg_t));
+	rc = send_msg_to_pf(&pci_dev->addr, (const char *)&mbox_msg, sizeof(dpi_mbox_msg_t));
 	if (rc < 0)
-		plt_err("Failed to send mbox message %d to DPI PF, err %d",
-			mbox_msg.s.cmd, rc);
-
-	roc_npa_pool_destroy(roc_dpi->aura_handle);
-	plt_memzone_free(roc_dpi->mz);
+		plt_err("Failed to send mbox message %d to DPI PF, err %d", mbox_msg.s.cmd, rc);

 	return rc;
 }
diff --git a/drivers/common/cnxk/roc_dpi.h b/drivers/common/cnxk/roc_dpi.h
index 2f061b07c5..4ebde5b8a6 100644
--- a/drivers/common/cnxk/roc_dpi.h
+++ b/drivers/common/cnxk/roc_dpi.h
@@ -5,41 +5,17 @@
 #ifndef _ROC_DPI_H_
 #define _ROC_DPI_H_

-struct roc_dpi_args {
-	uint8_t num_ssegs;
-	uint8_t num_dsegs;
-	uint8_t comp_type;
-	uint8_t direction;
-	uint8_t sdevice;
-	uint8_t ddevice;
-	uint8_t swap;
-	uint8_t use_lock : 1;
-	uint8_t tt : 7;
-	uint16_t func;
-	uint16_t grp;
-	uint32_t tag;
-	uint64_t comp_ptr;
-};
-
 struct roc_dpi {
-	/* Input parameters */
 	struct plt_pci_device *pci_dev;
-	/* End of Input parameters */
-	const struct plt_memzone *mz;
 	uint8_t *rbase;
 	uint16_t vfid;
-	uint16_t pool_size_m1;
-	uint16_t chunk_head;
-	uint64_t *chunk_base;
-	uint64_t *chunk_next;
-	uint64_t aura_handle;
-	plt_spinlock_t chunk_lock;
 } __plt_cache_aligned;

 int __roc_api roc_dpi_dev_init(struct roc_dpi *roc_dpi);
 int __roc_api roc_dpi_dev_fini(struct roc_dpi *roc_dpi);

-int __roc_api roc_dpi_configure(struct roc_dpi *dpi);
+int __roc_api roc_dpi_configure(struct roc_dpi *dpi, uint32_t chunk_sz, uint64_t aura,
+				uint64_t chunk_base);
 int __roc_api roc_dpi_enable(struct roc_dpi *dpi);
 int __roc_api roc_dpi_disable(struct roc_dpi *dpi);

diff --git a/drivers/common/cnxk/roc_dpi_priv.h b/drivers/common/cnxk/roc_dpi_priv.h
index 1fa1a715d3..518a3e7351 100644
--- a/drivers/common/cnxk/roc_dpi_priv.h
+++ b/drivers/common/cnxk/roc_dpi_priv.h
@@ -16,9 +16,6 @@
 #define DPI_REG_DUMP	0x3
 #define DPI_GET_REG_CFG 0x4

-#define DPI_CMD_QUEUE_SIZE 4096
-#define DPI_CMD_QUEUE_BUFS 1024
-
 typedef union dpi_mbox_msg_t {
 	uint64_t u[2];
 	struct dpi_mbox_message_s {
diff --git a/drivers/common/cnxk/roc_platform.c b/drivers/common/cnxk/roc_platform.c
index f91b95ceab..a8a83a3723 100644
--- a/drivers/common/cnxk/roc_platform.c
+++ b/drivers/common/cnxk/roc_platform.c
@@ -60,14 +60,15 @@ roc_plt_init(void)
 	return 0;
 }

-RTE_LOG_REGISTER(cnxk_logtype_base, pmd.cnxk.base, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_mbox, pmd.cnxk.mbox, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_cpt, pmd.crypto.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_ml, pmd.ml.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npa, pmd.mempool.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_nix, pmd.net.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_npc, pmd.net.cnxk.flow, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_sso, pmd.event.cnxk, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tim, pmd.event.cnxk.timer, NOTICE);
-RTE_LOG_REGISTER(cnxk_logtype_tm, pmd.net.cnxk.tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_base, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_mbox, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_cpt, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ml, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npa, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_nix, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_npc, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_sso, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tim, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_tm, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_dpi, NOTICE);
 RTE_LOG_REGISTER_DEFAULT(cnxk_logtype_ree, NOTICE);
diff --git a/drivers/common/cnxk/roc_platform.h b/drivers/common/cnxk/roc_platform.h
index 08f83aba12..dfd4da21b6 100644
--- a/drivers/common/cnxk/roc_platform.h
+++ b/drivers/common/cnxk/roc_platform.h
@@ -242,6 +242,7 @@ extern int cnxk_logtype_sso;
 extern int cnxk_logtype_tim;
 extern int cnxk_logtype_tm;
 extern int cnxk_logtype_ree;
+extern int cnxk_logtype_dpi;

 #define plt_err(fmt, args...)                                                  \
 	RTE_LOG(ERR, PMD, "%s():%u " fmt "\n", __func__, __LINE__, ##args)
@@ -270,6 +271,7 @@ extern int cnxk_logtype_ree;
 #define plt_tim_dbg(fmt, ...)	plt_dbg(tim, fmt, ##__VA_ARGS__)
 #define plt_tm_dbg(fmt, ...)	plt_dbg(tm, fmt, ##__VA_ARGS__)
 #define plt_ree_dbg(fmt, ...)	plt_dbg(ree, fmt, ##__VA_ARGS__)
+#define plt_dpi_dbg(fmt, ...)	plt_dbg(dpi, fmt, ##__VA_ARGS__)

 /* Datapath logs */
 #define plt_dp_err(fmt, args...)                                               \
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8c71497df8..1540dfadf9 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -7,6 +7,7 @@ INTERNAL {
 	cnxk_ipsec_outb_roundup_byte;
 	cnxk_logtype_base;
 	cnxk_logtype_cpt;
+	cnxk_logtype_dpi;
 	cnxk_logtype_mbox;
 	cnxk_logtype_ml;
 	cnxk_logtype_nix;
diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index eec6a897e2..f58bb92dbc 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,18 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */

-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>

 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -24,14 +12,14 @@ cnxk_dmadev_info_get(const struct rte_dma_dev *dev, struct rte_dma_info *dev_inf
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	RTE_SET_USED(size);

-	dev_info->max_vchans = MAX_VCHANS_PER_QUEUE;
+	dev_info->max_vchans = CNXK_DPI_MAX_VCHANS_PER_QUEUE;
 	dev_info->nb_vchans = dpivf->num_vchans;
 	dev_info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM | RTE_DMA_CAPA_MEM_TO_DEV |
 			     RTE_DMA_CAPA_DEV_TO_MEM | RTE_DMA_CAPA_DEV_TO_DEV |
 			     RTE_DMA_CAPA_OPS_COPY | RTE_DMA_CAPA_OPS_COPY_SG;
-	dev_info->max_desc = DPI_MAX_DESC;
-	dev_info->min_desc = DPI_MIN_DESC;
-	dev_info->max_sges = DPI_MAX_POINTER;
+	dev_info->max_desc = CNXK_DPI_MAX_DESC;
+	dev_info->min_desc = CNXK_DPI_MIN_DESC;
+	dev_info->max_sges = CNXK_DPI_MAX_POINTER;

 	return 0;
 }
@@ -48,7 +36,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 		num_vchans = dpivf->num_vchans;
 		i = 0;
 	} else {
-		if (vchan >= MAX_VCHANS_PER_QUEUE)
+		if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 			return -EINVAL;

 		num_vchans = vchan + 1;
@@ -57,7 +45,7 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)

 	for (; i < num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
-		max_desc = dpi_conf->c_desc.max_cnt;
+		max_desc = dpi_conf->c_desc.max_cnt + 1;
 		if (dpi_conf->c_desc.compl_ptr) {
 			for (j = 0; j < max_desc; j++)
 				rte_free(dpi_conf->c_desc.compl_ptr[j]);
@@ -71,39 +59,62 @@ cnxk_dmadev_vchan_free(struct cnxk_dpi_vf_s *dpivf, uint16_t vchan)
 }

 static int
-cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+cnxk_dmadev_chunk_pool_create(struct rte_dma_dev *dev, uint32_t nb_chunks, uint32_t chunk_sz)
 {
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
 	struct cnxk_dpi_vf_s *dpivf = NULL;
-	int rc = 0;
-
-	RTE_SET_USED(conf_sz);
+	int rc;

 	dpivf = dev->fp_obj->dev_private;
+	/* Create chunk pool. */
+	snprintf(pool_name, sizeof(pool_name), "cnxk_dma_chunk_pool%d", dev->data->dev_id);

-	/* Accept only number of vchans as config from application. */
-	if (!(dpivf->flag & CNXK_DPI_DEV_START)) {
-		/* After config function, vchan setup function has to be called.
-		 * Free up vchan memory if any, before configuring num_vchans.
-		 */
-		cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
-		dpivf->num_vchans = conf->nb_vchans;
+	nb_chunks += (CNXK_DPI_POOL_MAX_CACHE_SZ * rte_lcore_count());
+	dpivf->chunk_pool = rte_mempool_create_empty(
+		pool_name, nb_chunks, chunk_sz, CNXK_DPI_POOL_MAX_CACHE_SZ, 0, rte_socket_id(), 0);
+
+	if (dpivf->chunk_pool == NULL) {
+		plt_err("Unable to create chunkpool.");
+		return -ENOMEM;
 	}

-	if (dpivf->flag & CNXK_DPI_DEV_CONFIG)
-		return rc;
+	rc = rte_mempool_set_ops_byname(dpivf->chunk_pool, rte_mbuf_platform_mempool_ops(), NULL);
+	if (rc < 0) {
+		plt_err("Unable to set chunkpool ops");
+		goto free;
+	}

-	rc = roc_dpi_configure(&dpivf->rdpi);
+	rc = rte_mempool_populate_default(dpivf->chunk_pool);
 	if (rc < 0) {
-		plt_err("DMA configure failed err = %d", rc);
-		goto done;
+		plt_err("Unable to set populate chunkpool.");
+		goto free;
 	}
+	dpivf->aura = roc_npa_aura_handle_to_aura(dpivf->chunk_pool->pool_id);

-	dpivf->flag |= CNXK_DPI_DEV_CONFIG;
+	return 0;

-done:
+free:
+	rte_mempool_free(dpivf->chunk_pool);
 	return rc;
 }

+static int
+cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = NULL;
+
+	RTE_SET_USED(conf_sz);
+	dpivf = dev->fp_obj->dev_private;
+
+	/* After config function, vchan setup function has to be called.
+	 * Free up vchan memory if any, before configuring num_vchans.
+	 */
+	cnxk_dmadev_vchan_free(dpivf, RTE_DMA_ALL_VCHAN);
+	dpivf->num_vchans = conf->nb_vchans;
+
+	return 0;
+}
+
 static int
 cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
@@ -117,9 +128,6 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -163,8 +171,8 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -182,7 +190,7 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			return -ENOMEM;
 		}

-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -203,9 +211,6 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,

 	RTE_SET_USED(conf_sz);

-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;

 	switch (conf->direction) {
@@ -249,8 +254,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 	if (!rte_is_power_of_2(max_desc))
 		max_desc = rte_align32pow2(max_desc);

-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
+	if (max_desc > CNXK_DPI_MAX_DESC)
+		max_desc = CNXK_DPI_MAX_DESC;

 	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
 	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
@@ -267,7 +272,8 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
+
+		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
 	}

 	dpi_conf->c_desc.max_cnt = (max_desc - 1);
@@ -280,10 +286,9 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
 	struct cnxk_dpi_conf *dpi_conf;
-	int i, j;
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
+	uint32_t chunks, nb_desc = 0;
+	int i, j, rc = 0;
+	void *chunk;

 	for (i = 0; i < dpivf->num_vchans; i++) {
 		dpi_conf = &dpivf->conf[i];
@@ -292,29 +297,61 @@ cnxk_dmadev_start(struct rte_dma_dev *dev)
 		dpi_conf->pnum_words = 0;
 		dpi_conf->pending = 0;
 		dpi_conf->desc_idx = 0;
-		for (j = 0; j < dpi_conf->c_desc.max_cnt; j++) {
+		for (j = 0; j < dpi_conf->c_desc.max_cnt + 1; j++) {
 			if (dpi_conf->c_desc.compl_ptr[j])
-				dpi_conf->c_desc.compl_ptr[j]->cdata = DPI_REQ_CDATA;
+				dpi_conf->c_desc.compl_ptr[j]->cdata = CNXK_DPI_REQ_CDATA;
 		}
-
+		nb_desc += dpi_conf->c_desc.max_cnt + 1;
 		cnxk_stats_reset(dev, i);
 		dpi_conf->completed_offset = 0;
 	}

-	roc_dpi_enable(&dpivf->rdpi);
+	chunks = CNXK_DPI_CHUNKS_FROM_DESC(CNXK_DPI_QUEUE_BUF_SIZE, nb_desc);
+	rc = cnxk_dmadev_chunk_pool_create(dev, chunks, CNXK_DPI_QUEUE_BUF_SIZE);
+	if (rc < 0) {
+		plt_err("DMA pool configure failed err = %d", rc);
+		goto done;
+	}

-	dpivf->flag |= CNXK_DPI_DEV_START;
+	rc = rte_mempool_get(dpivf->chunk_pool, &chunk);
+	if (rc < 0) {
+		plt_err("DMA failed to get chunk pointer err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}

-	return 0;
+	rc = roc_dpi_configure(&dpivf->rdpi, CNXK_DPI_QUEUE_BUF_SIZE, dpivf->aura, (uint64_t)chunk);
+	if (rc < 0) {
+		plt_err("DMA configure failed err = %d", rc);
+		rte_mempool_free(dpivf->chunk_pool);
+		goto done;
+	}
+
+	dpivf->chunk_base = chunk;
+	dpivf->chunk_head = 0;
+	dpivf->chunk_size_m1 = (CNXK_DPI_QUEUE_BUF_SIZE >> 3) - 2;
+
+	roc_dpi_enable(&dpivf->rdpi);
+
+done:
+	return rc;
 }

 static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);

 	roc_dpi_disable(&dpivf->rdpi);
-	dpivf->flag &= ~CNXK_DPI_DEV_START;
+	rte_mempool_free(dpivf->chunk_pool);
+	dpivf->chunk_pool = NULL;
+	dpivf->chunk_base = NULL;
+	dpivf->chunk_size_m1 = 0;

 	return 0;
 }
@@ -335,7 +372,7 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 }

 static inline int
-__dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
+__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
 {
 	uint64_t *ptr = dpi->chunk_base;

@@ -346,31 +383,25 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 	 * Normally there is plenty of room in the current buffer for the
 	 * command
 	 */
-	if (dpi->chunk_head + cmd_count < dpi->pool_size_m1) {
+	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
 		ptr += dpi->chunk_head;
 		dpi->chunk_head += cmd_count;
 		while (cmd_count--)
 			*ptr++ = *cmds++;
 	} else {
+		uint64_t *new_buff = NULL;
 		int count;
-		uint64_t *new_buff = dpi->chunk_next;
-
-		dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-		if (!dpi->chunk_next) {
-			plt_dp_dbg("Failed to alloc next buffer from NPA");

-			/* NPA failed to allocate a buffer. Restoring chunk_next
-			 * to its original address.
-			 */
-			dpi->chunk_next = new_buff;
-			return -ENOSPC;
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOMEM;
 		}

 		/*
 		 * Figure out how many cmd words will fit in this buffer.
 		 * One location will be needed for the next buffer pointer.
 		 */
-		count = dpi->pool_size_m1 - dpi->chunk_head;
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
 		ptr += dpi->chunk_head;
 		cmd_count -= count;
 		while (count--)
@@ -395,17 +426,10 @@ __dpi_queue_write(struct roc_dpi *dpi, uint64_t *cmds, int cmd_count)
 			*ptr++ = *cmds++;

 		/* queue index may be greater than pool size */
-		if (dpi->chunk_head >= dpi->pool_size_m1) {
-			new_buff = dpi->chunk_next;
-			dpi->chunk_next = (void *)roc_npa_aura_op_alloc(dpi->aura_handle, 0);
-			if (!dpi->chunk_next) {
-				plt_dp_dbg("Failed to alloc next buffer from NPA");
-
-				/* NPA failed to allocate a buffer. Restoring chunk_next
-				 * to its original address.
-				 */
-				dpi->chunk_next = new_buff;
-				return -ENOSPC;
+		if (dpi->chunk_head == dpi->chunk_size_m1) {
+			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+				plt_dpi_dbg("Failed to alloc next buffer from NPA");
+				return -ENOMEM;
 			}

 			/* Write next buffer address */
@@ -433,10 +457,10 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -465,9 +489,9 @@ cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t d
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -498,10 +522,10 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -510,13 +534,13 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 	 * For all other cases, src pointers are first pointers.
 	 */
 	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
 		fptr = &dst[0];
 		lptr = &src[0];
 	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
+		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 		fptr = &src[0];
 		lptr = &dst[0];
 	}
@@ -537,9 +561,9 @@ cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -570,10 +594,10 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

@@ -593,9 +617,9 @@ cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t
 	cmd[num_words++] = length;
 	cmd[num_words++] = lptr;

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -627,15 +651,15 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge

 	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
 	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);

 	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return -ENOSPC;
 	}

-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
+	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
+	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
 	fptr = &src[0];
 	lptr = &dst[0];

@@ -656,9 +680,9 @@ cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge
 		lptr++;
 	}

-	rc = __dpi_queue_write(&dpivf->rdpi, cmd, num_words);
+	rc = __dpi_queue_write(dpivf, cmd, num_words);
 	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
 		return rc;
 	}

@@ -688,16 +712,16 @@ cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls,
 		comp_ptr = c_desc->compl_ptr[c_desc->head];

 		if (comp_ptr->cdata) {
-			if (comp_ptr->cdata == DPI_REQ_CDATA)
+			if (comp_ptr->cdata == CNXK_DPI_REQ_CDATA)
 				break;
 			*has_error = 1;
 			dpi_conf->stats.errors++;
-			STRM_INC(*c_desc, head);
+			CNXK_DPI_STRM_INC(*c_desc, head);
 			break;
 		}

-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -720,13 +744,13 @@ cnxk_dmadev_completed_status(void *dev_private, uint16_t vchan, const uint16_t n
 		comp_ptr = c_desc->compl_ptr[c_desc->head];
 		status[cnt] = comp_ptr->cdata;
 		if (status[cnt]) {
-			if (status[cnt] == DPI_REQ_CDATA)
+			if (status[cnt] == CNXK_DPI_REQ_CDATA)
 				break;

 			dpi_conf->stats.errors++;
 		}
-		comp_ptr->cdata = DPI_REQ_CDATA;
-		STRM_INC(*c_desc, head);
+		comp_ptr->cdata = CNXK_DPI_REQ_CDATA;
+		CNXK_DPI_STRM_INC(*c_desc, head);
 	}

 	dpi_conf->stats.completed += cnt;
@@ -794,7 +818,7 @@ cnxk_stats_get(const struct rte_dma_dev *dev, uint16_t vchan, struct rte_dma_sta
 		goto done;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
@@ -822,7 +846,7 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 		return 0;
 	}

-	if (vchan >= MAX_VCHANS_PER_QUEUE)
+	if (vchan >= CNXK_DPI_MAX_VCHANS_PER_QUEUE)
 		return -EINVAL;

 	dpi_conf = &dpivf->conf[vchan];
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 254e7fea20..72663e44af 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,22 +4,40 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H

+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>

-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define CNXK_DPI_MAX_POINTER		    15
+#define CNXK_DPI_STRM_INC(s, var)	    ((s).var = ((s).var + 1) & (s).max_cnt)
+#define CNXK_DPI_STRM_DEC(s, var)	    ((s).var = ((s).var - 1) == -1 ? (s).max_cnt :	\
+						((s).var - 1))
+#define CNXK_DPI_MAX_DESC		    32768
+#define CNXK_DPI_MIN_DESC		    2
+#define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
+#define CNXK_DPI_QUEUE_BUF_SIZE		    16256
+#define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_HDR_LEN		    4
+#define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + ((src) << 1) + ((dst) << 1))
+#define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER,		\
+							     CNXK_DPI_MAX_POINTER)
+#define CNXK_DPI_CHUNKS_FROM_DESC(cz, desc) (((desc) / (((cz) / 8) / CNXK_DPI_MAX_CMD_SZ)) + 1)

 /* Set Completion data to 0xFF when request submitted,
  * upon successful request completion engine reset to completion status
  */
-#define DPI_REQ_CDATA 0xFF
-
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_REQ_CDATA 0xFF

 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -45,8 +63,15 @@ struct cnxk_dpi_conf {
 };

 struct cnxk_dpi_vf_s {
+	/* Fast path */
+	uint64_t *chunk_base;
+	uint16_t chunk_head;
+	uint16_t chunk_size_m1;
+	struct rte_mempool *chunk_pool;
+	struct cnxk_dpi_conf conf[CNXK_DPI_MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
-	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
 } __plt_cache_aligned;
--
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v8 2/2] dma/cnxk: rewrite DMA fastpath
  2023-09-09 16:54             ` [PATCH v8 1/2] dma/cnxk: rework DMA driver pbhagavatula
@ 2023-09-09 16:54               ` pbhagavatula
  0 siblings, 0 replies; 18+ messages in thread
From: pbhagavatula @ 2023-09-09 16:54 UTC (permalink / raw)
  To: jerinj, Vamsi Attunuru; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 428 ++---------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  59 +++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 436 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 528 insertions(+), 404 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index f58bb92dbc..26680edfde 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -115,19 +115,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return 0;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -163,54 +153,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > CNXK_DPI_MAX_DESC)
-		max_desc = CNXK_DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = CNXK_DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -246,6 +193,27 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -371,333 +339,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & CNXK_DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & CNXK_DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & CNXK_DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -856,17 +497,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -917,12 +547,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 72663e44af..350ae73b5c 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -28,6 +28,7 @@
 #define CNXK_DPI_MAX_VCHANS_PER_QUEUE	    4
 #define CNXK_DPI_QUEUE_BUF_SIZE		    16256
 #define CNXK_DPI_POOL_MAX_CACHE_SZ	    (16)
+#define CNXK_DPI_DW_PER_SINGLE_CMD	    8
 #define CNXK_DPI_HDR_LEN		    4
 #define CNXK_DPI_CMD_LEN(src, dst)	    (CNXK_DPI_HDR_LEN + ((src) << 1) + ((dst) << 1))
 #define CNXK_DPI_MAX_CMD_SZ		    CNXK_DPI_CMD_LEN(CNXK_DPI_MAX_POINTER,		\
@@ -39,6 +40,49 @@
  */
 #define CNXK_DPI_REQ_CDATA 0xFF
 
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
+
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
 	void *cb_data;
@@ -52,12 +96,11 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
@@ -74,6 +117,18 @@ struct cnxk_dpi_vf_s {
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..16d7b5426b
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#else
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + CNXK_DPI_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, CNXK_DPI_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += CNXK_DPI_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		/* Copy the remaining cmd words to new chunk. */
+		__dpi_cpy_scalar(cmd + count, ptr, CNXK_DPI_DW_PER_SINGLE_CMD - count);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = CNXK_DPI_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/* Check if command fits in the current chunk. */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, CNXK_DPI_HDR_LEN);
+		ptr += CNXK_DPI_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in the current chunk
+		 * and copy them, copy the rest to the new buffer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[CNXK_DPI_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	CNXK_DPI_STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		CNXK_DPI_STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + CNXK_DPI_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += CNXK_DPI_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath
  2023-09-09 16:37             ` [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
@ 2023-09-20  4:17               ` Jerin Jacob
  0 siblings, 0 replies; 18+ messages in thread
From: Jerin Jacob @ 2023-09-20  4:17 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Vamsi Attunuru, dev

On Sat, Sep 9, 2023 at 11:26 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Rewrite DMA fastpath to use NEON instructions and reduce number
> of words read from config.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>


Series applied to dpdk-next-net-mrvl/for-next-net. Thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-09-20  4:18 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-30  7:56 [PATCH 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
2023-08-30  7:56 ` [PATCH 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-08-30 14:30 ` [PATCH v2 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
2023-08-30 14:30   ` [PATCH v2 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-08-30 16:54   ` [PATCH v3 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
2023-08-30 16:54     ` [PATCH v3 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-08-31  5:32     ` [PATCH v4 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
2023-08-31  5:32       ` [PATCH v4 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-09-05 15:58         ` Jerin Jacob
2023-09-05 16:19       ` [PATCH v5 1/2] dma/cnxk: use mempool for DMA chunk pool pbhagavatula
2023-09-05 16:19         ` [PATCH v5 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-09-09 16:32         ` [PATCH v6 1/2] dma/cnxk: rework DMA driver pbhagavatula
2023-09-09 16:32           ` [PATCH v6 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-09-09 16:37           ` [PATCH v7 1/2] dma/cnxk: rework DMA driver pbhagavatula
2023-09-09 16:37             ` [PATCH v7 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula
2023-09-20  4:17               ` Jerin Jacob
2023-09-09 16:54             ` [PATCH v8 1/2] dma/cnxk: rework DMA driver pbhagavatula
2023-09-09 16:54               ` [PATCH v8 2/2] dma/cnxk: rewrite DMA fastpath pbhagavatula

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).