DPDK patches and discussions
 help / color / mirror / Atom feed
From: xuan.ding@intel.com
To: maxime.coquelin@redhat.com, chenbo.xia@intel.com
Cc: dev@dpdk.org, jiayu.hu@intel.com, sunil.pai.g@intel.com,
	bruce.richardson@intel.com, harry.van.haaren@intel.com,
	yong.liu@intel.com, Xuan Ding <xuan.ding@intel.com>
Subject: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
Date: Thu,  3 Jun 2021 17:30:23 +0000	[thread overview]
Message-ID: <20210603173023.10487-1-xuan.ding@intel.com> (raw)
In-Reply-To: <20210531150629.35020-1-xuan.ding@intel.com>

From: Xuan Ding <xuan.ding@intel.com>

For async copy, it is unsafe to directly use the physical address.
And current address translation from GPA to HPA via SW also takes
CPU cycles, these can all benefit from IOMMU.

Since the existing DMA engine supports to use platform IOMMU,
this patch enables IOMMU for async vhost, which defines IOAT
devices to use virtual address instead of physical address.

When set memory table, the frontend's memory will be mapped
to the default container of DPDK where IOAT devices have been
added into. When DMA copy fails, the virtual address provided
to IOAT devices also allow us fallback to SW copy or PA copy.

With IOMMU enabled, to use IOAT devices:
1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
2. DPDK must use "--iova-mode=va".

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---

v3:
* Fixed some typos.

v2:
* Fixed a format issue.
* Added the dma unmap logic when device is closed.
---
 doc/guides/prog_guide/vhost_lib.rst |  20 +++++
 lib/vhost/vhost_user.c              | 125 +++++++++-------------------
 lib/vhost/virtio_net.c              |  30 +++----
 3 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..5777f0da96 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device specific operations:
 * ``get_notify_area``
 
   Called to get the notify area info of the queue.
+
+Vhost async data path
+---------------------
+
+* Address mode
+
+  Modern IOAT devices support to use the IOMMU, which can avoid using
+  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
+  GPA to HPA can also be saved. So IOAT devices are defined to use
+  virtual address instead of physical address.
+
+  With IOMMU enabled, to use IOAT devices:
+  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
+  2. DPDK must use ``--iova-mode=va``.
+
+* Fallback
+
+  When the DMA copy fails, the user who implements the transfer_data
+  callback can fallback to SW copy or fallback to PA copy through
+  rte_mem_virt2iova().
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..c33fa784ff 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,34 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 region->host_user_addr,
+						 region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   region->host_user_addr,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
 		if (reg->host_user_addr) {
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
+
+			if (dev->async_copy)
+				async_dma_map(reg, false);
 		}
 	}
 }
@@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_OK;
 }
 
-static int
-add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
-		   uint64_t host_phys_addr, uint64_t size)
-{
-	struct guest_page *page, *last_page;
-	struct guest_page *old_pages;
-
-	if (dev->nr_guest_pages == dev->max_guest_pages) {
-		dev->max_guest_pages *= 2;
-		old_pages = dev->guest_pages;
-		dev->guest_pages = rte_realloc(dev->guest_pages,
-					dev->max_guest_pages * sizeof(*page),
-					RTE_CACHE_LINE_SIZE);
-		if (dev->guest_pages == NULL) {
-			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
-			rte_free(old_pages);
-			return -1;
-		}
-	}
-
-	if (dev->nr_guest_pages > 0) {
-		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
-		/* merge if the two pages are continuous */
-		if (host_phys_addr == last_page->host_phys_addr +
-				      last_page->size) {
-			last_page->size += size;
-			return 0;
-		}
-	}
-
-	page = &dev->guest_pages[dev->nr_guest_pages++];
-	page->guest_phys_addr = guest_phys_addr;
-	page->host_phys_addr  = host_phys_addr;
-	page->size = size;
-
-	return 0;
-}
-
-static int
-add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
-		uint64_t page_size)
-{
-	uint64_t reg_size = reg->size;
-	uint64_t host_user_addr  = reg->host_user_addr;
-	uint64_t guest_phys_addr = reg->guest_phys_addr;
-	uint64_t host_phys_addr;
-	uint64_t size;
-
-	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
-	size = page_size - (guest_phys_addr & (page_size - 1));
-	size = RTE_MIN(size, reg_size);
-
-	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
-		return -1;
-
-	host_user_addr  += size;
-	guest_phys_addr += size;
-	reg_size -= size;
-
-	while (reg_size > 0) {
-		size = RTE_MIN(reg_size, page_size);
-		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
-						  host_user_addr);
-		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
-				size) < 0)
-			return -1;
-
-		host_user_addr  += size;
-		guest_phys_addr += size;
-		reg_size -= size;
-	}
-
-	/* sort guest page array if over binary search threshold */
-	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
-		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
-			sizeof(struct guest_page), guest_page_addrcmp);
-	}
-
-	return 0;
-}
-
 #ifdef RTE_LIBRTE_VHOST_DEBUG
 /* TODO: enable it only in debug mode? */
 static void
@@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
-		if (add_guest_pages(dev, region, alignment) < 0) {
-			VHOST_LOG_CONFIG(ERR,
-					"adding guest pages to region failed.\n");
+	if (dev->async_copy) {
+		ret = async_dma_map(region, true);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
 			return -1;
-		}
+			}
+	}
 
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 8da8a86a10..88110d2cb3 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	int error = 0;
-	uint64_t mapped_len;
 
 	uint32_t tlen = 0;
 	int tvec_idx = 0;
-	void *hpa;
 
 	if (unlikely(m == NULL)) {
 		error = -1;
@@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
-		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
-			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
-					buf_iova + buf_offset,
-					cpy_len, &mapped_len);
-
-			if (unlikely(!hpa || mapped_len < cpy_threshold))
-				break;
-
+		if (unlikely(cpy_len >= cpy_threshold)) {
 			async_fill_vec(src_iovec + tvec_idx,
-				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-				mbuf_offset), (size_t)mapped_len);
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), (size_t)cpy_len);
 
 			async_fill_vec(dst_iovec + tvec_idx,
-					hpa, (size_t)mapped_len);
-
-			tlen += (uint32_t)mapped_len;
-			cpy_len -= (uint32_t)mapped_len;
-			mbuf_avail  -= (uint32_t)mapped_len;
-			mbuf_offset += (uint32_t)mapped_len;
-			buf_avail  -= (uint32_t)mapped_len;
-			buf_offset += (uint32_t)mapped_len;
+				(void *)((uintptr_t)(buf_addr + buf_offset)), (size_t)cpy_len);
+
+			tlen += cpy_len;
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail  -= cpy_len;
+			buf_offset += cpy_len;
+			cpy_len = 0;
 			tvec_idx++;
 		}
 
-- 
2.17.1


  parent reply	other threads:[~2021-06-03  8:45 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: " xuan.ding
2021-06-02 14:26 ` [dpdk-dev] [PATCH v2] " xuan.ding
2021-06-03 17:30 ` xuan.ding [this message]
2021-06-18 16:17   ` [dpdk-dev] [PATCH v3] vhost: " Maxime Coquelin
2021-06-21  3:57     ` Hu, Jiayu
2021-06-22  6:18     ` Ding, Xuan
2021-06-29  9:23       ` Maxime Coquelin
2021-07-01  5:12         ` Ding, Xuan
2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 1/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 2/2] example/vhost: add dma vfio parsing Xuan Ding
2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-07-05 12:16     ` Burakov, Anatoly
2021-07-05 12:45       ` Maxime Coquelin
2021-07-06  8:31         ` Ding, Xuan
2021-07-06  9:16           ` Burakov, Anatoly
2021-07-06  9:32             ` Maxime Coquelin
2021-07-07  6:25               ` Ding, Xuan
2021-07-07 12:17                 ` Burakov, Anatoly
2021-07-07 12:54                   ` Ding, Xuan
2021-07-07 14:33                     ` Burakov, Anatoly
2021-07-07 15:00                       ` Bruce Richardson
2021-07-07 15:09                       ` Ding, Xuan
2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 2/2] example/vhost: add dma vfio parsing Xuan Ding

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210603173023.10487-1-xuan.ding@intel.com \
    --to=xuan.ding@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=chenbo.xia@intel.com \
    --cc=dev@dpdk.org \
    --cc=harry.van.haaren@intel.com \
    --cc=jiayu.hu@intel.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=sunil.pai.g@intel.com \
    --cc=yong.liu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).