DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost
@ 2021-05-31 15:06 xuan.ding
  2021-06-02 14:26 ` [dpdk-dev] [PATCH v2] " xuan.ding
                   ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: xuan.ding @ 2021-05-31 15:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, Xuan Ding

From: Xuan Ding <xuan.ding@intel.com>

For async copy, it is unsafe to directly use the physical address.
and current address translation from GPA to HPA via SW also takes
CPU cycles, these can all benefit from IOMMU.

Since the existing DMA engine supports to use platform IOMMU,
this patch enables IOMMU for async vhost, which defines IOAT
devices to use virtual address instead of physical address.

When set memory table, the frontend's memory will be mapped
to the default container of DPDK where IOAT devices has been
added into. When DMA copy fails, the virtual address provided
to IOAT devices also allow us fallback to SW copy or PA copy.

With IOMMU enabled, to use IOAT devices:
1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
2. DPDK must use "--iova-mode=va".

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  17 +++++
 lib/vhost/vhost_user.c              | 102 ++++------------------------
 lib/vhost/virtio_net.c              |  30 +++-----
 3 files changed, 41 insertions(+), 108 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..6b7206bc1d 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -420,3 +420,20 @@ Finally, a set of device ops is defined for device specific operations:
 * ``get_notify_area``
 
   Called to get the notify area info of the queue.
+
+  Vhost async data path
+  -----------------------------------
+* Address mode
+    Modern IOAT devices supports to use the IOMMU, which can avoid using
+    the unsafe HPA. Besides, the CPU cycles took by SW to translate from
+    GPA to HPA can also be saved. So IOAT devices are defined to use
+    virtual address instead of physical address.
+
+    With IOMMU enabled, to use IOAT devices:
+    1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
+    2. DPDK must use ``--iova-mode=va``.
+
+* Fallback
+    When the DMA copy fails, the user who implements the transfer_data
+    callback can fallback to SW copy or fallback to PA through
+    rte_mem_virt2iova().
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..4d562e0091 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -866,87 +867,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_OK;
 }
 
-static int
-add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
-		   uint64_t host_phys_addr, uint64_t size)
-{
-	struct guest_page *page, *last_page;
-	struct guest_page *old_pages;
-
-	if (dev->nr_guest_pages == dev->max_guest_pages) {
-		dev->max_guest_pages *= 2;
-		old_pages = dev->guest_pages;
-		dev->guest_pages = rte_realloc(dev->guest_pages,
-					dev->max_guest_pages * sizeof(*page),
-					RTE_CACHE_LINE_SIZE);
-		if (dev->guest_pages == NULL) {
-			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
-			rte_free(old_pages);
-			return -1;
-		}
-	}
-
-	if (dev->nr_guest_pages > 0) {
-		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
-		/* merge if the two pages are continuous */
-		if (host_phys_addr == last_page->host_phys_addr +
-				      last_page->size) {
-			last_page->size += size;
-			return 0;
-		}
-	}
-
-	page = &dev->guest_pages[dev->nr_guest_pages++];
-	page->guest_phys_addr = guest_phys_addr;
-	page->host_phys_addr  = host_phys_addr;
-	page->size = size;
-
-	return 0;
-}
-
-static int
-add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
-		uint64_t page_size)
-{
-	uint64_t reg_size = reg->size;
-	uint64_t host_user_addr  = reg->host_user_addr;
-	uint64_t guest_phys_addr = reg->guest_phys_addr;
-	uint64_t host_phys_addr;
-	uint64_t size;
-
-	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
-	size = page_size - (guest_phys_addr & (page_size - 1));
-	size = RTE_MIN(size, reg_size);
-
-	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
-		return -1;
-
-	host_user_addr  += size;
-	guest_phys_addr += size;
-	reg_size -= size;
-
-	while (reg_size > 0) {
-		size = RTE_MIN(reg_size, page_size);
-		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
-						  host_user_addr);
-		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
-				size) < 0)
-			return -1;
-
-		host_user_addr  += size;
-		guest_phys_addr += size;
-		reg_size -= size;
-	}
-
-	/* sort guest page array if over binary search threshold */
-	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
-		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
-			sizeof(struct guest_page), guest_page_addrcmp);
-	}
-
-	return 0;
-}
-
 #ifdef RTE_LIBRTE_VHOST_DEBUG
 /* TODO: enable it only in debug mode? */
 static void
@@ -1158,13 +1078,6 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
-		if (add_guest_pages(dev, region, alignment) < 0) {
-			VHOST_LOG_CONFIG(ERR,
-					"adding guest pages to region failed.\n");
-			return -1;
-		}
-
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1196,6 +1109,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 	uint64_t mmap_offset;
 	uint32_t i;
+	int ret;
 
 	if (validate_msg_fds(msg, memory->nregions) != 0)
 		return RTE_VHOST_MSG_RESULT_ERR;
@@ -1280,6 +1194,18 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		}
 
 		dev->mem->nregions++;
+
+		if (dev->async_copy) {
+			/* Add mapped region into the default container of DPDK. */
+			ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+							 reg->host_user_addr,
+							 reg->host_user_addr,
+							 reg->size);
+			if (ret < 0) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed");
+				goto free_mem_table;
+			}
+		}
 	}
 
 	if (vhost_user_postcopy_register(dev, main_fd, msg) < 0)
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 8da8a86a10..88110d2cb3 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	int error = 0;
-	uint64_t mapped_len;
 
 	uint32_t tlen = 0;
 	int tvec_idx = 0;
-	void *hpa;
 
 	if (unlikely(m == NULL)) {
 		error = -1;
@@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
-		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
-			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
-					buf_iova + buf_offset,
-					cpy_len, &mapped_len);
-
-			if (unlikely(!hpa || mapped_len < cpy_threshold))
-				break;
-
+		if (unlikely(cpy_len >= cpy_threshold)) {
 			async_fill_vec(src_iovec + tvec_idx,
-				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-				mbuf_offset), (size_t)mapped_len);
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), (size_t)cpy_len);
 
 			async_fill_vec(dst_iovec + tvec_idx,
-					hpa, (size_t)mapped_len);
-
-			tlen += (uint32_t)mapped_len;
-			cpy_len -= (uint32_t)mapped_len;
-			mbuf_avail  -= (uint32_t)mapped_len;
-			mbuf_offset += (uint32_t)mapped_len;
-			buf_avail  -= (uint32_t)mapped_len;
-			buf_offset += (uint32_t)mapped_len;
+				(void *)((uintptr_t)(buf_addr + buf_offset)), (size_t)cpy_len);
+
+			tlen += cpy_len;
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail  -= cpy_len;
+			buf_offset += cpy_len;
+			cpy_len = 0;
 			tvec_idx++;
 		}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2] lib/vhost: enable IOMMU for async vhost
  2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost xuan.ding
@ 2021-06-02 14:26 ` xuan.ding
  2021-06-03 17:30 ` [dpdk-dev] [PATCH v3] vhost: " xuan.ding
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: xuan.ding @ 2021-06-02 14:26 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, Xuan Ding

From: Xuan Ding <xuan.ding@intel.com>

For async copy, it is unsafe to directly use the physical address.
and current address translation from GPA to HPA via SW also takes
CPU cycles, these can all benefit from IOMMU.

Since the existing DMA engine supports to use platform IOMMU,
this patch enables IOMMU for async vhost, which defines IOAT
devices to use virtual address instead of physical address.

When set memory table, the frontend's memory will be mapped
to the default container of DPDK where IOAT devices has been
added into. When DMA copy fails, the virtual address provided
to IOAT devices also allow us fallback to SW copy or PA copy.

With IOMMU enabled, to use IOAT devices:
1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
2. DPDK must use "--iova-mode=va".

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---

v2:
* Fixed a format issue.
* Added the dma unmap logic when device is closed.
---
 doc/guides/prog_guide/vhost_lib.rst |  20 +++++
 lib/vhost/vhost_user.c              | 125 +++++++++-------------------
 lib/vhost/virtio_net.c              |  30 +++----
 3 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..9891394e50 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device specific operations:
 * ``get_notify_area``
 
   Called to get the notify area info of the queue.
+
+  Vhost async data path
+  -----------------------------------
+
+* Address mode
+
+    Modern IOAT devices supports to use the IOMMU, which can avoid using
+    the unsafe HPA. Besides, the CPU cycles took by SW to translate from
+    GPA to HPA can also be saved. So IOAT devices are defined to use
+    virtual address instead of physical address.
+
+    With IOMMU enabled, to use IOAT devices:
+    1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
+    2. DPDK must use ``--iova-mode=va``.
+
+* Fallback
+
+    When the DMA copy fails, the user who implements the transfer_data
+    callback can fallback to SW copy or fallback to PA copy through
+    rte_mem_virt2iova().
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..1154c7ee24 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,34 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						region->host_user_addr,
+						region->host_user_addr,
+						region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						region->host_user_addr,
+						region->host_user_addr,
+						region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
 		if (reg->host_user_addr) {
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
+
+			if (dev->async_copy)
+				async_dma_map(reg, false);
 		}
 	}
 }
@@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_OK;
 }
 
-static int
-add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
-		   uint64_t host_phys_addr, uint64_t size)
-{
-	struct guest_page *page, *last_page;
-	struct guest_page *old_pages;
-
-	if (dev->nr_guest_pages == dev->max_guest_pages) {
-		dev->max_guest_pages *= 2;
-		old_pages = dev->guest_pages;
-		dev->guest_pages = rte_realloc(dev->guest_pages,
-					dev->max_guest_pages * sizeof(*page),
-					RTE_CACHE_LINE_SIZE);
-		if (dev->guest_pages == NULL) {
-			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
-			rte_free(old_pages);
-			return -1;
-		}
-	}
-
-	if (dev->nr_guest_pages > 0) {
-		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
-		/* merge if the two pages are continuous */
-		if (host_phys_addr == last_page->host_phys_addr +
-				      last_page->size) {
-			last_page->size += size;
-			return 0;
-		}
-	}
-
-	page = &dev->guest_pages[dev->nr_guest_pages++];
-	page->guest_phys_addr = guest_phys_addr;
-	page->host_phys_addr  = host_phys_addr;
-	page->size = size;
-
-	return 0;
-}
-
-static int
-add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
-		uint64_t page_size)
-{
-	uint64_t reg_size = reg->size;
-	uint64_t host_user_addr  = reg->host_user_addr;
-	uint64_t guest_phys_addr = reg->guest_phys_addr;
-	uint64_t host_phys_addr;
-	uint64_t size;
-
-	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
-	size = page_size - (guest_phys_addr & (page_size - 1));
-	size = RTE_MIN(size, reg_size);
-
-	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
-		return -1;
-
-	host_user_addr  += size;
-	guest_phys_addr += size;
-	reg_size -= size;
-
-	while (reg_size > 0) {
-		size = RTE_MIN(reg_size, page_size);
-		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
-						  host_user_addr);
-		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
-				size) < 0)
-			return -1;
-
-		host_user_addr  += size;
-		guest_phys_addr += size;
-		reg_size -= size;
-	}
-
-	/* sort guest page array if over binary search threshold */
-	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
-		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
-			sizeof(struct guest_page), guest_page_addrcmp);
-	}
-
-	return 0;
-}
-
 #ifdef RTE_LIBRTE_VHOST_DEBUG
 /* TODO: enable it only in debug mode? */
 static void
@@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
-		if (add_guest_pages(dev, region, alignment) < 0) {
-			VHOST_LOG_CONFIG(ERR,
-					"adding guest pages to region failed.\n");
+	if (dev->async_copy) {
+		ret = async_dma_map(region, true);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
 			return -1;
-		}
+			}
+	}
 
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 8da8a86a10..88110d2cb3 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	int error = 0;
-	uint64_t mapped_len;
 
 	uint32_t tlen = 0;
 	int tvec_idx = 0;
-	void *hpa;
 
 	if (unlikely(m == NULL)) {
 		error = -1;
@@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
-		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
-			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
-					buf_iova + buf_offset,
-					cpy_len, &mapped_len);
-
-			if (unlikely(!hpa || mapped_len < cpy_threshold))
-				break;
-
+		if (unlikely(cpy_len >= cpy_threshold)) {
 			async_fill_vec(src_iovec + tvec_idx,
-				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-				mbuf_offset), (size_t)mapped_len);
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), (size_t)cpy_len);
 
 			async_fill_vec(dst_iovec + tvec_idx,
-					hpa, (size_t)mapped_len);
-
-			tlen += (uint32_t)mapped_len;
-			cpy_len -= (uint32_t)mapped_len;
-			mbuf_avail  -= (uint32_t)mapped_len;
-			mbuf_offset += (uint32_t)mapped_len;
-			buf_avail  -= (uint32_t)mapped_len;
-			buf_offset += (uint32_t)mapped_len;
+				(void *)((uintptr_t)(buf_addr + buf_offset)), (size_t)cpy_len);
+
+			tlen += cpy_len;
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail  -= cpy_len;
+			buf_offset += cpy_len;
+			cpy_len = 0;
 			tvec_idx++;
 		}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost xuan.ding
  2021-06-02 14:26 ` [dpdk-dev] [PATCH v2] " xuan.ding
@ 2021-06-03 17:30 ` xuan.ding
  2021-06-18 16:17   ` Maxime Coquelin
  2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
  2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
  3 siblings, 1 reply; 25+ messages in thread
From: xuan.ding @ 2021-06-03 17:30 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, Xuan Ding

From: Xuan Ding <xuan.ding@intel.com>

For async copy, it is unsafe to directly use the physical address.
And current address translation from GPA to HPA via SW also takes
CPU cycles, these can all benefit from IOMMU.

Since the existing DMA engine supports to use platform IOMMU,
this patch enables IOMMU for async vhost, which defines IOAT
devices to use virtual address instead of physical address.

When set memory table, the frontend's memory will be mapped
to the default container of DPDK where IOAT devices have been
added into. When DMA copy fails, the virtual address provided
to IOAT devices also allow us fallback to SW copy or PA copy.

With IOMMU enabled, to use IOAT devices:
1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
2. DPDK must use "--iova-mode=va".

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---

v3:
* Fixed some typos.

v2:
* Fixed a format issue.
* Added the dma unmap logic when device is closed.
---
 doc/guides/prog_guide/vhost_lib.rst |  20 +++++
 lib/vhost/vhost_user.c              | 125 +++++++++-------------------
 lib/vhost/virtio_net.c              |  30 +++----
 3 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index d18fb98910..5777f0da96 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device specific operations:
 * ``get_notify_area``
 
   Called to get the notify area info of the queue.
+
+Vhost async data path
+---------------------
+
+* Address mode
+
+  Modern IOAT devices support to use the IOMMU, which can avoid using
+  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
+  GPA to HPA can also be saved. So IOAT devices are defined to use
+  virtual address instead of physical address.
+
+  With IOMMU enabled, to use IOAT devices:
+  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
+  2. DPDK must use ``--iova-mode=va``.
+
+* Fallback
+
+  When the DMA copy fails, the user who implements the transfer_data
+  callback can fallback to SW copy or fallback to PA copy through
+  rte_mem_virt2iova().
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..c33fa784ff 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,34 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 region->host_user_addr,
+						 region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   region->host_user_addr,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
 		if (reg->host_user_addr) {
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
+
+			if (dev->async_copy)
+				async_dma_map(reg, false);
 		}
 	}
 }
@@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
 	return RTE_VHOST_MSG_RESULT_OK;
 }
 
-static int
-add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
-		   uint64_t host_phys_addr, uint64_t size)
-{
-	struct guest_page *page, *last_page;
-	struct guest_page *old_pages;
-
-	if (dev->nr_guest_pages == dev->max_guest_pages) {
-		dev->max_guest_pages *= 2;
-		old_pages = dev->guest_pages;
-		dev->guest_pages = rte_realloc(dev->guest_pages,
-					dev->max_guest_pages * sizeof(*page),
-					RTE_CACHE_LINE_SIZE);
-		if (dev->guest_pages == NULL) {
-			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
-			rte_free(old_pages);
-			return -1;
-		}
-	}
-
-	if (dev->nr_guest_pages > 0) {
-		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
-		/* merge if the two pages are continuous */
-		if (host_phys_addr == last_page->host_phys_addr +
-				      last_page->size) {
-			last_page->size += size;
-			return 0;
-		}
-	}
-
-	page = &dev->guest_pages[dev->nr_guest_pages++];
-	page->guest_phys_addr = guest_phys_addr;
-	page->host_phys_addr  = host_phys_addr;
-	page->size = size;
-
-	return 0;
-}
-
-static int
-add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
-		uint64_t page_size)
-{
-	uint64_t reg_size = reg->size;
-	uint64_t host_user_addr  = reg->host_user_addr;
-	uint64_t guest_phys_addr = reg->guest_phys_addr;
-	uint64_t host_phys_addr;
-	uint64_t size;
-
-	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
-	size = page_size - (guest_phys_addr & (page_size - 1));
-	size = RTE_MIN(size, reg_size);
-
-	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
-		return -1;
-
-	host_user_addr  += size;
-	guest_phys_addr += size;
-	reg_size -= size;
-
-	while (reg_size > 0) {
-		size = RTE_MIN(reg_size, page_size);
-		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
-						  host_user_addr);
-		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
-				size) < 0)
-			return -1;
-
-		host_user_addr  += size;
-		guest_phys_addr += size;
-		reg_size -= size;
-	}
-
-	/* sort guest page array if over binary search threshold */
-	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
-		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
-			sizeof(struct guest_page), guest_page_addrcmp);
-	}
-
-	return 0;
-}
-
 #ifdef RTE_LIBRTE_VHOST_DEBUG
 /* TODO: enable it only in debug mode? */
 static void
@@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
-		if (add_guest_pages(dev, region, alignment) < 0) {
-			VHOST_LOG_CONFIG(ERR,
-					"adding guest pages to region failed.\n");
+	if (dev->async_copy) {
+		ret = async_dma_map(region, true);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
 			return -1;
-		}
+			}
+	}
 
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 8da8a86a10..88110d2cb3 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	int error = 0;
-	uint64_t mapped_len;
 
 	uint32_t tlen = 0;
 	int tvec_idx = 0;
-	void *hpa;
 
 	if (unlikely(m == NULL)) {
 		error = -1;
@@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
-		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
-			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
-					buf_iova + buf_offset,
-					cpy_len, &mapped_len);
-
-			if (unlikely(!hpa || mapped_len < cpy_threshold))
-				break;
-
+		if (unlikely(cpy_len >= cpy_threshold)) {
 			async_fill_vec(src_iovec + tvec_idx,
-				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-				mbuf_offset), (size_t)mapped_len);
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), (size_t)cpy_len);
 
 			async_fill_vec(dst_iovec + tvec_idx,
-					hpa, (size_t)mapped_len);
-
-			tlen += (uint32_t)mapped_len;
-			cpy_len -= (uint32_t)mapped_len;
-			mbuf_avail  -= (uint32_t)mapped_len;
-			mbuf_offset += (uint32_t)mapped_len;
-			buf_avail  -= (uint32_t)mapped_len;
-			buf_offset += (uint32_t)mapped_len;
+				(void *)((uintptr_t)(buf_addr + buf_offset)), (size_t)cpy_len);
+
+			tlen += cpy_len;
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail  -= cpy_len;
+			buf_offset += cpy_len;
+			cpy_len = 0;
 			tvec_idx++;
 		}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-06-03 17:30 ` [dpdk-dev] [PATCH v3] vhost: " xuan.ding
@ 2021-06-18 16:17   ` Maxime Coquelin
  2021-06-21  3:57     ` Hu, Jiayu
  2021-06-22  6:18     ` Ding, Xuan
  0 siblings, 2 replies; 25+ messages in thread
From: Maxime Coquelin @ 2021-06-18 16:17 UTC (permalink / raw)
  To: xuan.ding, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren, yong.liu

Hi Xuan,

On 6/3/21 7:30 PM, xuan.ding@intel.com wrote:
> From: Xuan Ding <xuan.ding@intel.com>
> 
> For async copy, it is unsafe to directly use the physical address.
> And current address translation from GPA to HPA via SW also takes
> CPU cycles, these can all benefit from IOMMU.
> 
> Since the existing DMA engine supports to use platform IOMMU,
> this patch enables IOMMU for async vhost, which defines IOAT
> devices to use virtual address instead of physical address.

We have to keep in mind a generic DMA api is coming, and maybe we want
a SW implementation of a dmadev based on memcpy at least for
testing/debugging purpose.

> When set memory table, the frontend's memory will be mapped
> to the default container of DPDK where IOAT devices have been
> added into. When DMA copy fails, the virtual address provided
> to IOAT devices also allow us fallback to SW copy or PA copy.
> 
> With IOMMU enabled, to use IOAT devices:
> 1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> 2. DPDK must use "--iova-mode=va".

I think this is problematic, at least we need to check the right iova
mode has been selected, but even with doing that it is limiting.

What prevent us to reuse add_guest_pages() alogrithm to implement
IOVA_AS_PA?

> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
> 
> v3:
> * Fixed some typos.
> 
> v2:
> * Fixed a format issue.
> * Added the dma unmap logic when device is closed.
> ---
>  doc/guides/prog_guide/vhost_lib.rst |  20 +++++
>  lib/vhost/vhost_user.c              | 125 +++++++++-------------------
>  lib/vhost/virtio_net.c              |  30 +++----
>  3 files changed, 69 insertions(+), 106 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index d18fb98910..5777f0da96 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device specific operations:
>  * ``get_notify_area``
>  
>    Called to get the notify area info of the queue.
> +
> +Vhost async data path
> +---------------------
> +
> +* Address mode
> +
> +  Modern IOAT devices support to use the IOMMU, which can avoid using
> +  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
> +  GPA to HPA can also be saved. So IOAT devices are defined to use
> +  virtual address instead of physical address.
> +
> +  With IOMMU enabled, to use IOAT devices:
> +  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> +  2. DPDK must use ``--iova-mode=va``.
> +
> +* Fallback
> +
> +  When the DMA copy fails, the user who implements the transfer_data
> +  callback can fallback to SW copy or fallback to PA copy through
> +  rte_mem_virt2iova().
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> index 8f0eba6412..c33fa784ff 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -45,6 +45,7 @@
>  #include <rte_common.h>
>  #include <rte_malloc.h>
>  #include <rte_log.h>
> +#include <rte_vfio.h>
>  
>  #include "iotlb.h"
>  #include "vhost.h"
> @@ -141,6 +142,34 @@ get_blk_size(int fd)
>  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>  }
>  
> +static int
> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> +{
> +	int ret = 0;
> +	if (do_map) {
> +		/* Add mapped region into the default container of DPDK. */
> +		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						 region->host_user_addr,
> +						 region->host_user_addr,
> +						 region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> +			return ret;
> +		}
> +	} else {
> +		/* Remove mapped region from the default container of DPDK. */
> +		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						   region->host_user_addr,
> +						   region->host_user_addr,
> +						   region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> +			return ret;
> +		}
> +	}
> +	return ret;
> +}
> +
>  static void
>  free_mem_region(struct virtio_net *dev)
>  {
> @@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
>  		if (reg->host_user_addr) {
>  			munmap(reg->mmap_addr, reg->mmap_size);
>  			close(reg->fd);
> +
> +			if (dev->async_copy)
> +				async_dma_map(reg, false);
>  		}
>  	}
>  }
> @@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
>  	return RTE_VHOST_MSG_RESULT_OK;
>  }
>  
> -static int
> -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> -		   uint64_t host_phys_addr, uint64_t size)
> -{
> -	struct guest_page *page, *last_page;
> -	struct guest_page *old_pages;
> -
> -	if (dev->nr_guest_pages == dev->max_guest_pages) {
> -		dev->max_guest_pages *= 2;
> -		old_pages = dev->guest_pages;
> -		dev->guest_pages = rte_realloc(dev->guest_pages,
> -					dev->max_guest_pages * sizeof(*page),
> -					RTE_CACHE_LINE_SIZE);
> -		if (dev->guest_pages == NULL) {
> -			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
> -			rte_free(old_pages);
> -			return -1;
> -		}
> -	}
> -
> -	if (dev->nr_guest_pages > 0) {
> -		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
> -		/* merge if the two pages are continuous */
> -		if (host_phys_addr == last_page->host_phys_addr +
> -				      last_page->size) {
> -			last_page->size += size;
> -			return 0;
> -		}
> -	}
> -
> -	page = &dev->guest_pages[dev->nr_guest_pages++];
> -	page->guest_phys_addr = guest_phys_addr;
> -	page->host_phys_addr  = host_phys_addr;
> -	page->size = size;
> -
> -	return 0;
> -}
> -
> -static int
> -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
> -		uint64_t page_size)
> -{
> -	uint64_t reg_size = reg->size;
> -	uint64_t host_user_addr  = reg->host_user_addr;
> -	uint64_t guest_phys_addr = reg->guest_phys_addr;
> -	uint64_t host_phys_addr;
> -	uint64_t size;
> -
> -	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
> -	size = page_size - (guest_phys_addr & (page_size - 1));
> -	size = RTE_MIN(size, reg_size);
> -
> -	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
> -		return -1;
> -
> -	host_user_addr  += size;
> -	guest_phys_addr += size;
> -	reg_size -= size;
> -
> -	while (reg_size > 0) {
> -		size = RTE_MIN(reg_size, page_size);
> -		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
> -						  host_user_addr);
> -		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
> -				size) < 0)
> -			return -1;
> -
> -		host_user_addr  += size;
> -		guest_phys_addr += size;
> -		reg_size -= size;
> -	}
> -
> -	/* sort guest page array if over binary search threshold */
> -	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> -		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
> -			sizeof(struct guest_page), guest_page_addrcmp);
> -	}
> -
> -	return 0;
> -}
> -
>  #ifdef RTE_LIBRTE_VHOST_DEBUG
>  /* TODO: enable it only in debug mode? */
>  static void
> @@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
>  	uint64_t mmap_size;
>  	uint64_t alignment;
>  	int populate;
> +	int ret;
>  
>  	/* Check for memory_size + mmap_offset overflow */
>  	if (mmap_offset >= -region->size) {
> @@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
>  	region->mmap_size = mmap_size;
>  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
>  
> -	if (dev->async_copy)
> -		if (add_guest_pages(dev, region, alignment) < 0) {
> -			VHOST_LOG_CONFIG(ERR,
> -					"adding guest pages to region failed.\n");
> +	if (dev->async_copy) {
> +		ret = async_dma_map(region, true);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
>  			return -1;

Maybe we're too late in the init already, but I would think we may want
to fallback to SW implementation insea

> -		}
> +			}
> +	}
>  
>  	VHOST_LOG_CONFIG(INFO,
>  			"guest memory region size: 0x%" PRIx64 "\n"
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 8da8a86a10..88110d2cb3 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
>  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
>  	int error = 0;
> -	uint64_t mapped_len;
>  
>  	uint32_t tlen = 0;
>  	int tvec_idx = 0;
> -	void *hpa;
>  
>  	if (unlikely(m == NULL)) {
>  		error = -1;
> @@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  
>  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
>  
> -		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> -			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> -					buf_iova + buf_offset,
> -					cpy_len, &mapped_len);
> -
> -			if (unlikely(!hpa || mapped_len < cpy_threshold))
> -				break;
> -
> +		if (unlikely(cpy_len >= cpy_threshold)) {
>  			async_fill_vec(src_iovec + tvec_idx,
> -				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> -				mbuf_offset), (size_t)mapped_len);
> +				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), (size_t)cpy_len);
>  
>  			async_fill_vec(dst_iovec + tvec_idx,
> -					hpa, (size_t)mapped_len);
> -
> -			tlen += (uint32_t)mapped_len;
> -			cpy_len -= (uint32_t)mapped_len;
> -			mbuf_avail  -= (uint32_t)mapped_len;
> -			mbuf_offset += (uint32_t)mapped_len;
> -			buf_avail  -= (uint32_t)mapped_len;
> -			buf_offset += (uint32_t)mapped_len;
> +				(void *)((uintptr_t)(buf_addr + buf_offset)), (size_t)cpy_len);
> +
> +			tlen += cpy_len;
> +			mbuf_avail  -= cpy_len;
> +			mbuf_offset += cpy_len;
> +			buf_avail  -= cpy_len;
> +			buf_offset += cpy_len;
> +			cpy_len = 0;
>  			tvec_idx++;
>  		}
>  
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-06-18 16:17   ` Maxime Coquelin
@ 2021-06-21  3:57     ` Hu, Jiayu
  2021-06-22  6:18     ` Ding, Xuan
  1 sibling, 0 replies; 25+ messages in thread
From: Hu, Jiayu @ 2021-06-21  3:57 UTC (permalink / raw)
  To: Maxime Coquelin, Ding, Xuan, Xia, Chenbo
  Cc: dev, Pai G, Sunil, Richardson, Bruce, Van Haaren, Harry, Liu, Yong

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Saturday, June 19, 2021 12:18 AM
> To: Ding, Xuan <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> <yong.liu@intel.com>
> Subject: Re: [PATCH v3] vhost: enable IOMMU for async vhost
> 
> Hi Xuan,
> 
> On 6/3/21 7:30 PM, xuan.ding@intel.com wrote:
> > From: Xuan Ding <xuan.ding@intel.com>
> >
> > For async copy, it is unsafe to directly use the physical address.
> > And current address translation from GPA to HPA via SW also takes CPU
> > cycles, these can all benefit from IOMMU.
> >
> > Since the existing DMA engine supports to use platform IOMMU, this
> > patch enables IOMMU for async vhost, which defines IOAT devices to use
> > virtual address instead of physical address.
> 
> We have to keep in mind a generic DMA api is coming, and maybe we want a
> SW implementation of a dmadev based on memcpy at least for
> testing/debugging purpose.

Agree, we need to support SW fallback, and I think this is also what this
patch wants to do. Originally, vhost passes IOVA to DMA callbacks; if
DPDK in PA mode, we cannot fallback to SW copy. In this patch, vhost
passes both VA for pktmbuf and guest's buffer to DMA callbacks, which
makes SW fallback possible.

In terms of generic DMA api, no matter it uses VA or IOVA as buffer addresses,
I think this design can work, as DMA callback implementations can do address
translation anyway.

> 
> > When set memory table, the frontend's memory will be mapped to the
> > default container of DPDK where IOAT devices have been added into.
> > When DMA copy fails, the virtual address provided to IOAT devices also
> > allow us fallback to SW copy or PA copy.
> >
> > With IOMMU enabled, to use IOAT devices:
> > 1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> > 2. DPDK must use "--iova-mode=va".
> 
> I think this is problematic, at least we need to check the right iova mode has
> been selected, but even with doing that it is limiting.
> 
> What prevent us to reuse add_guest_pages() alogrithm to implement
> IOVA_AS_PA?

In the original design, vfio doesn't work, as vhost doesn't programs iommu
table with guest's memory. Specifically, if DPDK is in VA mode, IOVA passed
to DMA callback is VA, but IOMMU cannot find corresponding PA for guest
buffers; if DPDK is in PA mode, IOVA passed to DMA callback is PA. In this case,
there are random errors for guest buffers when VT-d is enabled, as IOMMU
behavior is uncertain. I think supporting vfio is one of reasons of this patch.

One concern about this patch is how to handle when IOVA is PA. If IOVA is PA,
IOMMU cannot find correct PA for pktmbuf via VA passed by vhost. But can
DMA callback translate VA to PA before calling ioat/dmadev API? IMHO, IOVA
as PA with vfio is not a recommended configuration. Do you think it's a must
for vhost to support this case?

Thanks,
Jiayu

> 
> >
> > Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > ---
> >
> > v3:
> > * Fixed some typos.
> >
> > v2:
> > * Fixed a format issue.
> > * Added the dma unmap logic when device is closed.
> > ---
> >  doc/guides/prog_guide/vhost_lib.rst |  20 +++++
> >  lib/vhost/vhost_user.c              | 125 +++++++++-------------------
> >  lib/vhost/virtio_net.c              |  30 +++----
> >  3 files changed, 69 insertions(+), 106 deletions(-)
> >
> > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> > b/doc/guides/prog_guide/vhost_lib.rst
> > index d18fb98910..5777f0da96 100644
> > --- a/doc/guides/prog_guide/vhost_lib.rst
> > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > @@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device
> specific operations:
> >  * ``get_notify_area``
> >
> >    Called to get the notify area info of the queue.
> > +
> > +Vhost async data path
> > +---------------------
> > +
> > +* Address mode
> > +
> > +  Modern IOAT devices support to use the IOMMU, which can avoid using
> > + the unsafe HPA. Besides, the CPU cycles took by SW to translate from
> > + GPA to HPA can also be saved. So IOAT devices are defined to use
> > + virtual address instead of physical address.
> > +
> > +  With IOMMU enabled, to use IOAT devices:
> > +  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> > +  2. DPDK must use ``--iova-mode=va``.
> > +
> > +* Fallback
> > +
> > +  When the DMA copy fails, the user who implements the transfer_data
> > + callback can fallback to SW copy or fallback to PA copy through
> > + rte_mem_virt2iova().
> > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index
> > 8f0eba6412..c33fa784ff 100644
> > --- a/lib/vhost/vhost_user.c
> > +++ b/lib/vhost/vhost_user.c
> > @@ -45,6 +45,7 @@
> >  #include <rte_common.h>
> >  #include <rte_malloc.h>
> >  #include <rte_log.h>
> > +#include <rte_vfio.h>
> >
> >  #include "iotlb.h"
> >  #include "vhost.h"
> > @@ -141,6 +142,34 @@ get_blk_size(int fd)
> >  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;  }
> >
> > +static int
> > +async_dma_map(struct rte_vhost_mem_region *region, bool do_map) {
> > +	int ret = 0;
> > +	if (do_map) {
> > +		/* Add mapped region into the default container of DPDK. */
> > +		ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > +						 region->host_user_addr,
> > +						 region->host_user_addr,
> > +						 region->size);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "DMA engine map
> failed\n");
> > +			return ret;
> > +		}
> > +	} else {
> > +		/* Remove mapped region from the default container of
> DPDK. */
> > +		ret =
> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > +						   region->host_user_addr,
> > +						   region->host_user_addr,
> > +						   region->size);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> failed\n");
> > +			return ret;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> >  static void
> >  free_mem_region(struct virtio_net *dev)  { @@ -155,6 +184,9 @@
> > free_mem_region(struct virtio_net *dev)
> >  		if (reg->host_user_addr) {
> >  			munmap(reg->mmap_addr, reg->mmap_size);
> >  			close(reg->fd);
> > +
> > +			if (dev->async_copy)
> > +				async_dma_map(reg, false);
> >  		}
> >  	}
> >  }
> > @@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net
> **pdev,
> >  	return RTE_VHOST_MSG_RESULT_OK;
> >  }
> >
> > -static int
> > -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> > -		   uint64_t host_phys_addr, uint64_t size)
> > -{
> > -	struct guest_page *page, *last_page;
> > -	struct guest_page *old_pages;
> > -
> > -	if (dev->nr_guest_pages == dev->max_guest_pages) {
> > -		dev->max_guest_pages *= 2;
> > -		old_pages = dev->guest_pages;
> > -		dev->guest_pages = rte_realloc(dev->guest_pages,
> > -					dev->max_guest_pages *
> sizeof(*page),
> > -					RTE_CACHE_LINE_SIZE);
> > -		if (dev->guest_pages == NULL) {
> > -			VHOST_LOG_CONFIG(ERR, "cannot realloc
> guest_pages\n");
> > -			rte_free(old_pages);
> > -			return -1;
> > -		}
> > -	}
> > -
> > -	if (dev->nr_guest_pages > 0) {
> > -		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
> > -		/* merge if the two pages are continuous */
> > -		if (host_phys_addr == last_page->host_phys_addr +
> > -				      last_page->size) {
> > -			last_page->size += size;
> > -			return 0;
> > -		}
> > -	}
> > -
> > -	page = &dev->guest_pages[dev->nr_guest_pages++];
> > -	page->guest_phys_addr = guest_phys_addr;
> > -	page->host_phys_addr  = host_phys_addr;
> > -	page->size = size;
> > -
> > -	return 0;
> > -}
> > -
> > -static int
> > -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region
> *reg,
> > -		uint64_t page_size)
> > -{
> > -	uint64_t reg_size = reg->size;
> > -	uint64_t host_user_addr  = reg->host_user_addr;
> > -	uint64_t guest_phys_addr = reg->guest_phys_addr;
> > -	uint64_t host_phys_addr;
> > -	uint64_t size;
> > -
> > -	host_phys_addr = rte_mem_virt2iova((void
> *)(uintptr_t)host_user_addr);
> > -	size = page_size - (guest_phys_addr & (page_size - 1));
> > -	size = RTE_MIN(size, reg_size);
> > -
> > -	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size)
> < 0)
> > -		return -1;
> > -
> > -	host_user_addr  += size;
> > -	guest_phys_addr += size;
> > -	reg_size -= size;
> > -
> > -	while (reg_size > 0) {
> > -		size = RTE_MIN(reg_size, page_size);
> > -		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
> > -						  host_user_addr);
> > -		if (add_one_guest_page(dev, guest_phys_addr,
> host_phys_addr,
> > -				size) < 0)
> > -			return -1;
> > -
> > -		host_user_addr  += size;
> > -		guest_phys_addr += size;
> > -		reg_size -= size;
> > -	}
> > -
> > -	/* sort guest page array if over binary search threshold */
> > -	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> > -		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
> > -			sizeof(struct guest_page), guest_page_addrcmp);
> > -	}
> > -
> > -	return 0;
> > -}
> > -
> >  #ifdef RTE_LIBRTE_VHOST_DEBUG
> >  /* TODO: enable it only in debug mode? */  static void @@ -1105,6
> > +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
> >  	uint64_t mmap_size;
> >  	uint64_t alignment;
> >  	int populate;
> > +	int ret;
> >
> >  	/* Check for memory_size + mmap_offset overflow */
> >  	if (mmap_offset >= -region->size) {
> > @@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net
> *dev,
> >  	region->mmap_size = mmap_size;
> >  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> > mmap_offset;
> >
> > -	if (dev->async_copy)
> > -		if (add_guest_pages(dev, region, alignment) < 0) {
> > -			VHOST_LOG_CONFIG(ERR,
> > -					"adding guest pages to region
> failed.\n");
> > +	if (dev->async_copy) {
> > +		ret = async_dma_map(region, true);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for
> DMA engine failed\n");
> >  			return -1;
> 
> Maybe we're too late in the init already, but I would think we may want to
> fallback to SW implementation insea
> 
> > -		}
> > +			}
> > +	}
> >
> >  	VHOST_LOG_CONFIG(INFO,
> >  			"guest memory region size: 0x%" PRIx64 "\n"
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> > 8da8a86a10..88110d2cb3 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> >  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> >  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> >  	int error = 0;
> > -	uint64_t mapped_len;
> >
> >  	uint32_t tlen = 0;
> >  	int tvec_idx = 0;
> > -	void *hpa;
> >
> >  	if (unlikely(m == NULL)) {
> >  		error = -1;
> > @@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev,
> > struct vhost_virtqueue *vq,
> >
> >  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> >
> > -		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> > -			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> > -					buf_iova + buf_offset,
> > -					cpy_len, &mapped_len);
> > -
> > -			if (unlikely(!hpa || mapped_len < cpy_threshold))
> > -				break;
> > -
> > +		if (unlikely(cpy_len >= cpy_threshold)) {
> >  			async_fill_vec(src_iovec + tvec_idx,
> > -				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> > -				mbuf_offset), (size_t)mapped_len);
> > +				rte_pktmbuf_mtod_offset(m, void *,
> mbuf_offset),
> > +(size_t)cpy_len);
> >
> >  			async_fill_vec(dst_iovec + tvec_idx,
> > -					hpa, (size_t)mapped_len);
> > -
> > -			tlen += (uint32_t)mapped_len;
> > -			cpy_len -= (uint32_t)mapped_len;
> > -			mbuf_avail  -= (uint32_t)mapped_len;
> > -			mbuf_offset += (uint32_t)mapped_len;
> > -			buf_avail  -= (uint32_t)mapped_len;
> > -			buf_offset += (uint32_t)mapped_len;
> > +				(void *)((uintptr_t)(buf_addr + buf_offset)),
> (size_t)cpy_len);
> > +
> > +			tlen += cpy_len;
> > +			mbuf_avail  -= cpy_len;
> > +			mbuf_offset += cpy_len;
> > +			buf_avail  -= cpy_len;
> > +			buf_offset += cpy_len;
> > +			cpy_len = 0;
> >  			tvec_idx++;
> >  		}
> >
> >


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-06-18 16:17   ` Maxime Coquelin
  2021-06-21  3:57     ` Hu, Jiayu
@ 2021-06-22  6:18     ` Ding, Xuan
  2021-06-29  9:23       ` Maxime Coquelin
  1 sibling, 1 reply; 25+ messages in thread
From: Ding, Xuan @ 2021-06-22  6:18 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Jiang, Cheng1

Hi Maxime,

Replies are inline.	

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Saturday, June 19, 2021 12:18 AM
> To: Ding, Xuan <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>
> Subject: Re: [PATCH v3] vhost: enable IOMMU for async vhost
> 
> Hi Xuan,
> 
> On 6/3/21 7:30 PM, xuan.ding@intel.com wrote:
> > From: Xuan Ding <xuan.ding@intel.com>
> >
> > For async copy, it is unsafe to directly use the physical address.
> > And current address translation from GPA to HPA via SW also takes
> > CPU cycles, these can all benefit from IOMMU.
> >
> > Since the existing DMA engine supports to use platform IOMMU,
> > this patch enables IOMMU for async vhost, which defines IOAT
> > devices to use virtual address instead of physical address.
> 
> We have to keep in mind a generic DMA api is coming, and maybe we want
> a SW implementation of a dmadev based on memcpy at least for
> testing/debugging purpose.

I noticed the generic dmadev model is under discussion. To support a SW
implementation, the VA mode support is needed, this is also the problem
that this patch hopes to solve. Traditionally, DMA engine can only use
physical address in PA mode.

> 
> > When set memory table, the frontend's memory will be mapped
> > to the default container of DPDK where IOAT devices have been
> > added into. When DMA copy fails, the virtual address provided
> > to IOAT devices also allow us fallback to SW copy or PA copy.
> >
> > With IOMMU enabled, to use IOAT devices:
> > 1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> > 2. DPDK must use "--iova-mode=va".
> 
> I think this is problematic, at least we need to check the right iova
> mode has been selected, but even with doing that it is limiting.

As a library, vhost is not aware of the device selected address(PA or VA)
and current DPDK iova mode. To some extent, this patch is a proposal.
With device fed with VA, SW fallback can be supported.
And VA can also be translated to PA through rte_mem_virt2iova().
Finally, the address selected by the device is determined by callback.
Not vice versa.

If the DMA callback implementer follows this design, SW fallback can be supported.
I would be very grateful if you could provide some insights for this design. :)

> 
> What prevent us to reuse add_guest_pages() alogrithm to implement
> IOVA_AS_PA?

If IOVA is PA, it's not easy to translate PA to VA to support SW implementation.
Until now, I don't have any good ideas to be compatible with IOVA_AS_PA 
and IOVA_AS_VA at the same time, because it requires vhost library to
select PA/VA for DMA device according to different DPDK iova mode.

Thanks,
Xuan

> 
> >
> > Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > ---
> >
> > v3:
> > * Fixed some typos.
> >
> > v2:
> > * Fixed a format issue.
> > * Added the dma unmap logic when device is closed.
> > ---
> >  doc/guides/prog_guide/vhost_lib.rst |  20 +++++
> >  lib/vhost/vhost_user.c              | 125 +++++++++-------------------
> >  lib/vhost/virtio_net.c              |  30 +++----
> >  3 files changed, 69 insertions(+), 106 deletions(-)
> >
> > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> b/doc/guides/prog_guide/vhost_lib.rst
> > index d18fb98910..5777f0da96 100644
> > --- a/doc/guides/prog_guide/vhost_lib.rst
> > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > @@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device
> specific operations:
> >  * ``get_notify_area``
> >
> >    Called to get the notify area info of the queue.
> > +
> > +Vhost async data path
> > +---------------------
> > +
> > +* Address mode
> > +
> > +  Modern IOAT devices support to use the IOMMU, which can avoid using
> > +  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
> > +  GPA to HPA can also be saved. So IOAT devices are defined to use
> > +  virtual address instead of physical address.
> > +
> > +  With IOMMU enabled, to use IOAT devices:
> > +  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> > +  2. DPDK must use ``--iova-mode=va``.
> > +
> > +* Fallback
> > +
> > +  When the DMA copy fails, the user who implements the transfer_data
> > +  callback can fallback to SW copy or fallback to PA copy through
> > +  rte_mem_virt2iova().
> > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> > index 8f0eba6412..c33fa784ff 100644
> > --- a/lib/vhost/vhost_user.c
> > +++ b/lib/vhost/vhost_user.c
> > @@ -45,6 +45,7 @@
> >  #include <rte_common.h>
> >  #include <rte_malloc.h>
> >  #include <rte_log.h>
> > +#include <rte_vfio.h>
> >
> >  #include "iotlb.h"
> >  #include "vhost.h"
> > @@ -141,6 +142,34 @@ get_blk_size(int fd)
> >  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >  }
> >
> > +static int
> > +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> > +{
> > +	int ret = 0;
> > +	if (do_map) {
> > +		/* Add mapped region into the default container of DPDK. */
> > +		ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > +						 region->host_user_addr,
> > +						 region->host_user_addr,
> > +						 region->size);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> > +			return ret;
> > +		}
> > +	} else {
> > +		/* Remove mapped region from the default container of DPDK.
> */
> > +		ret =
> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > +						   region->host_user_addr,
> > +						   region->host_user_addr,
> > +						   region->size);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> failed\n");
> > +			return ret;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> >  static void
> >  free_mem_region(struct virtio_net *dev)
> >  {
> > @@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
> >  		if (reg->host_user_addr) {
> >  			munmap(reg->mmap_addr, reg->mmap_size);
> >  			close(reg->fd);
> > +
> > +			if (dev->async_copy)
> > +				async_dma_map(reg, false);
> >  		}
> >  	}
> >  }
> > @@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
> >  	return RTE_VHOST_MSG_RESULT_OK;
> >  }
> >
> > -static int
> > -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> > -		   uint64_t host_phys_addr, uint64_t size)
> > -{
> > -	struct guest_page *page, *last_page;
> > -	struct guest_page *old_pages;
> > -
> > -	if (dev->nr_guest_pages == dev->max_guest_pages) {
> > -		dev->max_guest_pages *= 2;
> > -		old_pages = dev->guest_pages;
> > -		dev->guest_pages = rte_realloc(dev->guest_pages,
> > -					dev->max_guest_pages * sizeof(*page),
> > -					RTE_CACHE_LINE_SIZE);
> > -		if (dev->guest_pages == NULL) {
> > -			VHOST_LOG_CONFIG(ERR, "cannot realloc
> guest_pages\n");
> > -			rte_free(old_pages);
> > -			return -1;
> > -		}
> > -	}
> > -
> > -	if (dev->nr_guest_pages > 0) {
> > -		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
> > -		/* merge if the two pages are continuous */
> > -		if (host_phys_addr == last_page->host_phys_addr +
> > -				      last_page->size) {
> > -			last_page->size += size;
> > -			return 0;
> > -		}
> > -	}
> > -
> > -	page = &dev->guest_pages[dev->nr_guest_pages++];
> > -	page->guest_phys_addr = guest_phys_addr;
> > -	page->host_phys_addr  = host_phys_addr;
> > -	page->size = size;
> > -
> > -	return 0;
> > -}
> > -
> > -static int
> > -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
> > -		uint64_t page_size)
> > -{
> > -	uint64_t reg_size = reg->size;
> > -	uint64_t host_user_addr  = reg->host_user_addr;
> > -	uint64_t guest_phys_addr = reg->guest_phys_addr;
> > -	uint64_t host_phys_addr;
> > -	uint64_t size;
> > -
> > -	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
> > -	size = page_size - (guest_phys_addr & (page_size - 1));
> > -	size = RTE_MIN(size, reg_size);
> > -
> > -	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) <
> 0)
> > -		return -1;
> > -
> > -	host_user_addr  += size;
> > -	guest_phys_addr += size;
> > -	reg_size -= size;
> > -
> > -	while (reg_size > 0) {
> > -		size = RTE_MIN(reg_size, page_size);
> > -		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
> > -						  host_user_addr);
> > -		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
> > -				size) < 0)
> > -			return -1;
> > -
> > -		host_user_addr  += size;
> > -		guest_phys_addr += size;
> > -		reg_size -= size;
> > -	}
> > -
> > -	/* sort guest page array if over binary search threshold */
> > -	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> > -		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
> > -			sizeof(struct guest_page), guest_page_addrcmp);
> > -	}
> > -
> > -	return 0;
> > -}
> > -
> >  #ifdef RTE_LIBRTE_VHOST_DEBUG
> >  /* TODO: enable it only in debug mode? */
> >  static void
> > @@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
> >  	uint64_t mmap_size;
> >  	uint64_t alignment;
> >  	int populate;
> > +	int ret;
> >
> >  	/* Check for memory_size + mmap_offset overflow */
> >  	if (mmap_offset >= -region->size) {
> > @@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
> >  	region->mmap_size = mmap_size;
> >  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> mmap_offset;
> >
> > -	if (dev->async_copy)
> > -		if (add_guest_pages(dev, region, alignment) < 0) {
> > -			VHOST_LOG_CONFIG(ERR,
> > -					"adding guest pages to region
> failed.\n");
> > +	if (dev->async_copy) {
> > +		ret = async_dma_map(region, true);
> > +		if (ret) {
> > +			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA
> engine failed\n");
> >  			return -1;
> 
> Maybe we're too late in the init already, but I would think we may want
> to fallback to SW implementation insea
> 
> > -		}
> > +			}
> > +	}
> >
> >  	VHOST_LOG_CONFIG(INFO,
> >  			"guest memory region size: 0x%" PRIx64 "\n"
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > index 8da8a86a10..88110d2cb3 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> >  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> >  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> >  	int error = 0;
> > -	uint64_t mapped_len;
> >
> >  	uint32_t tlen = 0;
> >  	int tvec_idx = 0;
> > -	void *hpa;
> >
> >  	if (unlikely(m == NULL)) {
> >  		error = -1;
> > @@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
> >
> >  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> >
> > -		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> > -			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> > -					buf_iova + buf_offset,
> > -					cpy_len, &mapped_len);
> > -
> > -			if (unlikely(!hpa || mapped_len < cpy_threshold))
> > -				break;
> > -
> > +		if (unlikely(cpy_len >= cpy_threshold)) {
> >  			async_fill_vec(src_iovec + tvec_idx,
> > -				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> > -				mbuf_offset), (size_t)mapped_len);
> > +				rte_pktmbuf_mtod_offset(m, void *,
> mbuf_offset), (size_t)cpy_len);
> >
> >  			async_fill_vec(dst_iovec + tvec_idx,
> > -					hpa, (size_t)mapped_len);
> > -
> > -			tlen += (uint32_t)mapped_len;
> > -			cpy_len -= (uint32_t)mapped_len;
> > -			mbuf_avail  -= (uint32_t)mapped_len;
> > -			mbuf_offset += (uint32_t)mapped_len;
> > -			buf_avail  -= (uint32_t)mapped_len;
> > -			buf_offset += (uint32_t)mapped_len;
> > +				(void *)((uintptr_t)(buf_addr + buf_offset)),
> (size_t)cpy_len);
> > +
> > +			tlen += cpy_len;
> > +			mbuf_avail  -= cpy_len;
> > +			mbuf_offset += cpy_len;
> > +			buf_avail  -= cpy_len;
> > +			buf_offset += cpy_len;
> > +			cpy_len = 0;
> >  			tvec_idx++;
> >  		}
> >
> >


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-06-22  6:18     ` Ding, Xuan
@ 2021-06-29  9:23       ` Maxime Coquelin
  2021-07-01  5:12         ` Ding, Xuan
  0 siblings, 1 reply; 25+ messages in thread
From: Maxime Coquelin @ 2021-06-29  9:23 UTC (permalink / raw)
  To: Ding, Xuan, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Jiang, Cheng1

Hi Xuan,

On 6/22/21 8:18 AM, Ding, Xuan wrote:
> Hi Maxime,
> 
> Replies are inline.	
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Saturday, June 19, 2021 12:18 AM
>> To: Ding, Xuan <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>
>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
>> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>
>> Subject: Re: [PATCH v3] vhost: enable IOMMU for async vhost
>>
>> Hi Xuan,
>>
>> On 6/3/21 7:30 PM, xuan.ding@intel.com wrote:
>>> From: Xuan Ding <xuan.ding@intel.com>
>>>
>>> For async copy, it is unsafe to directly use the physical address.
>>> And current address translation from GPA to HPA via SW also takes
>>> CPU cycles, these can all benefit from IOMMU.
>>>
>>> Since the existing DMA engine supports to use platform IOMMU,
>>> this patch enables IOMMU for async vhost, which defines IOAT
>>> devices to use virtual address instead of physical address.
>>
>> We have to keep in mind a generic DMA api is coming, and maybe we want
>> a SW implementation of a dmadev based on memcpy at least for
>> testing/debugging purpose.
> 
> I noticed the generic dmadev model is under discussion. To support a SW
> implementation, the VA mode support is needed, this is also the problem
> that this patch hopes to solve. Traditionally, DMA engine can only use
> physical address in PA mode.
> 
>>
>>> When set memory table, the frontend's memory will be mapped
>>> to the default container of DPDK where IOAT devices have been
>>> added into. When DMA copy fails, the virtual address provided
>>> to IOAT devices also allow us fallback to SW copy or PA copy.
>>>
>>> With IOMMU enabled, to use IOAT devices:
>>> 1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
>>> 2. DPDK must use "--iova-mode=va".
>>
>> I think this is problematic, at least we need to check the right iova
>> mode has been selected, but even with doing that it is limiting.
> 
> As a library, vhost is not aware of the device selected address(PA or VA)
> and current DPDK iova mode. To some extent, this patch is a proposal.

If I'm not mistaken, the DMA device driver init should fail if it does
not support the DPDK IOVA mode.

Then, on Vhost lib side, you should be able to get the IOVA mode by
using the rte_eal_iova_mode() API.

> With device fed with VA, SW fallback can be supported.
> And VA can also be translated to PA through rte_mem_virt2iova().
> Finally, the address selected by the device is determined by callback.
> Not vice versa.
> 
> If the DMA callback implementer follows this design, SW fallback can be supported.
> I would be very grateful if you could provide some insights for this design. :)

TBH, I find the async design too much complicated.
Having some descriptors handled by the DMA engine, others by the CPU
makes it extremly hard to debug. Also, it makes Vhost library use less
deterministic.

>>
>> What prevent us to reuse add_guest_pages() alogrithm to implement
>> IOVA_AS_PA?
> 
> If IOVA is PA, it's not easy to translate PA to VA to support SW implementation.

What prevent you to use dev->guest_pages[] in that case to do the
translations?

> Until now, I don't have any good ideas to be compatible with IOVA_AS_PA 
> and IOVA_AS_VA at the same time, because it requires vhost library to
> select PA/VA for DMA device according to different DPDK iova mode.

If the DMA device claims to support IOVA_AS_PA at probe time, it should
be useable by the vhost library. It might not be the more efficient
mode, but we cannot just have a comment in the documenation saying that
IOVA_AS_VA is the only supported mode, without any safety check in the
code itself.

Regards,
Maxime

> Thanks,
> Xuan
> 
>>
>>>
>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>> ---
>>>
>>> v3:
>>> * Fixed some typos.
>>>
>>> v2:
>>> * Fixed a format issue.
>>> * Added the dma unmap logic when device is closed.
>>> ---
>>>  doc/guides/prog_guide/vhost_lib.rst |  20 +++++
>>>  lib/vhost/vhost_user.c              | 125 +++++++++-------------------
>>>  lib/vhost/virtio_net.c              |  30 +++----
>>>  3 files changed, 69 insertions(+), 106 deletions(-)
>>>
>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>> b/doc/guides/prog_guide/vhost_lib.rst
>>> index d18fb98910..5777f0da96 100644
>>> --- a/doc/guides/prog_guide/vhost_lib.rst
>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>>> @@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device
>> specific operations:
>>>  * ``get_notify_area``
>>>
>>>    Called to get the notify area info of the queue.
>>> +
>>> +Vhost async data path
>>> +---------------------
>>> +
>>> +* Address mode
>>> +
>>> +  Modern IOAT devices support to use the IOMMU, which can avoid using
>>> +  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
>>> +  GPA to HPA can also be saved. So IOAT devices are defined to use
>>> +  virtual address instead of physical address.
>>> +
>>> +  With IOMMU enabled, to use IOAT devices:
>>> +  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
>>> +  2. DPDK must use ``--iova-mode=va``.
>>> +
>>> +* Fallback
>>> +
>>> +  When the DMA copy fails, the user who implements the transfer_data
>>> +  callback can fallback to SW copy or fallback to PA copy through
>>> +  rte_mem_virt2iova().
>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>>> index 8f0eba6412..c33fa784ff 100644
>>> --- a/lib/vhost/vhost_user.c
>>> +++ b/lib/vhost/vhost_user.c
>>> @@ -45,6 +45,7 @@
>>>  #include <rte_common.h>
>>>  #include <rte_malloc.h>
>>>  #include <rte_log.h>
>>> +#include <rte_vfio.h>
>>>
>>>  #include "iotlb.h"
>>>  #include "vhost.h"
>>> @@ -141,6 +142,34 @@ get_blk_size(int fd)
>>>  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>>  }
>>>
>>> +static int
>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>>> +{
>>> +	int ret = 0;
>>> +	if (do_map) {
>>> +		/* Add mapped region into the default container of DPDK. */
>>> +		ret =
>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>> +						 region->host_user_addr,
>>> +						 region->host_user_addr,
>>> +						 region->size);
>>> +		if (ret) {
>>> +			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>>> +			return ret;
>>> +		}
>>> +	} else {
>>> +		/* Remove mapped region from the default container of DPDK.
>> */
>>> +		ret =
>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>> +						   region->host_user_addr,
>>> +						   region->host_user_addr,
>>> +						   region->size);
>>> +		if (ret) {
>>> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
>> failed\n");
>>> +			return ret;
>>> +		}
>>> +	}
>>> +	return ret;
>>> +}
>>> +
>>>  static void
>>>  free_mem_region(struct virtio_net *dev)
>>>  {
>>> @@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
>>>  		if (reg->host_user_addr) {
>>>  			munmap(reg->mmap_addr, reg->mmap_size);
>>>  			close(reg->fd);
>>> +
>>> +			if (dev->async_copy)
>>> +				async_dma_map(reg, false);
>>>  		}
>>>  	}
>>>  }
>>> @@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net **pdev,
>>>  	return RTE_VHOST_MSG_RESULT_OK;
>>>  }
>>>
>>> -static int
>>> -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
>>> -		   uint64_t host_phys_addr, uint64_t size)
>>> -{
>>> -	struct guest_page *page, *last_page;
>>> -	struct guest_page *old_pages;
>>> -
>>> -	if (dev->nr_guest_pages == dev->max_guest_pages) {
>>> -		dev->max_guest_pages *= 2;
>>> -		old_pages = dev->guest_pages;
>>> -		dev->guest_pages = rte_realloc(dev->guest_pages,
>>> -					dev->max_guest_pages * sizeof(*page),
>>> -					RTE_CACHE_LINE_SIZE);
>>> -		if (dev->guest_pages == NULL) {
>>> -			VHOST_LOG_CONFIG(ERR, "cannot realloc
>> guest_pages\n");
>>> -			rte_free(old_pages);
>>> -			return -1;
>>> -		}
>>> -	}
>>> -
>>> -	if (dev->nr_guest_pages > 0) {
>>> -		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
>>> -		/* merge if the two pages are continuous */
>>> -		if (host_phys_addr == last_page->host_phys_addr +
>>> -				      last_page->size) {
>>> -			last_page->size += size;
>>> -			return 0;
>>> -		}
>>> -	}
>>> -
>>> -	page = &dev->guest_pages[dev->nr_guest_pages++];
>>> -	page->guest_phys_addr = guest_phys_addr;
>>> -	page->host_phys_addr  = host_phys_addr;
>>> -	page->size = size;
>>> -
>>> -	return 0;
>>> -}
>>> -
>>> -static int
>>> -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
>>> -		uint64_t page_size)
>>> -{
>>> -	uint64_t reg_size = reg->size;
>>> -	uint64_t host_user_addr  = reg->host_user_addr;
>>> -	uint64_t guest_phys_addr = reg->guest_phys_addr;
>>> -	uint64_t host_phys_addr;
>>> -	uint64_t size;
>>> -
>>> -	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
>>> -	size = page_size - (guest_phys_addr & (page_size - 1));
>>> -	size = RTE_MIN(size, reg_size);
>>> -
>>> -	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) <
>> 0)
>>> -		return -1;
>>> -
>>> -	host_user_addr  += size;
>>> -	guest_phys_addr += size;
>>> -	reg_size -= size;
>>> -
>>> -	while (reg_size > 0) {
>>> -		size = RTE_MIN(reg_size, page_size);
>>> -		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
>>> -						  host_user_addr);
>>> -		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
>>> -				size) < 0)
>>> -			return -1;
>>> -
>>> -		host_user_addr  += size;
>>> -		guest_phys_addr += size;
>>> -		reg_size -= size;
>>> -	}
>>> -
>>> -	/* sort guest page array if over binary search threshold */
>>> -	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
>>> -		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
>>> -			sizeof(struct guest_page), guest_page_addrcmp);
>>> -	}
>>> -
>>> -	return 0;
>>> -}
>>> -
>>>  #ifdef RTE_LIBRTE_VHOST_DEBUG
>>>  /* TODO: enable it only in debug mode? */
>>>  static void
>>> @@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
>>>  	uint64_t mmap_size;
>>>  	uint64_t alignment;
>>>  	int populate;
>>> +	int ret;
>>>
>>>  	/* Check for memory_size + mmap_offset overflow */
>>>  	if (mmap_offset >= -region->size) {
>>> @@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net *dev,
>>>  	region->mmap_size = mmap_size;
>>>  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>> mmap_offset;
>>>
>>> -	if (dev->async_copy)
>>> -		if (add_guest_pages(dev, region, alignment) < 0) {
>>> -			VHOST_LOG_CONFIG(ERR,
>>> -					"adding guest pages to region
>> failed.\n");
>>> +	if (dev->async_copy) {
>>> +		ret = async_dma_map(region, true);
>>> +		if (ret) {
>>> +			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA
>> engine failed\n");
>>>  			return -1;
>>
>> Maybe we're too late in the init already, but I would think we may want
>> to fallback to SW implementation insea
>>
>>> -		}
>>> +			}
>>> +	}
>>>
>>>  	VHOST_LOG_CONFIG(INFO,
>>>  			"guest memory region size: 0x%" PRIx64 "\n"
>>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
>>> index 8da8a86a10..88110d2cb3 100644
>>> --- a/lib/vhost/virtio_net.c
>>> +++ b/lib/vhost/virtio_net.c
>>> @@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
>> vhost_virtqueue *vq,
>>>  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
>>>  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
>>>  	int error = 0;
>>> -	uint64_t mapped_len;
>>>
>>>  	uint32_t tlen = 0;
>>>  	int tvec_idx = 0;
>>> -	void *hpa;
>>>
>>>  	if (unlikely(m == NULL)) {
>>>  		error = -1;
>>> @@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev,
>> struct vhost_virtqueue *vq,
>>>
>>>  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
>>>
>>> -		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
>>> -			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
>>> -					buf_iova + buf_offset,
>>> -					cpy_len, &mapped_len);
>>> -
>>> -			if (unlikely(!hpa || mapped_len < cpy_threshold))
>>> -				break;
>>> -
>>> +		if (unlikely(cpy_len >= cpy_threshold)) {
>>>  			async_fill_vec(src_iovec + tvec_idx,
>>> -				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
>>> -				mbuf_offset), (size_t)mapped_len);
>>> +				rte_pktmbuf_mtod_offset(m, void *,
>> mbuf_offset), (size_t)cpy_len);
>>>
>>>  			async_fill_vec(dst_iovec + tvec_idx,
>>> -					hpa, (size_t)mapped_len);
>>> -
>>> -			tlen += (uint32_t)mapped_len;
>>> -			cpy_len -= (uint32_t)mapped_len;
>>> -			mbuf_avail  -= (uint32_t)mapped_len;
>>> -			mbuf_offset += (uint32_t)mapped_len;
>>> -			buf_avail  -= (uint32_t)mapped_len;
>>> -			buf_offset += (uint32_t)mapped_len;
>>> +				(void *)((uintptr_t)(buf_addr + buf_offset)),
>> (size_t)cpy_len);
>>> +
>>> +			tlen += cpy_len;
>>> +			mbuf_avail  -= cpy_len;
>>> +			mbuf_offset += cpy_len;
>>> +			buf_avail  -= cpy_len;
>>> +			buf_offset += cpy_len;
>>> +			cpy_len = 0;
>>>  			tvec_idx++;
>>>  		}
>>>
>>>
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vhost: enable IOMMU for async vhost
  2021-06-29  9:23       ` Maxime Coquelin
@ 2021-07-01  5:12         ` Ding, Xuan
  0 siblings, 0 replies; 25+ messages in thread
From: Ding, Xuan @ 2021-07-01  5:12 UTC (permalink / raw)
  To: Maxime Coquelin, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Jiang, Cheng1

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, June 29, 2021 5:23 PM
> To: Ding, Xuan <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
> Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: Re: [PATCH v3] vhost: enable IOMMU for async vhost
> 
> Hi Xuan,
> 
> On 6/22/21 8:18 AM, Ding, Xuan wrote:
> > Hi Maxime,
> >
> > Replies are inline.
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Saturday, June 19, 2021 12:18 AM
> >> To: Ding, Xuan <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>
> >> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Van
> >> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>
> >> Subject: Re: [PATCH v3] vhost: enable IOMMU for async vhost
> >>
> >> Hi Xuan,
> >>
> >> On 6/3/21 7:30 PM, xuan.ding@intel.com wrote:
> >>> From: Xuan Ding <xuan.ding@intel.com>
> >>>
> >>> For async copy, it is unsafe to directly use the physical address.
> >>> And current address translation from GPA to HPA via SW also takes
> >>> CPU cycles, these can all benefit from IOMMU.
> >>>
> >>> Since the existing DMA engine supports to use platform IOMMU,
> >>> this patch enables IOMMU for async vhost, which defines IOAT
> >>> devices to use virtual address instead of physical address.
> >>
> >> We have to keep in mind a generic DMA api is coming, and maybe we want
> >> a SW implementation of a dmadev based on memcpy at least for
> >> testing/debugging purpose.
> >
> > I noticed the generic dmadev model is under discussion. To support a SW
> > implementation, the VA mode support is needed, this is also the problem
> > that this patch hopes to solve. Traditionally, DMA engine can only use
> > physical address in PA mode.
> >
> >>
> >>> When set memory table, the frontend's memory will be mapped
> >>> to the default container of DPDK where IOAT devices have been
> >>> added into. When DMA copy fails, the virtual address provided
> >>> to IOAT devices also allow us fallback to SW copy or PA copy.
> >>>
> >>> With IOMMU enabled, to use IOAT devices:
> >>> 1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> >>> 2. DPDK must use "--iova-mode=va".
> >>
> >> I think this is problematic, at least we need to check the right iova
> >> mode has been selected, but even with doing that it is limiting.
> >
> > As a library, vhost is not aware of the device selected address(PA or VA)
> > and current DPDK iova mode. To some extent, this patch is a proposal.
> 
> If I'm not mistaken, the DMA device driver init should fail if it does
> not support the DPDK IOVA mode.
> 
> Then, on Vhost lib side, you should be able to get the IOVA mode by
> using the rte_eal_iova_mode() API.

Get your point, if the Vhost lib is able to get the IOVA mode, I think it is
possible to be compatible with different DPDK IOVA mode. I will work out
a new patch to pass iova to callback instead of virtual address only.

> 
> > With device fed with VA, SW fallback can be supported.
> > And VA can also be translated to PA through rte_mem_virt2iova().
> > Finally, the address selected by the device is determined by callback.
> > Not vice versa.
> >
> > If the DMA callback implementer follows this design, SW fallback can be
> supported.
> > I would be very grateful if you could provide some insights for this design. :)
> 
> TBH, I find the async design too much complicated.
> Having some descriptors handled by the DMA engine, others by the CPU
> makes it extremly hard to debug. Also, it makes Vhost library use less
> deterministic.

Here is to consider the difference in copy efficiency. In the case of small packages
(less than threshold), the performance of CPU copy is better. When the package
length is bigger than the threshold, the DMA engine copy will get a big performance
improvement.

> 
> >>
> >> What prevent us to reuse add_guest_pages() alogrithm to implement
> >> IOVA_AS_PA?
> >
> > If IOVA is PA, it's not easy to translate PA to VA to support SW implementation.
> 
> What prevent you to use dev->guest_pages[] in that case to do the
> translations?

Yes, you are right, use rte_mem_iova2virt() can help to do so.

> 
> > Until now, I don't have any good ideas to be compatible with IOVA_AS_PA
> > and IOVA_AS_VA at the same time, because it requires vhost library to
> > select PA/VA for DMA device according to different DPDK iova mode.
> 
> If the DMA device claims to support IOVA_AS_PA at probe time, it should
> be useable by the vhost library. It might not be the more efficient
> mode, but we cannot just have a comment in the documenation saying that
> IOVA_AS_VA is the only supported mode, without any safety check in the
> code itself.

Compatible with IOVA_AS_PA is possible, the reason for IOVA_AS_VA design
is because VA is easier to operate. Thus some compatibility is sacrificed.

I will adopt your suggestion, thanks very much!

Regards,
Xuan

> 
> Regards,
> Maxime
> 
> > Thanks,
> > Xuan
> >
> >>
> >>>
> >>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >>> ---
> >>>
> >>> v3:
> >>> * Fixed some typos.
> >>>
> >>> v2:
> >>> * Fixed a format issue.
> >>> * Added the dma unmap logic when device is closed.
> >>> ---
> >>>  doc/guides/prog_guide/vhost_lib.rst |  20 +++++
> >>>  lib/vhost/vhost_user.c              | 125 +++++++++-------------------
> >>>  lib/vhost/virtio_net.c              |  30 +++----
> >>>  3 files changed, 69 insertions(+), 106 deletions(-)
> >>>
> >>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> >> b/doc/guides/prog_guide/vhost_lib.rst
> >>> index d18fb98910..5777f0da96 100644
> >>> --- a/doc/guides/prog_guide/vhost_lib.rst
> >>> +++ b/doc/guides/prog_guide/vhost_lib.rst
> >>> @@ -420,3 +420,23 @@ Finally, a set of device ops is defined for device
> >> specific operations:
> >>>  * ``get_notify_area``
> >>>
> >>>    Called to get the notify area info of the queue.
> >>> +
> >>> +Vhost async data path
> >>> +---------------------
> >>> +
> >>> +* Address mode
> >>> +
> >>> +  Modern IOAT devices support to use the IOMMU, which can avoid using
> >>> +  the unsafe HPA. Besides, the CPU cycles took by SW to translate from
> >>> +  GPA to HPA can also be saved. So IOAT devices are defined to use
> >>> +  virtual address instead of physical address.
> >>> +
> >>> +  With IOMMU enabled, to use IOAT devices:
> >>> +  1. IOAT devices must be binded to vfio-pci, rather than igb_uio.
> >>> +  2. DPDK must use ``--iova-mode=va``.
> >>> +
> >>> +* Fallback
> >>> +
> >>> +  When the DMA copy fails, the user who implements the transfer_data
> >>> +  callback can fallback to SW copy or fallback to PA copy through
> >>> +  rte_mem_virt2iova().
> >>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> >>> index 8f0eba6412..c33fa784ff 100644
> >>> --- a/lib/vhost/vhost_user.c
> >>> +++ b/lib/vhost/vhost_user.c
> >>> @@ -45,6 +45,7 @@
> >>>  #include <rte_common.h>
> >>>  #include <rte_malloc.h>
> >>>  #include <rte_log.h>
> >>> +#include <rte_vfio.h>
> >>>
> >>>  #include "iotlb.h"
> >>>  #include "vhost.h"
> >>> @@ -141,6 +142,34 @@ get_blk_size(int fd)
> >>>  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >>>  }
> >>>
> >>> +static int
> >>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> >>> +{
> >>> +	int ret = 0;
> >>> +	if (do_map) {
> >>> +		/* Add mapped region into the default container of DPDK. */
> >>> +		ret =
> >> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>> +						 region->host_user_addr,
> >>> +						 region->host_user_addr,
> >>> +						 region->size);
> >>> +		if (ret) {
> >>> +			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> >>> +			return ret;
> >>> +		}
> >>> +	} else {
> >>> +		/* Remove mapped region from the default container of DPDK.
> >> */
> >>> +		ret =
> >> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>> +						   region->host_user_addr,
> >>> +						   region->host_user_addr,
> >>> +						   region->size);
> >>> +		if (ret) {
> >>> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> >> failed\n");
> >>> +			return ret;
> >>> +		}
> >>> +	}
> >>> +	return ret;
> >>> +}
> >>> +
> >>>  static void
> >>>  free_mem_region(struct virtio_net *dev)
> >>>  {
> >>> @@ -155,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
> >>>  		if (reg->host_user_addr) {
> >>>  			munmap(reg->mmap_addr, reg->mmap_size);
> >>>  			close(reg->fd);
> >>> +
> >>> +			if (dev->async_copy)
> >>> +				async_dma_map(reg, false);
> >>>  		}
> >>>  	}
> >>>  }
> >>> @@ -866,87 +898,6 @@ vhost_user_set_vring_base(struct virtio_net
> **pdev,
> >>>  	return RTE_VHOST_MSG_RESULT_OK;
> >>>  }
> >>>
> >>> -static int
> >>> -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> >>> -		   uint64_t host_phys_addr, uint64_t size)
> >>> -{
> >>> -	struct guest_page *page, *last_page;
> >>> -	struct guest_page *old_pages;
> >>> -
> >>> -	if (dev->nr_guest_pages == dev->max_guest_pages) {
> >>> -		dev->max_guest_pages *= 2;
> >>> -		old_pages = dev->guest_pages;
> >>> -		dev->guest_pages = rte_realloc(dev->guest_pages,
> >>> -					dev->max_guest_pages * sizeof(*page),
> >>> -					RTE_CACHE_LINE_SIZE);
> >>> -		if (dev->guest_pages == NULL) {
> >>> -			VHOST_LOG_CONFIG(ERR, "cannot realloc
> >> guest_pages\n");
> >>> -			rte_free(old_pages);
> >>> -			return -1;
> >>> -		}
> >>> -	}
> >>> -
> >>> -	if (dev->nr_guest_pages > 0) {
> >>> -		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
> >>> -		/* merge if the two pages are continuous */
> >>> -		if (host_phys_addr == last_page->host_phys_addr +
> >>> -				      last_page->size) {
> >>> -			last_page->size += size;
> >>> -			return 0;
> >>> -		}
> >>> -	}
> >>> -
> >>> -	page = &dev->guest_pages[dev->nr_guest_pages++];
> >>> -	page->guest_phys_addr = guest_phys_addr;
> >>> -	page->host_phys_addr  = host_phys_addr;
> >>> -	page->size = size;
> >>> -
> >>> -	return 0;
> >>> -}
> >>> -
> >>> -static int
> >>> -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region
> *reg,
> >>> -		uint64_t page_size)
> >>> -{
> >>> -	uint64_t reg_size = reg->size;
> >>> -	uint64_t host_user_addr  = reg->host_user_addr;
> >>> -	uint64_t guest_phys_addr = reg->guest_phys_addr;
> >>> -	uint64_t host_phys_addr;
> >>> -	uint64_t size;
> >>> -
> >>> -	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
> >>> -	size = page_size - (guest_phys_addr & (page_size - 1));
> >>> -	size = RTE_MIN(size, reg_size);
> >>> -
> >>> -	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) <
> >> 0)
> >>> -		return -1;
> >>> -
> >>> -	host_user_addr  += size;
> >>> -	guest_phys_addr += size;
> >>> -	reg_size -= size;
> >>> -
> >>> -	while (reg_size > 0) {
> >>> -		size = RTE_MIN(reg_size, page_size);
> >>> -		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
> >>> -						  host_user_addr);
> >>> -		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
> >>> -				size) < 0)
> >>> -			return -1;
> >>> -
> >>> -		host_user_addr  += size;
> >>> -		guest_phys_addr += size;
> >>> -		reg_size -= size;
> >>> -	}
> >>> -
> >>> -	/* sort guest page array if over binary search threshold */
> >>> -	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> >>> -		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
> >>> -			sizeof(struct guest_page), guest_page_addrcmp);
> >>> -	}
> >>> -
> >>> -	return 0;
> >>> -}
> >>> -
> >>>  #ifdef RTE_LIBRTE_VHOST_DEBUG
> >>>  /* TODO: enable it only in debug mode? */
> >>>  static void
> >>> @@ -1105,6 +1056,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
> >>>  	uint64_t mmap_size;
> >>>  	uint64_t alignment;
> >>>  	int populate;
> >>> +	int ret;
> >>>
> >>>  	/* Check for memory_size + mmap_offset overflow */
> >>>  	if (mmap_offset >= -region->size) {
> >>> @@ -1158,12 +1110,13 @@ vhost_user_mmap_region(struct virtio_net
> *dev,
> >>>  	region->mmap_size = mmap_size;
> >>>  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> >> mmap_offset;
> >>>
> >>> -	if (dev->async_copy)
> >>> -		if (add_guest_pages(dev, region, alignment) < 0) {
> >>> -			VHOST_LOG_CONFIG(ERR,
> >>> -					"adding guest pages to region
> >> failed.\n");
> >>> +	if (dev->async_copy) {
> >>> +		ret = async_dma_map(region, true);
> >>> +		if (ret) {
> >>> +			VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA
> >> engine failed\n");
> >>>  			return -1;
> >>
> >> Maybe we're too late in the init already, but I would think we may want
> >> to fallback to SW implementation insea
> >>
> >>> -		}
> >>> +			}
> >>> +	}
> >>>
> >>>  	VHOST_LOG_CONFIG(INFO,
> >>>  			"guest memory region size: 0x%" PRIx64 "\n"
> >>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> >>> index 8da8a86a10..88110d2cb3 100644
> >>> --- a/lib/vhost/virtio_net.c
> >>> +++ b/lib/vhost/virtio_net.c
> >>> @@ -980,11 +980,9 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> >> vhost_virtqueue *vq,
> >>>  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> >>>  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> >>>  	int error = 0;
> >>> -	uint64_t mapped_len;
> >>>
> >>>  	uint32_t tlen = 0;
> >>>  	int tvec_idx = 0;
> >>> -	void *hpa;
> >>>
> >>>  	if (unlikely(m == NULL)) {
> >>>  		error = -1;
> >>> @@ -1074,27 +1072,19 @@ async_mbuf_to_desc(struct virtio_net *dev,
> >> struct vhost_virtqueue *vq,
> >>>
> >>>  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> >>>
> >>> -		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> >>> -			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> >>> -					buf_iova + buf_offset,
> >>> -					cpy_len, &mapped_len);
> >>> -
> >>> -			if (unlikely(!hpa || mapped_len < cpy_threshold))
> >>> -				break;
> >>> -
> >>> +		if (unlikely(cpy_len >= cpy_threshold)) {
> >>>  			async_fill_vec(src_iovec + tvec_idx,
> >>> -				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> >>> -				mbuf_offset), (size_t)mapped_len);
> >>> +				rte_pktmbuf_mtod_offset(m, void *,
> >> mbuf_offset), (size_t)cpy_len);
> >>>
> >>>  			async_fill_vec(dst_iovec + tvec_idx,
> >>> -					hpa, (size_t)mapped_len);
> >>> -
> >>> -			tlen += (uint32_t)mapped_len;
> >>> -			cpy_len -= (uint32_t)mapped_len;
> >>> -			mbuf_avail  -= (uint32_t)mapped_len;
> >>> -			mbuf_offset += (uint32_t)mapped_len;
> >>> -			buf_avail  -= (uint32_t)mapped_len;
> >>> -			buf_offset += (uint32_t)mapped_len;
> >>> +				(void *)((uintptr_t)(buf_addr + buf_offset)),
> >> (size_t)cpy_len);
> >>> +
> >>> +			tlen += cpy_len;
> >>> +			mbuf_avail  -= cpy_len;
> >>> +			mbuf_offset += cpy_len;
> >>> +			buf_avail  -= cpy_len;
> >>> +			buf_offset += cpy_len;
> >>> +			cpy_len = 0;
> >>>  			tvec_idx++;
> >>>  		}
> >>>
> >>>
> >


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path
  2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost xuan.ding
  2021-06-02 14:26 ` [dpdk-dev] [PATCH v2] " xuan.ding
  2021-06-03 17:30 ` [dpdk-dev] [PATCH v3] vhost: " xuan.ding
@ 2021-07-05  8:19 ` Xuan Ding
  2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 1/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 2/2] example/vhost: add dma vfio parsing Xuan Ding
  2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
  3 siblings, 2 replies; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:19 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

This patch series expands the capability of DMA devices to use IOMMU.
When application tells the library the DMA device is bound to vfio,
the IOMMU will be programmed for guest memory.

v4:
* Added the compatibility with IOVA_AS_PA.
* Revised the commit log that the IOMMU capability is added for DMA device.
* Changed single patch to patchset, added a new flag parsing in example.

v3:
* Fixed some typos.

v2:
* Fixed a format issue.
* Added the dma unmap logic when device is closed.

Xuan Ding (2):
  vhost: enable IOMMU for async vhost
  example/vhost: add dma vfio parsing

 doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
 doc/guides/sample_app_ug/vhost.rst  |  7 +++++
 examples/vhost/main.c               | 16 +++++++++-
 lib/vhost/rte_vhost.h               |  1 +
 lib/vhost/socket.c                  |  9 ++++++
 lib/vhost/vhost.h                   |  1 +
 lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
 7 files changed, 87 insertions(+), 2 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] vhost: enable IOMMU for async vhost
  2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
@ 2021-07-05  8:19   ` Xuan Ding
  2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 2/2] example/vhost: add dma vfio parsing Xuan Ding
  1 sibling, 0 replies; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:19 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA device is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
 lib/vhost/rte_vhost.h               |  1 +
 lib/vhost/socket.c                  |  9 ++++++
 lib/vhost/vhost.h                   |  1 +
 lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
 5 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index 05c42c9b11..c3beda23d9 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -118,6 +118,15 @@ The following is an overview of some key Vhost API functions:
 
     It is disabled by default.
 
+  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
+
+    In asynchronous data path, vhost liarary is not aware of which driver
+    (igb_uio/vfio) the DMA device is bound to. Application should pass
+    this flag to tell vhost library whether IOMMU should be programmed
+    for guest memory.
+
+    It is disabled by default.
+
   - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
 
     Since v16.04, the vhost library forwards checksum and gso requests for
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 8d875e9322..a766ea7b6b 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -37,6 +37,7 @@ extern "C" {
 #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 #define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
 #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS	(1ULL << 8)
+#define RTE_VHOST_USER_ASYNC_USE_VFIO	(1ULL << 9)
 
 /* Features. */
 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 5d0d728d52..77c722c86b 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -42,6 +42,7 @@ struct vhost_user_socket {
 	bool extbuf;
 	bool linearbuf;
 	bool async_copy;
+	bool async_use_vfio;
 	bool net_compliant_ol_flags;
 
 	/*
@@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 			dev->async_copy = 1;
 	}
 
+	if (vsocket->async_use_vfio) {
+		dev = get_device(vid);
+
+		if (dev)
+			dev->async_use_vfio = 1;
+	}
+
 	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
+	vsocket->async_use_vfio = flags & RTE_VHOST_USER_ASYNC_USE_VFIO;
 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
 
 	if (vsocket->async_copy &&
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 8078ddff79..fb775ce4ed 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,7 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+	int			async_use_vfio;
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..72459e192f 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,36 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	uint64_t host_iova;
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -155,6 +186,9 @@ free_mem_region(struct virtio_net *dev)
 		if (reg->host_user_addr) {
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
+
+			if (dev->async_copy && dev->async_use_vfio)
+				async_dma_map(reg, false);
 		}
 	}
 }
@@ -1105,6 +1139,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1158,13 +1193,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (dev->async_use_vfio) {
+			ret = async_dma_map(region, true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] example/vhost: add dma vfio parsing
  2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
  2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 1/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-07-05  8:19   ` Xuan Ding
  1 sibling, 0 replies; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:19 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

This patch adds the dma-vfio argument parsing for async vhost driver.
This argument can help to determine whether IOMMU needs to be
programmed for guest memory.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |  7 +++++++
 examples/vhost/main.c              | 16 +++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 63dcf181e1..c54aebc504 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -176,6 +176,13 @@ operation. The index of the device corresponds to the socket file in order,
 that means vhost device 0 is created through the first socket file, vhost
 device 1 is created through the second socket file, and so on.
 
+**--dma-vfio**
+This parameter is used to specify whether the IOMMU needs to be programmed.
+If the DMA device is bound to vfio, IOMMU dma mapping will be setup for
+guest memory. If igb_uio is bound by DMA device, there is no need to do
+IOMMU dma mapping. It is a supplementary parameter for async vhost-user
+driver and it is disabled by default.
+
 Common Issues
 -------------
 
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 81d7e4cbd3..80a1f41326 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -89,6 +89,8 @@ static uint32_t enable_tx_csum;
 /* Disable TSO offload */
 static uint32_t enable_tso;
 
+static uint32_t dma_use_vfio = 0;
+
 static int client_mode;
 
 static int builtin_net_driver;
@@ -472,7 +474,8 @@ us_vhost_usage(const char *prgname)
 	"		--tso [0|1] disable/enable TCP segment offload.\n"
 	"		--client register a vhost-user socket as client mode.\n"
 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
-	"		--dmas register dma channel for specific vhost device.\n",
+	"		--dmas register dma channel for specific vhost device.\n"
+	"		--dma-vfio [0|1]: 0: DMA device uses igb_uio, 1: DMA device uses vfio\n",
 	       prgname);
 }
 
@@ -503,6 +506,8 @@ enum {
 	OPT_DMA_TYPE_NUM,
 #define OPT_DMAS                "dmas"
 	OPT_DMAS_NUM,
+#define OPT_DMA_VFIO            "dma-vfio"
+	OPT_DMA_VFIO_NUM,
 };
 
 /*
@@ -542,6 +547,8 @@ us_vhost_parse_args(int argc, char **argv)
 				NULL, OPT_DMA_TYPE_NUM},
 		{OPT_DMAS, required_argument,
 				NULL, OPT_DMAS_NUM},
+		{OPT_DMA_VFIO, required_argument,
+				NULL, OPT_DMA_VFIO_NUM},
 		{NULL, 0, 0, 0},
 	};
 
@@ -679,6 +686,10 @@ us_vhost_parse_args(int argc, char **argv)
 			}
 			break;
 
+		case OPT_DMA_VFIO_NUM:
+			dma_use_vfio = 1;
+			break;
+
 		case OPT_CLIENT_NUM:
 			client_mode = 1;
 			break;
@@ -1788,6 +1799,9 @@ main(int argc, char *argv[])
 	if (client_mode)
 		flags |= RTE_VHOST_USER_CLIENT;
 
+	if (dma_use_vfio)
+		flags |= RTE_VHOST_USER_ASYNC_USE_VFIO;
+
 	/* Register vhost user driver to handle vhost messages. */
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path
  2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost xuan.ding
                   ` (2 preceding siblings ...)
  2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
@ 2021-07-05  8:40 ` Xuan Ding
  2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 2/2] example/vhost: add dma vfio parsing Xuan Ding
  3 siblings, 2 replies; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:40 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

This patch series expands the capability of DMA devices to use IOMMU.
When application informs the library the DMA device is bound to vfio,
the IOMMU will be programmed for guest memory.

v5:
* Fixed a coding style issue.

v4:
* Added the compatibility with IOVA_AS_PA.
* Revised the commit log that the IOMMU capability is added for DMA
* device.
* Changed single patch to patchset, added a new flag parsing in example.

v3:
* Fixed some typos.

v2:
* Fixed a format issue.
* Added the dma unmap logic when device is closed.

Xuan Ding (2):
  vhost: enable IOMMU for async vhost
  example/vhost: add dma vfio parsing

 doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
 doc/guides/sample_app_ug/vhost.rst  |  7 +++++
 examples/vhost/main.c               | 16 +++++++++-
 lib/vhost/rte_vhost.h               |  1 +
 lib/vhost/socket.c                  |  9 ++++++
 lib/vhost/vhost.h                   |  1 +
 lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
 7 files changed, 87 insertions(+), 2 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
@ 2021-07-05  8:40   ` Xuan Ding
  2021-07-05 12:16     ` Burakov, Anatoly
  2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 2/2] example/vhost: add dma vfio parsing Xuan Ding
  1 sibling, 1 reply; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:40 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA device is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
 lib/vhost/rte_vhost.h               |  1 +
 lib/vhost/socket.c                  |  9 ++++++
 lib/vhost/vhost.h                   |  1 +
 lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
 5 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index 05c42c9b11..c3beda23d9 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -118,6 +118,15 @@ The following is an overview of some key Vhost API functions:
 
     It is disabled by default.
 
+  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
+
+    In asynchronous data path, vhost liarary is not aware of which driver
+    (igb_uio/vfio) the DMA device is bound to. Application should pass
+    this flag to tell vhost library whether IOMMU should be programmed
+    for guest memory.
+
+    It is disabled by default.
+
   - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
 
     Since v16.04, the vhost library forwards checksum and gso requests for
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 8d875e9322..a766ea7b6b 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -37,6 +37,7 @@ extern "C" {
 #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
 #define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
 #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS	(1ULL << 8)
+#define RTE_VHOST_USER_ASYNC_USE_VFIO	(1ULL << 9)
 
 /* Features. */
 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 5d0d728d52..77c722c86b 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -42,6 +42,7 @@ struct vhost_user_socket {
 	bool extbuf;
 	bool linearbuf;
 	bool async_copy;
+	bool async_use_vfio;
 	bool net_compliant_ol_flags;
 
 	/*
@@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 			dev->async_copy = 1;
 	}
 
+	if (vsocket->async_use_vfio) {
+		dev = get_device(vid);
+
+		if (dev)
+			dev->async_use_vfio = 1;
+	}
+
 	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
+	vsocket->async_use_vfio = flags & RTE_VHOST_USER_ASYNC_USE_VFIO;
 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
 
 	if (vsocket->async_copy &&
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 8078ddff79..fb775ce4ed 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,7 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+	int			async_use_vfio;
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 8f0eba6412..f3703f2e72 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,36 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	uint64_t host_iova;
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && dev->async_use_vfio)
+				async_dma_map(reg, false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -1105,6 +1139,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1158,13 +1193,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (dev->async_use_vfio) {
+			ret = async_dma_map(region, true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v5 2/2] example/vhost: add dma vfio parsing
  2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
  2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-07-05  8:40   ` Xuan Ding
  1 sibling, 0 replies; 25+ messages in thread
From: Xuan Ding @ 2021-07-05  8:40 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma, Xuan Ding

This patch adds the dma-vfio argument parsing for async vhost driver.
This argument can help to determine whether IOMMU needs to be
programmed for guest memory.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 doc/guides/sample_app_ug/vhost.rst |  7 +++++++
 examples/vhost/main.c              | 16 +++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 63dcf181e1..c54aebc504 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -176,6 +176,13 @@ operation. The index of the device corresponds to the socket file in order,
 that means vhost device 0 is created through the first socket file, vhost
 device 1 is created through the second socket file, and so on.
 
+**--dma-vfio**
+This parameter is used to specify whether the IOMMU needs to be programmed.
+If the DMA device is bound to vfio, IOMMU dma mapping will be setup for
+guest memory. If igb_uio is bound by DMA device, there is no need to do
+IOMMU dma mapping. It is a supplementary parameter for async vhost-user
+driver and it is disabled by default.
+
 Common Issues
 -------------
 
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 81d7e4cbd3..53bb8cfe80 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -89,6 +89,8 @@ static uint32_t enable_tx_csum;
 /* Disable TSO offload */
 static uint32_t enable_tso;
 
+static uint32_t dma_use_vfio;
+
 static int client_mode;
 
 static int builtin_net_driver;
@@ -472,7 +474,8 @@ us_vhost_usage(const char *prgname)
 	"		--tso [0|1] disable/enable TCP segment offload.\n"
 	"		--client register a vhost-user socket as client mode.\n"
 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
-	"		--dmas register dma channel for specific vhost device.\n",
+	"		--dmas register dma channel for specific vhost device.\n"
+	"		--dma-vfio [0|1]: 0: DMA device uses igb_uio, 1: DMA device uses vfio\n",
 	       prgname);
 }
 
@@ -503,6 +506,8 @@ enum {
 	OPT_DMA_TYPE_NUM,
 #define OPT_DMAS                "dmas"
 	OPT_DMAS_NUM,
+#define OPT_DMA_VFIO            "dma-vfio"
+	OPT_DMA_VFIO_NUM,
 };
 
 /*
@@ -542,6 +547,8 @@ us_vhost_parse_args(int argc, char **argv)
 				NULL, OPT_DMA_TYPE_NUM},
 		{OPT_DMAS, required_argument,
 				NULL, OPT_DMAS_NUM},
+		{OPT_DMA_VFIO, required_argument,
+				NULL, OPT_DMA_VFIO_NUM},
 		{NULL, 0, 0, 0},
 	};
 
@@ -679,6 +686,10 @@ us_vhost_parse_args(int argc, char **argv)
 			}
 			break;
 
+		case OPT_DMA_VFIO_NUM:
+			dma_use_vfio = 1;
+			break;
+
 		case OPT_CLIENT_NUM:
 			client_mode = 1;
 			break;
@@ -1788,6 +1799,9 @@ main(int argc, char *argv[])
 	if (client_mode)
 		flags |= RTE_VHOST_USER_CLIENT;
 
+	if (dma_use_vfio)
+		flags |= RTE_VHOST_USER_ASYNC_USE_VFIO;
+
 	/* Register vhost user driver to handle vhost messages. */
 	for (i = 0; i < nb_sockets; i++) {
 		char *file = socket_files + i * PATH_MAX;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-07-05 12:16     ` Burakov, Anatoly
  2021-07-05 12:45       ` Maxime Coquelin
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2021-07-05 12:16 UTC (permalink / raw)
  To: Xuan Ding, maxime.coquelin, chenbo.xia, Thomas Monjalon, David Marchand
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma

On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use
> IOMMU if the DMA device is bound to vfio.
> 
> When set memory table, the guest memory will be mapped
> into the default container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>   doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>   lib/vhost/rte_vhost.h               |  1 +
>   lib/vhost/socket.c                  |  9 ++++++
>   lib/vhost/vhost.h                   |  1 +
>   lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
>   5 files changed, 65 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index 05c42c9b11..c3beda23d9 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost API functions:
>   
>       It is disabled by default.
>   
> +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> +
> +    In asynchronous data path, vhost liarary is not aware of which driver
> +    (igb_uio/vfio) the DMA device is bound to. Application should pass
> +    this flag to tell vhost library whether IOMMU should be programmed
> +    for guest memory.
> +
> +    It is disabled by default.
> +
>     - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>   
>       Since v16.04, the vhost library forwards checksum and gso requests for
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> index 8d875e9322..a766ea7b6b 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -37,6 +37,7 @@ extern "C" {
>   #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
>   #define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
>   #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS	(1ULL << 8)
> +#define RTE_VHOST_USER_ASYNC_USE_VFIO	(1ULL << 9)
>   
>   /* Features. */
>   #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> index 5d0d728d52..77c722c86b 100644
> --- a/lib/vhost/socket.c
> +++ b/lib/vhost/socket.c
> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>   	bool extbuf;
>   	bool linearbuf;
>   	bool async_copy;
> +	bool async_use_vfio;
>   	bool net_compliant_ol_flags;
>   
>   	/*
> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
>   			dev->async_copy = 1;
>   	}
>   
> +	if (vsocket->async_use_vfio) {
> +		dev = get_device(vid);
> +
> +		if (dev)
> +			dev->async_use_vfio = 1;
> +	}
> +
>   	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>   
>   	if (vsocket->notify_ops->new_connection) {
> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
>   	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>   	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>   	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> +	vsocket->async_use_vfio = flags & RTE_VHOST_USER_ASYNC_USE_VFIO;
>   	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>   
>   	if (vsocket->async_copy &&
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 8078ddff79..fb775ce4ed 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -370,6 +370,7 @@ struct virtio_net {
>   	int16_t			broadcast_rarp;
>   	uint32_t		nr_vring;
>   	int			async_copy;
> +	int			async_use_vfio;
>   	int			extbuf;
>   	int			linearbuf;
>   	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> index 8f0eba6412..f3703f2e72 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -45,6 +45,7 @@
>   #include <rte_common.h>
>   #include <rte_malloc.h>
>   #include <rte_log.h>
> +#include <rte_vfio.h>
>   
>   #include "iotlb.h"
>   #include "vhost.h"
> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>   	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>   }
>   
> +static int
> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> +{
> +	int ret = 0;
> +	uint64_t host_iova;
> +	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
> +	if (do_map) {
> +		/* Add mapped region into the default container of DPDK. */
> +		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						 region->host_user_addr,
> +						 host_iova,
> +						 region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> +			return ret;
> +		}
> +	} else {
> +		/* Remove mapped region from the default container of DPDK. */
> +		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						   region->host_user_addr,
> +						   host_iova,
> +						   region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> +			return ret;
> +		}
> +	}
> +	return ret;
> +}

We've been discussing this off list with Xuan, and unfortunately this is 
a blocker for now.

Currently, the x86 IOMMU does not support partial unmap - the segments 
have to be unmapped exactly the same addr/len as they were mapped. We 
also concatenate adjacent mappings to prevent filling up the DMA mapping 
entry table with superfluous entries.

This means that, when two unrelated mappings are contiguous in memory 
(e.g. if you map regions 1 and 2 independently, but they happen to be 
sitting right next to each other in virtual memory), we cannot later 
unmap one of them because, even though these are two separate mappings 
as far as kernel VFIO infrastructure is concerned, the mapping gets 
compacted and looks like one single mapping to VFIO, so DPDK API will 
not let us unmap region 1 without also unmapping region 2.

The proper fix for this problem would be to always map memory 
page-by-page regardless of where it comes from (we already do that for 
internal memory, but not for external). However, the reason this works 
for internal memory is because when mapping internal memory segments, 
*we know the page size*. For external memory segments, there is no such 
guarantee, so we cannot deduce page size for a given memory segment, and 
thus can't map things page-by-page.

So, the proper fix for it would be to add page size to the VFIO DMA API. 
Unfortunately, it probably has to wait until 21.11 because it is an API 
change.

The slightly hacky fix for this would be to forego user mem map 
concatenation and trust that user is not going to do anything stupid, 
and will not spam the VFIO DMA API without reason. I would rather not go 
down this road, but this could be an option in this case.

Thoughts?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-05 12:16     ` Burakov, Anatoly
@ 2021-07-05 12:45       ` Maxime Coquelin
  2021-07-06  8:31         ` Ding, Xuan
  0 siblings, 1 reply; 25+ messages in thread
From: Maxime Coquelin @ 2021-07-05 12:45 UTC (permalink / raw)
  To: Burakov, Anatoly, Xuan Ding, chenbo.xia, Thomas Monjalon, David Marchand
  Cc: dev, jiayu.hu, sunil.pai.g, bruce.richardson, harry.van.haaren,
	yong.liu, wenwux.ma



On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
>> The use of IOMMU has many advantages, such as isolation and address
>> translation. This patch extends the capbility of DMA engine to use
>> IOMMU if the DMA device is bound to vfio.
>>
>> When set memory table, the guest memory will be mapped
>> into the default container of DPDK.
>>
>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>> ---
>>   doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>>   lib/vhost/rte_vhost.h               |  1 +
>>   lib/vhost/socket.c                  |  9 ++++++
>>   lib/vhost/vhost.h                   |  1 +
>>   lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
>>   5 files changed, 65 insertions(+), 1 deletion(-)
>>
>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>> b/doc/guides/prog_guide/vhost_lib.rst
>> index 05c42c9b11..c3beda23d9 100644
>> --- a/doc/guides/prog_guide/vhost_lib.rst
>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
>> API functions:
>>         It is disabled by default.
>>   +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
>> +
>> +    In asynchronous data path, vhost liarary is not aware of which
>> driver
>> +    (igb_uio/vfio) the DMA device is bound to. Application should pass
>> +    this flag to tell vhost library whether IOMMU should be programmed
>> +    for guest memory.
>> +
>> +    It is disabled by default.
>> +
>>     - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>>         Since v16.04, the vhost library forwards checksum and gso
>> requests for
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 8d875e9322..a766ea7b6b 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -37,6 +37,7 @@ extern "C" {
>>   #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>>   #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
>>   #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
>>     /* Features. */
>>   #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>> index 5d0d728d52..77c722c86b 100644
>> --- a/lib/vhost/socket.c
>> +++ b/lib/vhost/socket.c
>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>       bool extbuf;
>>       bool linearbuf;
>>       bool async_copy;
>> +    bool async_use_vfio;
>>       bool net_compliant_ol_flags;
>>         /*
>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
>> vhost_user_socket *vsocket)
>>               dev->async_copy = 1;
>>       }
>>   +    if (vsocket->async_use_vfio) {
>> +        dev = get_device(vid);
>> +
>> +        if (dev)
>> +            dev->async_use_vfio = 1;
>> +    }
>> +
>>       VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>         if (vsocket->notify_ops->new_connection) {
>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
>> uint64_t flags)
>>       vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>>       vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>>       vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>> +    vsocket->async_use_vfio = flags & RTE_VHOST_USER_ASYNC_USE_VFIO;
>>       vsocket->net_compliant_ol_flags = flags &
>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>>         if (vsocket->async_copy &&
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 8078ddff79..fb775ce4ed 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -370,6 +370,7 @@ struct virtio_net {
>>       int16_t            broadcast_rarp;
>>       uint32_t        nr_vring;
>>       int            async_copy;
>> +    int            async_use_vfio;
>>       int            extbuf;
>>       int            linearbuf;
>>       struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>> index 8f0eba6412..f3703f2e72 100644
>> --- a/lib/vhost/vhost_user.c
>> +++ b/lib/vhost/vhost_user.c
>> @@ -45,6 +45,7 @@
>>   #include <rte_common.h>
>>   #include <rte_malloc.h>
>>   #include <rte_log.h>
>> +#include <rte_vfio.h>
>>     #include "iotlb.h"
>>   #include "vhost.h"
>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>>       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>   }
>>   +static int
>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>> +{
>> +    int ret = 0;
>> +    uint64_t host_iova;
>> +    host_iova = rte_mem_virt2iova((void
>> *)(uintptr_t)region->host_user_addr);
>> +    if (do_map) {
>> +        /* Add mapped region into the default container of DPDK. */
>> +        ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>> +                         region->host_user_addr,
>> +                         host_iova,
>> +                         region->size);
>> +        if (ret) {
>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>> +            return ret;
>> +        }
>> +    } else {
>> +        /* Remove mapped region from the default container of DPDK. */
>> +        ret =
>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>> +                           region->host_user_addr,
>> +                           host_iova,
>> +                           region->size);
>> +        if (ret) {
>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
>> +            return ret;
>> +        }
>> +    }
>> +    return ret;
>> +}
> 
> We've been discussing this off list with Xuan, and unfortunately this is
> a blocker for now.
> 
> Currently, the x86 IOMMU does not support partial unmap - the segments
> have to be unmapped exactly the same addr/len as they were mapped. We
> also concatenate adjacent mappings to prevent filling up the DMA mapping
> entry table with superfluous entries.
> 
> This means that, when two unrelated mappings are contiguous in memory
> (e.g. if you map regions 1 and 2 independently, but they happen to be
> sitting right next to each other in virtual memory), we cannot later
> unmap one of them because, even though these are two separate mappings
> as far as kernel VFIO infrastructure is concerned, the mapping gets
> compacted and looks like one single mapping to VFIO, so DPDK API will
> not let us unmap region 1 without also unmapping region 2.
> 
> The proper fix for this problem would be to always map memory
> page-by-page regardless of where it comes from (we already do that for
> internal memory, but not for external). However, the reason this works
> for internal memory is because when mapping internal memory segments,
> *we know the page size*. For external memory segments, there is no such
> guarantee, so we cannot deduce page size for a given memory segment, and
> thus can't map things page-by-page.
> 
> So, the proper fix for it would be to add page size to the VFIO DMA API.
> Unfortunately, it probably has to wait until 21.11 because it is an API
> change.
> 
> The slightly hacky fix for this would be to forego user mem map
> concatenation and trust that user is not going to do anything stupid,
> and will not spam the VFIO DMA API without reason. I would rather not go
> down this road, but this could be an option in this case.
> 
> Thoughts?
> 

Thanks Anatoly for the detailed description of the issue.
It may be possible to either create a versioned symbol for this API
change, or maybe even to have a temporary internal API.

But I think this series in its current form is not acceptable, so
waiting for v21.11 would be the best option (we may want to send the
deprecation notice in this release though).

In this series, I don't like the user application has to pass a flag to
state whether the DMA engine uses VFIO or not. AFAICT, this new revision
does not implement what was discussed in the previous one, i.e.
supporting both IOVA_AS_VA and IOVA_AS_PA.

Regards,
Maxime


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-05 12:45       ` Maxime Coquelin
@ 2021-07-06  8:31         ` Ding, Xuan
  2021-07-06  9:16           ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: Ding, Xuan @ 2021-07-06  8:31 UTC (permalink / raw)
  To: Maxime Coquelin, Burakov, Anatoly, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, July 5, 2021 8:46 PM
> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> Monjalon <thomas@monjalon.net>; David Marchand
> <david.marchand@redhat.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> 
> 
> 
> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> > On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> >> The use of IOMMU has many advantages, such as isolation and address
> >> translation. This patch extends the capbility of DMA engine to use
> >> IOMMU if the DMA device is bound to vfio.
> >>
> >> When set memory table, the guest memory will be mapped
> >> into the default container of DPDK.
> >>
> >> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >> ---
> >>   doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
> >>   lib/vhost/rte_vhost.h               |  1 +
> >>   lib/vhost/socket.c                  |  9 ++++++
> >>   lib/vhost/vhost.h                   |  1 +
> >>   lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
> >>   5 files changed, 65 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> >> b/doc/guides/prog_guide/vhost_lib.rst
> >> index 05c42c9b11..c3beda23d9 100644
> >> --- a/doc/guides/prog_guide/vhost_lib.rst
> >> +++ b/doc/guides/prog_guide/vhost_lib.rst
> >> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
> >> API functions:
> >>         It is disabled by default.
> >>   +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> >> +
> >> +    In asynchronous data path, vhost liarary is not aware of which
> >> driver
> >> +    (igb_uio/vfio) the DMA device is bound to. Application should pass
> >> +    this flag to tell vhost library whether IOMMU should be programmed
> >> +    for guest memory.
> >> +
> >> +    It is disabled by default.
> >> +
> >>     - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
> >>         Since v16.04, the vhost library forwards checksum and gso
> >> requests for
> >> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> >> index 8d875e9322..a766ea7b6b 100644
> >> --- a/lib/vhost/rte_vhost.h
> >> +++ b/lib/vhost/rte_vhost.h
> >> @@ -37,6 +37,7 @@ extern "C" {
> >>   #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
> >>   #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> >>   #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
> >> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
> >>     /* Features. */
> >>   #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> >> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> >> index 5d0d728d52..77c722c86b 100644
> >> --- a/lib/vhost/socket.c
> >> +++ b/lib/vhost/socket.c
> >> @@ -42,6 +42,7 @@ struct vhost_user_socket {
> >>       bool extbuf;
> >>       bool linearbuf;
> >>       bool async_copy;
> >> +    bool async_use_vfio;
> >>       bool net_compliant_ol_flags;
> >>         /*
> >> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
> >> vhost_user_socket *vsocket)
> >>               dev->async_copy = 1;
> >>       }
> >>   +    if (vsocket->async_use_vfio) {
> >> +        dev = get_device(vid);
> >> +
> >> +        if (dev)
> >> +            dev->async_use_vfio = 1;
> >> +    }
> >> +
> >>       VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> >>         if (vsocket->notify_ops->new_connection) {
> >> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
> >> uint64_t flags)
> >>       vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> >>       vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
> >>       vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> >> +    vsocket->async_use_vfio = flags &
> RTE_VHOST_USER_ASYNC_USE_VFIO;
> >>       vsocket->net_compliant_ol_flags = flags &
> >> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
> >>         if (vsocket->async_copy &&
> >> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> >> index 8078ddff79..fb775ce4ed 100644
> >> --- a/lib/vhost/vhost.h
> >> +++ b/lib/vhost/vhost.h
> >> @@ -370,6 +370,7 @@ struct virtio_net {
> >>       int16_t            broadcast_rarp;
> >>       uint32_t        nr_vring;
> >>       int            async_copy;
> >> +    int            async_use_vfio;
> >>       int            extbuf;
> >>       int            linearbuf;
> >>       struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> >> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> >> index 8f0eba6412..f3703f2e72 100644
> >> --- a/lib/vhost/vhost_user.c
> >> +++ b/lib/vhost/vhost_user.c
> >> @@ -45,6 +45,7 @@
> >>   #include <rte_common.h>
> >>   #include <rte_malloc.h>
> >>   #include <rte_log.h>
> >> +#include <rte_vfio.h>
> >>     #include "iotlb.h"
> >>   #include "vhost.h"
> >> @@ -141,6 +142,36 @@ get_blk_size(int fd)
> >>       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >>   }
> >>   +static int
> >> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> >> +{
> >> +    int ret = 0;
> >> +    uint64_t host_iova;
> >> +    host_iova = rte_mem_virt2iova((void
> >> *)(uintptr_t)region->host_user_addr);
> >> +    if (do_map) {
> >> +        /* Add mapped region into the default container of DPDK. */
> >> +        ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >> +                         region->host_user_addr,
> >> +                         host_iova,
> >> +                         region->size);
> >> +        if (ret) {
> >> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> >> +            return ret;
> >> +        }
> >> +    } else {
> >> +        /* Remove mapped region from the default container of DPDK. */
> >> +        ret =
> >> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >> +                           region->host_user_addr,
> >> +                           host_iova,
> >> +                           region->size);
> >> +        if (ret) {
> >> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> >> +            return ret;
> >> +        }
> >> +    }
> >> +    return ret;
> >> +}
> >
> > We've been discussing this off list with Xuan, and unfortunately this is
> > a blocker for now.
> >
> > Currently, the x86 IOMMU does not support partial unmap - the segments
> > have to be unmapped exactly the same addr/len as they were mapped. We
> > also concatenate adjacent mappings to prevent filling up the DMA mapping
> > entry table with superfluous entries.
> >
> > This means that, when two unrelated mappings are contiguous in memory
> > (e.g. if you map regions 1 and 2 independently, but they happen to be
> > sitting right next to each other in virtual memory), we cannot later
> > unmap one of them because, even though these are two separate
> mappings
> > as far as kernel VFIO infrastructure is concerned, the mapping gets
> > compacted and looks like one single mapping to VFIO, so DPDK API will
> > not let us unmap region 1 without also unmapping region 2.
> >
> > The proper fix for this problem would be to always map memory
> > page-by-page regardless of where it comes from (we already do that for
> > internal memory, but not for external). However, the reason this works
> > for internal memory is because when mapping internal memory segments,
> > *we know the page size*. For external memory segments, there is no such
> > guarantee, so we cannot deduce page size for a given memory segment,
> and
> > thus can't map things page-by-page.
> >
> > So, the proper fix for it would be to add page size to the VFIO DMA API.
> > Unfortunately, it probably has to wait until 21.11 because it is an API
> > change.
> >
> > The slightly hacky fix for this would be to forego user mem map
> > concatenation and trust that user is not going to do anything stupid,
> > and will not spam the VFIO DMA API without reason. I would rather not go
> > down this road, but this could be an option in this case.
> >
> > Thoughts?
> >
> 
> Thanks Anatoly for the detailed description of the issue.
> It may be possible to either create a versioned symbol for this API
> change, or maybe even to have a temporary internal API.
> 
> But I think this series in its current form is not acceptable, so
> waiting for v21.11 would be the best option (we may want to send the
> deprecation notice in this release though).
> 
> In this series, I don't like the user application has to pass a flag to
> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
> does not implement what was discussed in the previous one, i.e.
> supporting both IOVA_AS_VA and IOVA_AS_PA.

Thanks for your comments. Here I hope to explain some questions:
1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version, the
virtual address is replaced with iova address of mapped region, and the iova
address is selected to program the IOMMU instead of virtual address only.

2. Why a flag is chosen to be passed by application?
A: Yes, as we discussed before, the rte_eal_iova_mode() API can be used to
get the IOVA mode, so as to determine whether IOMMU should be programmed.
However, in the implementation process, I found a problem. That is how to	
distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we should
always program the IOMMU. While in IGB_UIO cases, it depends on IOMMU
capability of platform.

So a flag is selected, but this requires the application to do extra things.
I find another solution, is to use
#ifdef VFIO_PRESENT
	If(rte_vfio_is_enabled("vfio"))
		program_iommu;
#endif

Because all the devices are managed by DPDK, we can follow DPDK to do the
decision. Does this make sense for you, or any some suggestions?

3.  The partial unmap issue
A: Thanks Anatoly for the detailed explanation. This problem was found in
reconnection cases. After our off list discussion, the solution requires
rte_vfio_container_dma_map/unmap API change. Here I want to consult
if there are some hope for versioned symbol or a temporary internal API 
be used in this release.

Thanks for your time!

Regards,
Xuan

> 
> Regards,
> Maxime


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-06  8:31         ` Ding, Xuan
@ 2021-07-06  9:16           ` Burakov, Anatoly
  2021-07-06  9:32             ` Maxime Coquelin
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2021-07-06  9:16 UTC (permalink / raw)
  To: Ding, Xuan, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Monday, July 5, 2021 8:46 PM
>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>> Monjalon <thomas@monjalon.net>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
>>
>>
>>
>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
>>>> The use of IOMMU has many advantages, such as isolation and address
>>>> translation. This patch extends the capbility of DMA engine to use
>>>> IOMMU if the DMA device is bound to vfio.
>>>>
>>>> When set memory table, the guest memory will be mapped
>>>> into the default container of DPDK.
>>>>
>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>>> ---
>>>>    doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>>>>    lib/vhost/rte_vhost.h               |  1 +
>>>>    lib/vhost/socket.c                  |  9 ++++++
>>>>    lib/vhost/vhost.h                   |  1 +
>>>>    lib/vhost/vhost_user.c              | 46 ++++++++++++++++++++++++++++-
>>>>    5 files changed, 65 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>>>> b/doc/guides/prog_guide/vhost_lib.rst
>>>> index 05c42c9b11..c3beda23d9 100644
>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>>>> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
>>>> API functions:
>>>>          It is disabled by default.
>>>>    +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
>>>> +
>>>> +    In asynchronous data path, vhost liarary is not aware of which
>>>> driver
>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should pass
>>>> +    this flag to tell vhost library whether IOMMU should be programmed
>>>> +    for guest memory.
>>>> +
>>>> +    It is disabled by default.
>>>> +
>>>>      - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>>>>          Since v16.04, the vhost library forwards checksum and gso
>>>> requests for
>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>>>> index 8d875e9322..a766ea7b6b 100644
>>>> --- a/lib/vhost/rte_vhost.h
>>>> +++ b/lib/vhost/rte_vhost.h
>>>> @@ -37,6 +37,7 @@ extern "C" {
>>>>    #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>>>>    #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
>>>>    #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
>>>>      /* Features. */
>>>>    #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>>>> index 5d0d728d52..77c722c86b 100644
>>>> --- a/lib/vhost/socket.c
>>>> +++ b/lib/vhost/socket.c
>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>>>        bool extbuf;
>>>>        bool linearbuf;
>>>>        bool async_copy;
>>>> +    bool async_use_vfio;
>>>>        bool net_compliant_ol_flags;
>>>>          /*
>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
>>>> vhost_user_socket *vsocket)
>>>>                dev->async_copy = 1;
>>>>        }
>>>>    +    if (vsocket->async_use_vfio) {
>>>> +        dev = get_device(vid);
>>>> +
>>>> +        if (dev)
>>>> +            dev->async_use_vfio = 1;
>>>> +    }
>>>> +
>>>>        VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>>>          if (vsocket->notify_ops->new_connection) {
>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
>>>> uint64_t flags)
>>>>        vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>>>>        vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>>>>        vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>>>> +    vsocket->async_use_vfio = flags &
>> RTE_VHOST_USER_ASYNC_USE_VFIO;
>>>>        vsocket->net_compliant_ol_flags = flags &
>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>>>>          if (vsocket->async_copy &&
>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>>>> index 8078ddff79..fb775ce4ed 100644
>>>> --- a/lib/vhost/vhost.h
>>>> +++ b/lib/vhost/vhost.h
>>>> @@ -370,6 +370,7 @@ struct virtio_net {
>>>>        int16_t            broadcast_rarp;
>>>>        uint32_t        nr_vring;
>>>>        int            async_copy;
>>>> +    int            async_use_vfio;
>>>>        int            extbuf;
>>>>        int            linearbuf;
>>>>        struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>>>> index 8f0eba6412..f3703f2e72 100644
>>>> --- a/lib/vhost/vhost_user.c
>>>> +++ b/lib/vhost/vhost_user.c
>>>> @@ -45,6 +45,7 @@
>>>>    #include <rte_common.h>
>>>>    #include <rte_malloc.h>
>>>>    #include <rte_log.h>
>>>> +#include <rte_vfio.h>
>>>>      #include "iotlb.h"
>>>>    #include "vhost.h"
>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>>>>        return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>>>    }
>>>>    +static int
>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>>>> +{
>>>> +    int ret = 0;
>>>> +    uint64_t host_iova;
>>>> +    host_iova = rte_mem_virt2iova((void
>>>> *)(uintptr_t)region->host_user_addr);
>>>> +    if (do_map) {
>>>> +        /* Add mapped region into the default container of DPDK. */
>>>> +        ret =
>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>> +                         region->host_user_addr,
>>>> +                         host_iova,
>>>> +                         region->size);
>>>> +        if (ret) {
>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>>>> +            return ret;
>>>> +        }
>>>> +    } else {
>>>> +        /* Remove mapped region from the default container of DPDK. */
>>>> +        ret =
>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>> +                           region->host_user_addr,
>>>> +                           host_iova,
>>>> +                           region->size);
>>>> +        if (ret) {
>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
>>>> +            return ret;
>>>> +        }
>>>> +    }
>>>> +    return ret;
>>>> +}
>>>
>>> We've been discussing this off list with Xuan, and unfortunately this is
>>> a blocker for now.
>>>
>>> Currently, the x86 IOMMU does not support partial unmap - the segments
>>> have to be unmapped exactly the same addr/len as they were mapped. We
>>> also concatenate adjacent mappings to prevent filling up the DMA mapping
>>> entry table with superfluous entries.
>>>
>>> This means that, when two unrelated mappings are contiguous in memory
>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
>>> sitting right next to each other in virtual memory), we cannot later
>>> unmap one of them because, even though these are two separate
>> mappings
>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
>>> compacted and looks like one single mapping to VFIO, so DPDK API will
>>> not let us unmap region 1 without also unmapping region 2.
>>>
>>> The proper fix for this problem would be to always map memory
>>> page-by-page regardless of where it comes from (we already do that for
>>> internal memory, but not for external). However, the reason this works
>>> for internal memory is because when mapping internal memory segments,
>>> *we know the page size*. For external memory segments, there is no such
>>> guarantee, so we cannot deduce page size for a given memory segment,
>> and
>>> thus can't map things page-by-page.
>>>
>>> So, the proper fix for it would be to add page size to the VFIO DMA API.
>>> Unfortunately, it probably has to wait until 21.11 because it is an API
>>> change.
>>>
>>> The slightly hacky fix for this would be to forego user mem map
>>> concatenation and trust that user is not going to do anything stupid,
>>> and will not spam the VFIO DMA API without reason. I would rather not go
>>> down this road, but this could be an option in this case.
>>>
>>> Thoughts?
>>>
>>
>> Thanks Anatoly for the detailed description of the issue.
>> It may be possible to either create a versioned symbol for this API
>> change, or maybe even to have a temporary internal API.
>>
>> But I think this series in its current form is not acceptable, so
>> waiting for v21.11 would be the best option (we may want to send the
>> deprecation notice in this release though).
>>
>> In this series, I don't like the user application has to pass a flag to
>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
>> does not implement what was discussed in the previous one, i.e.
>> supporting both IOVA_AS_VA and IOVA_AS_PA.
> 
> Thanks for your comments. Here I hope to explain some questions:
> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version, the
> virtual address is replaced with iova address of mapped region, and the iova
> address is selected to program the IOMMU instead of virtual address only.
> 
> 2. Why a flag is chosen to be passed by application?
> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be used to
> get the IOVA mode, so as to determine whether IOMMU should be programmed.
> However, in the implementation process, I found a problem. That is how to
> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we should
> always program the IOMMU. While in IGB_UIO cases, it depends on IOMMU
> capability of platform.

How does one program IOMMU with igb_uio? I was under impression that 
igb_uio (and uio_pci_generic for that matter) does not provide such 
facilities.

> 
> So a flag is selected, but this requires the application to do extra things.
> I find another solution, is to use
> #ifdef VFIO_PRESENT
>          If(rte_vfio_is_enabled("vfio"))
>                  program_iommu;
> #endif
> 
> Because all the devices are managed by DPDK, we can follow DPDK to do the
> decision. Does this make sense for you, or any some suggestions?

IMO the #ifdef is not needed. The API will always work, it's just that 
if VFIO is not compiled, it'll just compile down to noops.

> 
> 3.  The partial unmap issue
> A: Thanks Anatoly for the detailed explanation. This problem was found in
> reconnection cases. After our off list discussion, the solution requires
> rte_vfio_container_dma_map/unmap API change. Here I want to consult
> if there are some hope for versioned symbol or a temporary internal API
> be used in this release.

I don't think we can add a versioned symbol in this release unless 
there's an exception to rc1 feature freeze. I also don't like the idea 
of a temporary internal API because vhost is not in EAL, it's a library 
- meaning, the "internal" API has to in fact be external API, added to 
the .map file etc., otherwise it won't work with shared library builds.

That said, i'm not an expert on versioning, so maybe there are other 
ways i'm not aware of, or i have some misconceptions about how it works :)

> 
> Thanks for your time!
> 
> Regards,
> Xuan
> 
>>
>> Regards,
>> Maxime
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-06  9:16           ` Burakov, Anatoly
@ 2021-07-06  9:32             ` Maxime Coquelin
  2021-07-07  6:25               ` Ding, Xuan
  0 siblings, 1 reply; 25+ messages in thread
From: Maxime Coquelin @ 2021-07-06  9:32 UTC (permalink / raw)
  To: Burakov, Anatoly, Ding, Xuan, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX



On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
> On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
>> Hi Maxime,
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>> Sent: Monday, July 5, 2021 8:46 PM
>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>>> Monjalon <thomas@monjalon.net>; David Marchand
>>> <david.marchand@redhat.com>
>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
>>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
>>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
>>> vhost
>>>
>>>
>>>
>>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
>>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
>>>>> The use of IOMMU has many advantages, such as isolation and address
>>>>> translation. This patch extends the capbility of DMA engine to use
>>>>> IOMMU if the DMA device is bound to vfio.
>>>>>
>>>>> When set memory table, the guest memory will be mapped
>>>>> into the default container of DPDK.
>>>>>
>>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>>>> ---
>>>>>    doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>>>>>    lib/vhost/rte_vhost.h               |  1 +
>>>>>    lib/vhost/socket.c                  |  9 ++++++
>>>>>    lib/vhost/vhost.h                   |  1 +
>>>>>    lib/vhost/vhost_user.c              | 46
>>>>> ++++++++++++++++++++++++++++-
>>>>>    5 files changed, 65 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>>>>> b/doc/guides/prog_guide/vhost_lib.rst
>>>>> index 05c42c9b11..c3beda23d9 100644
>>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
>>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>>>>> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
>>>>> API functions:
>>>>>          It is disabled by default.
>>>>>    +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
>>>>> +
>>>>> +    In asynchronous data path, vhost liarary is not aware of which
>>>>> driver
>>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
>>>>> pass
>>>>> +    this flag to tell vhost library whether IOMMU should be
>>>>> programmed
>>>>> +    for guest memory.
>>>>> +
>>>>> +    It is disabled by default.
>>>>> +
>>>>>      - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>>>>>          Since v16.04, the vhost library forwards checksum and gso
>>>>> requests for
>>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>>>>> index 8d875e9322..a766ea7b6b 100644
>>>>> --- a/lib/vhost/rte_vhost.h
>>>>> +++ b/lib/vhost/rte_vhost.h
>>>>> @@ -37,6 +37,7 @@ extern "C" {
>>>>>    #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>>>>>    #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
>>>>>    #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
>>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
>>>>>      /* Features. */
>>>>>    #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
>>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>>>>> index 5d0d728d52..77c722c86b 100644
>>>>> --- a/lib/vhost/socket.c
>>>>> +++ b/lib/vhost/socket.c
>>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>>>>        bool extbuf;
>>>>>        bool linearbuf;
>>>>>        bool async_copy;
>>>>> +    bool async_use_vfio;
>>>>>        bool net_compliant_ol_flags;
>>>>>          /*
>>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
>>>>> vhost_user_socket *vsocket)
>>>>>                dev->async_copy = 1;
>>>>>        }
>>>>>    +    if (vsocket->async_use_vfio) {
>>>>> +        dev = get_device(vid);
>>>>> +
>>>>> +        if (dev)
>>>>> +            dev->async_use_vfio = 1;
>>>>> +    }
>>>>> +
>>>>>        VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>>>>          if (vsocket->notify_ops->new_connection) {
>>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
>>>>> uint64_t flags)
>>>>>        vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>>>>>        vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
>>>>>        vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>>>>> +    vsocket->async_use_vfio = flags &
>>> RTE_VHOST_USER_ASYNC_USE_VFIO;
>>>>>        vsocket->net_compliant_ol_flags = flags &
>>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>>>>>          if (vsocket->async_copy &&
>>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>>>>> index 8078ddff79..fb775ce4ed 100644
>>>>> --- a/lib/vhost/vhost.h
>>>>> +++ b/lib/vhost/vhost.h
>>>>> @@ -370,6 +370,7 @@ struct virtio_net {
>>>>>        int16_t            broadcast_rarp;
>>>>>        uint32_t        nr_vring;
>>>>>        int            async_copy;
>>>>> +    int            async_use_vfio;
>>>>>        int            extbuf;
>>>>>        int            linearbuf;
>>>>>        struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
>>>>> 2];
>>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>>>>> index 8f0eba6412..f3703f2e72 100644
>>>>> --- a/lib/vhost/vhost_user.c
>>>>> +++ b/lib/vhost/vhost_user.c
>>>>> @@ -45,6 +45,7 @@
>>>>>    #include <rte_common.h>
>>>>>    #include <rte_malloc.h>
>>>>>    #include <rte_log.h>
>>>>> +#include <rte_vfio.h>
>>>>>      #include "iotlb.h"
>>>>>    #include "vhost.h"
>>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>>>>>        return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>>>>    }
>>>>>    +static int
>>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>>>>> +{
>>>>> +    int ret = 0;
>>>>> +    uint64_t host_iova;
>>>>> +    host_iova = rte_mem_virt2iova((void
>>>>> *)(uintptr_t)region->host_user_addr);
>>>>> +    if (do_map) {
>>>>> +        /* Add mapped region into the default container of DPDK. */
>>>>> +        ret =
>>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>> +                         region->host_user_addr,
>>>>> +                         host_iova,
>>>>> +                         region->size);
>>>>> +        if (ret) {
>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>>>>> +            return ret;
>>>>> +        }
>>>>> +    } else {
>>>>> +        /* Remove mapped region from the default container of
>>>>> DPDK. */
>>>>> +        ret =
>>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>> +                           region->host_user_addr,
>>>>> +                           host_iova,
>>>>> +                           region->size);
>>>>> +        if (ret) {
>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
>>>>> +            return ret;
>>>>> +        }
>>>>> +    }
>>>>> +    return ret;
>>>>> +}
>>>>
>>>> We've been discussing this off list with Xuan, and unfortunately
>>>> this is
>>>> a blocker for now.
>>>>
>>>> Currently, the x86 IOMMU does not support partial unmap - the segments
>>>> have to be unmapped exactly the same addr/len as they were mapped. We
>>>> also concatenate adjacent mappings to prevent filling up the DMA
>>>> mapping
>>>> entry table with superfluous entries.
>>>>
>>>> This means that, when two unrelated mappings are contiguous in memory
>>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
>>>> sitting right next to each other in virtual memory), we cannot later
>>>> unmap one of them because, even though these are two separate
>>> mappings
>>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
>>>> compacted and looks like one single mapping to VFIO, so DPDK API will
>>>> not let us unmap region 1 without also unmapping region 2.
>>>>
>>>> The proper fix for this problem would be to always map memory
>>>> page-by-page regardless of where it comes from (we already do that for
>>>> internal memory, but not for external). However, the reason this works
>>>> for internal memory is because when mapping internal memory segments,
>>>> *we know the page size*. For external memory segments, there is no such
>>>> guarantee, so we cannot deduce page size for a given memory segment,
>>> and
>>>> thus can't map things page-by-page.
>>>>
>>>> So, the proper fix for it would be to add page size to the VFIO DMA
>>>> API.
>>>> Unfortunately, it probably has to wait until 21.11 because it is an API
>>>> change.
>>>>
>>>> The slightly hacky fix for this would be to forego user mem map
>>>> concatenation and trust that user is not going to do anything stupid,
>>>> and will not spam the VFIO DMA API without reason. I would rather
>>>> not go
>>>> down this road, but this could be an option in this case.
>>>>
>>>> Thoughts?
>>>>
>>>
>>> Thanks Anatoly for the detailed description of the issue.
>>> It may be possible to either create a versioned symbol for this API
>>> change, or maybe even to have a temporary internal API.
>>>
>>> But I think this series in its current form is not acceptable, so
>>> waiting for v21.11 would be the best option (we may want to send the
>>> deprecation notice in this release though).
>>>
>>> In this series, I don't like the user application has to pass a flag to
>>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
>>> does not implement what was discussed in the previous one, i.e.
>>> supporting both IOVA_AS_VA and IOVA_AS_PA.
>>
>> Thanks for your comments. Here I hope to explain some questions:
>> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
>> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version, the
>> virtual address is replaced with iova address of mapped region, and
>> the iova
>> address is selected to program the IOMMU instead of virtual address only.

Good!

>>
>> 2. Why a flag is chosen to be passed by application?
>> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
>> used to
>> get the IOVA mode, so as to determine whether IOMMU should be programmed.
>> However, in the implementation process, I found a problem. That is how to
>> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we should
>> always program the IOMMU. While in IGB_UIO cases, it depends on IOMMU
>> capability of platform.
> 
> How does one program IOMMU with igb_uio? I was under impression that
> igb_uio (and uio_pci_generic for that matter) does not provide such
> facilities.

+1

>>
>> So a flag is selected, but this requires the application to do extra
>> things.
>> I find another solution, is to use
>> #ifdef VFIO_PRESENT
>>          If(rte_vfio_is_enabled("vfio"))
>>                  program_iommu;
>> #endif
>>
>> Because all the devices are managed by DPDK, we can follow DPDK to do the
>> decision. Does this make sense for you, or any some suggestions?
> 
> IMO the #ifdef is not needed. The API will always work, it's just that
> if VFIO is not compiled, it'll just compile down to noops.

Agree the #ifdef is not necessary.

To be clear, rte_vfio_is_enabled() check is going to be done in the
Vhost library, making this transparent to the application?

>>
>> 3.  The partial unmap issue
>> A: Thanks Anatoly for the detailed explanation. This problem was found in
>> reconnection cases. After our off list discussion, the solution requires
>> rte_vfio_container_dma_map/unmap API change. Here I want to consult
>> if there are some hope for versioned symbol or a temporary internal API
>> be used in this release.
> 
> I don't think we can add a versioned symbol in this release unless
> there's an exception to rc1 feature freeze. I also don't like the idea
> of a temporary internal API because vhost is not in EAL, it's a library
> - meaning, the "internal" API has to in fact be external API, added to
> the .map file etc., otherwise it won't work with shared library builds.
> 
> That said, i'm not an expert on versioning, so maybe there are other
> ways i'm not aware of, or i have some misconceptions about how it works :)

Ok, it maybe indeed be better to wait for v21.11, it is too late for
this release.

Thanks,
Maxime

>>
>> Thanks for your time!
>>
>> Regards,
>> Xuan
>>
>>>
>>> Regards,
>>> Maxime
>>
> 
> 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-06  9:32             ` Maxime Coquelin
@ 2021-07-07  6:25               ` Ding, Xuan
  2021-07-07 12:17                 ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: Ding, Xuan @ 2021-07-07  6:25 UTC (permalink / raw)
  To: Maxime Coquelin, Burakov, Anatoly, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

Hi,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, July 6, 2021 5:32 PM
> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> Monjalon <thomas@monjalon.net>; David Marchand
> <david.marchand@redhat.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
> Ma, WenwuX <wenwux.ma@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> 
> 
> 
> On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
> > On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
> >> Hi Maxime,
> >>
> >>> -----Original Message-----
> >>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>> Sent: Monday, July 5, 2021 8:46 PM
> >>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> >>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> >>> Monjalon <thomas@monjalon.net>; David Marchand
> >>> <david.marchand@redhat.com>
> >>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> >>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> >>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> >>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
> >>> vhost
> >>>
> >>>
> >>>
> >>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> >>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> >>>>> The use of IOMMU has many advantages, such as isolation and address
> >>>>> translation. This patch extends the capbility of DMA engine to use
> >>>>> IOMMU if the DMA device is bound to vfio.
> >>>>>
> >>>>> When set memory table, the guest memory will be mapped
> >>>>> into the default container of DPDK.
> >>>>>
> >>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >>>>> ---
> >>>>>    doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
> >>>>>    lib/vhost/rte_vhost.h               |  1 +
> >>>>>    lib/vhost/socket.c                  |  9 ++++++
> >>>>>    lib/vhost/vhost.h                   |  1 +
> >>>>>    lib/vhost/vhost_user.c              | 46
> >>>>> ++++++++++++++++++++++++++++-
> >>>>>    5 files changed, 65 insertions(+), 1 deletion(-)
> >>>>>
> >>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> >>>>> b/doc/guides/prog_guide/vhost_lib.rst
> >>>>> index 05c42c9b11..c3beda23d9 100644
> >>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
> >>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
> >>>>> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
> >>>>> API functions:
> >>>>>          It is disabled by default.
> >>>>>    +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> >>>>> +
> >>>>> +    In asynchronous data path, vhost liarary is not aware of which
> >>>>> driver
> >>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
> >>>>> pass
> >>>>> +    this flag to tell vhost library whether IOMMU should be
> >>>>> programmed
> >>>>> +    for guest memory.
> >>>>> +
> >>>>> +    It is disabled by default.
> >>>>> +
> >>>>>      - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
> >>>>>          Since v16.04, the vhost library forwards checksum and gso
> >>>>> requests for
> >>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> >>>>> index 8d875e9322..a766ea7b6b 100644
> >>>>> --- a/lib/vhost/rte_vhost.h
> >>>>> +++ b/lib/vhost/rte_vhost.h
> >>>>> @@ -37,6 +37,7 @@ extern "C" {
> >>>>>    #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
> >>>>>    #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> >>>>>    #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
> >>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
> >>>>>      /* Features. */
> >>>>>    #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> >>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> >>>>> index 5d0d728d52..77c722c86b 100644
> >>>>> --- a/lib/vhost/socket.c
> >>>>> +++ b/lib/vhost/socket.c
> >>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
> >>>>>        bool extbuf;
> >>>>>        bool linearbuf;
> >>>>>        bool async_copy;
> >>>>> +    bool async_use_vfio;
> >>>>>        bool net_compliant_ol_flags;
> >>>>>          /*
> >>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
> >>>>> vhost_user_socket *vsocket)
> >>>>>                dev->async_copy = 1;
> >>>>>        }
> >>>>>    +    if (vsocket->async_use_vfio) {
> >>>>> +        dev = get_device(vid);
> >>>>> +
> >>>>> +        if (dev)
> >>>>> +            dev->async_use_vfio = 1;
> >>>>> +    }
> >>>>> +
> >>>>>        VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> >>>>>          if (vsocket->notify_ops->new_connection) {
> >>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
> >>>>> uint64_t flags)
> >>>>>        vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> >>>>>        vsocket->linearbuf = flags &
> RTE_VHOST_USER_LINEARBUF_SUPPORT;
> >>>>>        vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> >>>>> +    vsocket->async_use_vfio = flags &
> >>> RTE_VHOST_USER_ASYNC_USE_VFIO;
> >>>>>        vsocket->net_compliant_ol_flags = flags &
> >>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
> >>>>>          if (vsocket->async_copy &&
> >>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> >>>>> index 8078ddff79..fb775ce4ed 100644
> >>>>> --- a/lib/vhost/vhost.h
> >>>>> +++ b/lib/vhost/vhost.h
> >>>>> @@ -370,6 +370,7 @@ struct virtio_net {
> >>>>>        int16_t            broadcast_rarp;
> >>>>>        uint32_t        nr_vring;
> >>>>>        int            async_copy;
> >>>>> +    int            async_use_vfio;
> >>>>>        int            extbuf;
> >>>>>        int            linearbuf;
> >>>>>        struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
> >>>>> 2];
> >>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> >>>>> index 8f0eba6412..f3703f2e72 100644
> >>>>> --- a/lib/vhost/vhost_user.c
> >>>>> +++ b/lib/vhost/vhost_user.c
> >>>>> @@ -45,6 +45,7 @@
> >>>>>    #include <rte_common.h>
> >>>>>    #include <rte_malloc.h>
> >>>>>    #include <rte_log.h>
> >>>>> +#include <rte_vfio.h>
> >>>>>      #include "iotlb.h"
> >>>>>    #include "vhost.h"
> >>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
> >>>>>        return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >>>>>    }
> >>>>>    +static int
> >>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> >>>>> +{
> >>>>> +    int ret = 0;
> >>>>> +    uint64_t host_iova;
> >>>>> +    host_iova = rte_mem_virt2iova((void
> >>>>> *)(uintptr_t)region->host_user_addr);
> >>>>> +    if (do_map) {
> >>>>> +        /* Add mapped region into the default container of DPDK. */
> >>>>> +        ret =
> >>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>> +                         region->host_user_addr,
> >>>>> +                         host_iova,
> >>>>> +                         region->size);
> >>>>> +        if (ret) {
> >>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> >>>>> +            return ret;
> >>>>> +        }
> >>>>> +    } else {
> >>>>> +        /* Remove mapped region from the default container of
> >>>>> DPDK. */
> >>>>> +        ret =
> >>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>> +                           region->host_user_addr,
> >>>>> +                           host_iova,
> >>>>> +                           region->size);
> >>>>> +        if (ret) {
> >>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> >>>>> +            return ret;
> >>>>> +        }
> >>>>> +    }
> >>>>> +    return ret;
> >>>>> +}
> >>>>
> >>>> We've been discussing this off list with Xuan, and unfortunately
> >>>> this is
> >>>> a blocker for now.
> >>>>
> >>>> Currently, the x86 IOMMU does not support partial unmap - the segments
> >>>> have to be unmapped exactly the same addr/len as they were mapped. We
> >>>> also concatenate adjacent mappings to prevent filling up the DMA
> >>>> mapping
> >>>> entry table with superfluous entries.
> >>>>
> >>>> This means that, when two unrelated mappings are contiguous in memory
> >>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
> >>>> sitting right next to each other in virtual memory), we cannot later
> >>>> unmap one of them because, even though these are two separate
> >>> mappings
> >>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
> >>>> compacted and looks like one single mapping to VFIO, so DPDK API will
> >>>> not let us unmap region 1 without also unmapping region 2.
> >>>>
> >>>> The proper fix for this problem would be to always map memory
> >>>> page-by-page regardless of where it comes from (we already do that for
> >>>> internal memory, but not for external). However, the reason this works
> >>>> for internal memory is because when mapping internal memory segments,
> >>>> *we know the page size*. For external memory segments, there is no such
> >>>> guarantee, so we cannot deduce page size for a given memory segment,
> >>> and
> >>>> thus can't map things page-by-page.
> >>>>
> >>>> So, the proper fix for it would be to add page size to the VFIO DMA
> >>>> API.
> >>>> Unfortunately, it probably has to wait until 21.11 because it is an API
> >>>> change.
> >>>>
> >>>> The slightly hacky fix for this would be to forego user mem map
> >>>> concatenation and trust that user is not going to do anything stupid,
> >>>> and will not spam the VFIO DMA API without reason. I would rather
> >>>> not go
> >>>> down this road, but this could be an option in this case.
> >>>>
> >>>> Thoughts?
> >>>>
> >>>
> >>> Thanks Anatoly for the detailed description of the issue.
> >>> It may be possible to either create a versioned symbol for this API
> >>> change, or maybe even to have a temporary internal API.
> >>>
> >>> But I think this series in its current form is not acceptable, so
> >>> waiting for v21.11 would be the best option (we may want to send the
> >>> deprecation notice in this release though).
> >>>
> >>> In this series, I don't like the user application has to pass a flag to
> >>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
> >>> does not implement what was discussed in the previous one, i.e.
> >>> supporting both IOVA_AS_VA and IOVA_AS_PA.
> >>
> >> Thanks for your comments. Here I hope to explain some questions:
> >> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
> >> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version, the
> >> virtual address is replaced with iova address of mapped region, and
> >> the iova
> >> address is selected to program the IOMMU instead of virtual address only.
> 
> Good!
> 
> >>
> >> 2. Why a flag is chosen to be passed by application?
> >> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
> >> used to
> >> get the IOVA mode, so as to determine whether IOMMU should be
> programmed.
> >> However, in the implementation process, I found a problem. That is how to
> >> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we should
> >> always program the IOMMU. While in IGB_UIO cases, it depends on IOMMU
> >> capability of platform.
> >
> > How does one program IOMMU with igb_uio? I was under impression that
> > igb_uio (and uio_pci_generic for that matter) does not provide such
> > facilities.
> 
> +1

Maybe some misunderstanding in this sentence here.
In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be programmed.
True means vfio module is modprobed.
	
But there is an exception here, that is, even if vfio module is modprobed, 
DPDK user still bind all the devices to igb_uio.

This situation can be distinguished in DPDK eal initialization, because the resource mapping
is according to the driver loaded by each device(rte_pci_map_device).

But in our scenario, this judgment is somewhat weak. Because we cannot get
the device driver info in vhost library. I also think it is unreasonable for vhost to
do this. Only trust that users will not use it like this. Thoughts for this scenario?

> 
> >>
> >> So a flag is selected, but this requires the application to do extra
> >> things.
> >> I find another solution, is to use
> >> #ifdef VFIO_PRESENT
> >>          If(rte_vfio_is_enabled("vfio"))
> >>                  program_iommu;
> >> #endif
> >>
> >> Because all the devices are managed by DPDK, we can follow DPDK to do the
> >> decision. Does this make sense for you, or any some suggestions?
> >
> > IMO the #ifdef is not needed. The API will always work, it's just that
> > if VFIO is not compiled, it'll just compile down to noops.
> 
> Agree the #ifdef is not necessary.

Thanks, I will remove the #ifdef in next version.
Only use rte_vfio_is_enabled("vfio").

> 
> To be clear, rte_vfio_is_enabled() check is going to be done in the
> Vhost library, making this transparent to the application?

Yes, you are right :)
The check will be done in vhost library, application does not need to do
additional things.

> 
> >>
> >> 3.  The partial unmap issue
> >> A: Thanks Anatoly for the detailed explanation. This problem was found in
> >> reconnection cases. After our off list discussion, the solution requires
> >> rte_vfio_container_dma_map/unmap API change. Here I want to consult
> >> if there are some hope for versioned symbol or a temporary internal API
> >> be used in this release.
> >
> > I don't think we can add a versioned symbol in this release unless
> > there's an exception to rc1 feature freeze. I also don't like the idea
> > of a temporary internal API because vhost is not in EAL, it's a library
> > - meaning, the "internal" API has to in fact be external API, added to
> > the .map file etc., otherwise it won't work with shared library builds.

Get it, thanks for your suggestion.

> >
> > That said, i'm not an expert on versioning, so maybe there are other
> > ways i'm not aware of, or i have some misconceptions about how it works :)
> 
> Ok, it maybe indeed be better to wait for v21.11, it is too late for
> this release.

Agree, so I will send a new version in v21.11.

Thanks,
Xuan

> 
> Thanks,
> Maxime
> 
> >>
> >> Thanks for your time!
> >>
> >> Regards,
> >> Xuan
> >>
> >>>
> >>> Regards,
> >>> Maxime
> >>
> >
> >


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-07  6:25               ` Ding, Xuan
@ 2021-07-07 12:17                 ` Burakov, Anatoly
  2021-07-07 12:54                   ` Ding, Xuan
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2021-07-07 12:17 UTC (permalink / raw)
  To: Ding, Xuan, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

On 07-Jul-21 7:25 AM, Ding, Xuan wrote:
> Hi,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, July 6, 2021 5:32 PM
>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>> Monjalon <thomas@monjalon.net>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
>> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
>> Ma, WenwuX <wenwux.ma@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
>>
>>
>>
>> On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
>>> On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
>>>> Hi Maxime,
>>>>
>>>>> -----Original Message-----
>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>> Sent: Monday, July 5, 2021 8:46 PM
>>>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>>>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>>>>> Monjalon <thomas@monjalon.net>; David Marchand
>>>>> <david.marchand@redhat.com>
>>>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>>>>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
>>>>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
>>>>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
>>>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
>>>>> vhost
>>>>>
>>>>>
>>>>>
>>>>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
>>>>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
>>>>>>> The use of IOMMU has many advantages, such as isolation and address
>>>>>>> translation. This patch extends the capbility of DMA engine to use
>>>>>>> IOMMU if the DMA device is bound to vfio.
>>>>>>>
>>>>>>> When set memory table, the guest memory will be mapped
>>>>>>> into the default container of DPDK.
>>>>>>>
>>>>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>>>>>> ---
>>>>>>>     doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>>>>>>>     lib/vhost/rte_vhost.h               |  1 +
>>>>>>>     lib/vhost/socket.c                  |  9 ++++++
>>>>>>>     lib/vhost/vhost.h                   |  1 +
>>>>>>>     lib/vhost/vhost_user.c              | 46
>>>>>>> ++++++++++++++++++++++++++++-
>>>>>>>     5 files changed, 65 insertions(+), 1 deletion(-)
>>>>>>>
>>>>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>>>>>>> b/doc/guides/prog_guide/vhost_lib.rst
>>>>>>> index 05c42c9b11..c3beda23d9 100644
>>>>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
>>>>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>>>>>>> @@ -118,6 +118,15 @@ The following is an overview of some key Vhost
>>>>>>> API functions:
>>>>>>>           It is disabled by default.
>>>>>>>     +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
>>>>>>> +
>>>>>>> +    In asynchronous data path, vhost liarary is not aware of which
>>>>>>> driver
>>>>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
>>>>>>> pass
>>>>>>> +    this flag to tell vhost library whether IOMMU should be
>>>>>>> programmed
>>>>>>> +    for guest memory.
>>>>>>> +
>>>>>>> +    It is disabled by default.
>>>>>>> +
>>>>>>>       - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>>>>>>>           Since v16.04, the vhost library forwards checksum and gso
>>>>>>> requests for
>>>>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>>>>>>> index 8d875e9322..a766ea7b6b 100644
>>>>>>> --- a/lib/vhost/rte_vhost.h
>>>>>>> +++ b/lib/vhost/rte_vhost.h
>>>>>>> @@ -37,6 +37,7 @@ extern "C" {
>>>>>>>     #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>>>>>>>     #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
>>>>>>>     #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL << 8)
>>>>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
>>>>>>>       /* Features. */
>>>>>>>     #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
>>>>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>>>>>>> index 5d0d728d52..77c722c86b 100644
>>>>>>> --- a/lib/vhost/socket.c
>>>>>>> +++ b/lib/vhost/socket.c
>>>>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>>>>>>         bool extbuf;
>>>>>>>         bool linearbuf;
>>>>>>>         bool async_copy;
>>>>>>> +    bool async_use_vfio;
>>>>>>>         bool net_compliant_ol_flags;
>>>>>>>           /*
>>>>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
>>>>>>> vhost_user_socket *vsocket)
>>>>>>>                 dev->async_copy = 1;
>>>>>>>         }
>>>>>>>     +    if (vsocket->async_use_vfio) {
>>>>>>> +        dev = get_device(vid);
>>>>>>> +
>>>>>>> +        if (dev)
>>>>>>> +            dev->async_use_vfio = 1;
>>>>>>> +    }
>>>>>>> +
>>>>>>>         VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>>>>>>           if (vsocket->notify_ops->new_connection) {
>>>>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
>>>>>>> uint64_t flags)
>>>>>>>         vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>>>>>>>         vsocket->linearbuf = flags &
>> RTE_VHOST_USER_LINEARBUF_SUPPORT;
>>>>>>>         vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>>>>>>> +    vsocket->async_use_vfio = flags &
>>>>> RTE_VHOST_USER_ASYNC_USE_VFIO;
>>>>>>>         vsocket->net_compliant_ol_flags = flags &
>>>>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>>>>>>>           if (vsocket->async_copy &&
>>>>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>>>>>>> index 8078ddff79..fb775ce4ed 100644
>>>>>>> --- a/lib/vhost/vhost.h
>>>>>>> +++ b/lib/vhost/vhost.h
>>>>>>> @@ -370,6 +370,7 @@ struct virtio_net {
>>>>>>>         int16_t            broadcast_rarp;
>>>>>>>         uint32_t        nr_vring;
>>>>>>>         int            async_copy;
>>>>>>> +    int            async_use_vfio;
>>>>>>>         int            extbuf;
>>>>>>>         int            linearbuf;
>>>>>>>         struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
>>>>>>> 2];
>>>>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>>>>>>> index 8f0eba6412..f3703f2e72 100644
>>>>>>> --- a/lib/vhost/vhost_user.c
>>>>>>> +++ b/lib/vhost/vhost_user.c
>>>>>>> @@ -45,6 +45,7 @@
>>>>>>>     #include <rte_common.h>
>>>>>>>     #include <rte_malloc.h>
>>>>>>>     #include <rte_log.h>
>>>>>>> +#include <rte_vfio.h>
>>>>>>>       #include "iotlb.h"
>>>>>>>     #include "vhost.h"
>>>>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>>>>>>>         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>>>>>>     }
>>>>>>>     +static int
>>>>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>>>>>>> +{
>>>>>>> +    int ret = 0;
>>>>>>> +    uint64_t host_iova;
>>>>>>> +    host_iova = rte_mem_virt2iova((void
>>>>>>> *)(uintptr_t)region->host_user_addr);
>>>>>>> +    if (do_map) {
>>>>>>> +        /* Add mapped region into the default container of DPDK. */
>>>>>>> +        ret =
>>>>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>>>> +                         region->host_user_addr,
>>>>>>> +                         host_iova,
>>>>>>> +                         region->size);
>>>>>>> +        if (ret) {
>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +    } else {
>>>>>>> +        /* Remove mapped region from the default container of
>>>>>>> DPDK. */
>>>>>>> +        ret =
>>>>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>>>> +                           region->host_user_addr,
>>>>>>> +                           host_iova,
>>>>>>> +                           region->size);
>>>>>>> +        if (ret) {
>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +    return ret;
>>>>>>> +}
>>>>>>
>>>>>> We've been discussing this off list with Xuan, and unfortunately
>>>>>> this is
>>>>>> a blocker for now.
>>>>>>
>>>>>> Currently, the x86 IOMMU does not support partial unmap - the segments
>>>>>> have to be unmapped exactly the same addr/len as they were mapped. We
>>>>>> also concatenate adjacent mappings to prevent filling up the DMA
>>>>>> mapping
>>>>>> entry table with superfluous entries.
>>>>>>
>>>>>> This means that, when two unrelated mappings are contiguous in memory
>>>>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
>>>>>> sitting right next to each other in virtual memory), we cannot later
>>>>>> unmap one of them because, even though these are two separate
>>>>> mappings
>>>>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
>>>>>> compacted and looks like one single mapping to VFIO, so DPDK API will
>>>>>> not let us unmap region 1 without also unmapping region 2.
>>>>>>
>>>>>> The proper fix for this problem would be to always map memory
>>>>>> page-by-page regardless of where it comes from (we already do that for
>>>>>> internal memory, but not for external). However, the reason this works
>>>>>> for internal memory is because when mapping internal memory segments,
>>>>>> *we know the page size*. For external memory segments, there is no such
>>>>>> guarantee, so we cannot deduce page size for a given memory segment,
>>>>> and
>>>>>> thus can't map things page-by-page.
>>>>>>
>>>>>> So, the proper fix for it would be to add page size to the VFIO DMA
>>>>>> API.
>>>>>> Unfortunately, it probably has to wait until 21.11 because it is an API
>>>>>> change.
>>>>>>
>>>>>> The slightly hacky fix for this would be to forego user mem map
>>>>>> concatenation and trust that user is not going to do anything stupid,
>>>>>> and will not spam the VFIO DMA API without reason. I would rather
>>>>>> not go
>>>>>> down this road, but this could be an option in this case.
>>>>>>
>>>>>> Thoughts?
>>>>>>
>>>>>
>>>>> Thanks Anatoly for the detailed description of the issue.
>>>>> It may be possible to either create a versioned symbol for this API
>>>>> change, or maybe even to have a temporary internal API.
>>>>>
>>>>> But I think this series in its current form is not acceptable, so
>>>>> waiting for v21.11 would be the best option (we may want to send the
>>>>> deprecation notice in this release though).
>>>>>
>>>>> In this series, I don't like the user application has to pass a flag to
>>>>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
>>>>> does not implement what was discussed in the previous one, i.e.
>>>>> supporting both IOVA_AS_VA and IOVA_AS_PA.
>>>>
>>>> Thanks for your comments. Here I hope to explain some questions:
>>>> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
>>>> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version, the
>>>> virtual address is replaced with iova address of mapped region, and
>>>> the iova
>>>> address is selected to program the IOMMU instead of virtual address only.
>>
>> Good!
>>
>>>>
>>>> 2. Why a flag is chosen to be passed by application?
>>>> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
>>>> used to
>>>> get the IOVA mode, so as to determine whether IOMMU should be
>> programmed.
>>>> However, in the implementation process, I found a problem. That is how to
>>>> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we should
>>>> always program the IOMMU. While in IGB_UIO cases, it depends on IOMMU
>>>> capability of platform.
>>>
>>> How does one program IOMMU with igb_uio? I was under impression that
>>> igb_uio (and uio_pci_generic for that matter) does not provide such
>>> facilities.
>>
>> +1
> 
> Maybe some misunderstanding in this sentence here.
> In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be programmed.
> True means vfio module is modprobed.
> 
> But there is an exception here, that is, even if vfio module is modprobed,
> DPDK user still bind all the devices to igb_uio.
> 
> This situation can be distinguished in DPDK eal initialization, because the resource mapping
> is according to the driver loaded by each device(rte_pci_map_device).
> 
> But in our scenario, this judgment is somewhat weak. Because we cannot get
> the device driver info in vhost library. I also think it is unreasonable for vhost to
> do this. Only trust that users will not use it like this. Thoughts for this scenario?

I don't see how igb_uio would make any difference at all. If you are 
using igb_uio, you *don't have DMA mapping at all* and will use raw 
physical addresses. Assuming your code supports this, that's all you're 
ever going to get. The point of VFIO is to have memory regions that are 
mapped for DMA *because real physical addresses are assumed to be not 
available*. When you're using igb_uio, you effectively do have DMA 
access to the entire memory, and thus can bypass IOMMU altogether 
(assuming you're using passthrough mode).

Bottom line: do VFIO DMA mapping unconditionally. If VFIO is active - 
great, the memory will be DMA mapped. If it's not active - no harm will 
ever be done by mapping the memory for DMA anyway.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-07 12:17                 ` Burakov, Anatoly
@ 2021-07-07 12:54                   ` Ding, Xuan
  2021-07-07 14:33                     ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: Ding, Xuan @ 2021-07-07 12:54 UTC (permalink / raw)
  To: Burakov, Anatoly, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

Hi Anatoly,

> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Wednesday, July 7, 2021 8:18 PM
> To: Ding, Xuan <xuan.ding@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Thomas Monjalon <thomas@monjalon.net>; David Marchand
> <david.marchand@redhat.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
> Ma, WenwuX <wenwux.ma@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> 
> On 07-Jul-21 7:25 AM, Ding, Xuan wrote:
> > Hi,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, July 6, 2021 5:32 PM
> >> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> >> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> >> Monjalon <thomas@monjalon.net>; David Marchand
> >> <david.marchand@redhat.com>
> >> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Van
> >> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> <yong.liu@intel.com>;
> >> Ma, WenwuX <wenwux.ma@intel.com>
> >> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> >>
> >>
> >>
> >> On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
> >>> On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
> >>>> Hi Maxime,
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>>> Sent: Monday, July 5, 2021 8:46 PM
> >>>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> >>>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> >>>>> Monjalon <thomas@monjalon.net>; David Marchand
> >>>>> <david.marchand@redhat.com>
> >>>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >>>>> <sunil.pai.g@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>;
> >>>>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> >>>>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> >>>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
> >>>>> vhost
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> >>>>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> >>>>>>> The use of IOMMU has many advantages, such as isolation and address
> >>>>>>> translation. This patch extends the capbility of DMA engine to use
> >>>>>>> IOMMU if the DMA device is bound to vfio.
> >>>>>>>
> >>>>>>> When set memory table, the guest memory will be mapped
> >>>>>>> into the default container of DPDK.
> >>>>>>>
> >>>>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >>>>>>> ---
> >>>>>>>     doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
> >>>>>>>     lib/vhost/rte_vhost.h               |  1 +
> >>>>>>>     lib/vhost/socket.c                  |  9 ++++++
> >>>>>>>     lib/vhost/vhost.h                   |  1 +
> >>>>>>>     lib/vhost/vhost_user.c              | 46
> >>>>>>> ++++++++++++++++++++++++++++-
> >>>>>>>     5 files changed, 65 insertions(+), 1 deletion(-)
> >>>>>>>
> >>>>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>> b/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>> index 05c42c9b11..c3beda23d9 100644
> >>>>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>> @@ -118,6 +118,15 @@ The following is an overview of some key
> Vhost
> >>>>>>> API functions:
> >>>>>>>           It is disabled by default.
> >>>>>>>     +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> >>>>>>> +
> >>>>>>> +    In asynchronous data path, vhost liarary is not aware of which
> >>>>>>> driver
> >>>>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
> >>>>>>> pass
> >>>>>>> +    this flag to tell vhost library whether IOMMU should be
> >>>>>>> programmed
> >>>>>>> +    for guest memory.
> >>>>>>> +
> >>>>>>> +    It is disabled by default.
> >>>>>>> +
> >>>>>>>       - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
> >>>>>>>           Since v16.04, the vhost library forwards checksum and gso
> >>>>>>> requests for
> >>>>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> >>>>>>> index 8d875e9322..a766ea7b6b 100644
> >>>>>>> --- a/lib/vhost/rte_vhost.h
> >>>>>>> +++ b/lib/vhost/rte_vhost.h
> >>>>>>> @@ -37,6 +37,7 @@ extern "C" {
> >>>>>>>     #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
> >>>>>>>     #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> >>>>>>>     #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL <<
> 8)
> >>>>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
> >>>>>>>       /* Features. */
> >>>>>>>     #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> >>>>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> >>>>>>> index 5d0d728d52..77c722c86b 100644
> >>>>>>> --- a/lib/vhost/socket.c
> >>>>>>> +++ b/lib/vhost/socket.c
> >>>>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
> >>>>>>>         bool extbuf;
> >>>>>>>         bool linearbuf;
> >>>>>>>         bool async_copy;
> >>>>>>> +    bool async_use_vfio;
> >>>>>>>         bool net_compliant_ol_flags;
> >>>>>>>           /*
> >>>>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
> >>>>>>> vhost_user_socket *vsocket)
> >>>>>>>                 dev->async_copy = 1;
> >>>>>>>         }
> >>>>>>>     +    if (vsocket->async_use_vfio) {
> >>>>>>> +        dev = get_device(vid);
> >>>>>>> +
> >>>>>>> +        if (dev)
> >>>>>>> +            dev->async_use_vfio = 1;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>>         VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> >>>>>>>           if (vsocket->notify_ops->new_connection) {
> >>>>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
> >>>>>>> uint64_t flags)
> >>>>>>>         vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> >>>>>>>         vsocket->linearbuf = flags &
> >> RTE_VHOST_USER_LINEARBUF_SUPPORT;
> >>>>>>>         vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> >>>>>>> +    vsocket->async_use_vfio = flags &
> >>>>> RTE_VHOST_USER_ASYNC_USE_VFIO;
> >>>>>>>         vsocket->net_compliant_ol_flags = flags &
> >>>>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
> >>>>>>>           if (vsocket->async_copy &&
> >>>>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> >>>>>>> index 8078ddff79..fb775ce4ed 100644
> >>>>>>> --- a/lib/vhost/vhost.h
> >>>>>>> +++ b/lib/vhost/vhost.h
> >>>>>>> @@ -370,6 +370,7 @@ struct virtio_net {
> >>>>>>>         int16_t            broadcast_rarp;
> >>>>>>>         uint32_t        nr_vring;
> >>>>>>>         int            async_copy;
> >>>>>>> +    int            async_use_vfio;
> >>>>>>>         int            extbuf;
> >>>>>>>         int            linearbuf;
> >>>>>>>         struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
> >>>>>>> 2];
> >>>>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> >>>>>>> index 8f0eba6412..f3703f2e72 100644
> >>>>>>> --- a/lib/vhost/vhost_user.c
> >>>>>>> +++ b/lib/vhost/vhost_user.c
> >>>>>>> @@ -45,6 +45,7 @@
> >>>>>>>     #include <rte_common.h>
> >>>>>>>     #include <rte_malloc.h>
> >>>>>>>     #include <rte_log.h>
> >>>>>>> +#include <rte_vfio.h>
> >>>>>>>       #include "iotlb.h"
> >>>>>>>     #include "vhost.h"
> >>>>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
> >>>>>>>         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >>>>>>>     }
> >>>>>>>     +static int
> >>>>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> >>>>>>> +{
> >>>>>>> +    int ret = 0;
> >>>>>>> +    uint64_t host_iova;
> >>>>>>> +    host_iova = rte_mem_virt2iova((void
> >>>>>>> *)(uintptr_t)region->host_user_addr);
> >>>>>>> +    if (do_map) {
> >>>>>>> +        /* Add mapped region into the default container of DPDK. */
> >>>>>>> +        ret =
> >>>>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>>>> +                         region->host_user_addr,
> >>>>>>> +                         host_iova,
> >>>>>>> +                         region->size);
> >>>>>>> +        if (ret) {
> >>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> >>>>>>> +            return ret;
> >>>>>>> +        }
> >>>>>>> +    } else {
> >>>>>>> +        /* Remove mapped region from the default container of
> >>>>>>> DPDK. */
> >>>>>>> +        ret =
> >>>>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>>>> +                           region->host_user_addr,
> >>>>>>> +                           host_iova,
> >>>>>>> +                           region->size);
> >>>>>>> +        if (ret) {
> >>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> >>>>>>> +            return ret;
> >>>>>>> +        }
> >>>>>>> +    }
> >>>>>>> +    return ret;
> >>>>>>> +}
> >>>>>>
> >>>>>> We've been discussing this off list with Xuan, and unfortunately
> >>>>>> this is
> >>>>>> a blocker for now.
> >>>>>>
> >>>>>> Currently, the x86 IOMMU does not support partial unmap - the
> segments
> >>>>>> have to be unmapped exactly the same addr/len as they were mapped.
> We
> >>>>>> also concatenate adjacent mappings to prevent filling up the DMA
> >>>>>> mapping
> >>>>>> entry table with superfluous entries.
> >>>>>>
> >>>>>> This means that, when two unrelated mappings are contiguous in
> memory
> >>>>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
> >>>>>> sitting right next to each other in virtual memory), we cannot later
> >>>>>> unmap one of them because, even though these are two separate
> >>>>> mappings
> >>>>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
> >>>>>> compacted and looks like one single mapping to VFIO, so DPDK API will
> >>>>>> not let us unmap region 1 without also unmapping region 2.
> >>>>>>
> >>>>>> The proper fix for this problem would be to always map memory
> >>>>>> page-by-page regardless of where it comes from (we already do that for
> >>>>>> internal memory, but not for external). However, the reason this works
> >>>>>> for internal memory is because when mapping internal memory
> segments,
> >>>>>> *we know the page size*. For external memory segments, there is no
> such
> >>>>>> guarantee, so we cannot deduce page size for a given memory segment,
> >>>>> and
> >>>>>> thus can't map things page-by-page.
> >>>>>>
> >>>>>> So, the proper fix for it would be to add page size to the VFIO DMA
> >>>>>> API.
> >>>>>> Unfortunately, it probably has to wait until 21.11 because it is an API
> >>>>>> change.
> >>>>>>
> >>>>>> The slightly hacky fix for this would be to forego user mem map
> >>>>>> concatenation and trust that user is not going to do anything stupid,
> >>>>>> and will not spam the VFIO DMA API without reason. I would rather
> >>>>>> not go
> >>>>>> down this road, but this could be an option in this case.
> >>>>>>
> >>>>>> Thoughts?
> >>>>>>
> >>>>>
> >>>>> Thanks Anatoly for the detailed description of the issue.
> >>>>> It may be possible to either create a versioned symbol for this API
> >>>>> change, or maybe even to have a temporary internal API.
> >>>>>
> >>>>> But I think this series in its current form is not acceptable, so
> >>>>> waiting for v21.11 would be the best option (we may want to send the
> >>>>> deprecation notice in this release though).
> >>>>>
> >>>>> In this series, I don't like the user application has to pass a flag to
> >>>>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
> >>>>> does not implement what was discussed in the previous one, i.e.
> >>>>> supporting both IOVA_AS_VA and IOVA_AS_PA.
> >>>>
> >>>> Thanks for your comments. Here I hope to explain some questions:
> >>>> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
> >>>> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version,
> the
> >>>> virtual address is replaced with iova address of mapped region, and
> >>>> the iova
> >>>> address is selected to program the IOMMU instead of virtual address only.
> >>
> >> Good!
> >>
> >>>>
> >>>> 2. Why a flag is chosen to be passed by application?
> >>>> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
> >>>> used to
> >>>> get the IOVA mode, so as to determine whether IOMMU should be
> >> programmed.
> >>>> However, in the implementation process, I found a problem. That is how to
> >>>> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we
> should
> >>>> always program the IOMMU. While in IGB_UIO cases, it depends on
> IOMMU
> >>>> capability of platform.
> >>>
> >>> How does one program IOMMU with igb_uio? I was under impression that
> >>> igb_uio (and uio_pci_generic for that matter) does not provide such
> >>> facilities.
> >>
> >> +1
> >
> > Maybe some misunderstanding in this sentence here.
> > In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be
> programmed.
> > True means vfio module is modprobed.
> >
> > But there is an exception here, that is, even if vfio module is modprobed,
> > DPDK user still bind all the devices to igb_uio.
> >
> > This situation can be distinguished in DPDK eal initialization, because the
> resource mapping
> > is according to the driver loaded by each device(rte_pci_map_device).
> >
> > But in our scenario, this judgment is somewhat weak. Because we cannot get
> > the device driver info in vhost library. I also think it is unreasonable for vhost to
> > do this. Only trust that users will not use it like this. Thoughts for this scenario?
> 
> I don't see how igb_uio would make any difference at all. If you are
> using igb_uio, you *don't have DMA mapping at all* and will use raw
> physical addresses. Assuming your code supports this, that's all you're
> ever going to get. The point of VFIO is to have memory regions that are
> mapped for DMA *because real physical addresses are assumed to be not
> available*. When you're using igb_uio, you effectively do have DMA
> access to the entire memory, and thus can bypass IOMMU altogether
> (assuming you're using passthrough mode).

My concern is exactly here.
In igb_uio cases, although devices are not added to the default container in eal init,
but the "IOMMU programming" actually happens when the rte_vfio_container_dma_map() is called.
It is no harm but it is also unnecessary.

> 
> Bottom line: do VFIO DMA mapping unconditionally. If VFIO is active -
> great, the memory will be DMA mapped. If it's not active - no harm will
> ever be done by mapping the memory for DMA anyway.

Do VFIO DMA mapping unconditionally, do you mean the rte_eal_vfio_is_enabled() is unnecessary?
What if the platform does not have IOMMU?

Thanks very much.

Regards,
Xuan

> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-07 12:54                   ` Ding, Xuan
@ 2021-07-07 14:33                     ` Burakov, Anatoly
  2021-07-07 15:00                       ` Bruce Richardson
  2021-07-07 15:09                       ` Ding, Xuan
  0 siblings, 2 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2021-07-07 14:33 UTC (permalink / raw)
  To: Ding, Xuan, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX

On 07-Jul-21 1:54 PM, Ding, Xuan wrote:
> Hi Anatoly,
> 
>> -----Original Message-----
>> From: Burakov, Anatoly <anatoly.burakov@intel.com>
>> Sent: Wednesday, July 7, 2021 8:18 PM
>> To: Ding, Xuan <xuan.ding@intel.com>; Maxime Coquelin
>> <maxime.coquelin@redhat.com>; Xia, Chenbo <chenbo.xia@intel.com>;
>> Thomas Monjalon <thomas@monjalon.net>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
>> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
>> Ma, WenwuX <wenwux.ma@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
>>
>> On 07-Jul-21 7:25 AM, Ding, Xuan wrote:
>>> Hi,
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Tuesday, July 6, 2021 5:32 PM
>>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>>>> Monjalon <thomas@monjalon.net>; David Marchand
>>>> <david.marchand@redhat.com>
>>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>>>> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
>> Van
>>>> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
>> <yong.liu@intel.com>;
>>>> Ma, WenwuX <wenwux.ma@intel.com>
>>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
>>>>
>>>>
>>>>
>>>> On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
>>>>> On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
>>>>>> Hi Maxime,
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>>> Sent: Monday, July 5, 2021 8:46 PM
>>>>>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
>>>>>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
>>>>>>> Monjalon <thomas@monjalon.net>; David Marchand
>>>>>>> <david.marchand@redhat.com>
>>>>>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
>>>>>>> <sunil.pai.g@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>;
>>>>>>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
>>>>>>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
>>>>>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
>>>>>>> vhost
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
>>>>>>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
>>>>>>>>> The use of IOMMU has many advantages, such as isolation and address
>>>>>>>>> translation. This patch extends the capbility of DMA engine to use
>>>>>>>>> IOMMU if the DMA device is bound to vfio.
>>>>>>>>>
>>>>>>>>> When set memory table, the guest memory will be mapped
>>>>>>>>> into the default container of DPDK.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>>>>>>>> ---
>>>>>>>>>      doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
>>>>>>>>>      lib/vhost/rte_vhost.h               |  1 +
>>>>>>>>>      lib/vhost/socket.c                  |  9 ++++++
>>>>>>>>>      lib/vhost/vhost.h                   |  1 +
>>>>>>>>>      lib/vhost/vhost_user.c              | 46
>>>>>>>>> ++++++++++++++++++++++++++++-
>>>>>>>>>      5 files changed, 65 insertions(+), 1 deletion(-)
>>>>>>>>>
>>>>>>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
>>>>>>>>> b/doc/guides/prog_guide/vhost_lib.rst
>>>>>>>>> index 05c42c9b11..c3beda23d9 100644
>>>>>>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
>>>>>>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
>>>>>>>>> @@ -118,6 +118,15 @@ The following is an overview of some key
>> Vhost
>>>>>>>>> API functions:
>>>>>>>>>            It is disabled by default.
>>>>>>>>>      +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
>>>>>>>>> +
>>>>>>>>> +    In asynchronous data path, vhost liarary is not aware of which
>>>>>>>>> driver
>>>>>>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
>>>>>>>>> pass
>>>>>>>>> +    this flag to tell vhost library whether IOMMU should be
>>>>>>>>> programmed
>>>>>>>>> +    for guest memory.
>>>>>>>>> +
>>>>>>>>> +    It is disabled by default.
>>>>>>>>> +
>>>>>>>>>        - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
>>>>>>>>>            Since v16.04, the vhost library forwards checksum and gso
>>>>>>>>> requests for
>>>>>>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>>>>>>>>> index 8d875e9322..a766ea7b6b 100644
>>>>>>>>> --- a/lib/vhost/rte_vhost.h
>>>>>>>>> +++ b/lib/vhost/rte_vhost.h
>>>>>>>>> @@ -37,6 +37,7 @@ extern "C" {
>>>>>>>>>      #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>>>>>>>>>      #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
>>>>>>>>>      #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL <<
>> 8)
>>>>>>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
>>>>>>>>>        /* Features. */
>>>>>>>>>      #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
>>>>>>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>>>>>>>>> index 5d0d728d52..77c722c86b 100644
>>>>>>>>> --- a/lib/vhost/socket.c
>>>>>>>>> +++ b/lib/vhost/socket.c
>>>>>>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>>>>>>>>          bool extbuf;
>>>>>>>>>          bool linearbuf;
>>>>>>>>>          bool async_copy;
>>>>>>>>> +    bool async_use_vfio;
>>>>>>>>>          bool net_compliant_ol_flags;
>>>>>>>>>            /*
>>>>>>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
>>>>>>>>> vhost_user_socket *vsocket)
>>>>>>>>>                  dev->async_copy = 1;
>>>>>>>>>          }
>>>>>>>>>      +    if (vsocket->async_use_vfio) {
>>>>>>>>> +        dev = get_device(vid);
>>>>>>>>> +
>>>>>>>>> +        if (dev)
>>>>>>>>> +            dev->async_use_vfio = 1;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>>          VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>>>>>>>>            if (vsocket->notify_ops->new_connection) {
>>>>>>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
>>>>>>>>> uint64_t flags)
>>>>>>>>>          vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
>>>>>>>>>          vsocket->linearbuf = flags &
>>>> RTE_VHOST_USER_LINEARBUF_SUPPORT;
>>>>>>>>>          vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>>>>>>>>> +    vsocket->async_use_vfio = flags &
>>>>>>> RTE_VHOST_USER_ASYNC_USE_VFIO;
>>>>>>>>>          vsocket->net_compliant_ol_flags = flags &
>>>>>>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
>>>>>>>>>            if (vsocket->async_copy &&
>>>>>>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>>>>>>>>> index 8078ddff79..fb775ce4ed 100644
>>>>>>>>> --- a/lib/vhost/vhost.h
>>>>>>>>> +++ b/lib/vhost/vhost.h
>>>>>>>>> @@ -370,6 +370,7 @@ struct virtio_net {
>>>>>>>>>          int16_t            broadcast_rarp;
>>>>>>>>>          uint32_t        nr_vring;
>>>>>>>>>          int            async_copy;
>>>>>>>>> +    int            async_use_vfio;
>>>>>>>>>          int            extbuf;
>>>>>>>>>          int            linearbuf;
>>>>>>>>>          struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
>>>>>>>>> 2];
>>>>>>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
>>>>>>>>> index 8f0eba6412..f3703f2e72 100644
>>>>>>>>> --- a/lib/vhost/vhost_user.c
>>>>>>>>> +++ b/lib/vhost/vhost_user.c
>>>>>>>>> @@ -45,6 +45,7 @@
>>>>>>>>>      #include <rte_common.h>
>>>>>>>>>      #include <rte_malloc.h>
>>>>>>>>>      #include <rte_log.h>
>>>>>>>>> +#include <rte_vfio.h>
>>>>>>>>>        #include "iotlb.h"
>>>>>>>>>      #include "vhost.h"
>>>>>>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
>>>>>>>>>          return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
>>>>>>>>>      }
>>>>>>>>>      +static int
>>>>>>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
>>>>>>>>> +{
>>>>>>>>> +    int ret = 0;
>>>>>>>>> +    uint64_t host_iova;
>>>>>>>>> +    host_iova = rte_mem_virt2iova((void
>>>>>>>>> *)(uintptr_t)region->host_user_addr);
>>>>>>>>> +    if (do_map) {
>>>>>>>>> +        /* Add mapped region into the default container of DPDK. */
>>>>>>>>> +        ret =
>>>>>>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>>>>>> +                         region->host_user_addr,
>>>>>>>>> +                         host_iova,
>>>>>>>>> +                         region->size);
>>>>>>>>> +        if (ret) {
>>>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
>>>>>>>>> +            return ret;
>>>>>>>>> +        }
>>>>>>>>> +    } else {
>>>>>>>>> +        /* Remove mapped region from the default container of
>>>>>>>>> DPDK. */
>>>>>>>>> +        ret =
>>>>>>>>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>>>>>>>>> +                           region->host_user_addr,
>>>>>>>>> +                           host_iova,
>>>>>>>>> +                           region->size);
>>>>>>>>> +        if (ret) {
>>>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
>>>>>>>>> +            return ret;
>>>>>>>>> +        }
>>>>>>>>> +    }
>>>>>>>>> +    return ret;
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> We've been discussing this off list with Xuan, and unfortunately
>>>>>>>> this is
>>>>>>>> a blocker for now.
>>>>>>>>
>>>>>>>> Currently, the x86 IOMMU does not support partial unmap - the
>> segments
>>>>>>>> have to be unmapped exactly the same addr/len as they were mapped.
>> We
>>>>>>>> also concatenate adjacent mappings to prevent filling up the DMA
>>>>>>>> mapping
>>>>>>>> entry table with superfluous entries.
>>>>>>>>
>>>>>>>> This means that, when two unrelated mappings are contiguous in
>> memory
>>>>>>>> (e.g. if you map regions 1 and 2 independently, but they happen to be
>>>>>>>> sitting right next to each other in virtual memory), we cannot later
>>>>>>>> unmap one of them because, even though these are two separate
>>>>>>> mappings
>>>>>>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
>>>>>>>> compacted and looks like one single mapping to VFIO, so DPDK API will
>>>>>>>> not let us unmap region 1 without also unmapping region 2.
>>>>>>>>
>>>>>>>> The proper fix for this problem would be to always map memory
>>>>>>>> page-by-page regardless of where it comes from (we already do that for
>>>>>>>> internal memory, but not for external). However, the reason this works
>>>>>>>> for internal memory is because when mapping internal memory
>> segments,
>>>>>>>> *we know the page size*. For external memory segments, there is no
>> such
>>>>>>>> guarantee, so we cannot deduce page size for a given memory segment,
>>>>>>> and
>>>>>>>> thus can't map things page-by-page.
>>>>>>>>
>>>>>>>> So, the proper fix for it would be to add page size to the VFIO DMA
>>>>>>>> API.
>>>>>>>> Unfortunately, it probably has to wait until 21.11 because it is an API
>>>>>>>> change.
>>>>>>>>
>>>>>>>> The slightly hacky fix for this would be to forego user mem map
>>>>>>>> concatenation and trust that user is not going to do anything stupid,
>>>>>>>> and will not spam the VFIO DMA API without reason. I would rather
>>>>>>>> not go
>>>>>>>> down this road, but this could be an option in this case.
>>>>>>>>
>>>>>>>> Thoughts?
>>>>>>>>
>>>>>>>
>>>>>>> Thanks Anatoly for the detailed description of the issue.
>>>>>>> It may be possible to either create a versioned symbol for this API
>>>>>>> change, or maybe even to have a temporary internal API.
>>>>>>>
>>>>>>> But I think this series in its current form is not acceptable, so
>>>>>>> waiting for v21.11 would be the best option (we may want to send the
>>>>>>> deprecation notice in this release though).
>>>>>>>
>>>>>>> In this series, I don't like the user application has to pass a flag to
>>>>>>> state whether the DMA engine uses VFIO or not. AFAICT, this new revision
>>>>>>> does not implement what was discussed in the previous one, i.e.
>>>>>>> supporting both IOVA_AS_VA and IOVA_AS_PA.
>>>>>>
>>>>>> Thanks for your comments. Here I hope to explain some questions:
>>>>>> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
>>>>>> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version,
>> the
>>>>>> virtual address is replaced with iova address of mapped region, and
>>>>>> the iova
>>>>>> address is selected to program the IOMMU instead of virtual address only.
>>>>
>>>> Good!
>>>>
>>>>>>
>>>>>> 2. Why a flag is chosen to be passed by application?
>>>>>> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
>>>>>> used to
>>>>>> get the IOVA mode, so as to determine whether IOMMU should be
>>>> programmed.
>>>>>> However, in the implementation process, I found a problem. That is how to
>>>>>> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we
>> should
>>>>>> always program the IOMMU. While in IGB_UIO cases, it depends on
>> IOMMU
>>>>>> capability of platform.
>>>>>
>>>>> How does one program IOMMU with igb_uio? I was under impression that
>>>>> igb_uio (and uio_pci_generic for that matter) does not provide such
>>>>> facilities.
>>>>
>>>> +1
>>>
>>> Maybe some misunderstanding in this sentence here.
>>> In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be
>> programmed.
>>> True means vfio module is modprobed.
>>>
>>> But there is an exception here, that is, even if vfio module is modprobed,
>>> DPDK user still bind all the devices to igb_uio.
>>>
>>> This situation can be distinguished in DPDK eal initialization, because the
>> resource mapping
>>> is according to the driver loaded by each device(rte_pci_map_device).
>>>
>>> But in our scenario, this judgment is somewhat weak. Because we cannot get
>>> the device driver info in vhost library. I also think it is unreasonable for vhost to
>>> do this. Only trust that users will not use it like this. Thoughts for this scenario?
>>
>> I don't see how igb_uio would make any difference at all. If you are
>> using igb_uio, you *don't have DMA mapping at all* and will use raw
>> physical addresses. Assuming your code supports this, that's all you're
>> ever going to get. The point of VFIO is to have memory regions that are
>> mapped for DMA *because real physical addresses are assumed to be not
>> available*. When you're using igb_uio, you effectively do have DMA
>> access to the entire memory, and thus can bypass IOMMU altogether
>> (assuming you're using passthrough mode).
> 
> My concern is exactly here.
> In igb_uio cases, although devices are not added to the default container in eal init,
> but the "IOMMU programming" actually happens when the rte_vfio_container_dma_map() is called.
> It is no harm but it is also unnecessary.

Yes, it is unnecessary, but it's also not actively harmful, which means 
you can still do it without any regard as to whether you do or don't 
have IOMMU :)

Think of a hybrid VFIO/igb_uio setup - some NICs will have VFIO, some 
will have igb_uio. The igb_uio-bound NICs will not care if you have 
mapped anything for DMA because they don't go through IOMMU, things will 
"just work". The VFIO-bound NICs will get the memory mapped, because 
they are the ones who actually need the DMA mapping.

So, what you get is, if you do VFIO DMA mapping unconditionally, 1) NICs 
with igb_uio won't care about this, and 2) NICs with VFIO will benefit. 
You're not "mapping" the NICs, you're mapping the memory you're 
accessing with those NICs. You need it to be accessible to both, but 
since you have no way of knowing whether 1) any of the current HW needs 
VFIO, and 2) any of *future hotplugged* HW needs VFIO, the easiest way 
to solve this problem is just to map things regardless, and live with 
the "unnecessary" but harmless mapping in the worst case.

> 
>>
>> Bottom line: do VFIO DMA mapping unconditionally. If VFIO is active -
>> great, the memory will be DMA mapped. If it's not active - no harm will
>> ever be done by mapping the memory for DMA anyway.
> 
> Do VFIO DMA mapping unconditionally, do you mean the rte_eal_vfio_is_enabled() is unnecessary?
> What if the platform does not have IOMMU?
> 
> Thanks very much.
> 

If the platform has no IOMMU, the API call will just not do anything 
useful, so no harm done.

> Regards,
> Xuan
> 
>>
>> --
>> Thanks,
>> Anatoly


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-07 14:33                     ` Burakov, Anatoly
@ 2021-07-07 15:00                       ` Bruce Richardson
  2021-07-07 15:09                       ` Ding, Xuan
  1 sibling, 0 replies; 25+ messages in thread
From: Bruce Richardson @ 2021-07-07 15:00 UTC (permalink / raw)
  To: Burakov, Anatoly
  Cc: Ding, Xuan, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand, dev, Hu, Jiayu, Pai G, Sunil, Van Haaren, Harry,
	Liu, Yong, Ma, WenwuX

On Wed, Jul 07, 2021 at 03:33:44PM +0100, Burakov, Anatoly wrote:
> On 07-Jul-21 1:54 PM, Ding, Xuan wrote:
> > Hi Anatoly,
> > 
> > > -----Original Message-----
> > > From: Burakov, Anatoly <anatoly.burakov@intel.com>
> > > Sent: Wednesday, July 7, 2021 8:18 PM
> > > To: Ding, Xuan <xuan.ding@intel.com>; Maxime Coquelin
> > > <maxime.coquelin@redhat.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> > > Thomas Monjalon <thomas@monjalon.net>; David Marchand
> > > <david.marchand@redhat.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> > > <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Van
> > > Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong <yong.liu@intel.com>;
> > > Ma, WenwuX <wenwux.ma@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> > > 
> > > On 07-Jul-21 7:25 AM, Ding, Xuan wrote:
> > > > Hi,
> > > > 
> > > > > -----Original Message-----
> > > > > From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > Sent: Tuesday, July 6, 2021 5:32 PM
> > > > > To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> > > > > <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> > > > > Monjalon <thomas@monjalon.net>; David Marchand
> > > > > <david.marchand@redhat.com>
> > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> > > > > <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> > > Van
> > > > > Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> > > <yong.liu@intel.com>;
> > > > > Ma, WenwuX <wenwux.ma@intel.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> > > > > 
> > > > > 
> > > > > 
> > > > > On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
> > > > > > On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
> > > > > > > Hi Maxime,
> > > > > > > 
> > > > > > > > -----Original Message-----
> > > > > > > > From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > > > > Sent: Monday, July 5, 2021 8:46 PM
> > > > > > > > To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> > > > > > > > <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> > > > > > > > Monjalon <thomas@monjalon.net>; David Marchand
> > > > > > > > <david.marchand@redhat.com>
> > > > > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> > > > > > > > <sunil.pai.g@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>;
> > > > > > > > Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> > > > > > > > <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> > > > > > > > Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
> > > > > > > > vhost
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> > > > > > > > > On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> > > > > > > > > > The use of IOMMU has many advantages, such as isolation and address
> > > > > > > > > > translation. This patch extends the capbility of DMA engine to use
> > > > > > > > > > IOMMU if the DMA device is bound to vfio.
> > > > > > > > > > 
> > > > > > > > > > When set memory table, the guest memory will be mapped
> > > > > > > > > > into the default container of DPDK.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > > > > > > > > > ---
> > > > > > > > > >      doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
> > > > > > > > > >      lib/vhost/rte_vhost.h               |  1 +
> > > > > > > > > >      lib/vhost/socket.c                  |  9 ++++++
> > > > > > > > > >      lib/vhost/vhost.h                   |  1 +
> > > > > > > > > >      lib/vhost/vhost_user.c              | 46
> > > > > > > > > > ++++++++++++++++++++++++++++-
> > > > > > > > > >      5 files changed, 65 insertions(+), 1 deletion(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/doc/guides/prog_guide/vhost_lib.rst
> > > > > > > > > > b/doc/guides/prog_guide/vhost_lib.rst
> > > > > > > > > > index 05c42c9b11..c3beda23d9 100644
> > > > > > > > > > --- a/doc/guides/prog_guide/vhost_lib.rst
> > > > > > > > > > +++ b/doc/guides/prog_guide/vhost_lib.rst
> > > > > > > > > > @@ -118,6 +118,15 @@ The following is an overview of some key
> > > Vhost
> > > > > > > > > > API functions:
> > > > > > > > > >            It is disabled by default.
> > > > > > > > > >      +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> > > > > > > > > > +
> > > > > > > > > > +    In asynchronous data path, vhost liarary is not aware of which
> > > > > > > > > > driver
> > > > > > > > > > +    (igb_uio/vfio) the DMA device is bound to. Application should
> > > > > > > > > > pass
> > > > > > > > > > +    this flag to tell vhost library whether IOMMU should be
> > > > > > > > > > programmed
> > > > > > > > > > +    for guest memory.
> > > > > > > > > > +
> > > > > > > > > > +    It is disabled by default.
> > > > > > > > > > +
> > > > > > > > > >        - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
> > > > > > > > > >            Since v16.04, the vhost library forwards checksum and gso
> > > > > > > > > > requests for
> > > > > > > > > > diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> > > > > > > > > > index 8d875e9322..a766ea7b6b 100644
> > > > > > > > > > --- a/lib/vhost/rte_vhost.h
> > > > > > > > > > +++ b/lib/vhost/rte_vhost.h
> > > > > > > > > > @@ -37,6 +37,7 @@ extern "C" {
> > > > > > > > > >      #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
> > > > > > > > > >      #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> > > > > > > > > >      #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS    (1ULL <<
> > > 8)
> > > > > > > > > > +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
> > > > > > > > > >        /* Features. */
> > > > > > > > > >      #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> > > > > > > > > > diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> > > > > > > > > > index 5d0d728d52..77c722c86b 100644
> > > > > > > > > > --- a/lib/vhost/socket.c
> > > > > > > > > > +++ b/lib/vhost/socket.c
> > > > > > > > > > @@ -42,6 +42,7 @@ struct vhost_user_socket {
> > > > > > > > > >          bool extbuf;
> > > > > > > > > >          bool linearbuf;
> > > > > > > > > >          bool async_copy;
> > > > > > > > > > +    bool async_use_vfio;
> > > > > > > > > >          bool net_compliant_ol_flags;
> > > > > > > > > >            /*
> > > > > > > > > > @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd, struct
> > > > > > > > > > vhost_user_socket *vsocket)
> > > > > > > > > >                  dev->async_copy = 1;
> > > > > > > > > >          }
> > > > > > > > > >      +    if (vsocket->async_use_vfio) {
> > > > > > > > > > +        dev = get_device(vid);
> > > > > > > > > > +
> > > > > > > > > > +        if (dev)
> > > > > > > > > > +            dev->async_use_vfio = 1;
> > > > > > > > > > +    }
> > > > > > > > > > +
> > > > > > > > > >          VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> > > > > > > > > >            if (vsocket->notify_ops->new_connection) {
> > > > > > > > > > @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char *path,
> > > > > > > > > > uint64_t flags)
> > > > > > > > > >          vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
> > > > > > > > > >          vsocket->linearbuf = flags &
> > > > > RTE_VHOST_USER_LINEARBUF_SUPPORT;
> > > > > > > > > >          vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> > > > > > > > > > +    vsocket->async_use_vfio = flags &
> > > > > > > > RTE_VHOST_USER_ASYNC_USE_VFIO;
> > > > > > > > > >          vsocket->net_compliant_ol_flags = flags &
> > > > > > > > > > RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
> > > > > > > > > >            if (vsocket->async_copy &&
> > > > > > > > > > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> > > > > > > > > > index 8078ddff79..fb775ce4ed 100644
> > > > > > > > > > --- a/lib/vhost/vhost.h
> > > > > > > > > > +++ b/lib/vhost/vhost.h
> > > > > > > > > > @@ -370,6 +370,7 @@ struct virtio_net {
> > > > > > > > > >          int16_t            broadcast_rarp;
> > > > > > > > > >          uint32_t        nr_vring;
> > > > > > > > > >          int            async_copy;
> > > > > > > > > > +    int            async_use_vfio;
> > > > > > > > > >          int            extbuf;
> > > > > > > > > >          int            linearbuf;
> > > > > > > > > >          struct vhost_virtqueue    *virtqueue[VHOST_MAX_QUEUE_PAIRS *
> > > > > > > > > > 2];
> > > > > > > > > > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> > > > > > > > > > index 8f0eba6412..f3703f2e72 100644
> > > > > > > > > > --- a/lib/vhost/vhost_user.c
> > > > > > > > > > +++ b/lib/vhost/vhost_user.c
> > > > > > > > > > @@ -45,6 +45,7 @@
> > > > > > > > > >      #include <rte_common.h>
> > > > > > > > > >      #include <rte_malloc.h>
> > > > > > > > > >      #include <rte_log.h>
> > > > > > > > > > +#include <rte_vfio.h>
> > > > > > > > > >        #include "iotlb.h"
> > > > > > > > > >      #include "vhost.h"
> > > > > > > > > > @@ -141,6 +142,36 @@ get_blk_size(int fd)
> > > > > > > > > >          return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> > > > > > > > > >      }
> > > > > > > > > >      +static int
> > > > > > > > > > +async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
> > > > > > > > > > +{
> > > > > > > > > > +    int ret = 0;
> > > > > > > > > > +    uint64_t host_iova;
> > > > > > > > > > +    host_iova = rte_mem_virt2iova((void
> > > > > > > > > > *)(uintptr_t)region->host_user_addr);
> > > > > > > > > > +    if (do_map) {
> > > > > > > > > > +        /* Add mapped region into the default container of DPDK. */
> > > > > > > > > > +        ret =
> > > > > > > > rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > > > > > > > > > +                         region->host_user_addr,
> > > > > > > > > > +                         host_iova,
> > > > > > > > > > +                         region->size);
> > > > > > > > > > +        if (ret) {
> > > > > > > > > > +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> > > > > > > > > > +            return ret;
> > > > > > > > > > +        }
> > > > > > > > > > +    } else {
> > > > > > > > > > +        /* Remove mapped region from the default container of
> > > > > > > > > > DPDK. */
> > > > > > > > > > +        ret =
> > > > > > > > > > rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > > > > > > > > > +                           region->host_user_addr,
> > > > > > > > > > +                           host_iova,
> > > > > > > > > > +                           region->size);
> > > > > > > > > > +        if (ret) {
> > > > > > > > > > +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
> > > > > > > > > > +            return ret;
> > > > > > > > > > +        }
> > > > > > > > > > +    }
> > > > > > > > > > +    return ret;
> > > > > > > > > > +}
> > > > > > > > > 
> > > > > > > > > We've been discussing this off list with Xuan, and unfortunately
> > > > > > > > > this is
> > > > > > > > > a blocker for now.
> > > > > > > > > 
> > > > > > > > > Currently, the x86 IOMMU does not support partial unmap - the
> > > segments
> > > > > > > > > have to be unmapped exactly the same addr/len as they were mapped.
> > > We
> > > > > > > > > also concatenate adjacent mappings to prevent filling up the DMA
> > > > > > > > > mapping
> > > > > > > > > entry table with superfluous entries.
> > > > > > > > > 
> > > > > > > > > This means that, when two unrelated mappings are contiguous in
> > > memory
> > > > > > > > > (e.g. if you map regions 1 and 2 independently, but they happen to be
> > > > > > > > > sitting right next to each other in virtual memory), we cannot later
> > > > > > > > > unmap one of them because, even though these are two separate
> > > > > > > > mappings
> > > > > > > > > as far as kernel VFIO infrastructure is concerned, the mapping gets
> > > > > > > > > compacted and looks like one single mapping to VFIO, so DPDK API will
> > > > > > > > > not let us unmap region 1 without also unmapping region 2.
> > > > > > > > > 
> > > > > > > > > The proper fix for this problem would be to always map memory
> > > > > > > > > page-by-page regardless of where it comes from (we already do that for
> > > > > > > > > internal memory, but not for external). However, the reason this works
> > > > > > > > > for internal memory is because when mapping internal memory
> > > segments,
> > > > > > > > > *we know the page size*. For external memory segments, there is no
> > > such
> > > > > > > > > guarantee, so we cannot deduce page size for a given memory segment,
> > > > > > > > and
> > > > > > > > > thus can't map things page-by-page.
> > > > > > > > > 
> > > > > > > > > So, the proper fix for it would be to add page size to the VFIO DMA
> > > > > > > > > API.
> > > > > > > > > Unfortunately, it probably has to wait until 21.11 because it is an API
> > > > > > > > > change.
> > > > > > > > > 
> > > > > > > > > The slightly hacky fix for this would be to forego user mem map
> > > > > > > > > concatenation and trust that user is not going to do anything stupid,
> > > > > > > > > and will not spam the VFIO DMA API without reason. I would rather
> > > > > > > > > not go
> > > > > > > > > down this road, but this could be an option in this case.
> > > > > > > > > 
> > > > > > > > > Thoughts?
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Thanks Anatoly for the detailed description of the issue.
> > > > > > > > It may be possible to either create a versioned symbol for this API
> > > > > > > > change, or maybe even to have a temporary internal API.
> > > > > > > > 
> > > > > > > > But I think this series in its current form is not acceptable, so
> > > > > > > > waiting for v21.11 would be the best option (we may want to send the
> > > > > > > > deprecation notice in this release though).
> > > > > > > > 
> > > > > > > > In this series, I don't like the user application has to pass a flag to
> > > > > > > > state whether the DMA engine uses VFIO or not. AFAICT, this new revision
> > > > > > > > does not implement what was discussed in the previous one, i.e.
> > > > > > > > supporting both IOVA_AS_VA and IOVA_AS_PA.
> > > > > > > 
> > > > > > > Thanks for your comments. Here I hope to explain some questions:
> > > > > > > 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
> > > > > > > A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this version,
> > > the
> > > > > > > virtual address is replaced with iova address of mapped region, and
> > > > > > > the iova
> > > > > > > address is selected to program the IOMMU instead of virtual address only.
> > > > > 
> > > > > Good!
> > > > > 
> > > > > > > 
> > > > > > > 2. Why a flag is chosen to be passed by application?
> > > > > > > A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
> > > > > > > used to
> > > > > > > get the IOVA mode, so as to determine whether IOMMU should be
> > > > > programmed.
> > > > > > > However, in the implementation process, I found a problem. That is how to
> > > > > > > distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we
> > > should
> > > > > > > always program the IOMMU. While in IGB_UIO cases, it depends on
> > > IOMMU
> > > > > > > capability of platform.
> > > > > > 
> > > > > > How does one program IOMMU with igb_uio? I was under impression that
> > > > > > igb_uio (and uio_pci_generic for that matter) does not provide such
> > > > > > facilities.
> > > > > 
> > > > > +1
> > > > 
> > > > Maybe some misunderstanding in this sentence here.
> > > > In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be
> > > programmed.
> > > > True means vfio module is modprobed.
> > > > 
> > > > But there is an exception here, that is, even if vfio module is modprobed,
> > > > DPDK user still bind all the devices to igb_uio.
> > > > 
> > > > This situation can be distinguished in DPDK eal initialization, because the
> > > resource mapping
> > > > is according to the driver loaded by each device(rte_pci_map_device).
> > > > 
> > > > But in our scenario, this judgment is somewhat weak. Because we cannot get
> > > > the device driver info in vhost library. I also think it is unreasonable for vhost to
> > > > do this. Only trust that users will not use it like this. Thoughts for this scenario?
> > > 
> > > I don't see how igb_uio would make any difference at all. If you are
> > > using igb_uio, you *don't have DMA mapping at all* and will use raw
> > > physical addresses. Assuming your code supports this, that's all you're
> > > ever going to get. The point of VFIO is to have memory regions that are
> > > mapped for DMA *because real physical addresses are assumed to be not
> > > available*. When you're using igb_uio, you effectively do have DMA
> > > access to the entire memory, and thus can bypass IOMMU altogether
> > > (assuming you're using passthrough mode).
> > 
> > My concern is exactly here.
> > In igb_uio cases, although devices are not added to the default container in eal init,
> > but the "IOMMU programming" actually happens when the rte_vfio_container_dma_map() is called.
> > It is no harm but it is also unnecessary.
> 
> Yes, it is unnecessary, but it's also not actively harmful, which means you
> can still do it without any regard as to whether you do or don't have IOMMU
> :)
> 
> Think of a hybrid VFIO/igb_uio setup - some NICs will have VFIO, some will
> have igb_uio. The igb_uio-bound NICs will not care if you have mapped
> anything for DMA because they don't go through IOMMU, things will "just
> work". The VFIO-bound NICs will get the memory mapped, because they are the
> ones who actually need the DMA mapping.
> 
Do we even support a hybrid setup. I would have thought that was just
asking for problems and should be considered an unsupported configuration?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
  2021-07-07 14:33                     ` Burakov, Anatoly
  2021-07-07 15:00                       ` Bruce Richardson
@ 2021-07-07 15:09                       ` Ding, Xuan
  1 sibling, 0 replies; 25+ messages in thread
From: Ding, Xuan @ 2021-07-07 15:09 UTC (permalink / raw)
  To: Burakov, Anatoly, Maxime Coquelin, Xia, Chenbo, Thomas Monjalon,
	David Marchand
  Cc: dev, Hu, Jiayu, Pai G, Sunil, Richardson, Bruce, Van Haaren,
	Harry, Liu, Yong, Ma, WenwuX



> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Wednesday, July 7, 2021 10:34 PM
> To: Ding, Xuan <xuan.ding@intel.com>; Maxime Coquelin
> <maxime.coquelin@redhat.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Thomas Monjalon <thomas@monjalon.net>; David Marchand
> <david.marchand@redhat.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>;
> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost
> 
> On 07-Jul-21 1:54 PM, Ding, Xuan wrote:
> > Hi Anatoly,
> >
> >> -----Original Message-----
> >> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> >> Sent: Wednesday, July 7, 2021 8:18 PM
> >> To: Ding, Xuan <xuan.ding@intel.com>; Maxime Coquelin
> >> <maxime.coquelin@redhat.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> >> Thomas Monjalon <thomas@monjalon.net>; David Marchand
> >> <david.marchand@redhat.com>
> >> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >> <sunil.pai.g@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Van
> >> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> <yong.liu@intel.com>;
> >> Ma, WenwuX <wenwux.ma@intel.com>
> >> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
> vhost
> >>
> >> On 07-Jul-21 7:25 AM, Ding, Xuan wrote:
> >>> Hi,
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>> Sent: Tuesday, July 6, 2021 5:32 PM
> >>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> >>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>; Thomas
> >>>> Monjalon <thomas@monjalon.net>; David Marchand
> >>>> <david.marchand@redhat.com>
> >>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >>>> <sunil.pai.g@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>;
> >> Van
> >>>> Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> >> <yong.liu@intel.com>;
> >>>> Ma, WenwuX <wenwux.ma@intel.com>
> >>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async
> vhost
> >>>>
> >>>>
> >>>>
> >>>> On 7/6/21 11:16 AM, Burakov, Anatoly wrote:
> >>>>> On 06-Jul-21 9:31 AM, Ding, Xuan wrote:
> >>>>>> Hi Maxime,
> >>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>>>>> Sent: Monday, July 5, 2021 8:46 PM
> >>>>>>> To: Burakov, Anatoly <anatoly.burakov@intel.com>; Ding, Xuan
> >>>>>>> <xuan.ding@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Thomas
> >>>>>>> Monjalon <thomas@monjalon.net>; David Marchand
> >>>>>>> <david.marchand@redhat.com>
> >>>>>>> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Pai G, Sunil
> >>>>>>> <sunil.pai.g@intel.com>; Richardson, Bruce
> >> <bruce.richardson@intel.com>;
> >>>>>>> Van Haaren, Harry <harry.van.haaren@intel.com>; Liu, Yong
> >>>>>>> <yong.liu@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>
> >>>>>>> Subject: Re: [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for
> async
> >>>>>>> vhost
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 7/5/21 2:16 PM, Burakov, Anatoly wrote:
> >>>>>>>> On 05-Jul-21 9:40 AM, Xuan Ding wrote:
> >>>>>>>>> The use of IOMMU has many advantages, such as isolation and
> address
> >>>>>>>>> translation. This patch extends the capbility of DMA engine to use
> >>>>>>>>> IOMMU if the DMA device is bound to vfio.
> >>>>>>>>>
> >>>>>>>>> When set memory table, the guest memory will be mapped
> >>>>>>>>> into the default container of DPDK.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >>>>>>>>> ---
> >>>>>>>>>      doc/guides/prog_guide/vhost_lib.rst |  9 ++++++
> >>>>>>>>>      lib/vhost/rte_vhost.h               |  1 +
> >>>>>>>>>      lib/vhost/socket.c                  |  9 ++++++
> >>>>>>>>>      lib/vhost/vhost.h                   |  1 +
> >>>>>>>>>      lib/vhost/vhost_user.c              | 46
> >>>>>>>>> ++++++++++++++++++++++++++++-
> >>>>>>>>>      5 files changed, 65 insertions(+), 1 deletion(-)
> >>>>>>>>>
> >>>>>>>>> diff --git a/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>>>> b/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>>>> index 05c42c9b11..c3beda23d9 100644
> >>>>>>>>> --- a/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>>>> +++ b/doc/guides/prog_guide/vhost_lib.rst
> >>>>>>>>> @@ -118,6 +118,15 @@ The following is an overview of some
> key
> >> Vhost
> >>>>>>>>> API functions:
> >>>>>>>>>            It is disabled by default.
> >>>>>>>>>      +  - ``RTE_VHOST_USER_ASYNC_USE_VFIO``
> >>>>>>>>> +
> >>>>>>>>> +    In asynchronous data path, vhost liarary is not aware of which
> >>>>>>>>> driver
> >>>>>>>>> +    (igb_uio/vfio) the DMA device is bound to. Application should
> >>>>>>>>> pass
> >>>>>>>>> +    this flag to tell vhost library whether IOMMU should be
> >>>>>>>>> programmed
> >>>>>>>>> +    for guest memory.
> >>>>>>>>> +
> >>>>>>>>> +    It is disabled by default.
> >>>>>>>>> +
> >>>>>>>>>        - ``RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS``
> >>>>>>>>>            Since v16.04, the vhost library forwards checksum and gso
> >>>>>>>>> requests for
> >>>>>>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> >>>>>>>>> index 8d875e9322..a766ea7b6b 100644
> >>>>>>>>> --- a/lib/vhost/rte_vhost.h
> >>>>>>>>> +++ b/lib/vhost/rte_vhost.h
> >>>>>>>>> @@ -37,6 +37,7 @@ extern "C" {
> >>>>>>>>>      #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
> >>>>>>>>>      #define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> >>>>>>>>>      #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS
> (1ULL <<
> >> 8)
> >>>>>>>>> +#define RTE_VHOST_USER_ASYNC_USE_VFIO    (1ULL << 9)
> >>>>>>>>>        /* Features. */
> >>>>>>>>>      #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> >>>>>>>>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> >>>>>>>>> index 5d0d728d52..77c722c86b 100644
> >>>>>>>>> --- a/lib/vhost/socket.c
> >>>>>>>>> +++ b/lib/vhost/socket.c
> >>>>>>>>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
> >>>>>>>>>          bool extbuf;
> >>>>>>>>>          bool linearbuf;
> >>>>>>>>>          bool async_copy;
> >>>>>>>>> +    bool async_use_vfio;
> >>>>>>>>>          bool net_compliant_ol_flags;
> >>>>>>>>>            /*
> >>>>>>>>> @@ -243,6 +244,13 @@ vhost_user_add_connection(int fd,
> struct
> >>>>>>>>> vhost_user_socket *vsocket)
> >>>>>>>>>                  dev->async_copy = 1;
> >>>>>>>>>          }
> >>>>>>>>>      +    if (vsocket->async_use_vfio) {
> >>>>>>>>> +        dev = get_device(vid);
> >>>>>>>>> +
> >>>>>>>>> +        if (dev)
> >>>>>>>>> +            dev->async_use_vfio = 1;
> >>>>>>>>> +    }
> >>>>>>>>> +
> >>>>>>>>>          VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n",
> vid);
> >>>>>>>>>            if (vsocket->notify_ops->new_connection) {
> >>>>>>>>> @@ -879,6 +887,7 @@ rte_vhost_driver_register(const char
> *path,
> >>>>>>>>> uint64_t flags)
> >>>>>>>>>          vsocket->extbuf = flags &
> RTE_VHOST_USER_EXTBUF_SUPPORT;
> >>>>>>>>>          vsocket->linearbuf = flags &
> >>>> RTE_VHOST_USER_LINEARBUF_SUPPORT;
> >>>>>>>>>          vsocket->async_copy = flags &
> RTE_VHOST_USER_ASYNC_COPY;
> >>>>>>>>> +    vsocket->async_use_vfio = flags &
> >>>>>>> RTE_VHOST_USER_ASYNC_USE_VFIO;
> >>>>>>>>>          vsocket->net_compliant_ol_flags = flags &
> >>>>>>>>> RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
> >>>>>>>>>            if (vsocket->async_copy &&
> >>>>>>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> >>>>>>>>> index 8078ddff79..fb775ce4ed 100644
> >>>>>>>>> --- a/lib/vhost/vhost.h
> >>>>>>>>> +++ b/lib/vhost/vhost.h
> >>>>>>>>> @@ -370,6 +370,7 @@ struct virtio_net {
> >>>>>>>>>          int16_t            broadcast_rarp;
> >>>>>>>>>          uint32_t        nr_vring;
> >>>>>>>>>          int            async_copy;
> >>>>>>>>> +    int            async_use_vfio;
> >>>>>>>>>          int            extbuf;
> >>>>>>>>>          int            linearbuf;
> >>>>>>>>>          struct vhost_virtqueue
> *virtqueue[VHOST_MAX_QUEUE_PAIRS *
> >>>>>>>>> 2];
> >>>>>>>>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> >>>>>>>>> index 8f0eba6412..f3703f2e72 100644
> >>>>>>>>> --- a/lib/vhost/vhost_user.c
> >>>>>>>>> +++ b/lib/vhost/vhost_user.c
> >>>>>>>>> @@ -45,6 +45,7 @@
> >>>>>>>>>      #include <rte_common.h>
> >>>>>>>>>      #include <rte_malloc.h>
> >>>>>>>>>      #include <rte_log.h>
> >>>>>>>>> +#include <rte_vfio.h>
> >>>>>>>>>        #include "iotlb.h"
> >>>>>>>>>      #include "vhost.h"
> >>>>>>>>> @@ -141,6 +142,36 @@ get_blk_size(int fd)
> >>>>>>>>>          return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
> >>>>>>>>>      }
> >>>>>>>>>      +static int
> >>>>>>>>> +async_dma_map(struct rte_vhost_mem_region *region, bool
> do_map)
> >>>>>>>>> +{
> >>>>>>>>> +    int ret = 0;
> >>>>>>>>> +    uint64_t host_iova;
> >>>>>>>>> +    host_iova = rte_mem_virt2iova((void
> >>>>>>>>> *)(uintptr_t)region->host_user_addr);
> >>>>>>>>> +    if (do_map) {
> >>>>>>>>> +        /* Add mapped region into the default container of DPDK.
> */
> >>>>>>>>> +        ret =
> >>>>>>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>>>>>> +                         region->host_user_addr,
> >>>>>>>>> +                         host_iova,
> >>>>>>>>> +                         region->size);
> >>>>>>>>> +        if (ret) {
> >>>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
> >>>>>>>>> +            return ret;
> >>>>>>>>> +        }
> >>>>>>>>> +    } else {
> >>>>>>>>> +        /* Remove mapped region from the default container of
> >>>>>>>>> DPDK. */
> >>>>>>>>> +        ret =
> >>>>>>>>>
> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >>>>>>>>> +                           region->host_user_addr,
> >>>>>>>>> +                           host_iova,
> >>>>>>>>> +                           region->size);
> >>>>>>>>> +        if (ret) {
> >>>>>>>>> +            VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> failed\n");
> >>>>>>>>> +            return ret;
> >>>>>>>>> +        }
> >>>>>>>>> +    }
> >>>>>>>>> +    return ret;
> >>>>>>>>> +}
> >>>>>>>>
> >>>>>>>> We've been discussing this off list with Xuan, and unfortunately
> >>>>>>>> this is
> >>>>>>>> a blocker for now.
> >>>>>>>>
> >>>>>>>> Currently, the x86 IOMMU does not support partial unmap - the
> >> segments
> >>>>>>>> have to be unmapped exactly the same addr/len as they were
> mapped.
> >> We
> >>>>>>>> also concatenate adjacent mappings to prevent filling up the DMA
> >>>>>>>> mapping
> >>>>>>>> entry table with superfluous entries.
> >>>>>>>>
> >>>>>>>> This means that, when two unrelated mappings are contiguous in
> >> memory
> >>>>>>>> (e.g. if you map regions 1 and 2 independently, but they happen to
> be
> >>>>>>>> sitting right next to each other in virtual memory), we cannot later
> >>>>>>>> unmap one of them because, even though these are two separate
> >>>>>>> mappings
> >>>>>>>> as far as kernel VFIO infrastructure is concerned, the mapping gets
> >>>>>>>> compacted and looks like one single mapping to VFIO, so DPDK API
> will
> >>>>>>>> not let us unmap region 1 without also unmapping region 2.
> >>>>>>>>
> >>>>>>>> The proper fix for this problem would be to always map memory
> >>>>>>>> page-by-page regardless of where it comes from (we already do
> that for
> >>>>>>>> internal memory, but not for external). However, the reason this
> works
> >>>>>>>> for internal memory is because when mapping internal memory
> >> segments,
> >>>>>>>> *we know the page size*. For external memory segments, there is
> no
> >> such
> >>>>>>>> guarantee, so we cannot deduce page size for a given memory
> segment,
> >>>>>>> and
> >>>>>>>> thus can't map things page-by-page.
> >>>>>>>>
> >>>>>>>> So, the proper fix for it would be to add page size to the VFIO DMA
> >>>>>>>> API.
> >>>>>>>> Unfortunately, it probably has to wait until 21.11 because it is an
> API
> >>>>>>>> change.
> >>>>>>>>
> >>>>>>>> The slightly hacky fix for this would be to forego user mem map
> >>>>>>>> concatenation and trust that user is not going to do anything
> stupid,
> >>>>>>>> and will not spam the VFIO DMA API without reason. I would
> rather
> >>>>>>>> not go
> >>>>>>>> down this road, but this could be an option in this case.
> >>>>>>>>
> >>>>>>>> Thoughts?
> >>>>>>>>
> >>>>>>>
> >>>>>>> Thanks Anatoly for the detailed description of the issue.
> >>>>>>> It may be possible to either create a versioned symbol for this API
> >>>>>>> change, or maybe even to have a temporary internal API.
> >>>>>>>
> >>>>>>> But I think this series in its current form is not acceptable, so
> >>>>>>> waiting for v21.11 would be the best option (we may want to send
> the
> >>>>>>> deprecation notice in this release though).
> >>>>>>>
> >>>>>>> In this series, I don't like the user application has to pass a flag to
> >>>>>>> state whether the DMA engine uses VFIO or not. AFAICT, this new
> revision
> >>>>>>> does not implement what was discussed in the previous one, i.e.
> >>>>>>> supporting both IOVA_AS_VA and IOVA_AS_PA.
> >>>>>>
> >>>>>> Thanks for your comments. Here I hope to explain some questions:
> >>>>>> 1. Whether both IOVA_AS_VA and IOVA_AS_PA are supported now?
> >>>>>> A: Both IOVA_AS_PA and IOVA_AS_VA are supported now. In this
> version,
> >> the
> >>>>>> virtual address is replaced with iova address of mapped region, and
> >>>>>> the iova
> >>>>>> address is selected to program the IOMMU instead of virtual address
> only.
> >>>>
> >>>> Good!
> >>>>
> >>>>>>
> >>>>>> 2. Why a flag is chosen to be passed by application?
> >>>>>> A: Yes, as we discussed before, the rte_eal_iova_mode() API can be
> >>>>>> used to
> >>>>>> get the IOVA mode, so as to determine whether IOMMU should be
> >>>> programmed.
> >>>>>> However, in the implementation process, I found a problem. That is
> how to
> >>>>>> distinguish the VFIO PA and IGB_UIO PA. Because for VFIO cases, we
> >> should
> >>>>>> always program the IOMMU. While in IGB_UIO cases, it depends on
> >> IOMMU
> >>>>>> capability of platform.
> >>>>>
> >>>>> How does one program IOMMU with igb_uio? I was under impression
> that
> >>>>> igb_uio (and uio_pci_generic for that matter) does not provide such
> >>>>> facilities.
> >>>>
> >>>> +1
> >>>
> >>> Maybe some misunderstanding in this sentence here.
> >>> In our design, if rte_eal_vfio_is_enabled("vfio") is true, iommu will be
> >> programmed.
> >>> True means vfio module is modprobed.
> >>>
> >>> But there is an exception here, that is, even if vfio module is modprobed,
> >>> DPDK user still bind all the devices to igb_uio.
> >>>
> >>> This situation can be distinguished in DPDK eal initialization, because the
> >> resource mapping
> >>> is according to the driver loaded by each device(rte_pci_map_device).
> >>>
> >>> But in our scenario, this judgment is somewhat weak. Because we cannot
> get
> >>> the device driver info in vhost library. I also think it is unreasonable for
> vhost to
> >>> do this. Only trust that users will not use it like this. Thoughts for this
> scenario?
> >>
> >> I don't see how igb_uio would make any difference at all. If you are
> >> using igb_uio, you *don't have DMA mapping at all* and will use raw
> >> physical addresses. Assuming your code supports this, that's all you're
> >> ever going to get. The point of VFIO is to have memory regions that are
> >> mapped for DMA *because real physical addresses are assumed to be not
> >> available*. When you're using igb_uio, you effectively do have DMA
> >> access to the entire memory, and thus can bypass IOMMU altogether
> >> (assuming you're using passthrough mode).
> >
> > My concern is exactly here.
> > In igb_uio cases, although devices are not added to the default container in
> eal init,
> > but the "IOMMU programming" actually happens when the
> rte_vfio_container_dma_map() is called.
> > It is no harm but it is also unnecessary.
> 
> Yes, it is unnecessary, but it's also not actively harmful, which means
> you can still do it without any regard as to whether you do or don't
> have IOMMU :)
> 
> Think of a hybrid VFIO/igb_uio setup - some NICs will have VFIO, some
> will have igb_uio. The igb_uio-bound NICs will not care if you have
> mapped anything for DMA because they don't go through IOMMU, things
> will
> "just work". The VFIO-bound NICs will get the memory mapped, because
> they are the ones who actually need the DMA mapping.
> 
> So, what you get is, if you do VFIO DMA mapping unconditionally, 1) NICs
> with igb_uio won't care about this, and 2) NICs with VFIO will benefit.
> You're not "mapping" the NICs, you're mapping the memory you're
> accessing with those NICs. You need it to be accessible to both, but
> since you have no way of knowing whether 1) any of the current HW needs
> VFIO, and 2) any of *future hotplugged* HW needs VFIO, the easiest way
> to solve this problem is just to map things regardless, and live with
> the "unnecessary" but harmless mapping in the worst case.

Get your point! It's just such a worst case bothers me.
I have been thinking about how to avoid the igb_uio case programming IOMMU. But I cannot realize this just through a judgement.

Since it is harmless in this case, not to mention, a platform without IOMMU won’t do anything useful. I think it works to program IOMMU unconditionally.

> 
> >
> >>
> >> Bottom line: do VFIO DMA mapping unconditionally. If VFIO is active -
> >> great, the memory will be DMA mapped. If it's not active - no harm will
> >> ever be done by mapping the memory for DMA anyway.
> >
> > Do VFIO DMA mapping unconditionally, do you mean the
> rte_eal_vfio_is_enabled() is unnecessary?
> > What if the platform does not have IOMMU?
> >
> > Thanks very much.
> >
> 
> If the platform has no IOMMU, the API call will just not do anything
> useful, so no harm done.

So the only thing remained is the API change for page-by-page mapping
in next release.

Thanks,
Xuan

> 
> > Regards,
> > Xuan
> >
> >>
> >> --
> >> Thanks,
> >> Anatoly
> 
> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2021-07-07 15:09 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-31 15:06 [dpdk-dev] [PATCH v1] lib/vhost: enable IOMMU for async vhost xuan.ding
2021-06-02 14:26 ` [dpdk-dev] [PATCH v2] " xuan.ding
2021-06-03 17:30 ` [dpdk-dev] [PATCH v3] vhost: " xuan.ding
2021-06-18 16:17   ` Maxime Coquelin
2021-06-21  3:57     ` Hu, Jiayu
2021-06-22  6:18     ` Ding, Xuan
2021-06-29  9:23       ` Maxime Coquelin
2021-07-01  5:12         ` Ding, Xuan
2021-07-05  8:19 ` [dpdk-dev] [PATCH v4 0/2] vhost: add IOMMU support in async data path Xuan Ding
2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 1/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-07-05  8:19   ` [dpdk-dev] [PATCH v4 2/2] example/vhost: add dma vfio parsing Xuan Ding
2021-07-05  8:40 ` [dpdk-dev] [PATCH v5 0/2] vhost: add IOMMU support in async data path Xuan Ding
2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 1/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-07-05 12:16     ` Burakov, Anatoly
2021-07-05 12:45       ` Maxime Coquelin
2021-07-06  8:31         ` Ding, Xuan
2021-07-06  9:16           ` Burakov, Anatoly
2021-07-06  9:32             ` Maxime Coquelin
2021-07-07  6:25               ` Ding, Xuan
2021-07-07 12:17                 ` Burakov, Anatoly
2021-07-07 12:54                   ` Ding, Xuan
2021-07-07 14:33                     ` Burakov, Anatoly
2021-07-07 15:00                       ` Bruce Richardson
2021-07-07 15:09                       ` Ding, Xuan
2021-07-05  8:40   ` [dpdk-dev] [PATCH v5 2/2] example/vhost: add dma vfio parsing Xuan Ding

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).