[dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device ***

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device ***
@ 2021-09-01  5:30 Xuan Ding
  2021-09-01  5:30 ` [dpdk-dev] [PATCH 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
                   ` (7 more replies)
  0 siblings, 8 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-01  5:30 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, bruce.richardson, sunil.pai.g, Xuan Ding

This series adds support for DMA device to use vfio.
The first patch is to extend current vfio dma mapping API to
allow partial unmapping for adjacent memory if the
platform does not support partial unmapping.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost_user.c   |  46 +++++-
 2 files changed, 273 insertions(+), 111 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
@ 2021-09-01  5:30 ` Xuan Ding
  2021-09-01  5:30 ` [dpdk-dev] [PATCH 2/2] vhost: enable IOMMU for async vhost Xuan Ding
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-01  5:30 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, bruce.richardson, sunil.pai.g, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjancent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH 2/2] vhost: enable IOMMU for async vhost
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
  2021-09-01  5:30 ` [dpdk-dev] [PATCH 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-01  5:30 ` Xuan Ding
  2021-09-17  5:25 ` [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device Xuan Ding
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-01  5:30 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, bruce.richardson, sunil.pai.g, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA device is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost_user.c | 46 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 031c578e54..48617fc708 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,7 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +142,36 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	uint64_t host_iova;
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+		}
+	} else {
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +184,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -1157,6 +1191,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1245,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
  2021-09-01  5:30 ` [dpdk-dev] [PATCH 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-01  5:30 ` [dpdk-dev] [PATCH 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-09-17  5:25 ` Xuan Ding
  2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-09-25 10:03 ` [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device Xuan Ding
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-17  5:25 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v2:
* Added rte_errno filtering for some devices bound in the kernel driver.
* Added a flag to check the status of region mapping.
* Fixed one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/rte_vhost.h    |   1 +
 lib/vhost/vhost_user.c   |  57 ++++++-
 3 files changed, 285 insertions(+), 111 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-17  5:25 ` [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-17  5:25   ` Xuan Ding
  2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-17  5:25 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-17  5:25 ` [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device Xuan Ding
  2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-17  5:25   ` Xuan Ding
  2021-09-23 14:39     ` Hu, Jiayu
  1 sibling, 1 reply; 40+ messages in thread
From: Xuan Ding @ 2021-09-17  5:25 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/rte_vhost.h  |  1 +
 lib/vhost/vhost_user.c | 57 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 8d875e9322..e0537249f3 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
 	void	 *mmap_addr;
 	uint64_t mmap_size;
 	int fd;
+	uint64_t dma_map_success;
 };
 
 /**
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..7d1d592b86 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,46 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool do_map)
+{
+	int ret = 0;
+	uint64_t host_iova;
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		region->dma_map_success = ret == 0;
+		if (ret) {
+			if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+				VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+				return ret;
+			}
+			return 0;
+		}
+		return ret;
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!region->dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		region->dma_map_success = 0;
+	}
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +195,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -1157,6 +1202,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1256,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, true);
+			if (ret < 0) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-09-23 14:39     ` Hu, Jiayu
  2021-09-23 14:56       ` Maxime Coquelin
  0 siblings, 1 reply; 40+ messages in thread
From: Hu, Jiayu @ 2021-09-23 14:39 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

Hi Xuan,

> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Friday, September 17, 2021 1:26 PM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> 
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use IOMMU if
> the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped into the default
> container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>  lib/vhost/rte_vhost.h  |  1 +
>  lib/vhost/vhost_user.c | 57
> +++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
> 8d875e9322..e0537249f3 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
>  	void	 *mmap_addr;
>  	uint64_t mmap_size;
>  	int fd;
> +	uint64_t dma_map_success;

How about using bool for dma_map_success?

>  };
> 
>  /**
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index
> 29a4c9af60..7d1d592b86 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -45,6 +45,8 @@
>  #include <rte_common.h>
>  #include <rte_malloc.h>
>  #include <rte_log.h>
> +#include <rte_vfio.h>
> +#include <rte_errno.h>
> 
>  #include "iotlb.h"
>  #include "vhost.h"
> @@ -141,6 +143,46 @@ get_blk_size(int fd)
>  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;  }
> 
> +static int
> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map) {
> +	int ret = 0;
> +	uint64_t host_iova;
> +	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
> >host_user_addr);
> +	if (do_map) {
> +		/* Add mapped region into the default container of DPDK. */
> +		ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						 region->host_user_addr,
> +						 host_iova,
> +						 region->size);
> +		region->dma_map_success = ret == 0;
> +		if (ret) {
> +			if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
> +				VHOST_LOG_CONFIG(ERR, "DMA engine map
> failed\n");
> +				return ret;
> +			}
> +			return 0;

Why return 0, if ret is -1 here?

Thanks,
Jiayu

> +		}
> +		return ret;
> +	} else {
> +		/* No need to do vfio unmap if the map failed. */
> +		if (!region->dma_map_success)
> +			return 0;
> +
> +		/* Remove mapped region from the default container of
> DPDK. */
> +		ret =
> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						   region->host_user_addr,
> +						   host_iova,
> +						   region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> failed\n");
> +			return ret;
> +		}
> +		region->dma_map_success = 0;
> +	}
> +	return ret;
> +}
> +
>  static void
>  free_mem_region(struct virtio_net *dev)  { @@ -153,6 +195,9 @@
> free_mem_region(struct virtio_net *dev)
>  	for (i = 0; i < dev->mem->nregions; i++) {
>  		reg = &dev->mem->regions[i];
>  		if (reg->host_user_addr) {
> +			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +				async_dma_map(reg, false);
> +
>  			munmap(reg->mmap_addr, reg->mmap_size);
>  			close(reg->fd);
>  		}
> @@ -1157,6 +1202,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
>  	uint64_t mmap_size;
>  	uint64_t alignment;
>  	int populate;
> +	int ret;
> 
>  	/* Check for memory_size + mmap_offset overflow */
>  	if (mmap_offset >= -region->size) {
> @@ -1210,13 +1256,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
>  	region->mmap_size = mmap_size;
>  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> mmap_offset;
> 
> -	if (dev->async_copy)
> +	if (dev->async_copy) {
>  		if (add_guest_pages(dev, region, alignment) < 0) {
>  			VHOST_LOG_CONFIG(ERR,
>  					"adding guest pages to region
> failed.\n");
>  			return -1;
>  		}
> 
> +		if (rte_vfio_is_enabled("vfio")) {
> +			ret = async_dma_map(region, true);
> +			if (ret < 0) {
> +				VHOST_LOG_CONFIG(ERR, "Configure
> IOMMU for DMA engine failed\n");
> +				return -1;
> +			}
> +		}
> +	}
> +
>  	VHOST_LOG_CONFIG(INFO,
>  			"guest memory region size: 0x%" PRIx64 "\n"
>  			"\t guest physical addr: 0x%" PRIx64 "\n"
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-23 14:39     ` Hu, Jiayu
@ 2021-09-23 14:56       ` Maxime Coquelin
  2021-09-24  1:53         ` Xia, Chenbo
  0 siblings, 1 reply; 40+ messages in thread
From: Maxime Coquelin @ 2021-09-23 14:56 UTC (permalink / raw)
  To: Hu, Jiayu, Ding, Xuan, dev, Burakov, Anatoly, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX



On 9/23/21 16:39, Hu, Jiayu wrote:
> Hi Xuan,
> 
>> -----Original Message-----
>> From: Ding, Xuan <xuan.ding@intel.com>
>> Sent: Friday, September 17, 2021 1:26 PM
>> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
>> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
>> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
>> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
>> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
>> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
>>
>> The use of IOMMU has many advantages, such as isolation and address
>> translation. This patch extends the capbility of DMA engine to use IOMMU if
>> the DMA engine is bound to vfio.
>>
>> When set memory table, the guest memory will be mapped into the default
>> container of DPDK.
>>
>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>> ---
>>   lib/vhost/rte_vhost.h  |  1 +
>>   lib/vhost/vhost_user.c | 57
>> +++++++++++++++++++++++++++++++++++++++++-
>>   2 files changed, 57 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
>> 8d875e9322..e0537249f3 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
>>   	void	 *mmap_addr;
>>   	uint64_t mmap_size;
>>   	int fd;
>> +	uint64_t dma_map_success;
> 
> How about using bool for dma_map_success?

The bigger problem here is that you are breaking the ABI.

>>   };
>>
>>   /**
>> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index
>> 29a4c9af60..7d1d592b86 100644
>> --- a/lib/vhost/vhost_user.c
>> +++ b/lib/vhost/vhost_user.c
>> @@ -45,6 +45,8 @@
>>   #include <rte_common.h>
>>   #include <rte_malloc.h>
>>   #include <rte_log.h>
>> +#include <rte_vfio.h>
>> +#include <rte_errno.h>
>>
>>   #include "iotlb.h"
>>   #include "vhost.h"
>> @@ -141,6 +143,46 @@ get_blk_size(int fd)
>>   	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;  }
>>
>> +static int
>> +async_dma_map(struct rte_vhost_mem_region *region, bool do_map) {
>> +	int ret = 0;
>> +	uint64_t host_iova;
>> +	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
>>> host_user_addr);
>> +	if (do_map) {
>> +		/* Add mapped region into the default container of DPDK. */
>> +		ret =
>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>> +						 region->host_user_addr,
>> +						 host_iova,
>> +						 region->size);
>> +		region->dma_map_success = ret == 0;
>> +		if (ret) {
>> +			if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
>> +				VHOST_LOG_CONFIG(ERR, "DMA engine map
>> failed\n");
>> +				return ret;
>> +			}
>> +			return 0;
> 
> Why return 0, if ret is -1 here?
> 
> Thanks,
> Jiayu
> 
>> +		}
>> +		return ret;
>> +	} else {
>> +		/* No need to do vfio unmap if the map failed. */
>> +		if (!region->dma_map_success)
>> +			return 0;
>> +
>> +		/* Remove mapped region from the default container of
>> DPDK. */
>> +		ret =
>> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
>> +						   region->host_user_addr,
>> +						   host_iova,
>> +						   region->size);
>> +		if (ret) {
>> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
>> failed\n");
>> +			return ret;
>> +		}
>> +		region->dma_map_success = 0;
>> +	}
>> +	return ret;
>> +}
>> +
>>   static void
>>   free_mem_region(struct virtio_net *dev)  { @@ -153,6 +195,9 @@
>> free_mem_region(struct virtio_net *dev)
>>   	for (i = 0; i < dev->mem->nregions; i++) {
>>   		reg = &dev->mem->regions[i];
>>   		if (reg->host_user_addr) {
>> +			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
>> +				async_dma_map(reg, false);
>> +
>>   			munmap(reg->mmap_addr, reg->mmap_size);
>>   			close(reg->fd);
>>   		}
>> @@ -1157,6 +1202,7 @@ vhost_user_mmap_region(struct virtio_net *dev,
>>   	uint64_t mmap_size;
>>   	uint64_t alignment;
>>   	int populate;
>> +	int ret;
>>
>>   	/* Check for memory_size + mmap_offset overflow */
>>   	if (mmap_offset >= -region->size) {
>> @@ -1210,13 +1256,22 @@ vhost_user_mmap_region(struct virtio_net *dev,
>>   	region->mmap_size = mmap_size;
>>   	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>> mmap_offset;
>>
>> -	if (dev->async_copy)
>> +	if (dev->async_copy) {
>>   		if (add_guest_pages(dev, region, alignment) < 0) {
>>   			VHOST_LOG_CONFIG(ERR,
>>   					"adding guest pages to region
>> failed.\n");
>>   			return -1;
>>   		}
>>
>> +		if (rte_vfio_is_enabled("vfio")) {
>> +			ret = async_dma_map(region, true);
>> +			if (ret < 0) {
>> +				VHOST_LOG_CONFIG(ERR, "Configure
>> IOMMU for DMA engine failed\n");
>> +				return -1;
>> +			}
>> +		}
>> +	}
>> +
>>   	VHOST_LOG_CONFIG(INFO,
>>   			"guest memory region size: 0x%" PRIx64 "\n"
>>   			"\t guest physical addr: 0x%" PRIx64 "\n"
>> --
>> 2.17.1
> 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-23 14:56       ` Maxime Coquelin
@ 2021-09-24  1:53         ` Xia, Chenbo
  2021-09-24  7:13           ` Maxime Coquelin
  0 siblings, 1 reply; 40+ messages in thread
From: Xia, Chenbo @ 2021-09-24  1:53 UTC (permalink / raw)
  To: Maxime Coquelin, Hu, Jiayu, Ding, Xuan, dev, Burakov, Anatoly
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Thursday, September 23, 2021 10:56 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>;
> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> 
> 
> 
> On 9/23/21 16:39, Hu, Jiayu wrote:
> > Hi Xuan,
> >
> >> -----Original Message-----
> >> From: Ding, Xuan <xuan.ding@intel.com>
> >> Sent: Friday, September 17, 2021 1:26 PM
> >> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> >> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> >> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> >> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> >> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> >> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> >> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> >>
> >> The use of IOMMU has many advantages, such as isolation and address
> >> translation. This patch extends the capbility of DMA engine to use IOMMU if
> >> the DMA engine is bound to vfio.
> >>
> >> When set memory table, the guest memory will be mapped into the default
> >> container of DPDK.
> >>
> >> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >> ---
> >>   lib/vhost/rte_vhost.h  |  1 +
> >>   lib/vhost/vhost_user.c | 57
> >> +++++++++++++++++++++++++++++++++++++++++-
> >>   2 files changed, 57 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
> >> 8d875e9322..e0537249f3 100644
> >> --- a/lib/vhost/rte_vhost.h
> >> +++ b/lib/vhost/rte_vhost.h
> >> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
> >>   	void	 *mmap_addr;
> >>   	uint64_t mmap_size;
> >>   	int fd;
> >> +	uint64_t dma_map_success;
> >
> > How about using bool for dma_map_success?
> 
> The bigger problem here is that you are breaking the ABI.

Maybe this kind of driver-facing structs/functions should be removed
from ABI, since we are refactoring DPDK ABI recently.

/Chenbo

> 
> >>   };
> >>
> >>   /**


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-24  1:53         ` Xia, Chenbo
@ 2021-09-24  7:13           ` Maxime Coquelin
  2021-09-24  7:35             ` Xia, Chenbo
  0 siblings, 1 reply; 40+ messages in thread
From: Maxime Coquelin @ 2021-09-24  7:13 UTC (permalink / raw)
  To: Xia, Chenbo, Hu, Jiayu, Ding, Xuan, dev, Burakov, Anatoly
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX



On 9/24/21 03:53, Xia, Chenbo wrote:
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Thursday, September 23, 2021 10:56 PM
>> To: Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>;
>> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Xia, Chenbo
>> <chenbo.xia@intel.com>
>> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
>> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
>> Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
>>
>>
>>
>> On 9/23/21 16:39, Hu, Jiayu wrote:
>>> Hi Xuan,
>>>
>>>> -----Original Message-----
>>>> From: Ding, Xuan <xuan.ding@intel.com>
>>>> Sent: Friday, September 17, 2021 1:26 PM
>>>> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
>>>> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
>>>> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
>>>> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
>>>> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
>>>> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
>>>> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
>>>>
>>>> The use of IOMMU has many advantages, such as isolation and address
>>>> translation. This patch extends the capbility of DMA engine to use IOMMU if
>>>> the DMA engine is bound to vfio.
>>>>
>>>> When set memory table, the guest memory will be mapped into the default
>>>> container of DPDK.
>>>>
>>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>>>> ---
>>>>    lib/vhost/rte_vhost.h  |  1 +
>>>>    lib/vhost/vhost_user.c | 57
>>>> +++++++++++++++++++++++++++++++++++++++++-
>>>>    2 files changed, 57 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
>>>> 8d875e9322..e0537249f3 100644
>>>> --- a/lib/vhost/rte_vhost.h
>>>> +++ b/lib/vhost/rte_vhost.h
>>>> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
>>>>    	void	 *mmap_addr;
>>>>    	uint64_t mmap_size;
>>>>    	int fd;
>>>> +	uint64_t dma_map_success;
>>>
>>> How about using bool for dma_map_success?
>>
>> The bigger problem here is that you are breaking the ABI.
> 
> Maybe this kind of driver-facing structs/functions should be removed
> from ABI, since we are refactoring DPDK ABI recently.

It has actually been exposed for SPDK, we cannot just remove it from
API.

Maxime

> /Chenbo
> 
>>
>>>>    };
>>>>
>>>>    /**
> 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-24  7:13           ` Maxime Coquelin
@ 2021-09-24  7:35             ` Xia, Chenbo
  2021-09-24  8:18               ` Ding, Xuan
  0 siblings, 1 reply; 40+ messages in thread
From: Xia, Chenbo @ 2021-09-24  7:35 UTC (permalink / raw)
  To: Maxime Coquelin, Hu, Jiayu, Ding, Xuan, dev, Burakov, Anatoly
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Friday, September 24, 2021 3:14 PM
> To: Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>; Ding,
> Xuan <xuan.ding@intel.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> 
> 
> 
> On 9/24/21 03:53, Xia, Chenbo wrote:
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Thursday, September 23, 2021 10:56 PM
> >> To: Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>;
> >> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Xia, Chenbo
> >> <chenbo.xia@intel.com>
> >> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> >> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> >> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> >> Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> >>
> >>
> >>
> >> On 9/23/21 16:39, Hu, Jiayu wrote:
> >>> Hi Xuan,
> >>>
> >>>> -----Original Message-----
> >>>> From: Ding, Xuan <xuan.ding@intel.com>
> >>>> Sent: Friday, September 17, 2021 1:26 PM
> >>>> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> >>>> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> >>>> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>;
> >>>> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> >>>> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> >>>> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> >>>> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> >>>>
> >>>> The use of IOMMU has many advantages, such as isolation and address
> >>>> translation. This patch extends the capbility of DMA engine to use IOMMU
> if
> >>>> the DMA engine is bound to vfio.
> >>>>
> >>>> When set memory table, the guest memory will be mapped into the default
> >>>> container of DPDK.
> >>>>
> >>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >>>> ---
> >>>>    lib/vhost/rte_vhost.h  |  1 +
> >>>>    lib/vhost/vhost_user.c | 57
> >>>> +++++++++++++++++++++++++++++++++++++++++-
> >>>>    2 files changed, 57 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
> >>>> 8d875e9322..e0537249f3 100644
> >>>> --- a/lib/vhost/rte_vhost.h
> >>>> +++ b/lib/vhost/rte_vhost.h
> >>>> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
> >>>>    	void	 *mmap_addr;
> >>>>    	uint64_t mmap_size;
> >>>>    	int fd;
> >>>> +	uint64_t dma_map_success;
> >>>
> >>> How about using bool for dma_map_success?
> >>
> >> The bigger problem here is that you are breaking the ABI.
> >
> > Maybe this kind of driver-facing structs/functions should be removed
> > from ABI, since we are refactoring DPDK ABI recently.
> 
> It has actually been exposed for SPDK, we cannot just remove it from
> API.

'exposed' does not mean it has to be ABI. Like 'driver_sdk_headers' in
ethdev lib, those headers can be exposed but do not include ABI. I see
SPDK is using that for building its lib. Not sure in this case, the SPDK
Vhost lib should be considered as application.

Thanks,
Chenbo 

> 
> Maxime
> 
> > /Chenbo
> >
> >>
> >>>>    };
> >>>>
> >>>>    /**
> >


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost
  2021-09-24  7:35             ` Xia, Chenbo
@ 2021-09-24  8:18               ` Ding, Xuan
  0 siblings, 0 replies; 40+ messages in thread
From: Ding, Xuan @ 2021-09-24  8:18 UTC (permalink / raw)
  To: Xia, Chenbo, Maxime Coquelin, Hu, Jiayu, dev, Burakov, Anatoly
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX



> -----Original Message-----
> From: Xia, Chenbo <chenbo.xia@intel.com>
> Sent: Friday, September 24, 2021 3:36 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>; Hu, Jiayu
> <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; dev@dpdk.org;
> Burakov, Anatoly <anatoly.burakov@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> Subject: RE: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> 
> > -----Original Message-----
> > From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > Sent: Friday, September 24, 2021 3:14 PM
> > To: Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu <jiayu.hu@intel.com>;
> Ding,
> > Xuan <xuan.ding@intel.com>; dev@dpdk.org; Burakov, Anatoly
> > <anatoly.burakov@intel.com>
> > Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> > Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> > Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> >
> >
> >
> > On 9/24/21 03:53, Xia, Chenbo wrote:
> > >> -----Original Message-----
> > >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > >> Sent: Thursday, September 23, 2021 10:56 PM
> > >> To: Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>;
> > >> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Xia,
> Chenbo
> > >> <chenbo.xia@intel.com>
> > >> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> > >> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> > >> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> > >> Subject: Re: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> > >>
> > >>
> > >>
> > >> On 9/23/21 16:39, Hu, Jiayu wrote:
> > >>> Hi Xuan,
> > >>>
> > >>>> -----Original Message-----
> > >>>> From: Ding, Xuan <xuan.ding@intel.com>
> > >>>> Sent: Friday, September 17, 2021 1:26 PM
> > >>>> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > >>>> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > >>>> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1
> > <cheng1.jiang@intel.com>;
> > >>>> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> > >>>> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> > >>>> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> > >>>> Subject: [PATCH v2 2/2] vhost: enable IOMMU for async vhost
> > >>>>
> > >>>> The use of IOMMU has many advantages, such as isolation and address
> > >>>> translation. This patch extends the capbility of DMA engine to use
> IOMMU
> > if
> > >>>> the DMA engine is bound to vfio.
> > >>>>
> > >>>> When set memory table, the guest memory will be mapped into the
> default
> > >>>> container of DPDK.
> > >>>>
> > >>>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > >>>> ---
> > >>>>    lib/vhost/rte_vhost.h  |  1 +
> > >>>>    lib/vhost/vhost_user.c | 57
> > >>>> +++++++++++++++++++++++++++++++++++++++++-
> > >>>>    2 files changed, 57 insertions(+), 1 deletion(-)
> > >>>>
> > >>>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index
> > >>>> 8d875e9322..e0537249f3 100644
> > >>>> --- a/lib/vhost/rte_vhost.h
> > >>>> +++ b/lib/vhost/rte_vhost.h
> > >>>> @@ -127,6 +127,7 @@ struct rte_vhost_mem_region {
> > >>>>    	void	 *mmap_addr;
> > >>>>    	uint64_t mmap_size;
> > >>>>    	int fd;
> > >>>> +	uint64_t dma_map_success;
> > >>>
> > >>> How about using bool for dma_map_success?
> > >>
> > >> The bigger problem here is that you are breaking the ABI.
> > >
> > > Maybe this kind of driver-facing structs/functions should be removed
> > > from ABI, since we are refactoring DPDK ABI recently.
> >
> > It has actually been exposed for SPDK, we cannot just remove it from
> > API.
> 
> 'exposed' does not mean it has to be ABI. Like 'driver_sdk_headers' in
> ethdev lib, those headers can be exposed but do not include ABI. I see
> SPDK is using that for building its lib. Not sure in this case, the SPDK
> Vhost lib should be considered as application.
> 
> Thanks,
> Chenbo

Thanks for the discussion. Since the possible ABI changing is in the future,
I consider adding the dma_map_success in the virtio_net structure, to indicate
the map status of each region. This flag can even be removed if it is not considering
the restrictions on user(kernel driver support). Details can be provided in next version's patch.

Hope to get your insights. :)

Thanks,
Xuan

> 
> >
> > Maxime
> >
> > > /Chenbo
> > >
> > >>
> > >>>>    };
> > >>>>
> > >>>>    /**
> > >


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
                   ` (2 preceding siblings ...)
  2021-09-17  5:25 ` [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-25 10:03 ` Xuan Ding
  2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-09-25 10:33 ` [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device Xuan Ding
                   ` (3 subsequent siblings)
  7 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:03 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v3:
* Move the async_map_status flag to virtio_net structure to avoid
ABI breaking.

v2:
* Add rte_errno filtering for some devices bound in the kernel driver.
* Add a flag to check the status of region mapping.
* Fix one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost.h        |   4 +
 lib/vhost/vhost_user.c   | 112 ++++++++++++-
 3 files changed, 342 insertions(+), 112 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-25 10:03 ` [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-25 10:03   ` Xuan Ding
  2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:03 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost
  2021-09-25 10:03 ` [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device Xuan Ding
  2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-25 10:03   ` Xuan Ding
  2021-09-27  4:17     ` Hu, Jiayu
  1 sibling, 1 reply; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:03 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost.h      |   4 ++
 lib/vhost/vhost_user.c | 112 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 89a31e4ca8..bc5695e899 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,10 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+
+	/* Record the dma map status for each region. */
+	bool			*async_map_status;
+
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..3990e9b057 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,63 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
+{
+	uint64_t host_iova;
+	int ret = 0;
+
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		*dma_map_success = ret == 0;
+
+		if (ret) {
+			/*
+			 * DMA device may bind with kernel driver, in this case,
+			 * we don't need to program IOMMU manually. However, if no
+			 * device is bound with vfio/uio in DPDK, and vfio kernel
+			 * module is loaded, the API will still be called and return
+			 * with ENODEV/ENOSUP.
+			 *
+			 * DPDK VFIO only returns ENODEV/ENOSUP in very similar
+			 * situations(VFIO either unsupported, or supported
+			 * but no devices found). Either way, no mappings could be
+			 * performed. We treat it as normal case in async path.
+			 */
+			if (rte_errno == ENODEV && rte_errno == ENOTSUP) {
+				return 0;
+			} else {
+				VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+				return ret;
+			}
+		}
+
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!*dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		/* Clear the flag once the unmap succeeds. */
+		*dma_map_success = 0;
+	}
+
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +212,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, &dev->async_map_status[i], false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 	}
 
 	dev->postcopy_listening = 0;
+
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 }
 
 static void
@@ -621,6 +688,17 @@ numa_realloc(struct virtio_net *dev, int index)
 	}
 	dev->mem = mem;
 
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * dev->mem->nregions, 0, node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to realloc dma mapping status on node\n",
+				dev->vid);
+			return dev;
+		}
+	}
+
 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
 			RTE_CACHE_LINE_SIZE, node);
 	if (!gp) {
@@ -1151,12 +1229,14 @@ vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
 static int
 vhost_user_mmap_region(struct virtio_net *dev,
 		struct rte_vhost_mem_region *region,
+		uint32_t region_index,
 		uint64_t mmap_offset)
 {
 	void *mmap_addr;
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1290,25 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
+							"engine failed\n");
+				rte_free(dev->async_map_status);
+				dev->async_map_status = NULL;
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1291,6 +1383,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		dev->mem = NULL;
 	}
 
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
+
 	/* Flush IOTLB cache as previous HVAs are now invalid */
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		for (i = 0; i < dev->nr_vring; i++)
@@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		goto free_guest_pages;
 	}
 
+	if (dev->async_copy) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * memory->nregions, 0, numa_node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to allocate memory for dma mapping status\n",
+				dev->vid);
+			goto free_guest_pages;
+		}
+	}
+
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
 
@@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
-		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
 			goto free_mem_table;
 		}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
                   ` (3 preceding siblings ...)
  2021-09-25 10:03 ` [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-25 10:33 ` Xuan Ding
  2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-09-27  7:48 ` [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device Xuan Ding
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:33 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v4:
* Fix a format issue.

v3:
* Move the async_map_status flag to virtio_net structure to avoid
ABI breaking.

v2:
* Add rte_errno filtering for some devices bound in the kernel driver.
* Add a flag to check the status of region mapping.
* Fix one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost.h        |   4 +
 lib/vhost/vhost_user.c   | 112 ++++++++++++-
 3 files changed, 342 insertions(+), 112 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-25 10:33 ` [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-25 10:33   ` Xuan Ding
  2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:33 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] vhost: enable IOMMU for async vhost
  2021-09-25 10:33 ` [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device Xuan Ding
  2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-25 10:33   ` Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-25 10:33 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost.h      |   4 ++
 lib/vhost/vhost_user.c | 112 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 89a31e4ca8..bc5695e899 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,10 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+
+	/* Record the dma map status for each region. */
+	bool			*async_map_status;
+
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..10104be18f 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,63 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
+{
+	uint64_t host_iova;
+	int ret = 0;
+
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		*dma_map_success = ret == 0;
+
+		if (ret) {
+			/*
+			 * DMA device may bind with kernel driver, in this case,
+			 * we don't need to program IOMMU manually. However, if no
+			 * device is bound with vfio/uio in DPDK, and vfio kernel
+			 * module is loaded, the API will still be called and return
+			 * with ENODEV/ENOSUP.
+			 *
+			 * DPDK VFIO only returns ENODEV/ENOSUP in very similar
+			 * situations(VFIO either unsupported, or supported
+			 * but no devices found). Either way, no mappings could be
+			 * performed. We treat it as normal case in async path.
+			 */
+			if (rte_errno == ENODEV && rte_errno == ENOTSUP)
+				return 0;
+
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+
+		}
+
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!*dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		/* Clear the flag once the unmap succeeds. */
+		*dma_map_success = 0;
+	}
+
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +212,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, &dev->async_map_status[i], false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 	}
 
 	dev->postcopy_listening = 0;
+
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 }
 
 static void
@@ -621,6 +688,17 @@ numa_realloc(struct virtio_net *dev, int index)
 	}
 	dev->mem = mem;
 
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * dev->mem->nregions, 0, node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to realloc dma mapping status on node\n",
+				dev->vid);
+			return dev;
+		}
+	}
+
 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
 			RTE_CACHE_LINE_SIZE, node);
 	if (!gp) {
@@ -1151,12 +1229,14 @@ vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
 static int
 vhost_user_mmap_region(struct virtio_net *dev,
 		struct rte_vhost_mem_region *region,
+		uint32_t region_index,
 		uint64_t mmap_offset)
 {
 	void *mmap_addr;
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1290,25 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
+							"engine failed\n");
+				rte_free(dev->async_map_status);
+				dev->async_map_status = NULL;
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1291,6 +1383,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		dev->mem = NULL;
 	}
 
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
+
 	/* Flush IOTLB cache as previous HVAs are now invalid */
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		for (i = 0; i < dev->nr_vring; i++)
@@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		goto free_guest_pages;
 	}
 
+	if (dev->async_copy) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * memory->nregions, 0, numa_node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to allocate memory for dma mapping status\n",
+				dev->vid);
+			goto free_guest_pages;
+		}
+	}
+
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
 
@@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
-		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
 			goto free_mem_table;
 		}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost
  2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-09-27  4:17     ` Hu, Jiayu
  2021-09-27  4:55       ` Ding, Xuan
  0 siblings, 1 reply; 40+ messages in thread
From: Hu, Jiayu @ 2021-09-27  4:17 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

Hi Xuan,

> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Saturday, September 25, 2021 6:04 PM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v3 2/2] vhost: enable IOMMU for async vhost
> 
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use IOMMU if
> the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped into the default
> container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>  lib/vhost/vhost.h      |   4 ++
>  lib/vhost/vhost_user.c | 112
> ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 114 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
> 89a31e4ca8..bc5695e899 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -370,6 +370,10 @@ struct virtio_net {
>  	int16_t			broadcast_rarp;
>  	uint32_t		nr_vring;
>  	int			async_copy;
> +
> +	/* Record the dma map status for each region. */
> +	bool			*async_map_status;
> +
>  	int			extbuf;
>  	int			linearbuf;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index
> 29a4c9af60..3990e9b057 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -45,6 +45,8 @@
>  #include <rte_common.h>
>  #include <rte_malloc.h>
>  #include <rte_log.h>
> +#include <rte_vfio.h>
> +#include <rte_errno.h>
> 
>  #include "iotlb.h"
>  #include "vhost.h"
> @@ -141,6 +143,63 @@ get_blk_size(int fd)
>  	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;  }
> 
> +static int
> +async_dma_map(struct rte_vhost_mem_region *region, bool
> +*dma_map_success, bool do_map) {
> +	uint64_t host_iova;
> +	int ret = 0;
> +
> +	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
> >host_user_addr);
> +	if (do_map) {
> +		/* Add mapped region into the default container of DPDK. */
> +		ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						 region->host_user_addr,
> +						 host_iova,
> +						 region->size);
> +		*dma_map_success = ret == 0;
> +
> +		if (ret) {
> +			/*
> +			 * DMA device may bind with kernel driver, in this
> case,
> +			 * we don't need to program IOMMU manually.
> However, if no
> +			 * device is bound with vfio/uio in DPDK, and vfio
> kernel
> +			 * module is loaded, the API will still be called and
> return
> +			 * with ENODEV/ENOSUP.
> +			 *
> +			 * DPDK VFIO only returns ENODEV/ENOSUP in very
> similar
> +			 * situations(VFIO either unsupported, or supported
> +			 * but no devices found). Either way, no mappings
> could be
> +			 * performed. We treat it as normal case in async
> path.
> +			 */
> +			if (rte_errno == ENODEV && rte_errno == ENOTSUP) {
> +				return 0;
> +			} else {
> +				VHOST_LOG_CONFIG(ERR, "DMA engine map
> failed\n");
> +				return ret;
> +			}
> +		}
> +
> +	} else {
> +		/* No need to do vfio unmap if the map failed. */
> +		if (!*dma_map_success)
> +			return 0;
> +
> +		/* Remove mapped region from the default container of
> DPDK. */
> +		ret =
> rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						   region->host_user_addr,
> +						   host_iova,
> +						   region->size);
> +		if (ret) {
> +			VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> failed\n");
> +			return ret;
> +		}
> +		/* Clear the flag once the unmap succeeds. */
> +		*dma_map_success = 0;
> +	}
> +
> +	return ret;
> +}
> +
>  static void
>  free_mem_region(struct virtio_net *dev)  { @@ -153,6 +212,9 @@
> free_mem_region(struct virtio_net *dev)
>  	for (i = 0; i < dev->mem->nregions; i++) {
>  		reg = &dev->mem->regions[i];
>  		if (reg->host_user_addr) {
> +			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +				async_dma_map(reg, &dev-
> >async_map_status[i], false);
> +
>  			munmap(reg->mmap_addr, reg->mmap_size);
>  			close(reg->fd);
>  		}
> @@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
>  	}
> 
>  	dev->postcopy_listening = 0;
> +
> +	if (dev->async_map_status) {
> +		rte_free(dev->async_map_status);
> +		dev->async_map_status = NULL;
> +	}
>  }
> 
>  static void
> @@ -621,6 +688,17 @@ numa_realloc(struct virtio_net *dev, int index)
>  	}
>  	dev->mem = mem;
> 
> +	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
> +		dev->async_map_status = rte_zmalloc_socket("async-dma-
> map-status",
> +					sizeof(bool) * dev->mem->nregions,
> 0, node);
> +		if (!dev->async_map_status) {
> +			VHOST_LOG_CONFIG(ERR,
> +				"(%d) failed to realloc dma mapping status on
> node\n",
> +				dev->vid);
> +			return dev;
> +		}
> +	}
> +
>  	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages *
> sizeof(*gp),
>  			RTE_CACHE_LINE_SIZE, node);
>  	if (!gp) {
> @@ -1151,12 +1229,14 @@ vhost_user_postcopy_register(struct virtio_net
> *dev, int main_fd,  static int  vhost_user_mmap_region(struct virtio_net *dev,
>  		struct rte_vhost_mem_region *region,
> +		uint32_t region_index,
>  		uint64_t mmap_offset)
>  {
>  	void *mmap_addr;
>  	uint64_t mmap_size;
>  	uint64_t alignment;
>  	int populate;
> +	int ret;
> 
>  	/* Check for memory_size + mmap_offset overflow */
>  	if (mmap_offset >= -region->size) {
> @@ -1210,13 +1290,25 @@ vhost_user_mmap_region(struct virtio_net *dev,
>  	region->mmap_size = mmap_size;
>  	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> mmap_offset;
> 
> -	if (dev->async_copy)
> +	if (dev->async_copy) {
>  		if (add_guest_pages(dev, region, alignment) < 0) {
>  			VHOST_LOG_CONFIG(ERR,
>  					"adding guest pages to region
> failed.\n");
>  			return -1;
>  		}
> 
> +		if (rte_vfio_is_enabled("vfio")) {
> +			ret = async_dma_map(region, &dev-
> >async_map_status[region_index], true);
> +			if (ret) {
> +				VHOST_LOG_CONFIG(ERR, "Configure
> IOMMU for DMA "
> +							"engine failed\n");
> +				rte_free(dev->async_map_status);
> +				dev->async_map_status = NULL;

The freed dev->async_map_status is accessed in free_mem_region() later.
You need to free it after calling free_mem_region().

> +				return -1;
> +			}
> +		}
> +	}
> +
>  	VHOST_LOG_CONFIG(INFO,
>  			"guest memory region size: 0x%" PRIx64 "\n"
>  			"\t guest physical addr: 0x%" PRIx64 "\n"
> @@ -1291,6 +1383,11 @@ vhost_user_set_mem_table(struct virtio_net
> **pdev, struct VhostUserMsg *msg,
>  		dev->mem = NULL;
>  	}
> 
> +	if (dev->async_map_status) {
> +		rte_free(dev->async_map_status);
> +		dev->async_map_status = NULL;
> +	}

To handle the gust memory hot-plug case, you need to un-map
iommu tables before program iommu for new memory. But you
seem only free the old dev->async_map_status.

Thanks,
Jiayu

> +
>  	/* Flush IOTLB cache as previous HVAs are now invalid */
>  	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>  		for (i = 0; i < dev->nr_vring; i++)
> @@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net
> **pdev, struct VhostUserMsg *msg,
>  		goto free_guest_pages;
>  	}
> 
> +	if (dev->async_copy) {
> +		dev->async_map_status = rte_zmalloc_socket("async-dma-
> map-status",
> +					sizeof(bool) * memory->nregions, 0,
> numa_node);
> +		if (!dev->async_map_status) {
> +			VHOST_LOG_CONFIG(ERR,
> +				"(%d) failed to allocate memory for dma
> mapping status\n",
> +				dev->vid);
> +			goto free_guest_pages;
> +		}
> +	}
> +
>  	for (i = 0; i < memory->nregions; i++) {
>  		reg = &dev->mem->regions[i];
> 
> @@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net
> **pdev, struct VhostUserMsg *msg,
> 
>  		mmap_offset = memory->regions[i].mmap_offset;
> 
> -		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
> +		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
>  			VHOST_LOG_CONFIG(ERR, "Failed to mmap
> region %u\n", i);
>  			goto free_mem_table;
>  		}
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost
  2021-09-27  4:17     ` Hu, Jiayu
@ 2021-09-27  4:55       ` Ding, Xuan
  0 siblings, 0 replies; 40+ messages in thread
From: Ding, Xuan @ 2021-09-27  4:55 UTC (permalink / raw)
  To: Hu, Jiayu, dev, Burakov,  Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

Hi Jiayu,

> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu@intel.com>
> Sent: Monday, September 27, 2021 12:18 PM
> To: Ding, Xuan <xuan.ding@intel.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; maxime.coquelin@redhat.com; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> Subject: RE: [PATCH v3 2/2] vhost: enable IOMMU for async vhost
> 
> Hi Xuan,
> 
> > -----Original Message-----
> > From: Ding, Xuan <xuan.ding@intel.com>
> > Sent: Saturday, September 25, 2021 6:04 PM
> > To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> > <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> > YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> > Subject: [PATCH v3 2/2] vhost: enable IOMMU for async vhost
> >
> > The use of IOMMU has many advantages, such as isolation and address
> > translation. This patch extends the capbility of DMA engine to use IOMMU if
> > the DMA engine is bound to vfio.
> >
> > When set memory table, the guest memory will be mapped into the default
> > container of DPDK.
> >
> > Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > ---
> >  lib/vhost/vhost.h      |   4 ++
> >  lib/vhost/vhost_user.c | 112
> > ++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 114 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
> > 89a31e4ca8..bc5695e899 100644
> > --- a/lib/vhost/vhost.h
> > +++ b/lib/vhost/vhost.h
> > @@ -370,6 +370,10 @@ struct virtio_net {
> >  int16_tbroadcast_rarp;
> >  uint32_tnr_vring;
> >  intasync_copy;
> > +
> > +/* Record the dma map status for each region. */
> > +bool*async_map_status;
> > +
> >  intextbuf;
> >  intlinearbuf;
> >  struct vhost_virtqueue*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index
> > 29a4c9af60..3990e9b057 100644
> > --- a/lib/vhost/vhost_user.c
> > +++ b/lib/vhost/vhost_user.c
> > @@ -45,6 +45,8 @@
> >  #include <rte_common.h>
> >  #include <rte_malloc.h>
> >  #include <rte_log.h>
> > +#include <rte_vfio.h>
> > +#include <rte_errno.h>
> >
> >  #include "iotlb.h"
> >  #include "vhost.h"
> > @@ -141,6 +143,63 @@ get_blk_size(int fd)
> >  return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;  }
> >
> > +static int
> > +async_dma_map(struct rte_vhost_mem_region *region, bool
> > +*dma_map_success, bool do_map) {
> > +uint64_t host_iova;
> > +int ret = 0;
> > +
> > +host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
> > >host_user_addr);
> > +if (do_map) {
> > +/* Add mapped region into the default container of DPDK. */
> > +ret =
> > rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > + region->host_user_addr,
> > + host_iova,
> > + region->size);
> > +*dma_map_success = ret == 0;
> > +
> > +if (ret) {
> > +/*
> > + * DMA device may bind with kernel driver, in this
> > case,
> > + * we don't need to program IOMMU manually.
> > However, if no
> > + * device is bound with vfio/uio in DPDK, and vfio
> > kernel
> > + * module is loaded, the API will still be called and
> > return
> > + * with ENODEV/ENOSUP.
> > + *
> > + * DPDK VFIO only returns ENODEV/ENOSUP in very
> > similar
> > + * situations(VFIO either unsupported, or supported
> > + * but no devices found). Either way, no mappings
> > could be
> > + * performed. We treat it as normal case in async
> > path.
> > + */
> > +if (rte_errno == ENODEV && rte_errno == ENOTSUP) {
> > +return 0;
> > +} else {
> > +VHOST_LOG_CONFIG(ERR, "DMA engine map
> > failed\n");
> > +return ret;
> > +}
> > +}
> > +
> > +} else {
> > +/* No need to do vfio unmap if the map failed. */
> > +if (!*dma_map_success)
> > +return 0;
> > +
> > +/* Remove mapped region from the default container of
> > DPDK. */
> > +ret =
> > rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
> > +   region->host_user_addr,
> > +   host_iova,
> > +   region->size);
> > +if (ret) {
> > +VHOST_LOG_CONFIG(ERR, "DMA engine unmap
> > failed\n");
> > +return ret;
> > +}
> > +/* Clear the flag once the unmap succeeds. */
> > +*dma_map_success = 0;
> > +}
> > +
> > +return ret;
> > +}
> > +
> >  static void
> >  free_mem_region(struct virtio_net *dev)  { @@ -153,6 +212,9 @@
> > free_mem_region(struct virtio_net *dev)
> >  for (i = 0; i < dev->mem->nregions; i++) {
> >  reg = &dev->mem->regions[i];
> >  if (reg->host_user_addr) {
> > +if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> > +async_dma_map(reg, &dev-
> > >async_map_status[i], false);
> > +
> >  munmap(reg->mmap_addr, reg->mmap_size);
> >  close(reg->fd);
> >  }
> > @@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
> >  }
> >
> >  dev->postcopy_listening = 0;
> > +
> > +if (dev->async_map_status) {
> > +rte_free(dev->async_map_status);
> > +dev->async_map_status = NULL;
> > +}
> >  }
> >
> >  static void
> > @@ -621,6 +688,17 @@ numa_realloc(struct virtio_net *dev, int index)
> >  }
> >  dev->mem = mem;
> >
> > +if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
> > +dev->async_map_status = rte_zmalloc_socket("async-dma-
> > map-status",
> > +sizeof(bool) * dev->mem->nregions,
> > 0, node);
> > +if (!dev->async_map_status) {
> > +VHOST_LOG_CONFIG(ERR,
> > +"(%d) failed to realloc dma mapping status on
> > node\n",
> > +dev->vid);
> > +return dev;
> > +}
> > +}
> > +
> >  gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages *
> > sizeof(*gp),
> >  RTE_CACHE_LINE_SIZE, node);
> >  if (!gp) {
> > @@ -1151,12 +1229,14 @@ vhost_user_postcopy_register(struct virtio_net
> > *dev, int main_fd,  static int  vhost_user_mmap_region(struct virtio_net *dev,
> >  struct rte_vhost_mem_region *region,
> > +uint32_t region_index,
> >  uint64_t mmap_offset)
> >  {
> >  void *mmap_addr;
> >  uint64_t mmap_size;
> >  uint64_t alignment;
> >  int populate;
> > +int ret;
> >
> >  /* Check for memory_size + mmap_offset overflow */
> >  if (mmap_offset >= -region->size) {
> > @@ -1210,13 +1290,25 @@ vhost_user_mmap_region(struct virtio_net *dev,
> >  region->mmap_size = mmap_size;
> >  region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
> > mmap_offset;
> >
> > -if (dev->async_copy)
> > +if (dev->async_copy) {
> >  if (add_guest_pages(dev, region, alignment) < 0) {
> >  VHOST_LOG_CONFIG(ERR,
> >  "adding guest pages to region
> > failed.\n");
> >  return -1;
> >  }
> >
> > +if (rte_vfio_is_enabled("vfio")) {
> > +ret = async_dma_map(region, &dev-
> > >async_map_status[region_index], true);
> > +if (ret) {
> > +VHOST_LOG_CONFIG(ERR, "Configure
> > IOMMU for DMA "
> > +"engine failed\n");
> > +rte_free(dev->async_map_status);
> > +dev->async_map_status = NULL;
> 
> The freed dev->async_map_status is accessed in free_mem_region() later.
> You need to free it after calling free_mem_region().

Thanks for the catch! Will fix it in next version.

> 
> > +return -1;
> > +}
> > +}
> > +}
> > +
> >  VHOST_LOG_CONFIG(INFO,
> >  "guest memory region size: 0x%" PRIx64 "\n"
> >  "\t guest physical addr: 0x%" PRIx64 "\n"
> > @@ -1291,6 +1383,11 @@ vhost_user_set_mem_table(struct virtio_net
> > **pdev, struct VhostUserMsg *msg,
> >  dev->mem = NULL;
> >  }
> >
> > +if (dev->async_map_status) {
> > +rte_free(dev->async_map_status);
> > +dev->async_map_status = NULL;
> > +}
> 
> To handle the gust memory hot-plug case, you need to un-map
> iommu tables before program iommu for new memory. But you
> seem only free the old dev->async_map_status.

Yes, you are right. Will unmap the region in iommu table in hot-plug scenario.

Regards,
Xuan

> 
> Thanks,
> Jiayu
> 
> > +
> >  /* Flush IOTLB cache as previous HVAs are now invalid */
> >  if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >  for (i = 0; i < dev->nr_vring; i++)
> > @@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net
> > **pdev, struct VhostUserMsg *msg,
> >  goto free_guest_pages;
> >  }
> >
> > +if (dev->async_copy) {
> > +dev->async_map_status = rte_zmalloc_socket("async-dma-
> > map-status",
> > +sizeof(bool) * memory->nregions, 0,
> > numa_node);
> > +if (!dev->async_map_status) {
> > +VHOST_LOG_CONFIG(ERR,
> > +"(%d) failed to allocate memory for dma
> > mapping status\n",
> > +dev->vid);
> > +goto free_guest_pages;
> > +}
> > +}
> > +
> >  for (i = 0; i < memory->nregions; i++) {
> >  reg = &dev->mem->regions[i];
> >
> > @@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net
> > **pdev, struct VhostUserMsg *msg,
> >
> >  mmap_offset = memory->regions[i].mmap_offset;
> >
> > -if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
> > +if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
> >  VHOST_LOG_CONFIG(ERR, "Failed to mmap
> > region %u\n", i);
> >  goto free_mem_table;
> >  }
> > --
> > 2.17.1
> 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
                   ` (4 preceding siblings ...)
  2021-09-25 10:33 ` [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-27  7:48 ` Xuan Ding
  2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-09-29  2:41 ` [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device Xuan Ding
  2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
  7 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-27  7:48 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia, anatoly.burakov
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, YvonneX.Yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v5:
* Fix issue of a pointer be freed early.

v4:
* Fix a format issue.

v3:
* Move the async_map_status flag to virtio_net structure to avoid
ABI breaking.

v2:
* Add rte_errno filtering for some devices bound in the kernel driver.
* Add a flag to check the status of region mapping.
* Fix one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost.h        |   4 +
 lib/vhost/vhost_user.c   | 114 ++++++++++++-
 3 files changed, 344 insertions(+), 112 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v5 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-27  7:48 ` [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-27  7:48   ` Xuan Ding
  2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-27  7:48 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia, anatoly.burakov
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, YvonneX.Yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost
  2021-09-27  7:48 ` [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device Xuan Ding
  2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-27  7:48   ` Xuan Ding
  2021-09-27 12:13     ` Burakov, Anatoly
  1 sibling, 1 reply; 40+ messages in thread
From: Xuan Ding @ 2021-09-27  7:48 UTC (permalink / raw)
  To: dev, maxime.coquelin, chenbo.xia, anatoly.burakov
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, YvonneX.Yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost.h      |   4 ++
 lib/vhost/vhost_user.c | 114 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index d98ca8adfa..8b8df3897b 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,10 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+
+	/* Record the dma map status for each region. */
+	bool			*async_map_status;
+
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..7ffb679304 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,63 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
+{
+	uint64_t host_iova;
+	int ret = 0;
+
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		*dma_map_success = ret == 0;
+
+		if (ret) {
+			/*
+			 * DMA device may bind with kernel driver, in this case,
+			 * we don't need to program IOMMU manually. However, if no
+			 * device is bound with vfio/uio in DPDK, and vfio kernel
+			 * module is loaded, the API will still be called and return
+			 * with ENODEV/ENOSUP.
+			 *
+			 * DPDK VFIO only returns ENODEV/ENOSUP in very similar
+			 * situations(VFIO either unsupported, or supported
+			 * but no devices found). Either way, no mappings could be
+			 * performed. We treat it as normal case in async path.
+			 */
+			if (rte_errno == ENODEV && rte_errno == ENOTSUP)
+				return 0;
+
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+
+		}
+
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!*dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		/* Clear the flag once the unmap succeeds. */
+		*dma_map_success = 0;
+	}
+
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +212,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, &dev->async_map_status[i], false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 	}
 
 	dev->postcopy_listening = 0;
+
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 }
 
 static void
@@ -621,6 +688,17 @@ numa_realloc(struct virtio_net *dev, int index)
 	}
 	dev->mem = mem;
 
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * dev->mem->nregions, 0, node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to realloc dma mapping status on node\n",
+				dev->vid);
+			return dev;
+		}
+	}
+
 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
 			RTE_CACHE_LINE_SIZE, node);
 	if (!gp) {
@@ -1151,12 +1229,14 @@ vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
 static int
 vhost_user_mmap_region(struct virtio_net *dev,
 		struct rte_vhost_mem_region *region,
+		uint32_t region_index,
 		uint64_t mmap_offset)
 {
 	void *mmap_addr;
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1290,23 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
+							"engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1289,6 +1379,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		free_mem_region(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
+
+		if (dev->async_map_status) {
+			rte_free(dev->async_map_status);
+			dev->async_map_status = NULL;
+		}
 	}
 
 	/* Flush IOTLB cache as previous HVAs are now invalid */
@@ -1329,6 +1424,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		goto free_guest_pages;
 	}
 
+	if (dev->async_copy) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * memory->nregions, 0, numa_node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to allocate memory for dma mapping status\n",
+				dev->vid);
+			goto free_mem_table;
+		}
+	}
+
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
 
@@ -1345,7 +1451,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
-		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
 			goto free_mem_table;
 		}
@@ -1393,6 +1499,10 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 	free_mem_region(dev);
 	rte_free(dev->mem);
 	dev->mem = NULL;
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 free_guest_pages:
 	rte_free(dev->guest_pages);
 	dev->guest_pages = NULL;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost
  2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-09-27 12:13     ` Burakov, Anatoly
  2021-09-28  9:03       ` Ding, Xuan
  0 siblings, 1 reply; 40+ messages in thread
From: Burakov, Anatoly @ 2021-09-27 12:13 UTC (permalink / raw)
  To: Xuan Ding, dev, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, YvonneX.Yang

On 27-Sep-21 8:48 AM, Xuan Ding wrote:
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use
> IOMMU if the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped
> into the default container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---


>   	/* Flush IOTLB cache as previous HVAs are now invalid */
> @@ -1329,6 +1424,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
>   		goto free_guest_pages;
>   	}
>   
> +	if (dev->async_copy) {
> +		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
> +					sizeof(bool) * memory->nregions, 0, numa_node);

Would it be useful to sanity check this to make sure we're not leaking 
memory?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost
  2021-09-27 12:13     ` Burakov, Anatoly
@ 2021-09-28  9:03       ` Ding, Xuan
  0 siblings, 0 replies; 40+ messages in thread
From: Ding, Xuan @ 2021-09-28  9:03 UTC (permalink / raw)
  To: Burakov, Anatoly, dev, maxime.coquelin, Xia, Chenbo
  Cc: Hu, Jiayu, Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang,
	Yinan, Yang, YvonneX

Hi Anatoly,

> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Monday, September 27, 2021 8:14 PM
> To: Ding, Xuan <xuan.ding@intel.com>; dev@dpdk.org;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang, YvonneX
> <yvonnex.yang@intel.com>
> Subject: Re: [PATCH v5 2/2] vhost: enable IOMMU for async vhost
> 
> On 27-Sep-21 8:48 AM, Xuan Ding wrote:
> > The use of IOMMU has many advantages, such as isolation and address
> > translation. This patch extends the capbility of DMA engine to use
> > IOMMU if the DMA engine is bound to vfio.
> >
> > When set memory table, the guest memory will be mapped
> > into the default container of DPDK.
> >
> > Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> > ---
> 
> 
> >   	/* Flush IOTLB cache as previous HVAs are now invalid */
> > @@ -1329,6 +1424,17 @@ vhost_user_set_mem_table(struct virtio_net
> **pdev, struct VhostUserMsg *msg,
> >   		goto free_guest_pages;
> >   	}
> >
> > +	if (dev->async_copy) {
> > +		dev->async_map_status = rte_zmalloc_socket("async-dma-
> map-status",
> > +					sizeof(bool) * memory->nregions, 0,
> numa_node);
> 
> Would it be useful to sanity check this to make sure we're not leaking
> memory?

Thanks for the catch, will add the check in next version.

Regards,
Xuan

> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
                   ` (5 preceding siblings ...)
  2021-09-27  7:48 ` [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-29  2:41 ` Xuan Ding
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
  7 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-29  2:41 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v6:
* Fix a potential memory leak.

v5:
* Fix issue of a pointer be freed early.

v4:
* Fix a format issue.

v3:
* Move the async_map_status flag to virtio_net structure to avoid
ABI breaking.

v2:
* Add rte_errno filtering for some devices bound in the kernel driver.
* Add a flag to check the status of region mapping.
* Fix one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost.h        |   4 +
 lib/vhost/vhost_user.c   | 116 +++++++++++++-
 3 files changed, 346 insertions(+), 112 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v6 1/2] vfio: allow partially unmapping adjacent memory
  2021-09-29  2:41 ` [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-09-29  2:41   ` Xuan Ding
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  1 sibling, 0 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-29  2:41 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost
  2021-09-29  2:41 ` [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device Xuan Ding
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-09-29  2:41   ` Xuan Ding
  2021-09-29  6:12     ` Hu, Jiayu
  2021-09-30  5:19     ` Hu, Jiayu
  1 sibling, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-09-29  2:41 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost.h      |   4 ++
 lib/vhost/vhost_user.c | 116 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index d98ca8adfa..8b8df3897b 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,10 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+
+	/* Record the dma map status for each region. */
+	bool			*async_map_status;
+
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..3d2872c85f 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,63 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
+{
+	uint64_t host_iova;
+	int ret = 0;
+
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		*dma_map_success = ret == 0;
+
+		if (ret) {
+			/*
+			 * DMA device may bind with kernel driver, in this case,
+			 * we don't need to program IOMMU manually. However, if no
+			 * device is bound with vfio/uio in DPDK, and vfio kernel
+			 * module is loaded, the API will still be called and return
+			 * with ENODEV/ENOSUP.
+			 *
+			 * DPDK vfio only returns ENODEV/ENOSUP in very similar
+			 * situations(vfio either unsupported, or supported
+			 * but no devices found). Either way, no mappings could be
+			 * performed. We treat it as normal case in async path.
+			 */
+			if (rte_errno == ENODEV && rte_errno == ENOTSUP)
+				return 0;
+
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+
+		}
+
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!*dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		/* Clear the flag once the unmap succeeds. */
+		*dma_map_success = 0;
+	}
+
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +212,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, &dev->async_map_status[i], false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -203,6 +265,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 	}
 
 	dev->postcopy_listening = 0;
+
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 }
 
 static void
@@ -621,6 +688,19 @@ numa_realloc(struct virtio_net *dev, int index)
 	}
 	dev->mem = mem;
 
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		if (dev->async_map_status == NULL) {
+			dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * dev->mem->nregions, 0, node);
+			if (!dev->async_map_status) {
+				VHOST_LOG_CONFIG(ERR,
+					"(%d) failed to realloc dma mapping status on node\n",
+					dev->vid);
+				return dev;
+			}
+		}
+	}
+
 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
 			RTE_CACHE_LINE_SIZE, node);
 	if (!gp) {
@@ -1151,12 +1231,14 @@ vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
 static int
 vhost_user_mmap_region(struct virtio_net *dev,
 		struct rte_vhost_mem_region *region,
+		uint32_t region_index,
 		uint64_t mmap_offset)
 {
 	void *mmap_addr;
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1292,23 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
+							"engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1289,6 +1381,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		free_mem_region(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
+
+		if (dev->async_map_status) {
+			rte_free(dev->async_map_status);
+			dev->async_map_status = NULL;
+		}
 	}
 
 	/* Flush IOTLB cache as previous HVAs are now invalid */
@@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		goto free_guest_pages;
 	}
 
+	if (dev->async_copy) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * memory->nregions, 0, numa_node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to allocate memory for dma mapping status\n",
+				dev->vid);
+			goto free_mem_table;
+		}
+	}
+
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
 
@@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
-		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
 			goto free_mem_table;
 		}
@@ -1393,6 +1501,10 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 	free_mem_region(dev);
 	rte_free(dev->mem);
 	dev->mem = NULL;
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 free_guest_pages:
 	rte_free(dev->guest_pages);
 	dev->guest_pages = NULL;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-09-29  6:12     ` Hu, Jiayu
  2021-09-29  9:39       ` Burakov, Anatoly
  2021-09-30  5:19     ` Hu, Jiayu
  1 sibling, 1 reply; 40+ messages in thread
From: Hu, Jiayu @ 2021-09-29  6:12 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

Hi Xuan,

> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Wednesday, September 29, 2021 10:41 AM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v6 2/2] vhost: enable IOMMU for async vhost
> 
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use IOMMU if
> the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped into the default
> container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
> +async_dma_map(struct rte_vhost_mem_region *region, bool
> +*dma_map_success, bool do_map) {
> +	uint64_t host_iova;
> +	int ret = 0;
> +
> +	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
> >host_user_addr);
> +	if (do_map) {
> +		/* Add mapped region into the default container of DPDK. */
> +		ret =
> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> +						 region->host_user_addr,
> +						 host_iova,
> +						 region->size);
> +		*dma_map_success = ret == 0;
> +
> +		if (ret) {
> +			/*
> +			 * DMA device may bind with kernel driver, in this
> case,
> +			 * we don't need to program IOMMU manually.
> However, if no
> +			 * device is bound with vfio/uio in DPDK, and vfio
> kernel
> +			 * module is loaded, the API will still be called and
> return
> +			 * with ENODEV/ENOSUP.
> +			 *
> +			 * DPDK vfio only returns ENODEV/ENOSUP in very
> similar
> +			 * situations(vfio either unsupported, or supported
> +			 * but no devices found). Either way, no mappings
> could be
> +			 * performed. We treat it as normal case in async
> path.
> +			 */


What do you mean by saying "vfio either unsupported"? Does it mean platform
doesn't support iommu?

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost
  2021-09-29  6:12     ` Hu, Jiayu
@ 2021-09-29  9:39       ` Burakov, Anatoly
  2021-09-30  5:17         ` Hu, Jiayu
  0 siblings, 1 reply; 40+ messages in thread
From: Burakov, Anatoly @ 2021-09-29  9:39 UTC (permalink / raw)
  To: Hu, Jiayu, Ding, Xuan, dev, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX

On 29-Sep-21 7:12 AM, Hu, Jiayu wrote:
> Hi Xuan,
> 
>> -----Original Message-----
>> From: Ding, Xuan <xuan.ding@intel.com>
>> Sent: Wednesday, September 29, 2021 10:41 AM
>> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
>> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
>> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
>> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
>> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
>> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
>> Subject: [PATCH v6 2/2] vhost: enable IOMMU for async vhost
>>
>> The use of IOMMU has many advantages, such as isolation and address
>> translation. This patch extends the capbility of DMA engine to use IOMMU if
>> the DMA engine is bound to vfio.
>>
>> When set memory table, the guest memory will be mapped into the default
>> container of DPDK.
>>
>> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
>> ---
>> +async_dma_map(struct rte_vhost_mem_region *region, bool
>> +*dma_map_success, bool do_map) {
>> +uint64_t host_iova;
>> +int ret = 0;
>> +
>> +host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
>>> host_user_addr);
>> +if (do_map) {
>> +/* Add mapped region into the default container of DPDK. */
>> +ret =
>> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
>> + region->host_user_addr,
>> + host_iova,
>> + region->size);
>> +*dma_map_success = ret == 0;
>> +
>> +if (ret) {
>> +/*
>> + * DMA device may bind with kernel driver, in this
>> case,
>> + * we don't need to program IOMMU manually.
>> However, if no
>> + * device is bound with vfio/uio in DPDK, and vfio
>> kernel
>> + * module is loaded, the API will still be called and
>> return
>> + * with ENODEV/ENOSUP.
>> + *
>> + * DPDK vfio only returns ENODEV/ENOSUP in very
>> similar
>> + * situations(vfio either unsupported, or supported
>> + * but no devices found). Either way, no mappings
>> could be
>> + * performed. We treat it as normal case in async
>> path.
>> + */
> 
> 
> What do you mean by saying "vfio either unsupported"? Does it mean platform
> doesn't support iommu?

Unsupported as in the VFIO driver is not loaded. We don't really care if 
the *system* supports VFIO as much as whether it's *accessible to us*. 
I'm sure you would agree that scenario "VFIO is not supported" is 
exactly equivalent to "VFIO driver is not loaded", because from our 
perspective, in both cases the VFIO driver is not loaded :)

> 
> Thanks,
> Jiayu
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost
  2021-09-29  9:39       ` Burakov, Anatoly
@ 2021-09-30  5:17         ` Hu, Jiayu
  0 siblings, 0 replies; 40+ messages in thread
From: Hu, Jiayu @ 2021-09-30  5:17 UTC (permalink / raw)
  To: Burakov, Anatoly, Ding, Xuan, dev, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX



> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Wednesday, September 29, 2021 5:39 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>;
> dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>; Wang,
> Yinan <yinan.wang@intel.com>; Yang, YvonneX <yvonnex.yang@intel.com>
> Subject: Re: [PATCH v6 2/2] vhost: enable IOMMU for async vhost
> 
> On 29-Sep-21 7:12 AM, Hu, Jiayu wrote:
> > Hi Xuan,
> >
> >> -----Original Message-----
> >> From: Ding, Xuan <xuan.ding@intel.com>
> >> Sent: Wednesday, September 29, 2021 10:41 AM
> >> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> >> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> >> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1
> >> <cheng1.jiang@intel.com>; Richardson, Bruce
> >> <bruce.richardson@intel.com>; Pai G, Sunil <sunil.pai.g@intel.com>;
> >> Wang, Yinan <yinan.wang@intel.com>; Yang, YvonneX
> >> <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> >> Subject: [PATCH v6 2/2] vhost: enable IOMMU for async vhost
> >>
> >> The use of IOMMU has many advantages, such as isolation and address
> >> translation. This patch extends the capbility of DMA engine to use
> >> IOMMU if the DMA engine is bound to vfio.
> >>
> >> When set memory table, the guest memory will be mapped into the
> >> default container of DPDK.
> >>
> >> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> >> ---
> >> +async_dma_map(struct rte_vhost_mem_region *region, bool
> >> +*dma_map_success, bool do_map) { uint64_t host_iova; int ret = 0;
> >> +
> >> +host_iova = rte_mem_virt2iova((void *)(uintptr_t)region-
> >>> host_user_addr);
> >> +if (do_map) {
> >> +/* Add mapped region into the default container of DPDK. */ ret =
> >> rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
> >> + region->host_user_addr,
> >> + host_iova,
> >> + region->size);
> >> +*dma_map_success = ret == 0;
> >> +
> >> +if (ret) {
> >> +/*
> >> + * DMA device may bind with kernel driver, in this
> >> case,
> >> + * we don't need to program IOMMU manually.
> >> However, if no
> >> + * device is bound with vfio/uio in DPDK, and vfio
> >> kernel
> >> + * module is loaded, the API will still be called and
> >> return
> >> + * with ENODEV/ENOSUP.
> >> + *
> >> + * DPDK vfio only returns ENODEV/ENOSUP in very
> >> similar
> >> + * situations(vfio either unsupported, or supported
> >> + * but no devices found). Either way, no mappings
> >> could be
> >> + * performed. We treat it as normal case in async
> >> path.
> >> + */
> >
> >
> > What do you mean by saying "vfio either unsupported"? Does it mean
> > platform doesn't support iommu?
> 
> Unsupported as in the VFIO driver is not loaded. We don't really care if the
> *system* supports VFIO as much as whether it's *accessible to us*.
> I'm sure you would agree that scenario "VFIO is not supported" is exactly
> equivalent to "VFIO driver is not loaded", because from our perspective, in
> both cases the VFIO driver is not loaded :)

OK, that's clear.

Thanks,
Jiayu

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost
  2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-09-29  6:12     ` Hu, Jiayu
@ 2021-09-30  5:19     ` Hu, Jiayu
  1 sibling, 0 replies; 40+ messages in thread
From: Hu, Jiayu @ 2021-09-30  5:19 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan,
	Yang, YvonneX


> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Wednesday, September 29, 2021 10:41 AM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v6 2/2] vhost: enable IOMMU for async vhost
> 
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use IOMMU if
> the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped into the default
> container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>  lib/vhost/vhost.h      |   4 ++
>  lib/vhost/vhost_user.c | 116
> ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 118 insertions(+), 2 deletions(-)

Reviewed-by: Jiayu Hu <jiayu.hu@intel.com>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device
  2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
                   ` (6 preceding siblings ...)
  2021-09-29  2:41 ` [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device Xuan Ding
@ 2021-10-11  7:59 ` Xuan Ding
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
                     ` (2 more replies)
  7 siblings, 3 replies; 40+ messages in thread
From: Xuan Ding @ 2021-10-11  7:59 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

This series supports DMA device to use vfio in async vhost.

The first patch extends the capability of current vfio dma mapping
API to allow partial unmapping for adjacent memory if the platform
does not support partial unmapping. The second patch involves the
IOMMU programming for guest memory in async vhost.

v7:
* Fix an operator error.

v6:
* Fix a potential memory leak.

v5:
* Fix issue of a pointer be freed early.

v4:
* Fix a format issue.

v3:
* Move the async_map_status flag to virtio_net structure to avoid
ABI breaking.

v2:
* Add rte_errno filtering for some devices bound in the kernel driver.
* Add a flag to check the status of region mapping.
* Fix one typo.

Xuan Ding (2):
  vfio: allow partially unmapping adjacent memory
  vhost: enable IOMMU for async vhost

 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 lib/vhost/vhost.h        |   4 +
 lib/vhost/vhost_user.c   | 116 +++++++++++++-
 3 files changed, 346 insertions(+), 112 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory
  2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
@ 2021-10-11  7:59   ` Xuan Ding
  2021-10-13  6:57     ` Yang, YvonneX
  2021-10-21  9:50     ` Maxime Coquelin
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-10-21 12:33   ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Maxime Coquelin
  2 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-10-11  7:59 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

Currently, if we map a memory area A, then map a separate memory area B
that by coincidence happens to be adjacent to A, current implementation
will merge these two segments into one, and if partial unmapping is not
supported, these segments will then be only allowed to be unmapped in
one go. In other words, given segments A and B that are adjacent, it
is currently not possible to map A, then map B, then unmap A.

Fix this by adding a notion of "chunk size", which will allow
subdividing segments into equally sized segments whenever we are dealing
with an IOMMU that does not support partial unmapping. With this change,
we will still be able to merge adjacent segments, but only if they are
of the same size. If we keep with our above example, adjacent segments A
and B will be stored as separate segments if they are of different
sizes.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index 25add2fa5d..657c89ca58 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -31,9 +31,10 @@
  */
 #define VFIO_MAX_USER_MEM_MAPS 256
 struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
+	uint64_t addr;  /**< start VA */
+	uint64_t iova;  /**< start IOVA */
+	uint64_t len;   /**< total length of the mapping */
+	uint64_t chunk; /**< this mapping can be split in chunks of this size */
 };
 
 struct user_mem_maps {
@@ -95,7 +96,8 @@ static const struct vfio_iommu_type iommu_types[] = {
 static int
 is_null_map(const struct user_mem_map *map)
 {
-	return map->addr == 0 && map->iova == 0 && map->len == 0;
+	return map->addr == 0 && map->iova == 0 &&
+			map->len == 0 && map->chunk == 0;
 }
 
 /* we may need to merge user mem maps together in case of user mapping/unmapping
@@ -129,41 +131,90 @@ user_mem_map_cmp(const void *a, const void *b)
 	if (umm_a->len > umm_b->len)
 		return 1;
 
+	if (umm_a->chunk < umm_b->chunk)
+		return -1;
+	if (umm_a->chunk > umm_b->chunk)
+		return 1;
+
 	return 0;
 }
 
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
+/*
+ * Take in an address range and list of current mappings, and produce a list of
+ * mappings that will be kept.
  */
+static int
+process_maps(struct user_mem_map *src, size_t src_len,
+		struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
+{
+	struct user_mem_map *src_first = &src[0];
+	struct user_mem_map *src_last = &src[src_len - 1];
+	struct user_mem_map *dst_first = &newmap[0];
+	/* we can get at most two new segments */
+	struct user_mem_map *dst_last = &newmap[1];
+	uint64_t first_off = vaddr - src_first->addr;
+	uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
+	int newmap_len = 0;
+
+	if (first_off != 0) {
+		dst_first->addr = src_first->addr;
+		dst_first->iova = src_first->iova;
+		dst_first->len = first_off;
+		dst_first->chunk = src_first->chunk;
+
+		newmap_len++;
+	}
+	if (last_off != 0) {
+		/* if we had start offset, we have two segments */
+		struct user_mem_map *last =
+				first_off == 0 ? dst_first : dst_last;
+		last->addr = (src_last->addr + src_last->len) - last_off;
+		last->iova = (src_last->iova + src_last->len) - last_off;
+		last->len = last_off;
+		last->chunk = src_last->chunk;
+
+		newmap_len++;
+	}
+	return newmap_len;
+}
+
+/* erase certain maps from the list */
 static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-		uint64_t remove_va_start, uint64_t remove_len)
-{
-	/* if va start is same as start address, we're simply moving start */
-	if (remove_va_start == src->addr) {
-		src->addr += remove_len;
-		src->iova += remove_len;
-		src->len -= remove_len;
-	} else if (remove_va_start + remove_len == src->addr + src->len) {
-		/* we're shrinking mapping from the end */
-		src->len -= remove_len;
-	} else {
-		/* we're blowing a hole in the middle */
-		struct user_mem_map tmp;
-		uint64_t total_len = src->len;
+delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
+		size_t n_del)
+{
+	int i;
+	size_t j;
+
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &del_maps[j];
 
-		/* adjust source segment length */
-		src->len = remove_va_start - src->addr;
+		if (user_mem_map_cmp(left, right) == 0) {
+			memset(left, 0, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps--;
+		}
+	}
+}
+
+static void
+copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
+		size_t n_add)
+{
+	int i;
+	size_t j;
 
-		/* create temporary segment in the middle */
-		tmp.addr = src->addr + src->len;
-		tmp.iova = src->iova + src->len;
-		tmp.len = remove_len;
+	for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
+		struct user_mem_map *left = &user_mem_maps->maps[i];
+		struct user_mem_map *right = &add_maps[j];
 
-		/* populate end segment - this one we will be keeping */
-		end->addr = tmp.addr + tmp.len;
-		end->iova = tmp.iova + tmp.len;
-		end->len = total_len - src->len - tmp.len;
+		/* insert into empty space */
+		if (is_null_map(left)) {
+			memcpy(left, right, sizeof(*left));
+			j++;
+			user_mem_maps->n_maps++;
+		}
 	}
 }
 
@@ -179,7 +230,8 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 		return 0;
 	if (left->iova + left->len != right->iova)
 		return 0;
-
+	if (left->chunk != right->chunk)
+		return 0;
 	left->len += right->len;
 
 out:
@@ -188,51 +240,94 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 	return 1;
 }
 
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-		uint64_t iova, uint64_t len)
+static bool
+addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
+		uint64_t vaddr, uint64_t iova)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_maps; i++) {
+		struct user_mem_map *map = &maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+		uint64_t map_va_off = vaddr - map->addr;
+		uint64_t map_iova_off = iova - map->iova;
+
+		/* we include end of the segment in comparison as well */
+		bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
+		bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
+		/* chunk may not be power of two, so use modulo */
+		bool addr_is_aligned = (map_va_off % map->chunk) == 0;
+		bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
+
+		if (addr_in_map && iova_in_map &&
+				addr_is_aligned && iova_is_aligned)
+			return true;
+	}
+	return false;
+}
+
+static int
+find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len, struct user_mem_map *dst,
+		size_t dst_len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
-	int i;
+	bool found = false;
+	size_t j;
+	int i, ret;
 
-	for (i = 0; i < user_mem_maps->n_maps; i++) {
+	for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
 		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
-		/* check start VA */
-		if (addr < map->addr || addr >= map_va_end)
-			continue;
-		/* check if VA end is within boundaries */
-		if (va_end <= map->addr || va_end > map_va_end)
-			continue;
-
-		/* check start IOVA */
-		if (iova < map->iova || iova >= map_iova_end)
-			continue;
-		/* check if IOVA end is within boundaries */
-		if (iova_end <= map->iova || iova_end > map_iova_end)
-			continue;
-
-		/* we've found our map */
-		return map;
+		bool start_addr_in_map = (addr >= map->addr) &&
+				(addr < map_va_end);
+		bool end_addr_in_map = (va_end > map->addr) &&
+				(va_end <= map_va_end);
+		bool start_iova_in_map = (iova >= map->iova) &&
+				(iova < map_iova_end);
+		bool end_iova_in_map = (iova_end > map->iova) &&
+				(iova_end <= map_iova_end);
+
+		/* do we have space in temporary map? */
+		if (j == dst_len) {
+			ret = -ENOSPC;
+			goto err;
+		}
+		/* check if current map is start of our segment */
+		if (!found && start_addr_in_map && start_iova_in_map)
+			found = true;
+		/* if we have previously found a segment, add it to the map */
+		if (found) {
+			/* copy the segment into our temporary map */
+			memcpy(&dst[j++], map, sizeof(*map));
+
+			/* if we match end of segment, quit */
+			if (end_addr_in_map && end_iova_in_map)
+				return j;
+		}
 	}
-	return NULL;
+	/* we didn't find anything */
+	ret = -ENOENT;
+err:
+	memset(dst, 0, sizeof(*dst) * dst_len);
+	return ret;
 }
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
-	int i, n_merged, cur_idx;
+	int i;
 
-	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
 			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
-	n_merged = 0;
-	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+	for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
 		l = &user_mem_maps->maps[i];
@@ -241,30 +336,16 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		if (is_null_map(l) || is_null_map(r))
 			continue;
 
+		/* try and merge the maps */
 		if (merge_map(l, r))
-			n_merged++;
+			user_mem_maps->n_maps--;
 	}
 
 	/* the entries are still sorted, but now they have holes in them, so
-	 * walk through the list and remove the holes
+	 * sort the list again.
 	 */
-	if (n_merged > 0) {
-		cur_idx = 0;
-		for (i = 0; i < user_mem_maps->n_maps; i++) {
-			if (!is_null_map(&user_mem_maps->maps[i])) {
-				struct user_mem_map *src, *dst;
-
-				src = &user_mem_maps->maps[i];
-				dst = &user_mem_maps->maps[cur_idx++];
-
-				if (src != dst) {
-					memcpy(dst, src, sizeof(*src));
-					memset(src, 0, sizeof(*src));
-				}
-			}
-		}
-		user_mem_maps->n_maps = cur_idx;
-	}
+	qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 }
 
 static int
@@ -1795,6 +1876,7 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 {
 	struct user_mem_map *new_map;
 	struct user_mem_maps *user_mem_maps;
+	bool has_partial_unmap;
 	int ret = 0;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
@@ -1818,11 +1900,16 @@ container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		ret = -1;
 		goto out;
 	}
+	/* do we have partial unmap support? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
 	/* create new user mem map entry */
 	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
+	/* for IOMMU types supporting partial unmap, we don't need chunking */
+	new_map->chunk = has_partial_unmap ? 0 : len;
 
 	compact_user_maps(user_mem_maps);
 out:
@@ -1834,38 +1921,81 @@ static int
 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
 {
-	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
+	struct user_mem_map new_maps[2]; /* can be at most 2 */
 	struct user_mem_maps *user_mem_maps;
-	int ret = 0;
+	int n_orig, n_new, newlen, ret = 0;
+	bool has_partial_unmap;
 
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* find our mapping */
-	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-	if (!map) {
+	/*
+	 * Previously, we had adjacent mappings entirely contained within one
+	 * mapping entry. Since we now store original mapping length in some
+	 * cases, this is no longer the case, so unmapping can potentially go
+	 * over multiple segments and split them in any number of ways.
+	 *
+	 * To complicate things further, some IOMMU types support arbitrary
+	 * partial unmapping, while others will only support unmapping along the
+	 * chunk size, so there are a lot of cases we need to handle. To make
+	 * things easier code wise, instead of trying to adjust existing
+	 * mappings, let's just rebuild them using information we have.
+	 */
+
+	/* do we have partial unmap capability? */
+	has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
+
+	/*
+	 * first thing to do is check if there exists a mapping that includes
+	 * the start and the end of our requested unmap. We need to collect all
+	 * maps that include our unmapped region.
+	 */
+	n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
+			orig_maps, RTE_DIM(orig_maps));
+	/* did we find anything? */
+	if (n_orig < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
 		ret = -1;
 		goto out;
 	}
-	if (map->addr != vaddr || map->iova != iova || map->len != len) {
-		/* we're partially unmapping a previously mapped region, so we
-		 * need to split entry into two.
-		 */
-		if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
+
+	/*
+	 * if we don't support partial unmap, we must check if start and end of
+	 * current unmap region are chunk-aligned.
+	 */
+	if (!has_partial_unmap) {
+		bool start_aligned, end_aligned;
+
+		start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr, iova);
+		end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
+				vaddr + len, iova + len);
+
+		if (!start_aligned || !end_aligned) {
 			RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
 			rte_errno = ENOTSUP;
 			ret = -1;
 			goto out;
 		}
-		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-			rte_errno = ENOMEM;
-			ret = -1;
-			goto out;
-		}
-		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/*
+	 * now we know we can potentially unmap the region, but we still have to
+	 * figure out if there is enough space in our list to store remaining
+	 * maps. for this, we will figure out how many segments we are going to
+	 * remove, and how many new segments we are going to create.
+	 */
+	n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
+
+	/* can we store the new maps in our list? */
+	newlen = (user_mem_maps->n_maps - n_orig) + n_new;
+	if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
 	}
 
 	/* unmap the entry */
@@ -1886,23 +2016,11 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
 		}
 	}
-	/* remove map from the list of active mappings */
-	if (new_map != NULL) {
-		adjust_map(map, new_map, vaddr, len);
-
-		/* if we've created a new map by splitting, sort everything */
-		if (!is_null_map(new_map)) {
-			compact_user_maps(user_mem_maps);
-		} else {
-			/* we've created a new mapping, but it was unused */
-			user_mem_maps->n_maps--;
-		}
-	} else {
-		memset(map, 0, sizeof(*map));
-		compact_user_maps(user_mem_maps);
-		user_mem_maps->n_maps--;
-	}
 
+	/* we have unmapped the region, so now update the maps */
+	delete_maps(user_mem_maps, orig_maps, n_orig);
+	copy_maps(user_mem_maps, new_maps, n_new);
+	compact_user_maps(user_mem_maps);
 out:
 	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost
  2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-10-11  7:59   ` Xuan Ding
  2021-10-13  6:57     ` Yang, YvonneX
  2021-10-21 10:00     ` Maxime Coquelin
  2021-10-21 12:33   ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Maxime Coquelin
  2 siblings, 2 replies; 40+ messages in thread
From: Xuan Ding @ 2021-10-11  7:59 UTC (permalink / raw)
  To: dev, anatoly.burakov, maxime.coquelin, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang, Xuan Ding

The use of IOMMU has many advantages, such as isolation and address
translation. This patch extends the capbility of DMA engine to use
IOMMU if the DMA engine is bound to vfio.

When set memory table, the guest memory will be mapped
into the default container of DPDK.

Signed-off-by: Xuan Ding <xuan.ding@intel.com>
---
 lib/vhost/vhost.h      |   4 ++
 lib/vhost/vhost_user.c | 116 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index d98ca8adfa..8b8df3897b 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -370,6 +370,10 @@ struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			async_copy;
+
+	/* Record the dma map status for each region. */
+	bool			*async_map_status;
+
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 29a4c9af60..e48b945327 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -45,6 +45,8 @@
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_errno.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -141,6 +143,63 @@ get_blk_size(int fd)
 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
 }
 
+static int
+async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
+{
+	uint64_t host_iova;
+	int ret = 0;
+
+	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
+	if (do_map) {
+		/* Add mapped region into the default container of DPDK. */
+		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						 region->host_user_addr,
+						 host_iova,
+						 region->size);
+		*dma_map_success = ret == 0;
+
+		if (ret) {
+			/*
+			 * DMA device may bind with kernel driver, in this case,
+			 * we don't need to program IOMMU manually. However, if no
+			 * device is bound with vfio/uio in DPDK, and vfio kernel
+			 * module is loaded, the API will still be called and return
+			 * with ENODEV/ENOSUP.
+			 *
+			 * DPDK vfio only returns ENODEV/ENOSUP in very similar
+			 * situations(vfio either unsupported, or supported
+			 * but no devices found). Either way, no mappings could be
+			 * performed. We treat it as normal case in async path.
+			 */
+			if (rte_errno == ENODEV || rte_errno == ENOTSUP)
+				return 0;
+
+			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
+			return ret;
+
+		}
+
+	} else {
+		/* No need to do vfio unmap if the map failed. */
+		if (!*dma_map_success)
+			return 0;
+
+		/* Remove mapped region from the default container of DPDK. */
+		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
+						   region->host_user_addr,
+						   host_iova,
+						   region->size);
+		if (ret) {
+			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
+			return ret;
+		}
+		/* Clear the flag once the unmap succeeds. */
+		*dma_map_success = 0;
+	}
+
+	return ret;
+}
+
 static void
 free_mem_region(struct virtio_net *dev)
 {
@@ -153,6 +212,9 @@ free_mem_region(struct virtio_net *dev)
 	for (i = 0; i < dev->mem->nregions; i++) {
 		reg = &dev->mem->regions[i];
 		if (reg->host_user_addr) {
+			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
+				async_dma_map(reg, &dev->async_map_status[i], false);
+
 			munmap(reg->mmap_addr, reg->mmap_size);
 			close(reg->fd);
 		}
@@ -166,6 +228,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 		free_mem_region(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
+
+		if (dev->async_map_status) {
+			rte_free(dev->async_map_status);
+			dev->async_map_status = NULL;
+		}
 	}
 
 	rte_free(dev->guest_pages);
@@ -621,6 +688,19 @@ numa_realloc(struct virtio_net *dev, int index)
 	}
 	dev->mem = mem;
 
+	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
+		if (dev->async_map_status == NULL) {
+			dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * dev->mem->nregions, 0, node);
+			if (!dev->async_map_status) {
+				VHOST_LOG_CONFIG(ERR,
+					"(%d) failed to realloc dma mapping status on node\n",
+					dev->vid);
+				return dev;
+			}
+		}
+	}
+
 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
 			RTE_CACHE_LINE_SIZE, node);
 	if (!gp) {
@@ -1151,12 +1231,14 @@ vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
 static int
 vhost_user_mmap_region(struct virtio_net *dev,
 		struct rte_vhost_mem_region *region,
+		uint32_t region_index,
 		uint64_t mmap_offset)
 {
 	void *mmap_addr;
 	uint64_t mmap_size;
 	uint64_t alignment;
 	int populate;
+	int ret;
 
 	/* Check for memory_size + mmap_offset overflow */
 	if (mmap_offset >= -region->size) {
@@ -1210,13 +1292,23 @@ vhost_user_mmap_region(struct virtio_net *dev,
 	region->mmap_size = mmap_size;
 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
-	if (dev->async_copy)
+	if (dev->async_copy) {
 		if (add_guest_pages(dev, region, alignment) < 0) {
 			VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region failed.\n");
 			return -1;
 		}
 
+		if (rte_vfio_is_enabled("vfio")) {
+			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
+			if (ret) {
+				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
+							"engine failed\n");
+				return -1;
+			}
+		}
+	}
+
 	VHOST_LOG_CONFIG(INFO,
 			"guest memory region size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -1289,6 +1381,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		free_mem_region(dev);
 		rte_free(dev->mem);
 		dev->mem = NULL;
+
+		if (dev->async_map_status) {
+			rte_free(dev->async_map_status);
+			dev->async_map_status = NULL;
+		}
 	}
 
 	/* Flush IOTLB cache as previous HVAs are now invalid */
@@ -1329,6 +1426,17 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		goto free_guest_pages;
 	}
 
+	if (dev->async_copy) {
+		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
+					sizeof(bool) * memory->nregions, 0, numa_node);
+		if (!dev->async_map_status) {
+			VHOST_LOG_CONFIG(ERR,
+				"(%d) failed to allocate memory for dma mapping status\n",
+				dev->vid);
+			goto free_mem_table;
+		}
+	}
+
 	for (i = 0; i < memory->nregions; i++) {
 		reg = &dev->mem->regions[i];
 
@@ -1345,7 +1453,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
-		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
 			goto free_mem_table;
 		}
@@ -1393,6 +1501,10 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 	free_mem_region(dev);
 	rte_free(dev->mem);
 	dev->mem = NULL;
+	if (dev->async_map_status) {
+		rte_free(dev->async_map_status);
+		dev->async_map_status = NULL;
+	}
 free_guest_pages:
 	rte_free(dev->guest_pages);
 	dev->guest_pages = NULL;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
@ 2021-10-13  6:57     ` Yang, YvonneX
  2021-10-21  9:50     ` Maxime Coquelin
  1 sibling, 0 replies; 40+ messages in thread
From: Yang, YvonneX @ 2021-10-13  6:57 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Hu, Jiayu, Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan



> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Monday, October 11, 2021 4:00 PM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory
> 
> Currently, if we map a memory area A, then map a separate memory area B
> that by coincidence happens to be adjacent to A, current implementation will
> merge these two segments into one, and if partial unmapping is not
> supported, these segments will then be only allowed to be unmapped in one
> go. In other words, given segments A and B that are adjacent, it is currently
> not possible to map A, then map B, then unmap A.
> 
> Fix this by adding a notion of "chunk size", which will allow subdividing
> segments into equally sized segments whenever we are dealing with an
> IOMMU that does not support partial unmapping. With this change, we will
> still be able to merge adjacent segments, but only if they are of the same size.
> If we keep with our above example, adjacent segments A and B will be
> stored as separate segments if they are of different sizes.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---

Tested-by: Yvonne Yang <yvonnex.yang@intel.com>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-10-13  6:57     ` Yang, YvonneX
  2021-10-21 10:00     ` Maxime Coquelin
  1 sibling, 0 replies; 40+ messages in thread
From: Yang, YvonneX @ 2021-10-13  6:57 UTC (permalink / raw)
  To: Ding, Xuan, dev, Burakov, Anatoly, maxime.coquelin, Xia, Chenbo
  Cc: Hu, Jiayu, Jiang, Cheng1, Richardson, Bruce, Pai G, Sunil, Wang, Yinan



> -----Original Message-----
> From: Ding, Xuan <xuan.ding@intel.com>
> Sent: Monday, October 11, 2021 4:00 PM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: Hu, Jiayu <jiayu.hu@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Pai G, Sunil
> <sunil.pai.g@intel.com>; Wang, Yinan <yinan.wang@intel.com>; Yang,
> YvonneX <yvonnex.yang@intel.com>; Ding, Xuan <xuan.ding@intel.com>
> Subject: [PATCH v7 2/2] vhost: enable IOMMU for async vhost
> 
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use IOMMU if
> the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped into the default
> container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---

Tested-by: Yvonne Yang <yvonnex.yang@intel.com>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-10-13  6:57     ` Yang, YvonneX
@ 2021-10-21  9:50     ` Maxime Coquelin
  1 sibling, 0 replies; 40+ messages in thread
From: Maxime Coquelin @ 2021-10-21  9:50 UTC (permalink / raw)
  To: Xuan Ding, dev, anatoly.burakov, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang



On 10/11/21 09:59, Xuan Ding wrote:
> Currently, if we map a memory area A, then map a separate memory area B
> that by coincidence happens to be adjacent to A, current implementation
> will merge these two segments into one, and if partial unmapping is not
> supported, these segments will then be only allowed to be unmapped in
> one go. In other words, given segments A and B that are adjacent, it
> is currently not possible to map A, then map B, then unmap A.
> 
> Fix this by adding a notion of "chunk size", which will allow
> subdividing segments into equally sized segments whenever we are dealing
> with an IOMMU that does not support partial unmapping. With this change,
> we will still be able to merge adjacent segments, but only if they are
> of the same size. If we keep with our above example, adjacent segments A
> and B will be stored as separate segments if they are of different
> sizes.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>   lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
>   1 file changed, 228 insertions(+), 110 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost Xuan Ding
  2021-10-13  6:57     ` Yang, YvonneX
@ 2021-10-21 10:00     ` Maxime Coquelin
  1 sibling, 0 replies; 40+ messages in thread
From: Maxime Coquelin @ 2021-10-21 10:00 UTC (permalink / raw)
  To: Xuan Ding, dev, anatoly.burakov, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang



On 10/11/21 09:59, Xuan Ding wrote:
> The use of IOMMU has many advantages, such as isolation and address
> translation. This patch extends the capbility of DMA engine to use
> IOMMU if the DMA engine is bound to vfio.
> 
> When set memory table, the guest memory will be mapped
> into the default container of DPDK.
> 
> Signed-off-by: Xuan Ding <xuan.ding@intel.com>
> ---
>   lib/vhost/vhost.h      |   4 ++
>   lib/vhost/vhost_user.c | 116 ++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 118 insertions(+), 2 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device
  2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
  2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost Xuan Ding
@ 2021-10-21 12:33   ` Maxime Coquelin
  2 siblings, 0 replies; 40+ messages in thread
From: Maxime Coquelin @ 2021-10-21 12:33 UTC (permalink / raw)
  To: Xuan Ding, dev, anatoly.burakov, chenbo.xia
  Cc: jiayu.hu, cheng1.jiang, bruce.richardson, sunil.pai.g,
	yinan.wang, yvonnex.yang



On 10/11/21 09:59, Xuan Ding wrote:
> This series supports DMA device to use vfio in async vhost.
> 
> The first patch extends the capability of current vfio dma mapping
> API to allow partial unmapping for adjacent memory if the platform
> does not support partial unmapping. The second patch involves the
> IOMMU programming for guest memory in async vhost.
> 
> v7:
> * Fix an operator error.
> 
> v6:
> * Fix a potential memory leak.
> 
> v5:
> * Fix issue of a pointer be freed early.
> 
> v4:
> * Fix a format issue.
> 
> v3:
> * Move the async_map_status flag to virtio_net structure to avoid
> ABI breaking.
> 
> v2:
> * Add rte_errno filtering for some devices bound in the kernel driver.
> * Add a flag to check the status of region mapping.
> * Fix one typo.
> 
> Xuan Ding (2):
>    vfio: allow partially unmapping adjacent memory
>    vhost: enable IOMMU for async vhost
> 
>   lib/eal/linux/eal_vfio.c | 338 ++++++++++++++++++++++++++-------------
>   lib/vhost/vhost.h        |   4 +
>   lib/vhost/vhost_user.c   | 116 +++++++++++++-
>   3 files changed, 346 insertions(+), 112 deletions(-)
> 


Applied to dpdk-next-virtio/main.

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2021-10-21 12:33 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-01  5:30 [dpdk-dev] [PATCH 0/2] *** support IOMMU for DMA device *** Xuan Ding
2021-09-01  5:30 ` [dpdk-dev] [PATCH 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-01  5:30 ` [dpdk-dev] [PATCH 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-17  5:25 ` [dpdk-dev] [PATCH v2 0/2] support IOMMU for DMA device Xuan Ding
2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-17  5:25   ` [dpdk-dev] [PATCH v2 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-23 14:39     ` Hu, Jiayu
2021-09-23 14:56       ` Maxime Coquelin
2021-09-24  1:53         ` Xia, Chenbo
2021-09-24  7:13           ` Maxime Coquelin
2021-09-24  7:35             ` Xia, Chenbo
2021-09-24  8:18               ` Ding, Xuan
2021-09-25 10:03 ` [dpdk-dev] [PATCH v3 0/2] support IOMMU for DMA device Xuan Ding
2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-25 10:03   ` [dpdk-dev] [PATCH v3 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-27  4:17     ` Hu, Jiayu
2021-09-27  4:55       ` Ding, Xuan
2021-09-25 10:33 ` [dpdk-dev] [PATCH v4 0/2] support IOMMU for DMA device Xuan Ding
2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-25 10:33   ` [dpdk-dev] [PATCH v4 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-27  7:48 ` [dpdk-dev] [PATCH v5 0/2] support IOMMU for DMA device Xuan Ding
2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-27  7:48   ` [dpdk-dev] [PATCH v5 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-27 12:13     ` Burakov, Anatoly
2021-09-28  9:03       ` Ding, Xuan
2021-09-29  2:41 ` [dpdk-dev] [PATCH v6 0/2] support IOMMU for DMA device Xuan Ding
2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-09-29  2:41   ` [dpdk-dev] [PATCH v6 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-09-29  6:12     ` Hu, Jiayu
2021-09-29  9:39       ` Burakov, Anatoly
2021-09-30  5:17         ` Hu, Jiayu
2021-09-30  5:19     ` Hu, Jiayu
2021-10-11  7:59 ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Xuan Ding
2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 1/2] vfio: allow partially unmapping adjacent memory Xuan Ding
2021-10-13  6:57     ` Yang, YvonneX
2021-10-21  9:50     ` Maxime Coquelin
2021-10-11  7:59   ` [dpdk-dev] [PATCH v7 2/2] vhost: enable IOMMU for async vhost Xuan Ding
2021-10-13  6:57     ` Yang, YvonneX
2021-10-21 10:00     ` Maxime Coquelin
2021-10-21 12:33   ` [dpdk-dev] [PATCH v7 0/2] Support IOMMU for DMA device Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).