DPDK patches and discussions
 help / color / mirror / Atom feed
From: Takeshi Yoshimura <tyos@jp.ibm.com>
To: dev@dpdk.org
Cc: Takeshi Yoshimura <tyos@jp.ibm.com>
Subject: [dpdk-dev] [PATCH] vfio: fix expanding DMA area in ppc64le
Date: Wed, 12 Jun 2019 15:33:12 +0900	[thread overview]
Message-ID: <20190612063313.2729-1-tyos@jp.ibm.com> (raw)

In ppc64le, expanding DMA areas always fail because we cannot remove
a DMA window. As a result, we cannot allocate more than one memseg in
ppc64le. This is because vfio_spapr_dma_mem_map() doesn't unmap all
the mapped DMA before removing the window. This patch fixes this
incorrect behavior.

I added a global variable to track current window size since we do
not have better ways to get exact size of it than doing so. sPAPR
IOMMU seems not to provide any ways to get window size with ioctl
interfaces. rte_memseg_walk*() is currently used to calculate window
size, but it walks memsegs that are marked as used, not mapped. So,
we need to determine if a given memseg is mapped or not, otherwise
the ioctl reports errors due to attempting to unregister memory
addresses that are not registered. The global variable is excluded
in non-ppc64le binaries.

Similar problems happen in user maps. We need to avoid attempting to
unmap the address that is given as the function's parameter. The
compaction of user maps prevents us from passing correct length for
unmapping DMA at the window recreation. So, I removed it in ppc64le.

I also fixed the order of ioctl for unregister and unmap. The ioctl
for unregister sometimes report device busy errors due to the
existence of mapped area.

Signed-off-by: Takeshi Yoshimura <tyos@jp.ibm.com>
---
 lib/librte_eal/linux/eal/eal_vfio.c | 154 +++++++++++++++++++---------
 1 file changed, 103 insertions(+), 51 deletions(-)

diff --git a/lib/librte_eal/linux/eal/eal_vfio.c b/lib/librte_eal/linux/eal/eal_vfio.c
index f16c5c3c0..6edbaaff5 100644
--- a/lib/librte_eal/linux/eal/eal_vfio.c
+++ b/lib/librte_eal/linux/eal/eal_vfio.c
@@ -93,6 +93,7 @@ is_null_map(const struct user_mem_map *map)
 	return map->addr == 0 && map->iova == 0 && map->len == 0;
 }
 
+#ifndef RTE_ARCH_PPC_64
 /* we may need to merge user mem maps together in case of user mapping/unmapping
  * chunks of memory, so we'll need a comparator function to sort segments.
  */
@@ -126,6 +127,7 @@ user_mem_map_cmp(const void *a, const void *b)
 
 	return 0;
 }
+#endif
 
 /* adjust user map entry. this may result in shortening of existing map, or in
  * splitting existing map in two pieces.
@@ -162,6 +164,7 @@ adjust_map(struct user_mem_map *src, struct user_mem_map *end,
 	}
 }
 
+#ifndef RTE_ARCH_PPC_64
 /* try merging two maps into one, return 1 if succeeded */
 static int
 merge_map(struct user_mem_map *left, struct user_mem_map *right)
@@ -177,6 +180,7 @@ merge_map(struct user_mem_map *left, struct user_mem_map *right)
 
 	return 1;
 }
+#endif
 
 static struct user_mem_map *
 find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
@@ -211,6 +215,16 @@ find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
 	return NULL;
 }
 
+#ifdef RTE_ARCH_PPC_64
+/* Recreation of DMA window requires unregistering DMA memory.
+ * Compaction confuses the logic and causes false reports in the recreation.
+ * For now, we do not compact user maps in ppc64le.
+ */
+static void
+compact_user_maps(__rte_unused struct user_mem_maps *user_mem_maps)
+{
+}
+#else
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
 compact_user_maps(struct user_mem_maps *user_mem_maps)
@@ -256,6 +270,7 @@ compact_user_maps(struct user_mem_maps *user_mem_maps)
 		user_mem_maps->n_maps = cur_idx;
 	}
 }
+#endif
 
 static int
 vfio_open_group_fd(int iommu_group_num)
@@ -1357,14 +1372,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 		}
 
 	} else {
-		ret = ioctl(vfio_container_fd,
-				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
-					errno, strerror(errno));
-			return -1;
-		}
-
 		memset(&dma_unmap, 0, sizeof(dma_unmap));
 		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
 		dma_unmap.size = len;
@@ -1377,24 +1384,50 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 					errno, strerror(errno));
 			return -1;
 		}
+
+		ret = ioctl(vfio_container_fd,
+				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
 	}
 
 	return 0;
 }
 
+struct spapr_remap_walk_param {
+	int vfio_container_fd;
+	uint64_t window_size;
+};
+
 static int
 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
 		const struct rte_memseg *ms, void *arg)
 {
-	int *vfio_container_fd = arg;
+	struct spapr_remap_walk_param *p = arg;
 
-	if (msl->external)
+	if (msl->external || ms->iova >= p->window_size)
 		return 0;
 
-	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+	return vfio_spapr_dma_do_map(p->vfio_container_fd, ms->addr_64, ms->iova,
 			ms->len, 1);
 }
 
+static int
+vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
+		const struct rte_memseg *ms, void *arg)
+{
+	struct spapr_remap_walk_param *p = arg;
+
+	if (msl->external || ms->iova >= p->window_size)
+		return 0;
+
+	return vfio_spapr_dma_do_map(p->vfio_container_fd, ms->addr_64, ms->iova,
+			ms->len, 0);
+}
+
 struct spapr_walk_param {
 	uint64_t window_size;
 	uint64_t hugepage_sz;
@@ -1481,14 +1514,14 @@ vfio_spapr_create_new_dma_window(int vfio_container_fd,
 	return 0;
 }
 
+#ifdef RTE_ARCH_PPC_64
+static struct vfio_iommu_spapr_tce_create prev_create;
+
 static int
 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 		uint64_t len, int do_map)
 {
-	struct spapr_walk_param param;
-	struct vfio_iommu_spapr_tce_create create = {
-		.argsz = sizeof(create),
-	};
+	struct vfio_iommu_spapr_tce_create create;
 	struct vfio_config *vfio_cfg;
 	struct user_mem_maps *user_mem_maps;
 	int i, ret = 0;
@@ -1502,43 +1535,59 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 	user_mem_maps = &vfio_cfg->mem_maps;
 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-	/* check if window size needs to be adjusted */
-	memset(&param, 0, sizeof(param));
-
-	/* we're inside a callback so use thread-unsafe version */
-	if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
-				&param) < 0) {
-		RTE_LOG(ERR, EAL, "Could not get window size\n");
-		ret = -1;
-		goto out;
-	}
+	memcpy(&create, &prev_create, sizeof(create));
 
 	/* also check user maps */
 	for (i = 0; i < user_mem_maps->n_maps; i++) {
-		uint64_t max = user_mem_maps->maps[i].iova +
-				user_mem_maps->maps[i].len;
-		create.window_size = RTE_MAX(create.window_size, max);
+		struct user_mem_map *map = &user_mem_maps->maps[i];
+
+		if (vaddr == map->addr && len == map->len)
+			continue;
+		create.window_size = RTE_MAX(create.window_size, map->iova + map->len);
 	}
 
 	/* sPAPR requires window size to be a power of 2 */
-	create.window_size = rte_align64pow2(param.window_size);
-	create.page_shift = __builtin_ctzll(param.hugepage_sz);
-	create.levels = 1;
+	create.window_size = rte_align64pow2(create.window_size);
 
 	if (do_map) {
-		void *addr;
 		/* re-create window and remap the entire memory */
-		if (iova > create.window_size) {
+		if (iova + len > create.window_size) {
+			struct spapr_remap_walk_param param = {
+				.vfio_container_fd = vfio_container_fd,
+			    .window_size = create.window_size,
+			};
+
+			/* we're inside a callback, so use thread-unsafe version
+			 */
+			rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
+					&param);
+			/* destruct all user maps */
+			for (i = 0; i < user_mem_maps->n_maps; i++) {
+				struct user_mem_map *map =
+						&user_mem_maps->maps[i];
+				if (vaddr == map->addr && len == map->len)
+					continue;
+				if (vfio_spapr_dma_do_map(vfio_container_fd,
+						map->addr, map->iova, map->len,
+						0)) {
+					RTE_LOG(ERR, EAL, "Could not destruct user DMA maps\n");
+					ret = -1;
+					goto out;
+				}
+			}
+
+			create.window_size = rte_align64pow2(iova + len);
 			if (vfio_spapr_create_new_dma_window(vfio_container_fd,
 					&create) < 0) {
 				RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
 				ret = -1;
 				goto out;
 			}
+			memcpy(&prev_create, &create, sizeof(create));
 			/* we're inside a callback, so use thread-unsafe version
 			 */
 			if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
-					&vfio_container_fd) < 0) {
+					&param) < 0) {
 				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
 				ret = -1;
 				goto out;
@@ -1547,6 +1596,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 			for (i = 0; i < user_mem_maps->n_maps; i++) {
 				struct user_mem_map *map =
 						&user_mem_maps->maps[i];
+				if (vaddr == map->addr && len == map->len)
+					continue;
 				if (vfio_spapr_dma_do_map(vfio_container_fd,
 						map->addr, map->iova, map->len,
 						1)) {
@@ -1556,23 +1607,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 				}
 			}
 		}
-
-		/* now that we've remapped all of the memory that was present
-		 * before, map the segment that we were requested to map.
-		 *
-		 * however, if we were called by the callback, the memory we
-		 * were called with was already in the memseg list, so previous
-		 * mapping should've mapped that segment already.
-		 *
-		 * virt2memseg_list is a relatively cheap check, so use that. if
-		 * memory is within any memseg list, it's a memseg, so it's
-		 * already mapped.
-		 */
-		addr = (void *)(uintptr_t)vaddr;
-		if (rte_mem_virt2memseg_list(addr) == NULL &&
-				vfio_spapr_dma_do_map(vfio_container_fd,
-					vaddr, iova, len, 1) < 0) {
-			RTE_LOG(ERR, EAL, "Could not map segment\n");
+		if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
+			RTE_LOG(ERR, EAL, "Failed to map DMA\n");
 			ret = -1;
 			goto out;
 		}
@@ -1613,6 +1649,7 @@ vfio_spapr_dma_map(int vfio_container_fd)
 		RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
 		return -1;
 	}
+	memcpy(&prev_create, &create, sizeof(create));
 
 	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
 	if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
@@ -1620,6 +1657,21 @@ vfio_spapr_dma_map(int vfio_container_fd)
 
 	return 0;
 }
+#else
+static int
+vfio_spapr_dma_mem_map(int __rte_unused vfio_container_fd,
+			uint64_t __rte_unused vaddr,
+			uint64_t __rte_unused iova, uint64_t __rte_unused len,
+			int __rte_unused do_map)
+{
+	return 0;
+}
+static int
+vfio_spapr_dma_map(int __rte_unused vfio_container_fd)
+{
+	return 0;
+}
+#endif
 
 static int
 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
-- 
2.17.1


             reply	other threads:[~2019-06-12  6:33 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-12  6:33 Takeshi Yoshimura [this message]
2019-06-12 14:06 ` Aaron Conole
2019-06-13  2:22 ` Takeshi Yoshimura
2019-06-13 17:37   ` David Christensen
2019-06-14  7:34   ` David Marchand
2019-06-14  7:49   ` [dpdk-dev] [PATCH v2] " Takeshi Yoshimura
2019-07-13  1:15     ` [dpdk-dev] [PATCH v3] " Takeshi Yoshimura
2019-07-16  0:20       ` David Christensen
2019-07-16 10:56         ` Thomas Monjalon
2019-06-18  2:37   ` [dpdk-dev] [PATCH] " Mo, YufengX
2019-06-18  2:39   ` Mo, YufengX
2019-06-26  9:43   ` Burakov, Anatoly
2019-06-28 11:38   ` Takeshi T Yoshimura
2019-06-28 13:47     ` Burakov, Anatoly
2019-06-28 14:04       ` Burakov, Anatoly
2019-06-13  2:30 ` Takeshi T Yoshimura

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190612063313.2729-1-tyos@jp.ibm.com \
    --to=tyos@jp.ibm.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).