DPDK patches and discussions
 help / color / mirror / Atom feed
From: Takeshi Yoshimura <tyos@jp.ibm.com>
To: dev@dpdk.org
Cc: drc@ibm.com, Takeshi Yoshimura <tyos@jp.ibm.com>
Subject: [dpdk-dev] [PATCH] vfio: fix VFIO mapping failures in ppc64le
Date: Fri, 17 Jan 2020 13:25:55 +0900	[thread overview]
Message-ID: <20200117042555.22567-1-tyos@jp.ibm.com> (raw)

ppc64le failed when using large physical memory. I found problems in my two
commits in the past.

In commit e072d16f8920 ("vfio: fix expanding DMA area in ppc64le"), I added
a sanity check using a mapped address to resolve an issue around expanding
IOMMU window, but this was not enough, since memory allocation can return
memory anywhere dependent on memory fragmentation. DPDK may still skip DMA
mapping and attempts to unmap non-mapped DMA during expanding IOMMU window.
As a result, SPDK apps using large physical memory frequently failed to
proceed the communication with NVMe and/or went into an infinite loop.

The root cause of the bug was in a gap between memory segments managed by
DPDK and firmware-level DMA mapping. DPDK's memory segments don't contain
the state of DMA mapping, and so, the memesg_walk cannot determine if an
iterated memory segment is mapped or not. This resulted in incorrect DMA
maps and unmaps.

At this time, I added the code to avoid iterating non-mapped memory
segments during DMA mapping. The memseg_walk iterates over memory segments
marked as "used", and so, the code sets memory segments that will be
mapped or unmapped as "free" transiently.

The commit db90b4969e2e ("vfio: retry creating sPAPR DMA window") allows
retring different page levels and sizes to create DMA window. However, this
allows page sizes different from hugepage sizes. This inconsistency caused
failures at the time of DMA mapping after the window creation. This patch
fixes to retry only different page levels.

Fixes: e072d16f8920 ("vfio: fix expanding DMA area in ppc64le")
Fixes: db90b4969e2e ("vfio: retry creating sPAPR DMA window")

Signed-off-by: Takeshi Yoshimura <tyos@jp.ibm.com>
---
 lib/librte_eal/linux/eal/eal_vfio.c | 76 +++++++++++++----------------
 1 file changed, 33 insertions(+), 43 deletions(-)

diff --git a/lib/librte_eal/linux/eal/eal_vfio.c b/lib/librte_eal/linux/eal/eal_vfio.c
index 95f615c2e..01b5ef3f4 100644
--- a/lib/librte_eal/linux/eal/eal_vfio.c
+++ b/lib/librte_eal/linux/eal/eal_vfio.c
@@ -532,6 +532,17 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
 		return;
 	}
 
+#ifdef RTE_ARCH_PPC_64
+	ms = rte_mem_virt2memseg(addr, msl);
+	while (cur_len < len) {
+		int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+		rte_fbarray_set_free(&msl->memseg_arr, idx);
+		cur_len += ms->len;
+		++ms;
+	}
+	cur_len = 0;
+#endif
 	/* memsegs are contiguous in memory */
 	ms = rte_mem_virt2memseg(addr, msl);
 	while (cur_len < len) {
@@ -551,6 +562,17 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
 		cur_len += ms->len;
 		++ms;
 	}
+#ifdef RTE_ARCH_PPC_64
+	cur_len = 0;
+	ms = rte_mem_virt2memseg(addr, msl);
+	while (cur_len < len) {
+		int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+		rte_fbarray_set_used(&msl->memseg_arr, idx);
+		cur_len += ms->len;
+		++ms;
+	}
+#endif
 }
 
 static int
@@ -1416,16 +1438,11 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 	return 0;
 }
 
-struct spapr_remap_walk_param {
-	int vfio_container_fd;
-	uint64_t addr_64;
-};
-
 static int
 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
 		const struct rte_memseg *ms, void *arg)
 {
-	struct spapr_remap_walk_param *param = arg;
+	int *vfio_container_fd = arg;
 
 	/* skip external memory that isn't a heap */
 	if (msl->external && !msl->heap)
@@ -1435,10 +1452,7 @@ vfio_spapr_map_walk(const struct rte_memseg_list *msl,
 	if (ms->iova == RTE_BAD_IOVA)
 		return 0;
 
-	if (ms->addr_64 == param->addr_64)
-		return 0;
-
-	return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova,
+	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
 			ms->len, 1);
 }
 
@@ -1446,7 +1460,7 @@ static int
 vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
 		const struct rte_memseg *ms, void *arg)
 {
-	struct spapr_remap_walk_param *param = arg;
+	int *vfio_container_fd = arg;
 
 	/* skip external memory that isn't a heap */
 	if (msl->external && !msl->heap)
@@ -1456,17 +1470,13 @@ vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
 	if (ms->iova == RTE_BAD_IOVA)
 		return 0;
 
-	if (ms->addr_64 == param->addr_64)
-		return 0;
-
-	return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova,
+	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
 			ms->len, 0);
 }
 
 struct spapr_walk_param {
 	uint64_t window_size;
 	uint64_t hugepage_sz;
-	uint64_t addr_64;
 };
 
 static int
@@ -1484,10 +1494,6 @@ vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
 	if (ms->iova == RTE_BAD_IOVA)
 		return 0;
 
-	/* do not iterate ms we haven't mapped yet  */
-	if (param->addr_64 && ms->addr_64 == param->addr_64)
-		return 0;
-
 	if (max > param->window_size) {
 		param->hugepage_sz = ms->hugepage_sz;
 		param->window_size = max;
@@ -1531,20 +1537,11 @@ vfio_spapr_create_new_dma_window(int vfio_container_fd,
 		/* try possible page_shift and levels for workaround */
 		uint32_t levels;
 
-		for (levels = 1; levels <= info.ddw.levels; levels++) {
-			uint32_t pgsizes = info.ddw.pgsizes;
-
-			while (pgsizes != 0) {
-				create->page_shift = 31 - __builtin_clz(pgsizes);
-				create->levels = levels;
-				ret = ioctl(vfio_container_fd,
-					VFIO_IOMMU_SPAPR_TCE_CREATE, create);
-				if (!ret)
-					break;
-				pgsizes &= ~(1 << create->page_shift);
-			}
-			if (!ret)
-				break;
+		for (levels = create->levels + 1;
+			ret && levels <= info.ddw.levels; levels++) {
+			create->levels = levels;
+			ret = ioctl(vfio_container_fd,
+				VFIO_IOMMU_SPAPR_TCE_CREATE, create);
 		}
 #endif
 		if (ret) {
@@ -1585,7 +1582,6 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 
 	/* check if window size needs to be adjusted */
 	memset(&param, 0, sizeof(param));
-	param.addr_64 = vaddr;
 
 	/* we're inside a callback so use thread-unsafe version */
 	if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
@@ -1610,14 +1606,9 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 	if (do_map) {
 		/* re-create window and remap the entire memory */
 		if (iova + len > create.window_size) {
-			struct spapr_remap_walk_param remap_param = {
-				.vfio_container_fd = vfio_container_fd,
-				.addr_64 = vaddr,
-			};
-
 			/* release all maps before recreating the window */
 			if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
-					&remap_param) < 0) {
+					&vfio_container_fd) < 0) {
 				RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
 				ret = -1;
 				goto out;
@@ -1644,7 +1635,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 			/* we're inside a callback, so use thread-unsafe version
 			 */
 			if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
-					&remap_param) < 0) {
+					&vfio_container_fd) < 0) {
 				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
 				ret = -1;
 				goto out;
@@ -1691,7 +1682,6 @@ vfio_spapr_dma_map(int vfio_container_fd)
 	struct spapr_walk_param param;
 
 	memset(&param, 0, sizeof(param));
-	param.addr_64 = 0UL;
 
 	/* create DMA window from 0 to max(phys_addr + len) */
 	rte_memseg_walk(vfio_spapr_window_size_walk, &param);
-- 
2.17.1


             reply	other threads:[~2020-01-17  4:26 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-17  4:25 Takeshi Yoshimura [this message]
2020-01-22 18:34 ` David Christensen
2020-02-05 21:04 ` David Marchand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200117042555.22567-1-tyos@jp.ibm.com \
    --to=tyos@jp.ibm.com \
    --cc=dev@dpdk.org \
    --cc=drc@ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).