DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] vfio: Support for no-IOMMU mode
@ 2015-12-21 20:16 Anatoly Burakov
  2016-01-13 12:36 ` [dpdk-dev] [PATCH v2] " Anatoly Burakov
  0 siblings, 1 reply; 25+ messages in thread
From: Anatoly Burakov @ 2015-12-21 20:16 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio_dma.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/linuxapp/eal/Makefile           |   1 +
 lib/librte_eal/linuxapp/eal/eal_pci_init.h     |  22 ++++
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c     | 142 ++++++++++++++++---------
 lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c |  84 +++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |   5 +
 5 files changed, 201 insertions(+), 53 deletions(-)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c

diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..5c9e9d9 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_uio.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_dma.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_mp_sync.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_lcore.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index a17c708..da1c431 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -106,6 +106,28 @@ struct vfio_config {
 	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
 };
 
+/* function pointer typedef for DMA mapping functions */
+typedef  int (*vfio_dma_func_t)(int);
+
+/* Structure to hold supported IOMMU types */
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+/* function prototypes for different IOMMU types */
+int vfio_iommu_type1_dma_map(int container_fd);
+int vfio_iommu_noiommu_dma_map(int container_fd);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+		/* x86 IOMMU, otherwise known as type 1 */
+		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
+		/* IOMMU-less mode */
+		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
+};
+
 #endif
 
 #endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..71eeea8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,6 +72,7 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
@@ -208,42 +209,57 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	for (unsigned idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +388,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +441,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +452,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +683,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +917,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c
new file mode 100644
index 0000000..50d3563
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c
@@ -0,0 +1,84 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+
+#include "eal_pci_init.h"
+
+#ifdef VFIO_PRESENT
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..638ee31 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,11 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+/* older kernels may not have no-IOMMU mode */
+#ifndef VFIO_NOIOMMU_IOMMU
+#define VFIO_NOIOMMU_IOMMU 8
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.4.3

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2015-12-21 20:16 [dpdk-dev] [PATCH] vfio: Support for no-IOMMU mode Anatoly Burakov
@ 2016-01-13 12:36 ` Anatoly Burakov
  2016-01-13 16:45   ` Stephen Hemminger
                     ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Anatoly Burakov @ 2016-01-13 12:36 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio_dma.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Santosh Shukla <sshukla@mvista.com>
Tested-by: Santosh Shukla <sshukla@mvista.com>
---
v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done
---
 lib/librte_eal/linuxapp/eal/Makefile           |   1 +
 lib/librte_eal/linuxapp/eal/eal_pci_init.h     |  22 ++++
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c     | 143 ++++++++++++++++---------
 lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c |  84 +++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |   5 +
 5 files changed, 202 insertions(+), 53 deletions(-)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c

diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..5c9e9d9 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_uio.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_dma.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_mp_sync.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_lcore.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index a17c708..da1c431 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -106,6 +106,28 @@ struct vfio_config {
 	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
 };
 
+/* function pointer typedef for DMA mapping functions */
+typedef  int (*vfio_dma_func_t)(int);
+
+/* Structure to hold supported IOMMU types */
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+/* function prototypes for different IOMMU types */
+int vfio_iommu_type1_dma_map(int container_fd);
+int vfio_iommu_noiommu_dma_map(int container_fd);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+		/* x86 IOMMU, otherwise known as type 1 */
+		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
+		/* IOMMU-less mode */
+		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
+};
+
 #endif
 
 #endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..5eb6cd0 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,6 +72,7 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
@@ -208,42 +209,58 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +389,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +442,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +453,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +684,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +918,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c
new file mode 100644
index 0000000..50d3563
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c
@@ -0,0 +1,84 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+
+#include "eal_pci_init.h"
+
+#ifdef VFIO_PRESENT
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..638ee31 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,11 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+/* older kernels may not have no-IOMMU mode */
+#ifndef VFIO_NOIOMMU_IOMMU
+#define VFIO_NOIOMMU_IOMMU 8
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-13 12:36 ` [dpdk-dev] [PATCH v2] " Anatoly Burakov
@ 2016-01-13 16:45   ` Stephen Hemminger
  2016-01-14  9:50     ` Burakov, Anatoly
  2016-01-27  9:05   ` Thomas Monjalon
  2016-01-27 14:04   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
  2 siblings, 1 reply; 25+ messages in thread
From: Stephen Hemminger @ 2016-01-13 16:45 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

On Wed, 13 Jan 2016 12:36:09 +0000
Anatoly Burakov <anatoly.burakov@intel.com> wrote:

> +/* IOMMU types we support */
> +static const struct vfio_iommu_type iommu_types[] = {
> +		/* x86 IOMMU, otherwise known as type 1 */
> +		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
> +		/* IOMMU-less mode */
> +		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
> +};
> +

Nit.. Why full-tab indent here?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-13 16:45   ` Stephen Hemminger
@ 2016-01-14  9:50     ` Burakov, Anatoly
  0 siblings, 0 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-14  9:50 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev

Hi Stephen,

> > +/* IOMMU types we support */
> > +static const struct vfio_iommu_type iommu_types[] = {
> > +		/* x86 IOMMU, otherwise known as type 1 */
> > +		{ VFIO_TYPE1_IOMMU, "Type 1",
> &vfio_iommu_type1_dma_map},
> > +		/* IOMMU-less mode */
> > +		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU",
> &vfio_iommu_noiommu_dma_map},
> > +};
> > +
> 
> Nit.. Why full-tab indent here?

Readability mainly... at least it's more readable to me that way. I can change that if necessary.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-13 12:36 ` [dpdk-dev] [PATCH v2] " Anatoly Burakov
  2016-01-13 16:45   ` Stephen Hemminger
@ 2016-01-27  9:05   ` Thomas Monjalon
  2016-01-27 10:08     ` Burakov, Anatoly
  2016-01-27 14:04   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
  2 siblings, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-27  9:05 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

Hi Anatoly,

Few small comments.

The comments "function pointer typedef" or "structure to hold" don't
bring new information. Please keep it short.

2016-01-13 12:36, Anatoly Burakov:
> +/* function pointer typedef for DMA mapping functions */

->	DMA mapping function type
It would be relevant to describe the return and the parameter.

> +typedef  int (*vfio_dma_func_t)(int);
> +
> +/* Structure to hold supported IOMMU types */

This comment seems useless.

> +struct vfio_iommu_type {

[...]
> +/* function prototypes for different IOMMU types */

idem

> +int vfio_iommu_type1_dma_map(int container_fd);
> +int vfio_iommu_noiommu_dma_map(int container_fd);
> +
> +/* IOMMU types we support */
> +static const struct vfio_iommu_type iommu_types[] = {
> +		/* x86 IOMMU, otherwise known as type 1 */
> +		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
> +		/* IOMMU-less mode */
> +		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
> +};

[...]
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c

Why a new file for these functions?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-27  9:05   ` Thomas Monjalon
@ 2016-01-27 10:08     ` Burakov, Anatoly
  2016-01-27 10:12       ` Thomas Monjalon
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-27 10:08 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> The comments "function pointer typedef" or "structure to hold" don't
> bring new information. Please keep it short.

I'll fix that and submit a v3, thanks.

> Why a new file for these functions?

Well, my thought was to make future extensions easier by way of avoiding mixing irrelevant and/or general code with driver-specific code. I can change it back if that's not OK.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-27 10:08     ` Burakov, Anatoly
@ 2016-01-27 10:12       ` Thomas Monjalon
  2016-01-27 10:24         ` David Marchand
  0 siblings, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-27 10:12 UTC (permalink / raw)
  To: Burakov, Anatoly, david.marchand; +Cc: dev

2016-01-27 10:08, Burakov, Anatoly:
> > Why a new file for these functions?
> 
> Well, my thought was to make future extensions easier by way of avoiding mixing irrelevant and/or general code with driver-specific code. I can change it back if that's not OK.

No strong opinion here.
David?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-27 10:12       ` Thomas Monjalon
@ 2016-01-27 10:24         ` David Marchand
  2016-01-27 10:29           ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: David Marchand @ 2016-01-27 10:24 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

On Wed, Jan 27, 2016 at 11:12 AM, Thomas Monjalon
<thomas.monjalon@6wind.com> wrote:
> 2016-01-27 10:08, Burakov, Anatoly:
>> > Why a new file for these functions?
>>
>> Well, my thought was to make future extensions easier by way of avoiding mixing irrelevant and/or general code with driver-specific code. I can change it back if that's not OK.
>
> No strong opinion here.
> David?

Hum, no strong opinion either, but I don't think we really need to
split this file for this much code.
Besides, if we keep all code in eal_pci_vfio.c, there is no need to
expose those structures through eal_pci_init.h.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
  2016-01-27 10:24         ` David Marchand
@ 2016-01-27 10:29           ` Burakov, Anatoly
  0 siblings, 0 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-27 10:29 UTC (permalink / raw)
  To: David Marchand, Thomas Monjalon; +Cc: dev

> >> > Why a new file for these functions?
> >>
> >> Well, my thought was to make future extensions easier by way of
> avoiding mixing irrelevant and/or general code with driver-specific code. I can
> change it back if that's not OK.
> >
> > No strong opinion here.
> > David?
> 
> Hum, no strong opinion either, but I don't think we really need to split this
> file for this much code.
> Besides, if we keep all code in eal_pci_vfio.c, there is no need to expose
> those structures through eal_pci_init.h.

OK then, I'll merge it back into the eal_pci_vfio.c

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
  2016-01-13 12:36 ` [dpdk-dev] [PATCH v2] " Anatoly Burakov
  2016-01-13 16:45   ` Stephen Hemminger
  2016-01-27  9:05   ` Thomas Monjalon
@ 2016-01-27 14:04   ` Anatoly Burakov
  2016-01-27 14:23     ` Burakov, Anatoly
  2016-01-27 14:32     ` [dpdk-dev] [PATCH v4] " Anatoly Burakov
  2 siblings, 2 replies; 25+ messages in thread
From: Anatoly Burakov @ 2016-01-27 14:04 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio_dma.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: Santosh Shukla <sshukla@mvista.com>
---
v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +++++++++++++++++++++--------
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |   5 +
 2 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..fdf334b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;
 
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef  int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+int vfio_iommu_type1_dma_map(int);
+int vfio_iommu_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+		/* x86 IOMMU, otherwise known as type 1 */
+		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
+		/* IOMMU-less mode */
+		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
+};
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
 		    void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +451,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +980,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..638ee31 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,11 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+/* older kernels may not have no-IOMMU mode */
+#ifndef VFIO_NOIOMMU_IOMMU
+#define VFIO_NOIOMMU_IOMMU 8
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
  2016-01-27 14:04   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
@ 2016-01-27 14:23     ` Burakov, Anatoly
  2016-01-27 14:32     ` [dpdk-dev] [PATCH v4] " Anatoly Burakov
  1 sibling, 0 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-27 14:23 UTC (permalink / raw)
  To: Burakov, Anatoly, dev

Apologies, lost the signoff from Santosh Shukla and also the commit message still mentions the file that is now non-existent, so I'll submit a v4.

Thanks,
Anatoly


> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Anatoly Burakov
> Sent: Wednesday, January 27, 2016 2:05 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
> 
> This commit is adding a generic mechanism to support multiple IOMMU
> types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special VFIO
> mode that doesn't use IOMMU at all), but it's easily extended by adding
> necessary definitions into eal_pci_init.h and a DMA mapping function to
> eal_pci_vfio_dma.c.
> 
> Since type 1 IOMMU module is no longer necessary to have VFIO, we fix the
> module check to check for vfio-pci instead. It's not ideal and triggers VFIO
> checks more often (and thus produces more error output, which was the
> reason behind the module check in the first place), so we compensate for
> that by providing more verbose logging, indicating whether VFIO initialization
> has succeeded or failed.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Tested-by: Santosh Shukla <sshukla@mvista.com>
> ---
> v3 changes:
>   Merging DMA mapping functions back into eal_pci_vfio.c
>   Fixing and adding comments
> 
> v2 changes:
>   Compile fix (hat-tip to Santosh Shukla)
>   Tested-by is provisional, since only superficial testing was done
> 
>  lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +++++++++++++++++++++--
> ------
>  lib/librte_eal/linuxapp/eal/eal_vfio.h     |   5 +
>  2 files changed, 157 insertions(+), 53 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index 74f91ba..fdf334b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
>  #define VFIO_DIR "/dev/vfio"
>  #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
>  #define VFIO_GROUP_FMT "/dev/vfio/%u"
> +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
>  #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
> 
>  /* per-process VFIO config */
>  static struct vfio_config vfio_cfg;
> 
> +/* DMA mapping function prototype.
> + * Takes VFIO container fd as a parameter.
> + * Returns 0 on success, -1 on error.
> + * */
> +typedef  int (*vfio_dma_func_t)(int);
> +
> +struct vfio_iommu_type {
> +	int type_id;
> +	const char *name;
> +	vfio_dma_func_t dma_map_func;
> +};
> +
> +int vfio_iommu_type1_dma_map(int);
> +int vfio_iommu_noiommu_dma_map(int);
> +
> +/* IOMMU types we support */
> +static const struct vfio_iommu_type iommu_types[] = {
> +		/* x86 IOMMU, otherwise known as type 1 */
> +		{ VFIO_TYPE1_IOMMU, "Type 1",
> &vfio_iommu_type1_dma_map},
> +		/* IOMMU-less mode */
> +		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU",
> &vfio_iommu_noiommu_dma_map}, };
> +
> +int
> +vfio_iommu_type1_dma_map(int vfio_container_fd) {
> +	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> +	int i, ret;
> +
> +	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> +	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> +		struct vfio_iommu_type1_dma_map dma_map;
> +
> +		if (ms[i].addr == NULL)
> +			break;
> +
> +		memset(&dma_map, 0, sizeof(dma_map));
> +		dma_map.argsz = sizeof(struct
> vfio_iommu_type1_dma_map);
> +		dma_map.vaddr = ms[i].addr_64;
> +		dma_map.size = ms[i].len;
> +		dma_map.iova = ms[i].phys_addr;
> +		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
> VFIO_DMA_MAP_FLAG_WRITE;
> +
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
> &dma_map);
> +
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping,
> "
> +					"error %i (%s)\n", errno,
> strerror(errno));
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int
> +vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd) {
> +	/* No-IOMMU mode does not need DMA mapping */
> +	return 0;
> +}
> +
>  int
>  pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
>  		    void *buf, size_t len, off_t offs) @@ -208,42 +271,58 @@
> pci_vfio_set_bus_master(int dev_fd)
>  	return 0;
>  }
> 
> -/* set up DMA mappings */
> -static int
> -pci_vfio_setup_dma_maps(int vfio_container_fd) -{
> -	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> -	int i, ret;
> -
> -	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
> -			VFIO_TYPE1_IOMMU);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
> -				"error %i (%s)\n", errno, strerror(errno));
> -		return -1;
> +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for
> +error */ static const struct vfio_iommu_type *
> +pci_vfio_set_iommu_type(int vfio_container_fd) {
> +	unsigned idx;
> +	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
> +		const struct vfio_iommu_type *t = &iommu_types[idx];
> +
> +		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
> +				t->type_id);
> +		if (!ret) {
> +			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d
> (%s)\n",
> +					t->type_id, t->name);
> +			return t;
> +		}
> +		/* not an error, there may be more supported IOMMU types
> */
> +		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
> +				"error %i (%s)\n", t->type_id, t->name,
> errno,
> +				strerror(errno));
>  	}
> +	/* if we didn't find a suitable IOMMU type, fail */
> +	return NULL;
> +}
> 
> -	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> -	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> -		struct vfio_iommu_type1_dma_map dma_map;
> -
> -		if (ms[i].addr == NULL)
> -			break;
> -
> -		memset(&dma_map, 0, sizeof(dma_map));
> -		dma_map.argsz = sizeof(struct
> vfio_iommu_type1_dma_map);
> -		dma_map.vaddr = ms[i].addr_64;
> -		dma_map.size = ms[i].len;
> -		dma_map.iova = ms[i].phys_addr;
> -		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
> VFIO_DMA_MAP_FLAG_WRITE;
> -
> -		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
> &dma_map);
> +/* check if we have any supported extensions */ static int
> +pci_vfio_has_supported_extensions(int vfio_container_fd) {
> +	int ret;
> +	unsigned idx, n_extensions = 0;
> +	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
> +		const struct vfio_iommu_type *t = &iommu_types[idx];
> 
> -		if (ret) {
> -			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping,
> "
> -					"error %i (%s)\n", errno,
> strerror(errno));
> +		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
> +				t->type_id);
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
> +				"error %i (%s)\n", errno,
> +				strerror(errno));
> +			close(vfio_container_fd);
>  			return -1;
> +		} else if (ret == 1) {
> +			/* we found a supported extension */
> +			n_extensions++;
>  		}
> +		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
> +				t->type_id, t->name,
> +				ret ? "supported" : "not supported");
> +	}
> +
> +	/* if we didn't find any supported IOMMU types, fail */
> +	if (!n_extensions) {
> +		close(vfio_container_fd);
> +		return -1;
>  	}
> 
>  	return 0;
> @@ -372,17 +451,10 @@ pci_vfio_get_container_fd(void)
>  			return -1;
>  		}
> 
> -		/* check if we support IOMMU type 1 */
> -		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
> VFIO_TYPE1_IOMMU);
> -		if (ret != 1) {
> -			if (ret < 0)
> -				RTE_LOG(ERR, EAL, "  could not get IOMMU
> type, "
> -					"error %i (%s)\n", errno,
> -					strerror(errno));
> -			else
> -				RTE_LOG(ERR, EAL, "  unsupported IOMMU
> type "
> -					"detected in VFIO\n");
> -			close(vfio_container_fd);
> +		ret =
> pci_vfio_has_supported_extensions(vfio_container_fd);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  no supported IOMMU "
> +					"extensions found!\n");
>  			return -1;
>  		}
> 
> @@ -432,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
> 
>  	/* if primary, try to open the group */
>  	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> +		/* try regular group format */
>  		snprintf(filename, sizeof(filename),
>  				 VFIO_GROUP_FMT, iommu_group_no);
>  		vfio_group_fd = open(filename, O_RDWR); @@ -442,7
> +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
>  						strerror(errno));
>  				return -1;
>  			}
> -			return 0;
> +
> +			/* special case: try no-IOMMU path as well */
> +			snprintf(filename, sizeof(filename),
> +					VFIO_NOIOMMU_GROUP_FMT,
> iommu_group_no);
> +			vfio_group_fd = open(filename, O_RDWR);
> +			if (vfio_group_fd < 0) {
> +				if (errno != ENOENT) {
> +					RTE_LOG(ERR, EAL, "Cannot open %s:
> %s\n", filename,
> +							strerror(errno));
> +					return -1;
> +				}
> +				return 0;
> +			}
> +			/* noiommu group found */
>  		}
> 
>  		/* if the fd is valid, create a new group for it */ @@ -660,14
> +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
>  	}
> 
>  	/*
> -	 * set up DMA mappings for container
> +	 * pick an IOMMU type and set up DMA mappings for container
>  	 *
>  	 * needs to be done only once, only when at least one group is
> assigned to
>  	 * a container and only in primary process
>  	 */
>  	if (internal_config.process_type == RTE_PROC_PRIMARY &&
>  			vfio_cfg.vfio_container_has_dma == 0) {
> -		ret =
> pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
> +		/* select an IOMMU type which we will be using */
> +		const struct vfio_iommu_type *t =
> +
> 	pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
> +		if (!t) {
> +			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU
> type\n", pci_addr);
> +			return -1;
> +		}
> +		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
>  		if (ret) {
>  			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
>  					"error %i (%s)\n", pci_addr, errno,
> strerror(errno)); @@ -887,35 +980,41 @@ pci_vfio_enable(void)  {
>  	/* initialize group list */
>  	int i;
> -	int module_vfio_type1;
> +	int vfio_available;
> 
>  	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
>  		vfio_cfg.vfio_groups[i].fd = -1;
>  		vfio_cfg.vfio_groups[i].group_no = -1;
>  	}
> 
> -	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
> +	/* inform the user that we are probing for VFIO */
> +	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
> +
> +	/* check if vfio-pci module is loaded */
> +	vfio_available = rte_eal_check_module("vfio_pci");
> 
>  	/* return error directly */
> -	if (module_vfio_type1 == -1) {
> +	if (vfio_available == -1) {
>  		RTE_LOG(INFO, EAL, "Could not get loaded module
> details!\n");
>  		return -1;
>  	}
> 
>  	/* return 0 if VFIO modules not loaded */
> -	if (module_vfio_type1 == 0) {
> -		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
> -			"skip VFIO support...\n");
> +	if (vfio_available == 0) {
> +		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
> +			"skipping VFIO support...\n");
>  		return 0;
>  	}
> 
>  	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
> 
>  	/* check if we have VFIO driver enabled */
> -	if (vfio_cfg.vfio_container_fd != -1)
> +	if (vfio_cfg.vfio_container_fd != -1) {
> +		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
>  		vfio_cfg.vfio_enabled = 1;
> -	else
> +	} else {
>  		RTE_LOG(NOTICE, EAL, "VFIO support could not be
> initialized\n");
> +	}
> 
>  	return 0;
>  }
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 72ec3f6..638ee31 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -52,6 +52,11 @@
>  #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE  #endif
> 
> +/* older kernels may not have no-IOMMU mode */ #ifndef
> +VFIO_NOIOMMU_IOMMU #define VFIO_NOIOMMU_IOMMU 8 #endif
> +
>  #define VFIO_PRESENT
>  #endif /* kernel version */
>  #endif /* RTE_EAL_VFIO */
> --
> 2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
  2016-01-27 14:04   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
  2016-01-27 14:23     ` Burakov, Anatoly
@ 2016-01-27 14:32     ` Anatoly Burakov
  2016-01-27 15:50       ` Thomas Monjalon
  2016-01-27 16:50       ` [dpdk-dev] [PATCH v5] " Anatoly Burakov
  1 sibling, 2 replies; 25+ messages in thread
From: Anatoly Burakov @ 2016-01-27 14:32 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Santosh Shukla <sshukla@mvista.com>
Tested-by: Santosh Shukla <sshukla@mvista.com>
---
v4 changes:
  Fixed the commit message and added a missing sign-off

v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +++++++++++++++++++++--------
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |   5 +
 2 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..fdf334b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;
 
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef  int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+int vfio_iommu_type1_dma_map(int);
+int vfio_iommu_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+		/* x86 IOMMU, otherwise known as type 1 */
+		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_iommu_type1_dma_map},
+		/* IOMMU-less mode */
+		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_iommu_noiommu_dma_map},
+};
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
 		    void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +451,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +980,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..638ee31 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,11 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+/* older kernels may not have no-IOMMU mode */
+#ifndef VFIO_NOIOMMU_IOMMU
+#define VFIO_NOIOMMU_IOMMU 8
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
  2016-01-27 14:32     ` [dpdk-dev] [PATCH v4] " Anatoly Burakov
@ 2016-01-27 15:50       ` Thomas Monjalon
  2016-01-27 16:01         ` Burakov, Anatoly
  2016-01-27 16:50       ` [dpdk-dev] [PATCH v5] " Anatoly Burakov
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-27 15:50 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

2016-01-27 14:32, Anatoly Burakov:
> +/* DMA mapping function prototype.
> + * Takes VFIO container fd as a parameter.
> + * Returns 0 on success, -1 on error.
> + * */
> +typedef  int (*vfio_dma_func_t)(int);
> +
> +struct vfio_iommu_type {
> +	int type_id;
> +	const char *name;
> +	vfio_dma_func_t dma_map_func;
> +};
> +
> +int vfio_iommu_type1_dma_map(int);
> +int vfio_iommu_noiommu_dma_map(int);

Is it possible (is it better) to declare these functions
with vfio_dma_func_t?

vfio_iommu_noiommu_dma_map is a weird name.
Why not vfio_noiommu_dma_map or vfio_iommu_none_dma_map?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
  2016-01-27 15:50       ` Thomas Monjalon
@ 2016-01-27 16:01         ` Burakov, Anatoly
  2016-01-27 16:30           ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-27 16:01 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> > +/* DMA mapping function prototype.
> > + * Takes VFIO container fd as a parameter.
> > + * Returns 0 on success, -1 on error.
> > + * */
> > +typedef  int (*vfio_dma_func_t)(int);
> > +
> > +struct vfio_iommu_type {
> > +	int type_id;
> > +	const char *name;
> > +	vfio_dma_func_t dma_map_func;
> > +};
> > +
> > +int vfio_iommu_type1_dma_map(int);
> > +int vfio_iommu_noiommu_dma_map(int);
> 
> Is it possible (is it better) to declare these functions with vfio_dma_func_t?

Yeah, sure. Or maybe the other way around - maybe we could do away with the typedef. I'll go for the former though.

> vfio_iommu_noiommu_dma_map is a weird name.
> Why not vfio_noiommu_dma_map or vfio_iommu_none_dma_map?

Well, the NOIOMMU type is named VFIO_IOMMU_NOIOMMU in the VFIO headers. So it's consistent with the IOMMU type name. Although vfio_noiommu_dma_map seems reasonable.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
  2016-01-27 16:01         ` Burakov, Anatoly
@ 2016-01-27 16:30           ` Burakov, Anatoly
  0 siblings, 0 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-27 16:30 UTC (permalink / raw)
  To: Burakov, Anatoly, Thomas Monjalon; +Cc: dev

Hi Thomas,

> > Is it possible (is it better) to declare these functions with vfio_dma_func_t?
> 
> Yeah, sure. Or maybe the other way around - maybe we could do away with
> the typedef. I'll go for the former though.

No, we can't declare the functions with a function pointer. At least I don't see any obvious way to do that without incurring multiple declarations compile error. So I'll leave it as forward declarations. Of course, the other alternative is to put the array below the functions and make them static, to avoid forward declarations, but I think it's much clearer the way it is now.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v5] vfio: Support for no-IOMMU mode
  2016-01-27 14:32     ` [dpdk-dev] [PATCH v4] " Anatoly Burakov
  2016-01-27 15:50       ` Thomas Monjalon
@ 2016-01-27 16:50       ` Anatoly Burakov
  2016-01-27 17:07         ` Thomas Monjalon
  2016-01-28 11:57         ` [dpdk-dev] [PATCH v6] " Anatoly Burakov
  1 sibling, 2 replies; 25+ messages in thread
From: Anatoly Burakov @ 2016-01-27 16:50 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary DMA mapping functions to eal_pci_vfio.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Santosh Shukla <sshukla@mvista.com>
Tested-by: Santosh Shukla <sshukla@mvista.com>
---
v5 changes:
  Renamed functions

v4 changes:
  Fixed the commit message and added a missing sign-off

v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +++++++++++++++++++++--------
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |   5 +
 2 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..b87819d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;
 
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+int vfio_type1_dma_map(int);
+int vfio_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+		/* x86 IOMMU, otherwise known as type 1 */
+		{ VFIO_TYPE1_IOMMU, "Type 1", &vfio_type1_dma_map},
+		/* IOMMU-less mode */
+		{ VFIO_NOIOMMU_IOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+};
+
+int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
 		    void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +451,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +980,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..638ee31 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,11 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+/* older kernels may not have no-IOMMU mode */
+#ifndef VFIO_NOIOMMU_IOMMU
+#define VFIO_NOIOMMU_IOMMU 8
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vfio: Support for no-IOMMU mode
  2016-01-27 16:50       ` [dpdk-dev] [PATCH v5] " Anatoly Burakov
@ 2016-01-27 17:07         ` Thomas Monjalon
  2016-01-28 10:03           ` Burakov, Anatoly
  2016-01-28 11:57         ` [dpdk-dev] [PATCH v6] " Anatoly Burakov
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-27 17:07 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

2016-01-27 16:50, Anatoly Burakov:
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +int vfio_type1_dma_map(int);
> +int vfio_noiommu_dma_map(int);

WARNING:AVOID_EXTERNS: externs should be avoided in .c files
I agree with checkpatch, they should be static ;)

> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +/* older kernels may not have no-IOMMU mode */
> +#ifndef VFIO_NOIOMMU_IOMMU
> +#define VFIO_NOIOMMU_IOMMU 8
> +#endif

Shouldn't it be defined privately in .c file?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vfio: Support for no-IOMMU mode
  2016-01-27 17:07         ` Thomas Monjalon
@ 2016-01-28 10:03           ` Burakov, Anatoly
  2016-01-28 13:27             ` Thomas Monjalon
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-28 10:03 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

> 2016-01-27 16:50, Anatoly Burakov:
> > --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > +int vfio_type1_dma_map(int);
> > +int vfio_noiommu_dma_map(int);
> 
> WARNING:AVOID_EXTERNS: externs should be avoided in .c files I agree with
> checkpatch, they should be static ;)
> 

Will fix that.

> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> > +/* older kernels may not have no-IOMMU mode */ #ifndef
> > +VFIO_NOIOMMU_IOMMU #define VFIO_NOIOMMU_IOMMU 8 #endif
> 
> Shouldn't it be defined privately in .c file?

We already have other VFIO-related definitions in that file, specifically the PCI defines that aren't present in earlier kernels. This definition is similar in nature - it will be present in kernels starting from 4.5 (when NOIOMMU was introduced), but earlier kernels will need this defined. I didn't want to go similar route with redefining everything VFIO-related, but maybe it makes sense in this case for consistency's sake? E.g.

#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU [we're already in an ifdef linux >= 3.6, so define type1 unconditionally]
#if linux < 4.5
#define RTE_VFIO_NOIOMMU 8
#else
#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
#endif

Or something like that?

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-27 16:50       ` [dpdk-dev] [PATCH v5] " Anatoly Burakov
  2016-01-27 17:07         ` Thomas Monjalon
@ 2016-01-28 11:57         ` Anatoly Burakov
  2016-01-28 13:58           ` Thomas Monjalon
  2016-01-28 16:55           ` Thomas Monjalon
  1 sibling, 2 replies; 25+ messages in thread
From: Anatoly Burakov @ 2016-01-28 11:57 UTC (permalink / raw)
  To: dev

This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions to eal_vfio.h, and DMA mapping
functions to eal_pci_vfio.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Santosh Shukla <sshukla@mvista.com>
Tested-by: Santosh Shukla <sshukla@mvista.com>
---
v6 changes:
  Fixed functions not declared as static
  Fixed definitions to be more consistent with others

v5 changes:
  Renamed functions

v4 changes:
  Fixed the commit message and added a missing sign-off

v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +++++++++++++++++++++--------
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |   8 ++
 2 files changed, 160 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..a6c7e16 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
 
 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;
 
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+static int vfio_type1_dma_map(int);
+static int vfio_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+	/* x86 IOMMU, otherwise known as type 1 */
+	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+	/* IOMMU-less mode */
+	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+};
+
+int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
 		    void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
 	return 0;
 }
 
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
-
-	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-			VFIO_TYPE1_IOMMU);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
 	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
-
-		if (ms[i].addr == NULL)
-			break;
-
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		dma_map.iova = ms[i].phys_addr;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
 			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
 		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
 	}
 
 	return 0;
@@ -372,17 +451,10 @@ pci_vfio_get_container_fd(void)
 			return -1;
 		}
 
-		/* check if we support IOMMU type 1 */
-		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
-		if (ret != 1) {
-			if (ret < 0)
-				RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-					"error %i (%s)\n", errno,
-					strerror(errno));
-			else
-				RTE_LOG(ERR, EAL, "  unsupported IOMMU type "
-					"detected in VFIO\n");
-			close(vfio_container_fd);
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
 			return -1;
 		}
 
@@ -432,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no)
 
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
 		snprintf(filename, sizeof(filename),
 				 VFIO_GROUP_FMT, iommu_group_no);
 		vfio_group_fd = open(filename, O_RDWR);
@@ -442,7 +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no)
 						strerror(errno));
 				return -1;
 			}
-			return 0;
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
 		}
 
 		/* if the fd is valid, create a new group for it */
@@ -660,14 +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/*
-	 * set up DMA mappings for container
+	 * pick an IOMMU type and set up DMA mappings for container
 	 *
 	 * needs to be done only once, only when at least one group is assigned to
 	 * a container and only in primary process
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
 			vfio_cfg.vfio_container_has_dma == 0) {
-		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", pci_addr, errno, strerror(errno));
@@ -887,35 +980,41 @@ pci_vfio_enable(void)
 {
 	/* initialize group list */
 	int i;
-	int module_vfio_type1;
+	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
 		vfio_cfg.vfio_groups[i].fd = -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
 	}
 
-	module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
 
 	/* return error directly */
-	if (module_vfio_type1 == -1) {
+	if (vfio_available == -1) {
 		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
 		return -1;
 	}
 
 	/* return 0 if VFIO modules not loaded */
-	if (module_vfio_type1 == 0) {
-		RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
-			"skip VFIO support...\n");
+	if (vfio_available == 0) {
+		RTE_LOG(INFO, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
 		return 0;
 	}
 
 	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1)
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
 		vfio_cfg.vfio_enabled = 1;
-	else
+	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 72ec3f6..f483bf4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -52,6 +52,14 @@
 #define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
+#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
+#define RTE_VFIO_NOIOMMU 8
+#else
+#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
2.5.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] vfio: Support for no-IOMMU mode
  2016-01-28 10:03           ` Burakov, Anatoly
@ 2016-01-28 13:27             ` Thomas Monjalon
  0 siblings, 0 replies; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-28 13:27 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev

2016-01-28 10:03, Burakov, Anatoly:
> > 2016-01-27 16:50, Anatoly Burakov:
> > > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> > > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> > > +/* older kernels may not have no-IOMMU mode */ #ifndef
> > > +VFIO_NOIOMMU_IOMMU #define VFIO_NOIOMMU_IOMMU 8 #endif
> > 
> > Shouldn't it be defined privately in .c file?
> 
> We already have other VFIO-related definitions in that file, specifically the PCI defines that aren't present in earlier kernels. This definition is similar in nature - it will be present in kernels starting from 4.5 (when NOIOMMU was introduced), but earlier kernels will need this defined. I didn't want to go similar route with redefining everything VFIO-related, but maybe it makes sense in this case for consistency's sake? E.g.
> 
> #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU [we're already in an ifdef linux >= 3.6, so define type1 unconditionally]
> #if linux < 4.5
> #define RTE_VFIO_NOIOMMU 8
> #else
> #define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
> #endif
> 
> Or something like that?

OK you can keep it as is or define a RTE constant. Up to you.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-28 11:57         ` [dpdk-dev] [PATCH v6] " Anatoly Burakov
@ 2016-01-28 13:58           ` Thomas Monjalon
  2016-01-28 14:16             ` Burakov, Anatoly
  2016-01-28 16:55           ` Thomas Monjalon
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-28 13:58 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

2016-01-28 11:57, Anatoly Burakov:
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)

Why not #ifndef VFIO_NOIOMMU_IOMMU?
It would avoid some backport issue.

> +#define RTE_VFIO_NOIOMMU 8
> +#else
> +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
> +#endif

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-28 13:58           ` Thomas Monjalon
@ 2016-01-28 14:16             ` Burakov, Anatoly
  2016-01-28 14:40               ` Thomas Monjalon
  0 siblings, 1 reply; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-28 14:16 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> 2016-01-28 11:57, Anatoly Burakov:
> > +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
> 
> Why not #ifndef VFIO_NOIOMMU_IOMMU?
> It would avoid some backport issue.

I don't see how it could. Versions post-4.5 will have VFIO_NOIOMMU_IOMMU, so no issue there. Pre-4.5 versions, whether they do or do not have VFIO_NOIOMMU_IOMMU defined, will have RTE_VFIO_NOIOMMU defined as 8 regardless.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-28 14:16             ` Burakov, Anatoly
@ 2016-01-28 14:40               ` Thomas Monjalon
  2016-01-28 15:00                 ` Burakov, Anatoly
  0 siblings, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-28 14:40 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev

2016-01-28 14:16, Burakov, Anatoly:
> Hi Thomas,
> 
> > 2016-01-28 11:57, Anatoly Burakov:
> > > +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
> > 
> > Why not #ifndef VFIO_NOIOMMU_IOMMU?
> > It would avoid some backport issue.
> 
> I don't see how it could. Versions post-4.5 will have VFIO_NOIOMMU_IOMMU, so no issue there. Pre-4.5 versions, whether they do or do not have VFIO_NOIOMMU_IOMMU defined, will have RTE_VFIO_NOIOMMU defined as 8 regardless.

Are we sure it will ever be backported as 8?
Anyway I think it's better to avoid version number checks.
What happens if the feature is reverted from 4.5 as it was from 4.4?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-28 14:40               ` Thomas Monjalon
@ 2016-01-28 15:00                 ` Burakov, Anatoly
  0 siblings, 0 replies; 25+ messages in thread
From: Burakov, Anatoly @ 2016-01-28 15:00 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

> > Hi Thomas,
> >
> > > 2016-01-28 11:57, Anatoly Burakov:
> > > > +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
> > >
> > > Why not #ifndef VFIO_NOIOMMU_IOMMU?
> > > It would avoid some backport issue.
> >
> > I don't see how it could. Versions post-4.5 will have
> VFIO_NOIOMMU_IOMMU, so no issue there. Pre-4.5 versions, whether
> they do or do not have VFIO_NOIOMMU_IOMMU defined, will have
> RTE_VFIO_NOIOMMU defined as 8 regardless.
> 
> Are we sure it will ever be backported as 8?
> Anyway I think it's better to avoid version number checks.

Is there a precedent of kernel API definitions ever changing in backports? Presumably whoever backports the changes is interested in making them as compatible as possible, so I believe it's a safe bet to make. I have no strong opinion for or against this way of doing things, but if we're taking issue with kernel version checks, we probably should also adapt all the other stuff in the eal_vfio.h that does things in the exact same manner.

> What happens if the feature is reverted from 4.5 as it was from 4.4?

Well then we have to wait until NOIOMMU makes it into official kernel before applying this patch. There's nothing we can do about that. If the patch gets reverted, then defining NOIOMMU as 8 will be wrong regardless of whether there's a kernel version check.

Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] vfio: Support for no-IOMMU mode
  2016-01-28 11:57         ` [dpdk-dev] [PATCH v6] " Anatoly Burakov
  2016-01-28 13:58           ` Thomas Monjalon
@ 2016-01-28 16:55           ` Thomas Monjalon
  1 sibling, 0 replies; 25+ messages in thread
From: Thomas Monjalon @ 2016-01-28 16:55 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev, alex.williamson

2016-01-28 11:57, Anatoly Burakov:
> This commit is adding a generic mechanism to support multiple IOMMU
> types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
> VFIO mode that doesn't use IOMMU at all), but it's easily extended
> by adding necessary definitions to eal_vfio.h, and DMA mapping
> functions to eal_pci_vfio.c.
> 
> Since type 1 IOMMU module is no longer necessary to have VFIO,
> we fix the module check to check for vfio-pci instead. It's not
> ideal and triggers VFIO checks more often (and thus produces more
> error output, which was the reason behind the module check in the
> first place), so we compensate for that by providing more verbose
> logging, indicating whether VFIO initialization has succeeded or
> failed.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Signed-off-by: Santosh Shukla <sshukla@mvista.com>
> Tested-by: Santosh Shukla <sshukla@mvista.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-01-28 16:57 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-21 20:16 [dpdk-dev] [PATCH] vfio: Support for no-IOMMU mode Anatoly Burakov
2016-01-13 12:36 ` [dpdk-dev] [PATCH v2] " Anatoly Burakov
2016-01-13 16:45   ` Stephen Hemminger
2016-01-14  9:50     ` Burakov, Anatoly
2016-01-27  9:05   ` Thomas Monjalon
2016-01-27 10:08     ` Burakov, Anatoly
2016-01-27 10:12       ` Thomas Monjalon
2016-01-27 10:24         ` David Marchand
2016-01-27 10:29           ` Burakov, Anatoly
2016-01-27 14:04   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
2016-01-27 14:23     ` Burakov, Anatoly
2016-01-27 14:32     ` [dpdk-dev] [PATCH v4] " Anatoly Burakov
2016-01-27 15:50       ` Thomas Monjalon
2016-01-27 16:01         ` Burakov, Anatoly
2016-01-27 16:30           ` Burakov, Anatoly
2016-01-27 16:50       ` [dpdk-dev] [PATCH v5] " Anatoly Burakov
2016-01-27 17:07         ` Thomas Monjalon
2016-01-28 10:03           ` Burakov, Anatoly
2016-01-28 13:27             ` Thomas Monjalon
2016-01-28 11:57         ` [dpdk-dev] [PATCH v6] " Anatoly Burakov
2016-01-28 13:58           ` Thomas Monjalon
2016-01-28 14:16             ` Burakov, Anatoly
2016-01-28 14:40               ` Thomas Monjalon
2016-01-28 15:00                 ` Burakov, Anatoly
2016-01-28 16:55           ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).