DPDK patches and discussions
 help / color / mirror / Atom feed
From: beilei.xing@intel.com
To: anatoly.burakov@intel.com
Cc: dev@dpdk.org, thomas@monjalon.net, ferruh.yigit@amd.com,
	bruce.richardson@intel.com, chenbox@nvidia.com,
	yahui.cao@intel.com, Beilei Xing <beilei.xing@intel.com>
Subject: [PATCH 2/4] vfio: add VFIO IOMMUFD support
Date: Fri, 22 Dec 2023 19:44:51 +0000	[thread overview]
Message-ID: <20231222194453.3049693-3-beilei.xing@intel.com> (raw)
In-Reply-To: <20231222194453.3049693-1-beilei.xing@intel.com>

From: Beilei Xing <beilei.xing@intel.com>

VFIO IOMMUFD is a new component added to co-work with IOMMUFD.
IOMMUFD has no impact on the existing VFIO Container/Group
interface, while the latest IOMMU feature(e.g. PASID/SSID) may
be only available through VFIO IOMMUFD/CDEV interface.

This path exposes setup/release vfio device functions with VFIO
IOMMUFD/CDEV interface.

Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
---
 lib/eal/include/rte_vfio.h       |  55 +++++
 lib/eal/linux/eal_vfio.h         |   3 +
 lib/eal/linux/eal_vfio_iommufd.c | 385 +++++++++++++++++++++++++++++++
 lib/eal/linux/meson.build        |   1 +
 lib/eal/version.map              |   2 +
 5 files changed, 446 insertions(+)
 create mode 100644 lib/eal/linux/eal_vfio_iommufd.c

diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 22832afd0f..7a9b26b0f7 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -17,6 +17,8 @@ extern "C" {
 #include <stdbool.h>
 #include <stdint.h>
 
+#include <rte_compat.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -28,6 +30,9 @@ extern "C" {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
 #define HAVE_VFIO_DEV_REQ_INTERFACE
 #endif /* kernel version >= 4.0.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
@@ -42,6 +47,10 @@ extern "C" {
 #define VFIO_NOIOMMU_MODE      \
 	"/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
 
+#ifdef VFIO_IOMMUFD_PRESENT
+#define VFIO_CDEV_CLASS_DIR "/sys/class/vfio-dev"
+#endif
+
 /* NOIOMMU is defined from kernel version 4.5 onwards */
 #ifdef VFIO_NOIOMMU_IOMMU
 #define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
@@ -137,6 +146,33 @@ struct vfio_device_info;
 int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info);
 
+/**
+ * Setup iommufd_cfg for the device identified by its address.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param sysfs_base
+ *   sysfs path prefix.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param vfio_dev_fd
+ *   VFIO fd.
+ *
+ * @param device_info
+ *   Device information.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ *   >1 if the device cannot be managed this way.
+ */
+__rte_experimental
+int rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+				  int *vfio_dev_fd, struct vfio_device_info *device_info);
+
 /**
  * Release a device mapped to a VFIO-managed I/O MMU group.
  *
@@ -158,6 +194,25 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
  */
 int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
 
+/**
+ * Release a device mapped to a VFIO-iommufd-managed I/O MMU group.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param fd
+ *   VFIO fd.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ */
+__rte_experimental
+int rte_vfio_iommufd_release_device(const char *dev_addr, int fd);
+
 /**
  * Enable a VFIO-related kmod.
  *
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 23a787ad20..c94409e828 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -17,6 +17,9 @@
 #else
 #pragma message("VFIO configured but not supported by this kernel, disabling.")
 #endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
diff --git a/lib/eal/linux/eal_vfio_iommufd.c b/lib/eal/linux/eal_vfio_iommufd.c
new file mode 100644
index 0000000000..02996a588a
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_iommufd.c
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+#ifdef VFIO_IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+#include "eal_iommufd.h"
+
+#define VFIO_IOMMUFD_MEM_EVENT_CLB_NAME "vfio_iommufd_mem_event_clb"
+
+struct ioas_info {
+	int iommufd;
+	uint32_t ioas_id;
+};
+
+static int
+vfio_iommufd_add_device(const char *dev_addr, int vfio_dev_fd)
+{
+	struct iommufd_config *iommufd_cfg;
+	int iommufd;
+	uint32_t ioas_id;
+	struct vfio_device_bind_iommufd bind = {};
+	struct vfio_device_attach_iommufd_pt attach = {};
+	int ret = 0;
+
+	iommufd_cfg = default_iommufd_cfg;
+	iommufd = iommufd_cfg->iommufd;
+	ioas_id = iommufd_cfg->ioas_id;
+
+	bind.argsz = sizeof(bind);
+	bind.iommufd = iommufd;
+	bind.flags = 0;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot bind to iommufd\n", dev_addr);
+		return ret;
+	}
+
+	attach.argsz = sizeof(attach);
+	attach.flags = 0;
+	attach.pt_id = ioas_id;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot attach to ioas\n", dev_addr);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int
+vfio_iommufd_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+			size_t len, void *arg)
+{
+	struct ioas_info *info = arg;
+
+	if (msl->external)
+		return 0;
+
+	return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+				   ms->iova, len, 1);
+}
+
+static int
+vfio_iommufd_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		 void *arg)
+{
+	struct ioas_info *info = arg;
+
+	/* skip external memory that isn't a heap */
+	if (msl->external && !msl->heap)
+		return 0;
+
+	/* skip any segments with invalid IOVA addresses */
+	if (ms->iova == RTE_BAD_IOVA)
+		return 0;
+
+	/* if IOVA mode is VA, we've already mapped the internal segments */
+	if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
+		return 0;
+
+	return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+				   ms->iova,  ms->len, 1);
+}
+
+static int
+vfio_iommufd_dma_map(int iommufd, uint32_t ioasid)
+{
+	struct ioas_info info = {.iommufd = iommufd, .ioas_id = ioasid};
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		/* with IOVA as VA mode, we can get away with mapping contiguous
+		 * chunks rather than going page-by-page.
+		 */
+		int ret = rte_memseg_contig_walk(vfio_iommufd_map_contig,
+						 &info);
+		if (ret)
+			return ret;
+		/* we have to continue the walk because we've skipped the
+		 * external segments during the config walk.
+		 */
+	}
+	return rte_memseg_walk(vfio_iommufd_map, &info);
+}
+
+static void
+vfio_iommufd_mem_event_callback(enum rte_mem_event type, const void *addr,
+				size_t len, void *arg __rte_unused)
+{
+	struct rte_memseg_list *msl;
+	struct rte_memseg *ms;
+	size_t cur_len = 0;
+
+	msl = rte_mem_virt2memseg_list(addr);
+
+	/* for IOVA as VA mode, no need to care for IOVA addresses */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+		uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+		uint64_t page_sz = msl->page_sz;
+
+		/* Maintain granularity of DMA map/unmap to memseg size */
+		for (; cur_len < len; cur_len += page_sz) {
+			if (type == RTE_MEM_EVENT_ALLOC)
+				iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+						    default_iommufd_cfg->ioas_id,
+						    vfio_va, vfio_va, page_sz, 1);
+			else
+				iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+						    default_iommufd_cfg->ioas_id,
+						    vfio_va, vfio_va, page_sz, 0);
+			vfio_va += page_sz;
+		}
+
+		return;
+	}
+
+	/* memsegs are contiguous in memory */
+	ms = rte_mem_virt2memseg(addr, msl);
+	while (cur_len < len) {
+		/* some memory segments may have invalid IOVA */
+		if (ms->iova == RTE_BAD_IOVA) {
+			RTE_LOG(DEBUG, EAL,
+				"Memory segment at %p has bad IOVA, skipping\n",
+				ms->addr);
+			goto next;
+		}
+		if (type == RTE_MEM_EVENT_ALLOC)
+			iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+					    default_iommufd_cfg->ioas_id,
+					    ms->addr_64, ms->iova, ms->len, 1);
+		else
+			iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+					    default_iommufd_cfg->ioas_id,
+					    ms->addr_64, ms->iova, ms->len, 0);
+next:
+		cur_len += ms->len;
+		++ms;
+	}
+}
+
+static int
+vfio_iommufd_get_fd(const char *sysfs_base, const char *dev_addr)
+{
+	char vfio_cdev_path[PATH_MAX];
+	char vfio_path[PATH_MAX];
+	char dirname[PATH_MAX];
+	int vfio_dev_fd;
+	struct dirent *dent;
+	unsigned int major, minor;
+	struct stat st;
+	dev_t cdev;
+	DIR *dir;
+	FILE *f;
+	int ret = 0;
+
+	memset(vfio_cdev_path, 0, sizeof(vfio_cdev_path));
+	memset(vfio_path, 0, sizeof(vfio_path));
+	memset(dirname, 0, sizeof(dirname));
+
+	snprintf(dirname, sizeof(dirname), "%s/%s/vfio-dev",
+		 sysfs_base, dev_addr);
+
+	dir = opendir(dirname);
+	if (dir == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+
+	while ((dent = readdir(dir)) != NULL) {
+		if (!strncmp(dent->d_name, "vfio", 4)) {
+			snprintf(vfio_cdev_path, sizeof(vfio_cdev_path),
+				 "%s/%s/vfio-dev/%s/dev", sysfs_base,
+				 dev_addr, dent->d_name);
+			break;
+		}
+	}
+
+	f = fopen(vfio_cdev_path, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+			__func__);
+		ret = -1;
+		goto err_fopen;
+	}
+
+	ret = fscanf(f, "%u:%u", &major, &minor);
+	if (ret != 2) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+			__func__);
+		ret = -1;
+		goto err_fscanf;
+	}
+
+	cdev = makedev(major, minor);
+
+	snprintf(vfio_path, sizeof(vfio_path), "/dev/vfio/devices/%s", dent->d_name);
+	vfio_dev_fd = open(vfio_path, O_RDWR);
+	if (vfio_dev_fd == -1) {
+		RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+			__func__, vfio_path, strerror(errno));
+		ret = -1;
+		goto err_fscanf;
+	}
+
+	if (fstat(vfio_dev_fd, &st) || !S_ISCHR(st.st_mode) ||
+	    (cdev != 0 && st.st_rdev != cdev)) {
+		RTE_LOG(ERR, EAL, "%s(): vfio char device is not matched\n",
+			__func__);
+		ret = -1;
+	}
+
+	ret = vfio_dev_fd;
+
+err_fscanf:
+	fclose(f);
+err_fopen:
+	closedir(dir);
+	return ret;
+}
+
+int
+rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+			      int *vfio_dev_fd, struct vfio_device_info *device_info)
+{
+	struct iommufd_config *iommufd_cfg;
+	int iommufd;
+	uint32_t ioas_id;
+	int ret = 0;
+	const struct internal_config *internal_conf =
+		eal_get_internal_configuration();
+
+	iommufd_cfg = default_iommufd_cfg;
+	iommufd = iommufd_cfg->iommufd;
+	ioas_id = iommufd_cfg->ioas_id;
+
+	*vfio_dev_fd = vfio_iommufd_get_fd(sysfs_base, dev_addr);
+	if (*vfio_dev_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to get device fd for device %s\n", dev_addr);
+		return -1;
+	}
+
+	if (vfio_iommufd_add_device(dev_addr, *vfio_dev_fd)) {
+		RTE_LOG(ERR, EAL, "Failed to add device %s to iommufd\n", dev_addr);
+		ret = -1;
+		goto err_add_dev;
+	}
+
+	if (!iommufd_cfg->dma_init &&
+	    internal_conf->process_type == RTE_PROC_PRIMARY &&
+	    iommufd != -1) {
+		/* lock memory hotplug before mapping and release it
+		 * after registering callback, to prevent races
+		 */
+		rte_mcfg_mem_read_lock();
+		ret = vfio_iommufd_dma_map(iommufd, ioas_id);
+		if (ret) {
+			RTE_LOG(ERR, EAL,
+				"%s DMA remapping failed, error "
+				"%i (%s)\n",
+				dev_addr, errno, strerror(errno));
+			rte_mcfg_mem_read_unlock();
+			ret = -1;
+			goto err_dma_map;
+		}
+
+		/* register callback for mem events */
+		ret = rte_mem_event_callback_register(
+			VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+			vfio_iommufd_mem_event_callback, NULL);
+
+		/* unlock memory hotplug */
+		rte_mcfg_mem_read_unlock();
+
+		if (ret && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+			ret = -1;
+			goto err_dma_map;
+		}
+		if (ret)
+			RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+		else
+			RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
+
+		iommufd_cfg->dma_init = true;
+	}
+
+	ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "%s cannot get device info, "
+			"error %i (%s)\n", dev_addr, errno,
+			strerror(errno));
+		ret = -1;
+		goto err_dma_map;
+	}
+
+	return 0;
+
+err_dma_map:
+	rte_vfio_iommufd_release_device(dev_addr, *vfio_dev_fd);
+err_add_dev:
+	close(*vfio_dev_fd);
+	return ret;
+}
+
+int
+rte_vfio_iommufd_release_device(const char *dev_addr, int vfio_dev_fd)
+{
+	struct vfio_device_detach_iommufd_pt detach = {};
+	int ret = 0;
+
+	rte_mcfg_mem_read_lock();
+
+	detach.argsz = sizeof(detach);
+	detach.flags = 0;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot detach from iommufd\n", dev_addr);
+		goto err;
+	}
+
+	close(vfio_dev_fd);
+
+	rte_mem_event_callback_unregister(VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+					  NULL);
+
+err:
+	rte_mcfg_mem_read_unlock();
+	return ret;
+}
+
+#else
+int
+rte_vfio_iommufd_setup_device(__rte_unused const char *sysfs_base,
+			      __rte_unused const char *dev_addr,
+			      __rte_unused int *vfio_dev_fd,
+			      __rte_unused struct vfio_device_info *device_info)
+{
+	return -1;
+}
+
+int
+rte_vfio_iommufd_release_device(__rte_unused const char *dev_addr,
+				__rte_unused int vfio_dev_fd)
+{
+	return -1;
+}
+
+#endif /* VFIO_IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 8081087584..bf246e64c9 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
         'eal_thread.c',
         'eal_timer.c',
         'eal_vfio.c',
+	'eal_vfio_iommufd.c',
         'eal_iommufd.c',
         'eal_vfio_mp_sync.c',
 )
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 30e66a7267..9c1e70feca 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -396,6 +396,8 @@ EXPERIMENTAL {
 
 	rte_iommufd_enable; # WINDOWS_NO_EXPORT
 	rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
+	rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT
+	rte_vfio_iommufd_setup_device; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
-- 
2.34.1


  parent reply	other threads:[~2023-12-22 11:22 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-22 19:44 [PATCH 0/4] add VFIO IOMMUFD/CDEV support beilei.xing
2023-12-22 19:44 ` [PATCH 1/4] iommufd: add IOMMUFD support beilei.xing
2023-12-22 19:44 ` beilei.xing [this message]
2023-12-22 17:17   ` [PATCH 2/4] vfio: add VFIO " Stephen Hemminger
2023-12-25  6:30     ` Xing, Beilei
2023-12-22 19:44 ` [PATCH 3/4] bus/pci: add VFIO CDEV support beilei.xing
2023-12-22 19:44 ` [PATCH 4/4] eal: add new args to choose VFIO mode beilei.xing
2023-12-22 17:17   ` Stephen Hemminger
2023-12-25  6:06     ` Xing, Beilei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231222194453.3049693-3-beilei.xing@intel.com \
    --to=beilei.xing@intel.com \
    --cc=anatoly.burakov@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=chenbox@nvidia.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@amd.com \
    --cc=thomas@monjalon.net \
    --cc=yahui.cao@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).