* [PATCH 1/4] iommufd: add IOMMUFD support
2023-12-22 19:44 [PATCH 0/4] add VFIO IOMMUFD/CDEV support beilei.xing
@ 2023-12-22 19:44 ` beilei.xing
2023-12-22 19:44 ` [PATCH 2/4] vfio: add VFIO " beilei.xing
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: beilei.xing @ 2023-12-22 19:44 UTC (permalink / raw)
To: anatoly.burakov
Cc: dev, thomas, ferruh.yigit, bruce.richardson, chenbox, yahui.cao,
Beilei Xing
From: Yahui Cao <yahui.cao@intel.com>
IOMMUFD is a new standalone IOMMU subsystem introduced in Linux.
Linux now includes multiple device-passthrough frameworks (e.g. VFIO and
vDPA) and those frameworks implements their own logic for managing I/O
page tables, which is hard to scale to support modern IOMMU features like
PASID, I/O page fault, IOMMU dirty page tracking. The goal of IOMMUFD is
to make Linux subsystems like VFIO and vDPA to consume a unified IOMMU
framework.
This patch exports basic enable function, default isolation domain and
per-IOMMUFD dma mapping function. The IOMMUFD consumer should use the
default isolation domain and dma mapping function when user-initiated
DMA is required.
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
---
config/meson.build | 3 +
config/rte_config.h | 1 +
lib/eal/include/rte_iommufd.h | 73 ++++++++++++++
lib/eal/linux/eal.c | 22 ++++
lib/eal/linux/eal_iommufd.c | 183 ++++++++++++++++++++++++++++++++++
lib/eal/linux/eal_iommufd.h | 43 ++++++++
lib/eal/linux/meson.build | 1 +
lib/eal/version.map | 3 +
8 files changed, 329 insertions(+)
create mode 100644 lib/eal/include/rte_iommufd.h
create mode 100644 lib/eal/linux/eal_iommufd.c
create mode 100644 lib/eal/linux/eal_iommufd.h
diff --git a/config/meson.build b/config/meson.build
index a9ccd56deb..93c63984c8 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -442,6 +442,9 @@ install_headers(['rte_config.h'],
# enable VFIO only if it is linux OS
dpdk_conf.set('RTE_EAL_VFIO', is_linux)
+# enable IOMMUFD only if it is linux OS
+dpdk_conf.set('RTE_EAL_IOMMUFD', is_linux)
+
# specify -D_GNU_SOURCE unconditionally
add_project_arguments('-D_GNU_SOURCE', language: 'c')
diff --git a/config/rte_config.h b/config/rte_config.h
index da265d7dd2..25a6dccd8f 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -38,6 +38,7 @@
#define RTE_MAX_TAILQ 32
#define RTE_LOG_DP_LEVEL RTE_LOG_INFO
#define RTE_MAX_VFIO_CONTAINERS 64
+#define RTE_MAX_IOMMUFD_FD 1
/* bsd module defines */
#define RTE_CONTIGMEM_MAX_NUM_BUFS 64
diff --git a/lib/eal/include/rte_iommufd.h b/lib/eal/include/rte_iommufd.h
new file mode 100644
index 0000000000..ac42713018
--- /dev/null
+++ b/lib/eal/include/rte_iommufd.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _RTE_IOMMUFD_H_
+#define _RTE_IOMMUFD_H_
+
+/**
+ * @file
+ * RTE IOMMUFD. This library provides various IOMMUFD related utility functions.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_compat.h>
+/*
+ * determine if IOMMUFD is present on the system
+ */
+#if !defined(IOMMUFD_PRESENT) && defined(RTE_EAL_IOMMUFD)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
+#endif /* RTE_EAL_IOMMUFD */
+
+#ifdef IOMMUFD_PRESENT
+
+#define IOMMUFD_PATH "/dev/iommu"
+
+#else /* not IOMMUFD_PRESENT */
+#endif /* IOMMUFD_PRESENT */
+
+/**
+ * Enable a IOMMUFD-related kmod.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param modname
+ * kernel module name.
+ *
+ * @return
+ * 0 on success.
+ * <0 on failure.
+ */
+__rte_experimental
+int rte_iommufd_enable(const char *modname);
+
+/**
+ * Check whether a IOMMUFD-related kmod is enabled.
+ *
+ * This function is only relevant to Linux.
+ *
+ * @param modname
+ * kernel module name.
+ *
+ * @return
+ * 1 if true.
+ * 0 otherwise.
+ */
+__rte_experimental
+int rte_iommufd_is_enabled(const char *modname);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_IOMMUFD_H_ */
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 57da058cec..4c8e0a7b6e 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -41,6 +41,7 @@
#include <rte_version.h>
#include <malloc_heap.h>
#include <rte_vfio.h>
+#include <rte_iommufd.h>
#include <telemetry_internal.h>
#include "eal_private.h"
@@ -52,6 +53,7 @@
#include "eal_trace.h"
#include "eal_options.h"
#include "eal_vfio.h"
+#include "eal_iommufd.h"
#include "hotplug_mp.h"
#include "log_internal.h"
@@ -877,6 +879,16 @@ static int rte_eal_vfio_setup(void)
}
#endif
+#ifdef IOMMUFD_PRESENT
+static int rte_eal_iommufd_setup(void)
+{
+ if (rte_iommufd_enable("iommufd"))
+ return -1;
+
+ return 0;
+}
+#endif
+
static void rte_eal_init_alert(const char *msg)
{
fprintf(stderr, "EAL: FATAL: %s\n", msg);
@@ -1162,6 +1174,16 @@ rte_eal_init(int argc, char **argv)
return -1;
}
#endif
+
+#ifdef IOMMUFD_PRESENT
+ if (rte_eal_iommufd_setup() < 0) {
+ rte_eal_init_alert("Cannot init IOMMUFD");
+ rte_errno = EAGAIN;
+ rte_atomic_store_explicit(&run_once, 0, rte_memory_order_relaxed);
+ return -1;
+ }
+#endif
+
/* in secondary processes, memory init may allocate additional fbarrays
* not present in primary processes, so to avoid any potential issues,
* initialize memzones first.
diff --git a/lib/eal/linux/eal_iommufd.c b/lib/eal/linux/eal_iommufd.c
new file mode 100644
index 0000000000..8866aa60c1
--- /dev/null
+++ b/lib/eal/linux/eal_iommufd.c
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+#include <rte_iommufd.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
+
+#include "eal_iommufd.h"
+#include "eal_private.h"
+
+#ifdef IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+
+/* per-process IOMMUFD config */
+static struct iommufd_config iommufd_cfgs[IOMMUFD_MAX_FD];
+struct iommufd_config *default_iommufd_cfg = &iommufd_cfgs[0];
+
+static void
+iommufd_get_ioas(int *iommufd, uint32_t *ioas_id)
+{
+ int iommu_fd, ret;
+ struct iommu_ioas_alloc alloc_data = {};
+
+ *iommufd = -1;
+ iommu_fd = open(IOMMUFD_PATH, O_RDWR);
+ if (iommu_fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to open iommufd!\n");
+ return;
+ }
+
+ alloc_data.size = sizeof(alloc_data);
+ ret = ioctl(iommu_fd, IOMMU_IOAS_ALLOC, &alloc_data);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Failed to alloc ioas!\n");
+ return;
+ }
+
+ *iommufd = iommu_fd;
+ *ioas_id = alloc_data.out_ioas_id;
+}
+
+int
+rte_iommufd_enable(const char *modname)
+{
+ /* initialize device list */
+ int i;
+ int iommufd_available;
+ const struct internal_config *internal_conf =
+ eal_get_internal_configuration();
+
+ for (i = 0; i < IOMMUFD_MAX_FD; i++) {
+ iommufd_cfgs[i].iommufd_enabled = 0;
+ iommufd_cfgs[i].iommufd = -1;
+ iommufd_cfgs[i].ioas_id = 0;
+ iommufd_cfgs[i].dma_init = false;
+ }
+
+ RTE_LOG(DEBUG, EAL, "Probing IOMMUFD support...\n");
+
+ /* check if iommufd module is loaded */
+ iommufd_available = rte_eal_check_module(modname);
+
+ /* return error directly */
+ if (iommufd_available == -1) {
+ RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
+ return -1;
+ }
+
+ /* return 0 if IOMMUFD modules not loaded */
+ if (iommufd_available == 0) {
+ RTE_LOG(DEBUG, EAL,
+ "IOMMUFD modules not loaded, skipping IOMMUFD support...\n");
+ return 0;
+ }
+
+ if (internal_conf->process_type == RTE_PROC_PRIMARY)
+ iommufd_get_ioas(&default_iommufd_cfg->iommufd, &default_iommufd_cfg->ioas_id);
+
+ /* check if we have IOMMUFD driver enabled */
+ if (default_iommufd_cfg->iommufd != -1) {
+ RTE_LOG(INFO, EAL, "IOMMUFD support initialized\n");
+ default_iommufd_cfg->iommufd_enabled = 1;
+ } else {
+ RTE_LOG(NOTICE, EAL, "IOMMUFD support could not be initialized\n");
+ }
+
+ return 0;
+}
+
+int
+rte_iommufd_is_enabled(const char *modname)
+{
+ const int mod_available = rte_eal_check_module(modname) > 0;
+ return default_iommufd_cfg->iommufd_enabled && mod_available;
+}
+
+int
+iommufd_dma_mem_map(int iommufd, uint32_t ioasid, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map)
+{
+ struct iommu_ioas_map dma_map;
+ struct iommu_ioas_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.ioas_id = ioasid;
+ dma_map.size = sizeof(struct iommu_ioas_map);
+ dma_map.user_va = vaddr;
+ dma_map.length = len;
+ dma_map.iova = iova;
+ dma_map.flags = IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_FIXED_IOVA;
+
+ ret = ioctl(iommufd, IOMMU_IOAS_MAP, &dma_map);
+ if (ret) {
+ /**
+ * In case the mapping was already done EEXIST will be
+ * returned from kernel.
+ */
+ if (errno == EEXIST) {
+ RTE_LOG(DEBUG, EAL,
+ "Memory segment is already mapped, skipping");
+ } else {
+ RTE_LOG(ERR, EAL,
+ "Cannot set up DMA remapping, error "
+ "%i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+ }
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.ioas_id = ioasid;
+ dma_unmap.size = sizeof(struct iommu_ioas_unmap);
+ dma_unmap.length = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(iommufd, IOMMU_IOAS_UNMAP, &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Cannot clear DMA remapping, error "
+ "%i (%s)\n", errno, strerror(errno));
+ return -1;
+ } else if (dma_unmap.length != len) {
+ RTE_LOG(ERR, EAL, "Unexpected size %"PRIu64
+ " of DMA remapping cleared instead of %"PRIu64"\n",
+ (uint64_t)dma_unmap.size, len);
+ rte_errno = EIO;
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+#else /* not IOMMUFD_PRESENT */
+
+int
+rte_iommufd_enable(__rte_unused const char *modname)
+{
+ return -1;
+}
+
+int
+rte_iommufd_is_enabled(__rte_unused const char *modname)
+{
+ return -1;
+}
+
+int
+iommufd_dma_mem_map(__rte_unused int iommufd, __rte_unused uint32_t ioasid,
+ __rte_unused uint64_t vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len, __rte_unused int do_map)
+{
+ return -1;
+}
+
+#endif /* IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/eal_iommufd.h b/lib/eal/linux/eal_iommufd.h
new file mode 100644
index 0000000000..d9b67a7fd9
--- /dev/null
+++ b/lib/eal/linux/eal_iommufd.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef EAL_IOMMUFD_H_
+#define EAL_IOMMUFD_H_
+
+#include <rte_common.h>
+#include <stdbool.h>
+
+/*
+ * determine if IOMMUFD is present on the system
+ */
+#if !defined(IOMMUFD_PRESENT) && defined(RTE_EAL_IOMMUFD)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define IOMMUFD_PRESENT
+#else
+#pragma message("IOMMUFD configured but not supported by this kernel, disabling.")
+#endif /* kernel version >= 6.6.0 */
+#endif /* RTE_EAL_IOMMUFD */
+
+#ifdef IOMMUFD_PRESENT
+
+#define IOMMUFD_MAX_FD RTE_MAX_IOMMUFD_FD
+
+struct iommufd_config {
+ int iommufd_enabled;
+ int iommufd;
+ uint32_t ioas_id;
+ bool dma_init;
+};
+
+/* per-process IOMMUFD config */
+extern struct iommufd_config *default_iommufd_cfg;
+
+#endif /* IOMMUFD_PRESENT */
+
+int
+iommufd_dma_mem_map(int iommufd, uint32_t ioasid, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map);
+
+#endif /* EAL_IOMMUFD_H_ */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index e99ebed256..8081087584 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
'eal_thread.c',
'eal_timer.c',
'eal_vfio.c',
+ 'eal_iommufd.c',
'eal_vfio_mp_sync.c',
)
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 5e0cd47c82..30e66a7267 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -393,6 +393,9 @@ EXPERIMENTAL {
# added in 23.07
rte_memzone_max_get;
rte_memzone_max_set;
+
+ rte_iommufd_enable; # WINDOWS_NO_EXPORT
+ rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
};
INTERNAL {
--
2.34.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/4] vfio: add VFIO IOMMUFD support
2023-12-22 19:44 [PATCH 0/4] add VFIO IOMMUFD/CDEV support beilei.xing
2023-12-22 19:44 ` [PATCH 1/4] iommufd: add IOMMUFD support beilei.xing
@ 2023-12-22 19:44 ` beilei.xing
2023-12-22 17:17 ` Stephen Hemminger
2023-12-22 19:44 ` [PATCH 3/4] bus/pci: add VFIO CDEV support beilei.xing
2023-12-22 19:44 ` [PATCH 4/4] eal: add new args to choose VFIO mode beilei.xing
3 siblings, 1 reply; 9+ messages in thread
From: beilei.xing @ 2023-12-22 19:44 UTC (permalink / raw)
To: anatoly.burakov
Cc: dev, thomas, ferruh.yigit, bruce.richardson, chenbox, yahui.cao,
Beilei Xing
From: Beilei Xing <beilei.xing@intel.com>
VFIO IOMMUFD is a new component added to co-work with IOMMUFD.
IOMMUFD has no impact on the existing VFIO Container/Group
interface, while the latest IOMMU feature(e.g. PASID/SSID) may
be only available through VFIO IOMMUFD/CDEV interface.
This path exposes setup/release vfio device functions with VFIO
IOMMUFD/CDEV interface.
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
---
lib/eal/include/rte_vfio.h | 55 +++++
lib/eal/linux/eal_vfio.h | 3 +
lib/eal/linux/eal_vfio_iommufd.c | 385 +++++++++++++++++++++++++++++++
lib/eal/linux/meson.build | 1 +
lib/eal/version.map | 2 +
5 files changed, 446 insertions(+)
create mode 100644 lib/eal/linux/eal_vfio_iommufd.c
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 22832afd0f..7a9b26b0f7 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -17,6 +17,8 @@ extern "C" {
#include <stdbool.h>
#include <stdint.h>
+#include <rte_compat.h>
+
/*
* determine if VFIO is present on the system
*/
@@ -28,6 +30,9 @@ extern "C" {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
#define HAVE_VFIO_DEV_REQ_INTERFACE
#endif /* kernel version >= 4.0.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
#endif /* RTE_EAL_VFIO */
#ifdef VFIO_PRESENT
@@ -42,6 +47,10 @@ extern "C" {
#define VFIO_NOIOMMU_MODE \
"/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
+#ifdef VFIO_IOMMUFD_PRESENT
+#define VFIO_CDEV_CLASS_DIR "/sys/class/vfio-dev"
+#endif
+
/* NOIOMMU is defined from kernel version 4.5 onwards */
#ifdef VFIO_NOIOMMU_IOMMU
#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
@@ -137,6 +146,33 @@ struct vfio_device_info;
int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info);
+/**
+ * Setup iommufd_cfg for the device identified by its address.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param sysfs_base
+ * sysfs path prefix.
+ *
+ * @param dev_addr
+ * device location.
+ *
+ * @param vfio_dev_fd
+ * VFIO fd.
+ *
+ * @param device_info
+ * Device information.
+ *
+ * @return
+ * 0 on success.
+ * <0 on failure.
+ * >1 if the device cannot be managed this way.
+ */
+__rte_experimental
+int rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+ int *vfio_dev_fd, struct vfio_device_info *device_info);
+
/**
* Release a device mapped to a VFIO-managed I/O MMU group.
*
@@ -158,6 +194,25 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
*/
int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
+/**
+ * Release a device mapped to a VFIO-iommufd-managed I/O MMU group.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param dev_addr
+ * device location.
+ *
+ * @param fd
+ * VFIO fd.
+ *
+ * @return
+ * 0 on success.
+ * <0 on failure.
+ */
+__rte_experimental
+int rte_vfio_iommufd_release_device(const char *dev_addr, int fd);
+
/**
* Enable a VFIO-related kmod.
*
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 23a787ad20..c94409e828 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -17,6 +17,9 @@
#else
#pragma message("VFIO configured but not supported by this kernel, disabling.")
#endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
#endif /* RTE_EAL_VFIO */
#ifdef VFIO_PRESENT
diff --git a/lib/eal/linux/eal_vfio_iommufd.c b/lib/eal/linux/eal_vfio_iommufd.c
new file mode 100644
index 0000000000..02996a588a
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_iommufd.c
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+#ifdef VFIO_IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+#include "eal_iommufd.h"
+
+#define VFIO_IOMMUFD_MEM_EVENT_CLB_NAME "vfio_iommufd_mem_event_clb"
+
+struct ioas_info {
+ int iommufd;
+ uint32_t ioas_id;
+};
+
+static int
+vfio_iommufd_add_device(const char *dev_addr, int vfio_dev_fd)
+{
+ struct iommufd_config *iommufd_cfg;
+ int iommufd;
+ uint32_t ioas_id;
+ struct vfio_device_bind_iommufd bind = {};
+ struct vfio_device_attach_iommufd_pt attach = {};
+ int ret = 0;
+
+ iommufd_cfg = default_iommufd_cfg;
+ iommufd = iommufd_cfg->iommufd;
+ ioas_id = iommufd_cfg->ioas_id;
+
+ bind.argsz = sizeof(bind);
+ bind.iommufd = iommufd;
+ bind.flags = 0;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Device %s cannot bind to iommufd\n", dev_addr);
+ return ret;
+ }
+
+ attach.argsz = sizeof(attach);
+ attach.flags = 0;
+ attach.pt_id = ioas_id;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Device %s cannot attach to ioas\n", dev_addr);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int
+vfio_iommufd_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ size_t len, void *arg)
+{
+ struct ioas_info *info = arg;
+
+ if (msl->external)
+ return 0;
+
+ return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+ ms->iova, len, 1);
+}
+
+static int
+vfio_iommufd_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
+{
+ struct ioas_info *info = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ /* if IOVA mode is VA, we've already mapped the internal segments */
+ if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
+ return 0;
+
+ return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+ ms->iova, ms->len, 1);
+}
+
+static int
+vfio_iommufd_dma_map(int iommufd, uint32_t ioasid)
+{
+ struct ioas_info info = {.iommufd = iommufd, .ioas_id = ioasid};
+ if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ /* with IOVA as VA mode, we can get away with mapping contiguous
+ * chunks rather than going page-by-page.
+ */
+ int ret = rte_memseg_contig_walk(vfio_iommufd_map_contig,
+ &info);
+ if (ret)
+ return ret;
+ /* we have to continue the walk because we've skipped the
+ * external segments during the config walk.
+ */
+ }
+ return rte_memseg_walk(vfio_iommufd_map, &info);
+}
+
+static void
+vfio_iommufd_mem_event_callback(enum rte_mem_event type, const void *addr,
+ size_t len, void *arg __rte_unused)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0;
+
+ msl = rte_mem_virt2memseg_list(addr);
+
+ /* for IOVA as VA mode, no need to care for IOVA addresses */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+ uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+ uint64_t page_sz = msl->page_sz;
+
+ /* Maintain granularity of DMA map/unmap to memseg size */
+ for (; cur_len < len; cur_len += page_sz) {
+ if (type == RTE_MEM_EVENT_ALLOC)
+ iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+ default_iommufd_cfg->ioas_id,
+ vfio_va, vfio_va, page_sz, 1);
+ else
+ iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+ default_iommufd_cfg->ioas_id,
+ vfio_va, vfio_va, page_sz, 0);
+ vfio_va += page_sz;
+ }
+
+ return;
+ }
+
+ /* memsegs are contiguous in memory */
+ ms = rte_mem_virt2memseg(addr, msl);
+ while (cur_len < len) {
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL,
+ "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ goto next;
+ }
+ if (type == RTE_MEM_EVENT_ALLOC)
+ iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+ default_iommufd_cfg->ioas_id,
+ ms->addr_64, ms->iova, ms->len, 1);
+ else
+ iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+ default_iommufd_cfg->ioas_id,
+ ms->addr_64, ms->iova, ms->len, 0);
+next:
+ cur_len += ms->len;
+ ++ms;
+ }
+}
+
+static int
+vfio_iommufd_get_fd(const char *sysfs_base, const char *dev_addr)
+{
+ char vfio_cdev_path[PATH_MAX];
+ char vfio_path[PATH_MAX];
+ char dirname[PATH_MAX];
+ int vfio_dev_fd;
+ struct dirent *dent;
+ unsigned int major, minor;
+ struct stat st;
+ dev_t cdev;
+ DIR *dir;
+ FILE *f;
+ int ret = 0;
+
+ memset(vfio_cdev_path, 0, sizeof(vfio_cdev_path));
+ memset(vfio_path, 0, sizeof(vfio_path));
+ memset(dirname, 0, sizeof(dirname));
+
+ snprintf(dirname, sizeof(dirname), "%s/%s/vfio-dev",
+ sysfs_base, dev_addr);
+
+ dir = opendir(dirname);
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ while ((dent = readdir(dir)) != NULL) {
+ if (!strncmp(dent->d_name, "vfio", 4)) {
+ snprintf(vfio_cdev_path, sizeof(vfio_cdev_path),
+ "%s/%s/vfio-dev/%s/dev", sysfs_base,
+ dev_addr, dent->d_name);
+ break;
+ }
+ }
+
+ f = fopen(vfio_cdev_path, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+ __func__);
+ ret = -1;
+ goto err_fopen;
+ }
+
+ ret = fscanf(f, "%u:%u", &major, &minor);
+ if (ret != 2) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+ __func__);
+ ret = -1;
+ goto err_fscanf;
+ }
+
+ cdev = makedev(major, minor);
+
+ snprintf(vfio_path, sizeof(vfio_path), "/dev/vfio/devices/%s", dent->d_name);
+ vfio_dev_fd = open(vfio_path, O_RDWR);
+ if (vfio_dev_fd == -1) {
+ RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+ __func__, vfio_path, strerror(errno));
+ ret = -1;
+ goto err_fscanf;
+ }
+
+ if (fstat(vfio_dev_fd, &st) || !S_ISCHR(st.st_mode) ||
+ (cdev != 0 && st.st_rdev != cdev)) {
+ RTE_LOG(ERR, EAL, "%s(): vfio char device is not matched\n",
+ __func__);
+ ret = -1;
+ }
+
+ ret = vfio_dev_fd;
+
+err_fscanf:
+ fclose(f);
+err_fopen:
+ closedir(dir);
+ return ret;
+}
+
+int
+rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+ int *vfio_dev_fd, struct vfio_device_info *device_info)
+{
+ struct iommufd_config *iommufd_cfg;
+ int iommufd;
+ uint32_t ioas_id;
+ int ret = 0;
+ const struct internal_config *internal_conf =
+ eal_get_internal_configuration();
+
+ iommufd_cfg = default_iommufd_cfg;
+ iommufd = iommufd_cfg->iommufd;
+ ioas_id = iommufd_cfg->ioas_id;
+
+ *vfio_dev_fd = vfio_iommufd_get_fd(sysfs_base, dev_addr);
+ if (*vfio_dev_fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to get device fd for device %s\n", dev_addr);
+ return -1;
+ }
+
+ if (vfio_iommufd_add_device(dev_addr, *vfio_dev_fd)) {
+ RTE_LOG(ERR, EAL, "Failed to add device %s to iommufd\n", dev_addr);
+ ret = -1;
+ goto err_add_dev;
+ }
+
+ if (!iommufd_cfg->dma_init &&
+ internal_conf->process_type == RTE_PROC_PRIMARY &&
+ iommufd != -1) {
+ /* lock memory hotplug before mapping and release it
+ * after registering callback, to prevent races
+ */
+ rte_mcfg_mem_read_lock();
+ ret = vfio_iommufd_dma_map(iommufd, ioas_id);
+ if (ret) {
+ RTE_LOG(ERR, EAL,
+ "%s DMA remapping failed, error "
+ "%i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ rte_mcfg_mem_read_unlock();
+ ret = -1;
+ goto err_dma_map;
+ }
+
+ /* register callback for mem events */
+ ret = rte_mem_event_callback_register(
+ VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+ vfio_iommufd_mem_event_callback, NULL);
+
+ /* unlock memory hotplug */
+ rte_mcfg_mem_read_unlock();
+
+ if (ret && rte_errno != ENOTSUP) {
+ RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+ ret = -1;
+ goto err_dma_map;
+ }
+ if (ret)
+ RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+ else
+ RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
+
+ iommufd_cfg->dma_init = true;
+ }
+
+ ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "%s cannot get device info, "
+ "error %i (%s)\n", dev_addr, errno,
+ strerror(errno));
+ ret = -1;
+ goto err_dma_map;
+ }
+
+ return 0;
+
+err_dma_map:
+ rte_vfio_iommufd_release_device(dev_addr, *vfio_dev_fd);
+err_add_dev:
+ close(*vfio_dev_fd);
+ return ret;
+}
+
+int
+rte_vfio_iommufd_release_device(const char *dev_addr, int vfio_dev_fd)
+{
+ struct vfio_device_detach_iommufd_pt detach = {};
+ int ret = 0;
+
+ rte_mcfg_mem_read_lock();
+
+ detach.argsz = sizeof(detach);
+ detach.flags = 0;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Device %s cannot detach from iommufd\n", dev_addr);
+ goto err;
+ }
+
+ close(vfio_dev_fd);
+
+ rte_mem_event_callback_unregister(VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+ NULL);
+
+err:
+ rte_mcfg_mem_read_unlock();
+ return ret;
+}
+
+#else
+int
+rte_vfio_iommufd_setup_device(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *vfio_dev_fd,
+ __rte_unused struct vfio_device_info *device_info)
+{
+ return -1;
+}
+
+int
+rte_vfio_iommufd_release_device(__rte_unused const char *dev_addr,
+ __rte_unused int vfio_dev_fd)
+{
+ return -1;
+}
+
+#endif /* VFIO_IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 8081087584..bf246e64c9 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
'eal_thread.c',
'eal_timer.c',
'eal_vfio.c',
+ 'eal_vfio_iommufd.c',
'eal_iommufd.c',
'eal_vfio_mp_sync.c',
)
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 30e66a7267..9c1e70feca 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -396,6 +396,8 @@ EXPERIMENTAL {
rte_iommufd_enable; # WINDOWS_NO_EXPORT
rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
+ rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT
+ rte_vfio_iommufd_setup_device; # WINDOWS_NO_EXPORT
};
INTERNAL {
--
2.34.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/4] vfio: add VFIO IOMMUFD support
2023-12-22 19:44 ` [PATCH 2/4] vfio: add VFIO " beilei.xing
@ 2023-12-22 17:17 ` Stephen Hemminger
2023-12-25 6:30 ` Xing, Beilei
0 siblings, 1 reply; 9+ messages in thread
From: Stephen Hemminger @ 2023-12-22 17:17 UTC (permalink / raw)
To: beilei.xing
Cc: anatoly.burakov, dev, thomas, ferruh.yigit, bruce.richardson,
chenbox, yahui.cao
On Fri, 22 Dec 2023 19:44:51 +0000
beilei.xing@intel.com wrote:
> diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
> index 22832afd0f..7a9b26b0f7 100644
> --- a/lib/eal/include/rte_vfio.h
> +++ b/lib/eal/include/rte_vfio.h
> @@ -17,6 +17,8 @@ extern "C" {
> #include <stdbool.h>
> #include <stdint.h>
>
> +#include <rte_compat.h>
> +
> /*
> * determine if VFIO is present on the system
> */
> @@ -28,6 +30,9 @@ extern "C" {
> #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
> #define HAVE_VFIO_DEV_REQ_INTERFACE
> #endif /* kernel version >= 4.0.0 */
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
> +#define VFIO_IOMMUFD_PRESENT
> +#endif /* kernel version >= 6.6.0 */
> #endif /* RTE_EAL_VFIO */
Depending on kernel version macro is a mistake because many enterprise
distro's backport features and do not change kernel version.
Also, it means the build and target machine have to be same kernel version.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH 2/4] vfio: add VFIO IOMMUFD support
2023-12-22 17:17 ` Stephen Hemminger
@ 2023-12-25 6:30 ` Xing, Beilei
0 siblings, 0 replies; 9+ messages in thread
From: Xing, Beilei @ 2023-12-25 6:30 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Burakov, Anatoly, dev, thomas, ferruh.yigit, Richardson, Bruce,
chenbox, Cao, Yahui
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Saturday, December 23, 2023 1:17 AM
> To: Xing, Beilei <beilei.xing@intel.com>
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; dev@dpdk.org;
> thomas@monjalon.net; ferruh.yigit@amd.com; Richardson, Bruce
> <bruce.richardson@intel.com>; chenbox@nvidia.com; Cao, Yahui
> <yahui.cao@intel.com>
> Subject: Re: [PATCH 2/4] vfio: add VFIO IOMMUFD support
>
> On Fri, 22 Dec 2023 19:44:51 +0000
> beilei.xing@intel.com wrote:
>
> > diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
> > index 22832afd0f..7a9b26b0f7 100644
> > --- a/lib/eal/include/rte_vfio.h
> > +++ b/lib/eal/include/rte_vfio.h
> > @@ -17,6 +17,8 @@ extern "C" {
> > #include <stdbool.h>
> > #include <stdint.h>
> >
> > +#include <rte_compat.h>
> > +
> > /*
> > * determine if VFIO is present on the system
> > */
> > @@ -28,6 +30,9 @@ extern "C" {
> > #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) #define
> > HAVE_VFIO_DEV_REQ_INTERFACE #endif /* kernel version >= 4.0.0 */
> > +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) #define
> > +VFIO_IOMMUFD_PRESENT #endif /* kernel version >= 6.6.0 */
> > #endif /* RTE_EAL_VFIO */
>
> Depending on kernel version macro is a mistake because many enterprise
> distro's backport features and do not change kernel version.
Make sense. We defined VFIO_IOMMUFD_PRESENT with reference to
VFIO_PRESENT. Do you have suggestion for this point? Thanks a lot.
> Also, it means the build and target machine have to be same kernel version.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/4] bus/pci: add VFIO CDEV support
2023-12-22 19:44 [PATCH 0/4] add VFIO IOMMUFD/CDEV support beilei.xing
2023-12-22 19:44 ` [PATCH 1/4] iommufd: add IOMMUFD support beilei.xing
2023-12-22 19:44 ` [PATCH 2/4] vfio: add VFIO " beilei.xing
@ 2023-12-22 19:44 ` beilei.xing
2023-12-22 19:44 ` [PATCH 4/4] eal: add new args to choose VFIO mode beilei.xing
3 siblings, 0 replies; 9+ messages in thread
From: beilei.xing @ 2023-12-22 19:44 UTC (permalink / raw)
To: anatoly.burakov
Cc: dev, thomas, ferruh.yigit, bruce.richardson, chenbox, yahui.cao,
Beilei Xing
From: Beilei Xing <beilei.xing@intel.com>
This patch adds VFIO CDEV support to probe PCI devices.
For VFIO subsystem, mainline Linux supports both of VFIO Container/GROUP
interface and VFIO IOMMUFD/CDEV interface. Comparing with VFIO Container
and VFIO IOMMUFD, vfio device uAPI does not change while I/O page tables
management is moved from VFIO Container into IOMMUFD interface.
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
---
drivers/bus/pci/bus_pci_driver.h | 1 +
drivers/bus/pci/linux/pci.c | 14 +++++++++
drivers/bus/pci/linux/pci_init.h | 4 +++
drivers/bus/pci/linux/pci_vfio.c | 52 ++++++++++++++++++++++++++------
4 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/drivers/bus/pci/bus_pci_driver.h b/drivers/bus/pci/bus_pci_driver.h
index be32263a82..6ac25546cf 100644
--- a/drivers/bus/pci/bus_pci_driver.h
+++ b/drivers/bus/pci/bus_pci_driver.h
@@ -26,6 +26,7 @@ enum rte_pci_kernel_driver {
RTE_PCI_KDRV_NIC_UIO, /* nic_uio for FreeBSD */
RTE_PCI_KDRV_NONE, /* no attached driver */
RTE_PCI_KDRV_NET_UIO, /* NetUIO for Windows */
+ RTE_PCI_KDRV_VFIO_IOMMUFD, /* VFIO IOMMUFD for Linux */
};
/**
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 3d237398d9..1a37f5de22 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -65,6 +65,12 @@ rte_pci_map_device(struct rte_pci_device *dev)
#ifdef VFIO_PRESENT
if (pci_vfio_is_enabled())
ret = pci_vfio_map_resource(dev);
+#endif
+ break;
+ case RTE_PCI_KDRV_VFIO_IOMMUFD:
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (pci_iommufd_is_enabled())
+ ret = pci_vfio_map_resource(dev);
#endif
break;
case RTE_PCI_KDRV_IGB_UIO:
@@ -94,6 +100,12 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
#ifdef VFIO_PRESENT
if (pci_vfio_is_enabled())
pci_vfio_unmap_resource(dev);
+#endif
+ break;
+ case RTE_PCI_KDRV_VFIO_IOMMUFD:
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (pci_iommufd_is_enabled())
+ pci_vfio_unmap_resource(dev);
#endif
break;
case RTE_PCI_KDRV_IGB_UIO:
@@ -645,6 +657,7 @@ int rte_pci_read_config(const struct rte_pci_device *device,
return pci_uio_read_config(intr_handle, buf, len, offset);
#ifdef VFIO_PRESENT
case RTE_PCI_KDRV_VFIO:
+ case RTE_PCI_KDRV_VFIO_IOMMUFD:
return pci_vfio_read_config(device, buf, len, offset);
#endif
default:
@@ -669,6 +682,7 @@ int rte_pci_write_config(const struct rte_pci_device *device,
return pci_uio_write_config(intr_handle, buf, len, offset);
#ifdef VFIO_PRESENT
case RTE_PCI_KDRV_VFIO:
+ case RTE_PCI_KDRV_VFIO_IOMMUFD:
return pci_vfio_write_config(device, buf, len, offset);
#endif
default:
diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h
index a4d37c0d0a..a096bc245b 100644
--- a/drivers/bus/pci/linux/pci_init.h
+++ b/drivers/bus/pci/linux/pci_init.h
@@ -79,4 +79,8 @@ int pci_vfio_is_enabled(void);
#endif
+#ifdef VFIO_IOMMUFD_PRESENT
+int pci_iommufd_is_enabled(void);
+#endif
+
#endif /* EAL_PCI_INIT_H_ */
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 3f3201daf2..97032231d7 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -21,6 +21,9 @@
#include <bus_driver.h>
#include <rte_spinlock.h>
#include <rte_tailq.h>
+#ifdef VFIO_IOMMUFD_PRESENT
+#include <rte_iommufd.h>
+#endif
#include "eal_filesystem.h"
@@ -783,10 +786,21 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
loc->domain, loc->bus, loc->devid, loc->function);
- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
- &vfio_dev_fd, &device_info);
- if (ret)
- return ret;
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (dev->kdrv == RTE_PCI_KDRV_VFIO_IOMMUFD) {
+ ret = rte_vfio_iommufd_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+ } else {
+#endif
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+#ifdef VFIO_IOMMUFD_PRESENT
+ }
+#endif
if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
goto err_vfio_dev_fd;
@@ -1148,12 +1162,24 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
return -1;
}
- ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
- vfio_dev_fd);
- if (ret < 0) {
- RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
- return ret;
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (dev->kdrv == RTE_PCI_KDRV_VFIO_IOMMUFD) {
+ ret = rte_vfio_iommufd_release_device(pci_addr, vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
+ return ret;
+ }
+ } else {
+#endif
+ ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
+ vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
+ return ret;
+ }
+#ifdef VFIO_IOMMUFD_PRESENT
}
+#endif
vfio_res_list =
RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
@@ -1327,3 +1353,11 @@ pci_vfio_is_enabled(void)
return rte_vfio_is_enabled("vfio_pci");
}
#endif
+
+#ifdef VFIO_IOMMUFD_PRESENT
+int
+pci_iommufd_is_enabled(void)
+{
+ return rte_iommufd_is_enabled("iommufd");
+}
+#endif
--
2.34.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 4/4] eal: add new args to choose VFIO mode
2023-12-22 19:44 [PATCH 0/4] add VFIO IOMMUFD/CDEV support beilei.xing
` (2 preceding siblings ...)
2023-12-22 19:44 ` [PATCH 3/4] bus/pci: add VFIO CDEV support beilei.xing
@ 2023-12-22 19:44 ` beilei.xing
2023-12-22 17:17 ` Stephen Hemminger
3 siblings, 1 reply; 9+ messages in thread
From: beilei.xing @ 2023-12-22 19:44 UTC (permalink / raw)
To: anatoly.burakov
Cc: dev, thomas, ferruh.yigit, bruce.richardson, chenbox, yahui.cao,
Beilei Xing
From: Beilei Xing <beilei.xing@intel.com>
Since now Linux has both of VFIO Container/GROUP & VFIO IOMMUFD/CDEV
support, user can determine how to probe the PCI device by the new
args "--vfio-mode".
Use "--vfio-mode=container" to choose VFIO Container/GROUP, and use
"--vfio-mode=iommufd" to choose VFIO IOMMUFD/CDEV.
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
---
drivers/bus/pci/linux/pci.c | 7 ++++-
lib/eal/common/eal_common_config.c | 6 ++++
lib/eal/common/eal_common_options.c | 48 ++++++++++++++++++++++++++++-
lib/eal/common/eal_internal_cfg.h | 1 +
lib/eal/common/eal_options.h | 2 ++
lib/eal/include/rte_eal.h | 18 +++++++++++
lib/eal/version.map | 1 +
7 files changed, 81 insertions(+), 2 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 1a37f5de22..24c7395f98 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -226,6 +226,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
struct rte_pci_device_internal *pdev;
struct rte_pci_device *dev;
char driver[PATH_MAX];
+ enum rte_vfio_mode vfio_mode;
int ret;
pdev = malloc(sizeof(*pdev));
@@ -317,6 +318,8 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
return -1;
}
+ vfio_mode = rte_eal_vfio_mode();
+
/* parse driver */
snprintf(filename, sizeof(filename), "%s/driver", dirname);
ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
@@ -327,8 +330,10 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
}
if (!ret) {
- if (!strcmp(driver, "vfio-pci"))
+ if (!strcmp(driver, "vfio-pci") && vfio_mode == RTE_VFIO_CONTAINER)
dev->kdrv = RTE_PCI_KDRV_VFIO;
+ else if (!strcmp(driver, "vfio-pci") && vfio_mode == RTE_VFIO_IOMMUFD)
+ dev->kdrv = RTE_PCI_KDRV_VFIO_IOMMUFD;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_PCI_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
diff --git a/lib/eal/common/eal_common_config.c b/lib/eal/common/eal_common_config.c
index 0daf0f3188..cb3368095d 100644
--- a/lib/eal/common/eal_common_config.c
+++ b/lib/eal/common/eal_common_config.c
@@ -58,6 +58,12 @@ rte_eal_iova_mode(void)
return rte_eal_get_configuration()->iova_mode;
}
+enum rte_vfio_mode
+rte_eal_vfio_mode(void)
+{
+ return internal_config.vfio_mode;
+}
+
/* Get the EAL base address */
uint64_t
rte_eal_get_baseaddr(void)
diff --git a/lib/eal/common/eal_common_options.c b/lib/eal/common/eal_common_options.c
index a6d21f1cba..7df1fa0821 100644
--- a/lib/eal/common/eal_common_options.c
+++ b/lib/eal/common/eal_common_options.c
@@ -35,6 +35,7 @@
#include <rte_telemetry.h>
#endif
#include <rte_vect.h>
+#include <rte_vfio.h>
#include "eal_internal_cfg.h"
#include "eal_options.h"
@@ -96,6 +97,7 @@ eal_long_options[] = {
{OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM },
{OPT_VDEV, 1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
+ {OPT_VFIO_MODE, 1, NULL, OPT_VFIO_MODE_NUM },
{OPT_VFIO_VF_TOKEN, 1, NULL, OPT_VFIO_VF_TOKEN_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
@@ -1598,6 +1600,42 @@ available_cores(void)
return str;
}
+static int
+eal_parse_vfio_mode(const char *name)
+{
+ int mode;
+ struct internal_config *internal_conf =
+ eal_get_internal_configuration();
+#ifdef VFIO_IOMMUFD_PRESENT
+ char dirname[PATH_MAX] = VFIO_CDEV_CLASS_DIR;
+#endif
+
+ if (name == NULL)
+ return -1;
+
+ if (!strcmp("container", name)) {
+ mode = RTE_VFIO_CONTAINER;
+ } else if (!strcmp("iommufd", name)) {
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (opendir(dirname) == NULL) {
+ RTE_LOG(WARNING, EAL, "vfio cdev isn't supported, change to vfio container mode\n");
+ mode = RTE_VFIO_CONTAINER;
+ } else {
+ mode = RTE_VFIO_IOMMUFD;
+ }
+#else
+ RTE_LOG(WARNING, EAL, "vfio cdev isn't supported, change to vfio container mode\n");
+ mode = RTE_VFIO_CONTAINER;
+#endif
+ } else {
+ RTE_LOG(ERR, EAL, "unsupported vfio mode\n");
+ return -1;
+ }
+
+ internal_conf->vfio_mode = mode;
+ return 0;
+}
+
#define HUGE_UNLINK_NEVER "never"
static int
@@ -1922,7 +1960,13 @@ eal_parse_common_option(int opt, const char *optarg,
return -1;
}
break;
-
+ case OPT_VFIO_MODE_NUM:
+ if (eal_parse_vfio_mode(optarg) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_VFIO_MODE "\n");
+ return -1;
+ }
+ break;
/* don't know what to do, leave this to caller */
default:
return 1;
@@ -2189,6 +2233,8 @@ eal_common_usage(void)
" (ex: --vdev=net_pcap0,iface=eth2).\n"
" --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n"
" 'va' for IOVA_VA\n"
+ " --"OPT_VFIO_MODE" Set VFIO mode. 'container' for VFIO_CONTAINER\n"
+ " 'cdev' for VFIO_IOMMUFD\n"
" -d LIB.so|DIR Add a driver or driver directory\n"
" (can be used multiple times)\n"
" --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n"
diff --git a/lib/eal/common/eal_internal_cfg.h b/lib/eal/common/eal_internal_cfg.h
index 167ec501fa..525c7c88a3 100644
--- a/lib/eal/common/eal_internal_cfg.h
+++ b/lib/eal/common/eal_internal_cfg.h
@@ -103,6 +103,7 @@ struct internal_config {
struct simd_bitwidth max_simd_bitwidth;
/**< max simd bitwidth path to use */
size_t huge_worker_stack_size; /**< worker thread stack size */
+ enum rte_vfio_mode vfio_mode; /**< Set VFIO mode */
};
void eal_reset_internal_config(struct internal_config *internal_cfg);
diff --git a/lib/eal/common/eal_options.h b/lib/eal/common/eal_options.h
index 3cc9cb6412..ff8666d378 100644
--- a/lib/eal/common/eal_options.h
+++ b/lib/eal/common/eal_options.h
@@ -89,6 +89,8 @@ enum {
OPT_FORCE_MAX_SIMD_BITWIDTH_NUM,
#define OPT_HUGE_WORKER_STACK "huge-worker-stack"
OPT_HUGE_WORKER_STACK_NUM,
+#define OPT_VFIO_MODE "vfio-mode"
+ OPT_VFIO_MODE_NUM,
OPT_LONG_MAX_NUM
};
diff --git a/lib/eal/include/rte_eal.h b/lib/eal/include/rte_eal.h
index c2256f832e..1117b578e8 100644
--- a/lib/eal/include/rte_eal.h
+++ b/lib/eal/include/rte_eal.h
@@ -472,6 +472,24 @@ enum rte_iova_mode {
*/
enum rte_iova_mode rte_eal_iova_mode(void);
+/**
+ * VFIO mode.
+ */
+enum rte_vfio_mode {
+ RTE_VFIO_CONTAINER = 0, /* vfio container mode */
+ RTE_VFIO_IOMMUFD = 1 /* vfio iommufd mode */
+};
+
+/**
+ * Get the vfio mode
+ *
+ * @return
+ * enum rte_vfio_mode value.
+ */
+
+__rte_experimental
+enum rte_vfio_mode rte_eal_vfio_mode(void);
+
/**
* Get user provided pool ops name for mbuf
*
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 9c1e70feca..42451f12de 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -394,6 +394,7 @@ EXPERIMENTAL {
rte_memzone_max_get;
rte_memzone_max_set;
+ rte_eal_vfio_mode; # WINDOWS_NO_EXPORT
rte_iommufd_enable; # WINDOWS_NO_EXPORT
rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT
--
2.34.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 4/4] eal: add new args to choose VFIO mode
2023-12-22 19:44 ` [PATCH 4/4] eal: add new args to choose VFIO mode beilei.xing
@ 2023-12-22 17:17 ` Stephen Hemminger
2023-12-25 6:06 ` Xing, Beilei
0 siblings, 1 reply; 9+ messages in thread
From: Stephen Hemminger @ 2023-12-22 17:17 UTC (permalink / raw)
To: beilei.xing
Cc: anatoly.burakov, dev, thomas, ferruh.yigit, bruce.richardson,
chenbox, yahui.cao
On Fri, 22 Dec 2023 19:44:53 +0000
beilei.xing@intel.com wrote:
> From: Beilei Xing <beilei.xing@intel.com>
>
> Since now Linux has both of VFIO Container/GROUP & VFIO IOMMUFD/CDEV
> support, user can determine how to probe the PCI device by the new
> args "--vfio-mode".
>
> Use "--vfio-mode=container" to choose VFIO Container/GROUP, and use
> "--vfio-mode=iommufd" to choose VFIO IOMMUFD/CDEV.
>
> Signed-off-by: Beilei Xing <beilei.xing@intel.com>
> Signed-off-by: Yahui Cao <yahui.cao@intel.com>
Can't this be automatic, users don't need more EAL options.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH 4/4] eal: add new args to choose VFIO mode
2023-12-22 17:17 ` Stephen Hemminger
@ 2023-12-25 6:06 ` Xing, Beilei
0 siblings, 0 replies; 9+ messages in thread
From: Xing, Beilei @ 2023-12-25 6:06 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Burakov, Anatoly, dev, thomas, ferruh.yigit, Richardson, Bruce,
chenbox, Cao, Yahui
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Saturday, December 23, 2023 1:18 AM
> To: Xing, Beilei <beilei.xing@intel.com>
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; dev@dpdk.org;
> thomas@monjalon.net; ferruh.yigit@amd.com; Richardson, Bruce
> <bruce.richardson@intel.com>; chenbox@nvidia.com; Cao, Yahui
> <yahui.cao@intel.com>
> Subject: Re: [PATCH 4/4] eal: add new args to choose VFIO mode
>
> On Fri, 22 Dec 2023 19:44:53 +0000
> beilei.xing@intel.com wrote:
>
> > From: Beilei Xing <beilei.xing@intel.com>
> >
> > Since now Linux has both of VFIO Container/GROUP & VFIO IOMMUFD/CDEV
> > support, user can determine how to probe the PCI device by the new
> > args "--vfio-mode".
> >
> > Use "--vfio-mode=container" to choose VFIO Container/GROUP, and use
> > "--vfio-mode=iommufd" to choose VFIO IOMMUFD/CDEV.
> >
> > Signed-off-by: Beilei Xing <beilei.xing@intel.com>
> > Signed-off-by: Yahui Cao <yahui.cao@intel.com>
>
> Can't this be automatic, users don't need more EAL options.
Thanks for your review. Since Linux supports both VFIO Container/GROUP and VFIO
OMMUFD/CDEV currently, I think user can choose which mode they want. The new
IOMMU features (e.g. PASID/SSID) may be only available through VFIO IOMMUFD/CDEV
interface, VFIO Container/GROUP may be deprecated in future, and then DPDK will
use iommufd mode automatically.
.
^ permalink raw reply [flat|nested] 9+ messages in thread