DPDK patches and discussions
 help / color / mirror / Atom feed
From: Chenbo Xia <chenbo.xia@intel.com>
To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com,
	jingjing.wu@intel.com
Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu,
	nhorman@tuxdriver.com, bruce.richardson@intel.com,
	david.marchand@redhat.com, stephen@networkplumber.org,
	konstantin.ananyev@intel.com, Tiwei Bie <tiwei.bie@intel.com>
Subject: [dpdk-dev] [RFC v3 5/6] bus/pci: add mdev support
Date: Tue,  1 Jun 2021 11:06:43 +0800	[thread overview]
Message-ID: <20210601030644.3318-6-chenbo.xia@intel.com> (raw)
In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com>

From: Tiwei Bie <tiwei.bie@intel.com>

This patch adds the mdev (Mediated device) support in PCI bus
driver. With this patch, the PCI bus driver will be able to scan
and probe the mediated PCI devices (i.e. the Mediated devices
whose device API is "vfio-pci") in the system.

There are several things different between physical PCI devices
and mediated PCI devices:

- Mediated PCI devices have to be accessed through VFIO API;
- The regions in mediated PCI devices may not be mmap-able,
  and drivers need to call read/write function to access them
  in this case;
- Mediated PCI devices use UUID as device address;

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Signed-off-by: Chenbo Xia <chenbo.xia@intel.com>
---
 drivers/bus/pci/linux/pci.c           |  30 ++-
 drivers/bus/pci/linux/pci_init.h      |  15 +-
 drivers/bus/pci/linux/pci_vfio.c      | 147 ++++++++++++--
 drivers/bus/pci/linux/pci_vfio_mdev.c | 277 ++++++++++++++++++++++++++
 drivers/bus/pci/meson.build           |   1 +
 drivers/bus/pci/pci_common.c          |  84 +++++---
 drivers/bus/pci/pci_params.c          |  36 +++-
 drivers/bus/pci/private.h             |  17 ++
 drivers/bus/pci/rte_bus_pci.h         |  17 +-
 lib/eal/linux/eal.c                   |  17 +-
 10 files changed, 571 insertions(+), 70 deletions(-)
 create mode 100644 drivers/bus/pci/linux/pci_vfio_mdev.c

diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 4805f277c5..29dd9ba26f 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -30,7 +30,7 @@
 
 extern struct rte_pci_bus rte_pci_bus;
 
-static int
+int
 pci_get_kernel_driver_by_path(const char *filename, char *dri_name,
 			      size_t len)
 {
@@ -70,7 +70,7 @@ rte_pci_map_device(struct rte_pci_device *dev)
 	switch (dev->kdrv) {
 	case RTE_PCI_KDRV_VFIO:
 #ifdef VFIO_PRESENT
-		if (pci_vfio_is_enabled())
+		if (pci_vfio_is_enabled(dev))
 			ret = pci_vfio_map_resource(dev);
 #endif
 		break;
@@ -99,7 +99,7 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
 	switch (dev->kdrv) {
 	case RTE_PCI_KDRV_VFIO:
 #ifdef VFIO_PRESENT
-		if (pci_vfio_is_enabled())
+		if (pci_vfio_is_enabled(dev))
 			pci_vfio_unmap_resource(dev);
 #endif
 		break;
@@ -347,6 +347,15 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
 		int ret;
 
 		TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+			/*
+			 * Insert physical PCI devices before all mediated
+			 * PCI devices.
+			 */
+			if (dev2->is_mdev) {
+				rte_pci_insert_device(dev2, dev);
+				return 0;
+			}
+
 			ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
 			if (ret > 0)
 				continue;
@@ -465,8 +474,14 @@ rte_pci_scan(void)
 		return 0;
 
 #ifdef VFIO_PRESENT
-	if (!pci_vfio_is_enabled())
-		RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n");
+	if (!rte_vfio_is_enabled("vfio_pci"))
+		RTE_LOG(DEBUG, EAL, "VFIO PCI module not loaded\n");
+
+	if (!rte_vfio_is_enabled("vfio_mdev"))
+		RTE_LOG(DEBUG, EAL, "VFIO MDEV module not loaded\n");
+
+	if (pci_scan_mdev() != 0)
+		return -1;
 #endif
 
 	dir = opendir(rte_pci_get_sysfs_path());
@@ -737,7 +752,7 @@ rte_pci_ioport_map(struct rte_pci_device *dev, int bar,
 	switch (dev->kdrv) {
 #ifdef VFIO_PRESENT
 	case RTE_PCI_KDRV_VFIO:
-		if (pci_vfio_is_enabled())
+		if (pci_vfio_is_enabled(dev))
 			ret = pci_vfio_ioport_map(dev, bar, p);
 		break;
 #endif
@@ -801,8 +816,7 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
 	switch (p->dev->kdrv) {
 #ifdef VFIO_PRESENT
 	case RTE_PCI_KDRV_VFIO:
-		if (pci_vfio_is_enabled())
-			ret = pci_vfio_ioport_unmap(p);
+		ret = -1;
 		break;
 #endif
 	case RTE_PCI_KDRV_IGB_UIO:
diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h
index 6853fa88a3..0c0191b6d5 100644
--- a/drivers/bus/pci/linux/pci_init.h
+++ b/drivers/bus/pci/linux/pci_init.h
@@ -19,6 +19,9 @@
 extern void *pci_map_addr;
 void *pci_find_max_end_va(void);
 
+int pci_get_kernel_driver_by_path(const char *filename, char *dri_name,
+				  size_t len);
+
 /* parse one line of the "resource" sysfs file (note that the 'line'
  * string is modified)
  */
@@ -93,7 +96,17 @@ int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
 int pci_vfio_map_resource(struct rte_pci_device *dev);
 int pci_vfio_unmap_resource(struct rte_pci_device *dev);
 
-int pci_vfio_is_enabled(void);
+int pci_vfio_is_enabled(struct rte_pci_device *dev);
+
+int pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
+			  struct vfio_device_info *device_info);
+
+int pci_vfio_get_pci_id(struct rte_pci_device *dev, int vfio_dev_fd,
+			struct rte_pci_id *pci_id);
+
+const char *pci_mdev_get_sysfs_path(void);
+
+int pci_scan_mdev(void);
 
 #endif
 
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 3ecd984215..00ba5db03a 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -21,6 +21,7 @@
 #include <rte_bus.h>
 #include <rte_spinlock.h>
 #include <rte_tailq.h>
+#include <rte_uuid.h>
 
 #include "eal_filesystem.h"
 
@@ -741,7 +742,7 @@ pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
 	return ret;
 }
 
-static int
+int
 pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
 		      struct vfio_device_info *device_info)
 {
@@ -776,6 +777,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 	struct vfio_region_info *reg = NULL;
 	char pci_addr[PATH_MAX] = {0};
+	const char *sysfs_base;
 	int vfio_dev_fd;
 	struct rte_pci_addr *loc = &dev->addr;
 	int i, ret;
@@ -791,11 +793,17 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 #endif
 
 	/* store PCI address string */
-	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+	if (dev->is_mdev) {
+		sysfs_base = pci_mdev_get_sysfs_path();
+		rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+	} else {
+		sysfs_base = rte_pci_get_sysfs_path();
+		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 			loc->domain, loc->bus, loc->devid, loc->function);
+	}
 
-	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
-					&vfio_dev_fd, &device_info);
+	ret = rte_vfio_setup_device(sysfs_base, pci_addr, &vfio_dev_fd,
+		&device_info);
 	if (ret)
 		return ret;
 
@@ -806,7 +814,13 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 			"Cannot store VFIO mmap details\n");
 		goto err_vfio_dev_fd;
 	}
-	memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+	vfio_res->is_mdev = dev->is_mdev;
+	if (dev->is_mdev)
+		memcpy(&vfio_res->uuid, &dev->uuid, sizeof(vfio_res->uuid));
+	else
+		memcpy(&vfio_res->pci_addr, &dev->addr,
+			sizeof(vfio_res->pci_addr));
 
 	/* get number of registers (up to BAR5) */
 	vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
@@ -938,6 +952,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 {
 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 	char pci_addr[PATH_MAX] = {0};
+	const char *sysfs_base;
 	int vfio_dev_fd;
 	struct rte_pci_addr *loc = &dev->addr;
 	int i, ret;
@@ -953,15 +968,29 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 #endif
 
 	/* store PCI address string */
-	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+	if (dev->is_mdev) {
+		sysfs_base = pci_mdev_get_sysfs_path();
+		rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+	} else {
+		sysfs_base = rte_pci_get_sysfs_path();
+		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 			loc->domain, loc->bus, loc->devid, loc->function);
+	}
 
 	/* if we're in a secondary process, just find our tailq entry */
 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
-		if (rte_pci_addr_cmp(&vfio_res->pci_addr,
-						 &dev->addr))
+		if (dev->is_mdev != vfio_res->is_mdev)
 			continue;
-		break;
+
+		if (!dev->is_mdev && !rte_pci_addr_cmp(&vfio_res->pci_addr,
+			&dev->addr))
+			break;
+
+		if (dev->is_mdev && !rte_uuid_compare(vfio_res->uuid,
+			dev->uuid))
+			break;
+
+		continue;
 	}
 	/* if we haven't found our tailq entry, something's wrong */
 	if (vfio_res == NULL) {
@@ -970,8 +999,8 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 		return -1;
 	}
 
-	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
-					&vfio_dev_fd, &device_info);
+	ret = rte_vfio_setup_device(sysfs_base, pci_addr, &vfio_dev_fd,
+		&device_info);
 	if (ret)
 		return ret;
 
@@ -1030,9 +1059,18 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 
 	/* Get vfio_res */
 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
-		if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
+		if (dev->is_mdev != vfio_res->is_mdev)
 			continue;
-		break;
+
+		if (!dev->is_mdev && !rte_pci_addr_cmp(&vfio_res->pci_addr,
+			&dev->addr))
+			break;
+
+		if (dev->is_mdev && !rte_uuid_compare(vfio_res->uuid,
+			dev->uuid))
+			break;
+
+		continue;
 	}
 
 	if  (vfio_res == NULL)
@@ -1061,6 +1099,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 static int
 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
 {
+	const char *sysfs_base;
 	char pci_addr[PATH_MAX] = {0};
 	struct rte_pci_addr *loc = &dev->addr;
 	struct mapped_pci_resource *vfio_res = NULL;
@@ -1068,8 +1107,14 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
 	int ret;
 
 	/* store PCI address string */
-	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+	if (dev->is_mdev) {
+		sysfs_base = pci_mdev_get_sysfs_path();
+		rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+	} else {
+		sysfs_base = rte_pci_get_sysfs_path();
+		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 			loc->domain, loc->bus, loc->devid, loc->function);
+	}
 
 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 	ret = pci_vfio_disable_notifier(dev);
@@ -1091,8 +1136,8 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
 		return -1;
 	}
 
-	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
-				  dev->intr_handle.vfio_dev_fd);
+	ret = rte_vfio_release_device(sysfs_base, pci_addr,
+		dev->intr_handle.vfio_dev_fd);
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
 		return ret;
@@ -1117,6 +1162,7 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
 static int
 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
 {
+	const char *sysfs_base;
 	char pci_addr[PATH_MAX] = {0};
 	struct rte_pci_addr *loc = &dev->addr;
 	struct mapped_pci_resource *vfio_res = NULL;
@@ -1124,11 +1170,17 @@ pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
 	int ret;
 
 	/* store PCI address string */
-	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+	if (dev->is_mdev) {
+		sysfs_base = pci_mdev_get_sysfs_path();
+		rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+	} else {
+		sysfs_base = rte_pci_get_sysfs_path();
+		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 			loc->domain, loc->bus, loc->devid, loc->function);
+	}
 
-	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
-				  dev->intr_handle.vfio_dev_fd);
+	ret = rte_vfio_release_device(sysfs_base, pci_addr,
+		dev->intr_handle.vfio_dev_fd);
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
 		return ret;
@@ -1249,8 +1301,61 @@ pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar,
 }
 
 int
-pci_vfio_is_enabled(void)
+pci_vfio_is_enabled(struct rte_pci_device *dev)
 {
-	return rte_vfio_is_enabled("vfio_pci");
+	return rte_vfio_is_enabled(dev->is_mdev ? "vfio_mdev" : "vfio_pci");
 }
+
+int
+pci_vfio_get_pci_id(struct rte_pci_device *dev, int vfio_dev_fd,
+		    struct rte_pci_id *pci_id)
+{
+	uint64_t size, offset;
+	int class;
+
+	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
+				&size, &offset) != 0) {
+		RTE_LOG(DEBUG, EAL, "Cannot get offset of CONFIG region.\n");
+		return -1;
+	}
+
+	/* vendor_id */
+	if (pread64(vfio_dev_fd, &pci_id->vendor_id, sizeof(uint16_t),
+		    offset + PCI_VENDOR_ID) != sizeof(uint16_t)) {
+		RTE_LOG(DEBUG, EAL, "Cannot read VendorID from PCI config space\n");
+		return -1;
+	}
+
+	/* device_id */
+	if (pread64(vfio_dev_fd, &pci_id->device_id, sizeof(uint16_t),
+		    offset + PCI_DEVICE_ID) != sizeof(uint16_t)) {
+		RTE_LOG(DEBUG, EAL, "Cannot read DeviceID from PCI config space\n");
+		return -1;
+	}
+
+	/* subsystem_vendor_id */
+	if (pread64(vfio_dev_fd, &pci_id->subsystem_vendor_id, sizeof(uint16_t),
+		    offset + PCI_SUBSYSTEM_VENDOR_ID) != sizeof(uint16_t)) {
+		RTE_LOG(DEBUG, EAL, "Cannot read SubVendorID from PCI config space\n");
+		return -1;
+	}
+
+	/* subsystem_device_id */
+	if (pread64(vfio_dev_fd, &pci_id->subsystem_device_id, sizeof(uint16_t),
+		    offset + PCI_SUBSYSTEM_ID) != sizeof(uint16_t)) {
+		RTE_LOG(DEBUG, EAL, "Cannot read SubDeviceID from PCI config space\n");
+		return -1;
+	}
+
+	/* class_id */
+	if (pread64(vfio_dev_fd, &class, sizeof(uint32_t),
+		    offset + PCI_CLASS_REVISION) != sizeof(uint32_t)) {
+		RTE_LOG(DEBUG, EAL, "Cannot read ClassID from PCI config space\n");
+		return -1;
+	}
+	pci_id->class_id = class >> 8;
+
+	return 0;
+}
+
 #endif
diff --git a/drivers/bus/pci/linux/pci_vfio_mdev.c b/drivers/bus/pci/linux/pci_vfio_mdev.c
new file mode 100644
index 0000000000..ef25749a0d
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_vfio_mdev.c
@@ -0,0 +1,277 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <string.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/pci_regs.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+#include <rte_vfio.h>
+#include <rte_uuid.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+
+#include "private.h"
+#include "pci_init.h"
+
+#ifdef VFIO_PRESENT
+
+extern struct rte_pci_bus rte_pci_bus;
+
+#define SYSFS_MDEV_DEVICES "/sys/bus/mdev/devices"
+
+const char *pci_mdev_get_sysfs_path(void)
+{
+	const char *path = NULL;
+
+	path = getenv("SYSFS_MDEV_DEVICES");
+	if (path == NULL)
+		return SYSFS_MDEV_DEVICES;
+
+	return path;
+}
+
+static int
+is_pci_device(const char *dirname)
+{
+	char device_api[PATH_MAX];
+	char filename[PATH_MAX];
+	char *ptr;
+
+	/* get device_api */
+	snprintf(filename, sizeof(filename), "%s/mdev_type/device_api",
+		 dirname);
+
+	if (rte_eal_parse_sysfs_str(filename, device_api,
+				    sizeof(device_api)) < 0) {
+		return -1;
+	}
+
+	ptr = strchr(device_api, '\n');
+	if (ptr != NULL)
+		*ptr = '\0';
+
+	return strcmp(device_api, "vfio-pci") == 0;
+}
+
+static int
+pci_scan_one_mdev(const char *dirname, const rte_uuid_t addr)
+{
+	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+	char name[RTE_UUID_STRLEN];
+	char filename[PATH_MAX];
+	char path[PATH_MAX];
+	char driver[PATH_MAX];
+	char *ptr;
+	struct rte_pci_device_internal *pdev;
+	struct rte_pci_device *dev;
+	bool need_release = false;
+	const char *sysfs_base;
+	unsigned long tmp;
+	int vfio_dev_fd;
+	int ret;
+
+	sysfs_base = pci_mdev_get_sysfs_path();
+
+	pdev = malloc(sizeof(*pdev));
+	if (pdev == NULL)
+		return -1;
+
+	memset(pdev, 0, sizeof(*pdev));
+
+	dev = &pdev->device;
+	dev->device.bus = &rte_pci_bus.bus;
+	rte_uuid_unparse(addr, name, sizeof(name));
+
+	/* parse driver */
+	snprintf(filename, sizeof(filename), "%s/driver", dirname);
+	ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
+	if (ret < 0) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to get kernel driver\n", name);
+		goto err;
+	}
+
+	if (ret != 0 || strcmp(driver, "vfio_mdev") != 0) {
+		RTE_LOG(DEBUG, EAL, "%s: unsupported mdev driver\n", name);
+		goto err;
+	}
+
+	dev->kdrv = RTE_PCI_KDRV_VFIO;
+
+	dev->is_mdev = 1;
+	rte_uuid_copy(dev->uuid, addr);
+
+	snprintf(filename, sizeof(filename), "%s/%s", sysfs_base, name);
+
+	/* Get the path of the parent device. */
+	if (realpath(filename, path) == NULL) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to get parent device\n", name);
+		goto err;
+	}
+
+	ptr = strrchr(path, '/');
+	if (ptr == NULL) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to parse parent device\n",
+			name);
+		goto err;
+	}
+	*ptr = '\0';
+
+	/* get numa node, default to 0 if not present */
+	snprintf(filename, sizeof(filename), "%s/numa_node", path);
+
+	if (access(filename, F_OK) != -1) {
+		if (eal_parse_sysfs_value(filename, &tmp) == 0)
+			dev->device.numa_node = tmp;
+		else
+			dev->device.numa_node = -1;
+	} else {
+		dev->device.numa_node = 0;
+	}
+
+	pci_name_set(dev);
+
+	if (rte_vfio_setup_device(sysfs_base, name, &vfio_dev_fd,
+				  &device_info) != 0) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to setup device\n", name);
+		goto err;
+	}
+
+	need_release = true;
+
+	if (pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info) != 0) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to get regions\n", name);
+		goto err;
+	}
+
+	if (pci_vfio_get_pci_id(dev, vfio_dev_fd, &dev->id) != 0) {
+		RTE_LOG(DEBUG, EAL, "%s: failed to access the device\n", name);
+		goto err;
+	}
+
+	/* device is valid, add to the list (sorted) */
+	if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+		rte_pci_add_device(dev);
+	} else {
+		struct rte_pci_device *dev2;
+		int ret;
+
+		TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+			/*
+			 * Insert mediated PCI devices after all physical
+			 * PCI devices.
+			 */
+			if (!dev2->is_mdev)
+				continue;
+			ret = rte_uuid_compare(dev->uuid, dev2->uuid);
+			if (ret > 0)
+				continue;
+			if (ret < 0)
+				rte_pci_insert_device(dev2, dev);
+			else {/* already registered */
+				if (!rte_dev_is_probed(&dev2->device)) {
+					dev2->kdrv = dev->kdrv;
+					dev2->max_vfs = dev->max_vfs;
+					pci_name_set(dev2);
+					memmove(dev2->mem_resource,
+						dev->mem_resource,
+						sizeof(dev->mem_resource));
+				} else {
+					/**
+					 * If device is plugged and driver is
+					 * probed already, (This happens when
+					 * we call rte_dev_probe which will
+					 * scan all device on the bus) we don't
+					 * need to do anything here unless...
+					 **/
+					if (dev2->kdrv != dev->kdrv ||
+						dev2->max_vfs != dev->max_vfs ||
+						memcmp(&dev2->id, &dev->id,
+							sizeof(dev2->id)))
+						/*
+						 * This should not happen.
+						 * But it is still possible if
+						 * we unbind a device from
+						 * vfio or uio before hotplug
+						 * remove and rebind it with
+						 * a different configure.
+						 * So we just print out the
+						 * error as an alarm.
+						 */
+						RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n",
+							filename);
+					else if (dev2->device.devargs !=
+						 dev->device.devargs) {
+						rte_devargs_remove(dev2->device.devargs);
+						pci_name_set(dev2);
+					}
+				}
+				free(pdev);
+			}
+			return 0;
+		}
+
+		rte_pci_add_device(dev);
+	}
+
+	return 0;
+
+err:
+	if (need_release)
+		rte_vfio_release_device(sysfs_base, name, vfio_dev_fd);
+	free(pdev);
+	return 1;
+}
+
+int
+pci_scan_mdev(void)
+{
+	struct dirent *e;
+	DIR *dir;
+	char dirname[PATH_MAX];
+	rte_uuid_t addr;
+
+	dir = opendir(pci_mdev_get_sysfs_path());
+	if (dir == NULL) {
+		RTE_LOG(DEBUG, EAL, "%s(): opendir failed: %s\n",
+			__func__, strerror(errno));
+		return 0;
+	}
+
+	while ((e = readdir(dir)) != NULL) {
+		if (e->d_name[0] == '.')
+			continue;
+
+		if (rte_uuid_parse(e->d_name, addr) != 0)
+			continue;
+
+		if (rte_mdev_ignore_device(addr))
+			continue;
+
+		snprintf(dirname, sizeof(dirname), "%s/%s",
+			 pci_mdev_get_sysfs_path(), e->d_name);
+
+		if (!is_pci_device(dirname))
+			continue;
+
+		if (pci_scan_one_mdev(dirname, addr) < 0)
+			goto error;
+	}
+	closedir(dir);
+	return 0;
+
+error:
+	closedir(dir);
+	return -1;
+}
+
+#endif /* VFIO_PRESENT */
diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build
index 81c7e94c00..fb7a9a1fa8 100644
--- a/drivers/bus/pci/meson.build
+++ b/drivers/bus/pci/meson.build
@@ -11,6 +11,7 @@ if is_linux
             'linux/pci.c',
             'linux/pci_uio.c',
             'linux/pci_vfio.c',
+            'linux/pci_vfio_mdev.c',
     )
     includes += include_directories('linux')
 endif
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 1c368c254c..1984dbdba0 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -24,6 +24,7 @@
 #include <rte_common.h>
 #include <rte_devargs.h>
 #include <rte_vfio.h>
+#include <rte_uuid.h>
 
 #include "private.h"
 
@@ -57,15 +58,34 @@ pci_devargs_lookup(const struct rte_pci_addr *pci_addr)
 	return NULL;
 }
 
+static struct rte_devargs *
+mdev_devargs_lookup(const rte_uuid_t mdev_addr)
+{
+	struct rte_devargs *devargs;
+	rte_uuid_t id;
+
+	RTE_EAL_DEVARGS_FOREACH("pci", devargs) {
+		devargs->bus->parse(devargs->name, &id);
+		if (!rte_uuid_compare(mdev_addr, id))
+			return devargs;
+	}
+	return NULL;
+}
+
 void
 pci_name_set(struct rte_pci_device *dev)
 {
 	struct rte_devargs *devargs;
 
 	/* Each device has its internal, canonical name set. */
-	rte_pci_device_name(&dev->addr,
-			dev->name, sizeof(dev->name));
-	devargs = pci_devargs_lookup(&dev->addr);
+	if (dev->is_mdev) {
+		rte_uuid_unparse(dev->uuid, dev->name, sizeof(dev->name));
+		devargs = mdev_devargs_lookup(dev->uuid);
+	} else {
+		rte_pci_device_name(&dev->addr, dev->name, sizeof(dev->name));
+		devargs = pci_devargs_lookup(&dev->addr);
+	}
+
 	dev->device.devargs = devargs;
 
 	/* When using a blocklist, only blocked devices will have
@@ -166,21 +186,17 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 {
 	int ret;
 	bool already_probed;
-	struct rte_pci_addr *loc;
 
 	if ((dr == NULL) || (dev == NULL))
 		return -EINVAL;
 
-	loc = &dev->addr;
-
 	/* The device is not blocked; Check if driver supports it */
 	if (!rte_pci_match(dr, dev))
 		/* Match of device and driver failed */
 		return 1;
 
-	RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
-			loc->domain, loc->bus, loc->devid, loc->function,
-			dev->device.numa_node);
+	RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n",
+		dev->name, dev->device.numa_node);
 
 	/* no initialization when marked as blocked, return without error */
 	if (dev->device.devargs != NULL &&
@@ -235,10 +251,9 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 		}
 	}
 
-	RTE_LOG(INFO, EAL, "Probe PCI driver: %s (%x:%x) device: "PCI_PRI_FMT" (socket %i)\n",
+	RTE_LOG(INFO, EAL, "Probe PCI driver: %s (%x:%x) device: %s (socket %i)\n",
 			dr->driver.name, dev->id.vendor_id, dev->id.device_id,
-			loc->domain, loc->bus, loc->devid, loc->function,
-			dev->device.numa_node);
+			dev->name, dev->device.numa_node);
 	/* call the driver probe() function */
 	ret = dr->probe(dr, dev);
 	if (already_probed)
@@ -266,7 +281,6 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 static int
 rte_pci_detach_dev(struct rte_pci_device *dev)
 {
-	struct rte_pci_addr *loc;
 	struct rte_pci_driver *dr;
 	int ret = 0;
 
@@ -274,11 +288,9 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
 		return -EINVAL;
 
 	dr = dev->driver;
-	loc = &dev->addr;
 
-	RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
-			loc->domain, loc->bus, loc->devid,
-			loc->function, dev->device.numa_node);
+	RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n",
+		dev->name, dev->device.numa_node);
 
 	RTE_LOG(DEBUG, EAL, "  remove driver: %x:%x %s\n", dev->id.vendor_id,
 			dev->id.device_id, dr->driver.name);
@@ -345,10 +357,9 @@ pci_probe(void)
 		ret = pci_probe_all_drivers(dev);
 		if (ret < 0) {
 			if (ret != -EEXIST) {
-				RTE_LOG(ERR, EAL, "Requested device "
-					PCI_PRI_FMT " cannot be used\n",
-					dev->addr.domain, dev->addr.bus,
-					dev->addr.devid, dev->addr.function);
+				RTE_LOG(ERR, EAL,
+					"Requested device %s cannot be used\n",
+					dev->name);
 				rte_errno = errno;
 				failed++;
 			}
@@ -395,11 +406,20 @@ pci_parse(const char *name, void *addr)
 {
 	struct rte_pci_addr *out = addr;
 	struct rte_pci_addr pci_addr;
+	rte_uuid_t mdev_addr;
 	bool parse;
 
 	parse = (rte_pci_addr_parse(name, &pci_addr) == 0);
 	if (parse && addr != NULL)
 		*out = pci_addr;
+
+	if (parse)
+		return 0;
+
+	parse = (rte_uuid_parse(name, mdev_addr) == 0);
+	if (parse && addr != NULL)
+		memcpy(addr, &mdev_addr, sizeof(mdev_addr));
+
 	return parse == false;
 }
 
@@ -622,11 +642,9 @@ pci_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, size_t len)
 	return -1;
 }
 
-bool
-rte_pci_ignore_device(const struct rte_pci_addr *pci_addr)
+static bool
+devargs_ignore_device(struct rte_devargs *devargs)
 {
-	struct rte_devargs *devargs = pci_devargs_lookup(pci_addr);
-
 	switch (rte_pci_bus.bus.conf.scan_mode) {
 	case RTE_BUS_SCAN_ALLOWLIST:
 		if (devargs && devargs->policy == RTE_DEV_ALLOWED)
@@ -641,6 +659,22 @@ rte_pci_ignore_device(const struct rte_pci_addr *pci_addr)
 	return true;
 }
 
+bool
+rte_pci_ignore_device(const struct rte_pci_addr *pci_addr)
+{
+	struct rte_devargs *devargs = pci_devargs_lookup(pci_addr);
+
+	return devargs_ignore_device(devargs);
+}
+
+bool
+rte_mdev_ignore_device(const rte_uuid_t mdev_addr)
+{
+	struct rte_devargs *devargs = mdev_devargs_lookup(mdev_addr);
+
+	return devargs_ignore_device(devargs);
+}
+
 enum rte_iova_mode
 rte_pci_get_iommu_class(void)
 {
diff --git a/drivers/bus/pci/pci_params.c b/drivers/bus/pci/pci_params.c
index 3192e9c967..231e57213e 100644
--- a/drivers/bus/pci/pci_params.c
+++ b/drivers/bus/pci/pci_params.c
@@ -2,12 +2,15 @@
  * Copyright 2018 Gaëtan Rivet
  */
 
+#include <string.h>
+
 #include <rte_bus.h>
 #include <rte_bus_pci.h>
 #include <rte_dev.h>
 #include <rte_errno.h>
 #include <rte_kvargs.h>
 #include <rte_pci.h>
+#include <rte_uuid.h>
 
 #include "private.h"
 
@@ -35,6 +38,19 @@ pci_addr_kv_cmp(const char *key __rte_unused,
 	return -abs(rte_pci_addr_cmp(addr1, addr2));
 }
 
+static int
+mdev_addr_kv_cmp(const char *key __rte_unused,
+		const char *value,
+		void *_addr2)
+{
+	rte_uuid_t addr1;
+	unsigned char *addr2 = _addr2;
+
+	if (rte_uuid_parse(value, addr1))
+		return -1;
+	return -abs(rte_uuid_compare(addr1, addr2));
+}
+
 static int
 pci_dev_match(const struct rte_device *dev,
 	      const void *_kvlist)
@@ -47,11 +63,21 @@ pci_dev_match(const struct rte_device *dev,
 		return 0;
 	pdev = RTE_DEV_TO_PCI_CONST(dev);
 	/* if any field does not match. */
-	if (rte_kvargs_process(kvlist, pci_params_keys[RTE_PCI_PARAM_ADDR],
-			       &pci_addr_kv_cmp,
-			       (void *)(intptr_t)&pdev->addr))
-		return 1;
-	return 0;
+	if (!pdev->is_mdev) {
+		if (rte_kvargs_process(kvlist,
+			pci_params_keys[RTE_PCI_PARAM_ADDR], &pci_addr_kv_cmp,
+			(void *)(intptr_t)&pdev->addr))
+			return 1;
+		else
+			return 0;
+	} else {
+		if (rte_kvargs_process(kvlist,
+			pci_params_keys[RTE_PCI_PARAM_ADDR], &mdev_addr_kv_cmp,
+			(void *)(intptr_t)&pdev->uuid))
+			return 1;
+		else
+			return 0;
+	}
 }
 
 void *
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 8b5fa70641..3515c086aa 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -64,6 +64,18 @@ pci_name_set(struct rte_pci_device *dev);
  */
 bool rte_pci_ignore_device(const struct rte_pci_addr *pci_addr);
 
+/**
+ * Validate whether a mediated PCI device with given uuid should be
+ * ignored or not.
+ *
+ * @param mdev_addr
+ *	MDEV address of device to be validated
+ * @return
+ *	true: if device is to be ignored,
+ *	false: if device is to be scanned,
+ */
+bool rte_mdev_ignore_device(const rte_uuid_t mdev_addr);
+
 /**
  * Add a PCI device to the PCI Bus (append to PCI Device list). This function
  * also updates the bus references of the PCI Device (and the generic device
@@ -114,6 +126,11 @@ struct pci_msix_table {
 struct mapped_pci_resource {
 	TAILQ_ENTRY(mapped_pci_resource) next;
 
+	union {
+		struct rte_pci_addr addr;
+		rte_uuid_t uuid;
+	};
+	uint8_t is_mdev;
 	struct rte_pci_addr pci_addr;
 	char path[PATH_MAX];
 	int nb_maps;
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index dc26811b0a..fb7d934bd0 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -51,6 +51,15 @@ TAILQ_HEAD(rte_pci_driver_list, rte_pci_driver);
 
 struct rte_devargs;
 
+/*
+ * NOTE: we can't include rte_uuid.h directly due to the conflicts
+ *      introduced by stdbool.h
+ */
+typedef unsigned char rte_uuid_t[16];
+
+/* It's RTE_UUID_STRLEN, which is bigger than PCI_PRI_STR_SIZE. */
+#define RTE_PCI_NAME_LEN		(36 + 1)
+
 enum rte_pci_kernel_driver {
 	RTE_PCI_KDRV_UNKNOWN = 0,  /* may be misc UIO or bifurcated driver */
 	RTE_PCI_KDRV_IGB_UIO,      /* igb_uio for Linux */
@@ -67,7 +76,11 @@ enum rte_pci_kernel_driver {
 struct rte_pci_device {
 	TAILQ_ENTRY(rte_pci_device) next;   /**< Next probed PCI device. */
 	struct rte_device device;           /**< Inherit core device */
-	struct rte_pci_addr addr;           /**< PCI location. */
+	union {
+		struct rte_pci_addr addr;   /**< PCI location. */
+		rte_uuid_t uuid;            /**< Mdev location. */
+	};
+	uint8_t is_mdev;                    /**< True for mediated PCI device */
 	struct rte_pci_id id;               /**< PCI ID. */
 	struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
 					    /**< PCI Memory Resource */
@@ -75,7 +88,7 @@ struct rte_pci_device {
 	struct rte_pci_driver *driver;      /**< PCI driver used in probing */
 	uint16_t max_vfs;                   /**< sriov enable if not zero */
 	enum rte_pci_kernel_driver kdrv;    /**< Kernel driver passthrough */
-	char name[PCI_PRI_STR_SIZE+1];      /**< PCI location (ASCII) */
+	char name[RTE_PCI_NAME_LEN];        /**< PCI/Mdev location (ASCII) */
 	struct rte_intr_handle vfio_req_intr_handle;
 				/**< Handler of VFIO request interrupt */
 };
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index d5917a48ca..323f13107e 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -1089,6 +1089,15 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+#ifdef VFIO_PRESENT
+	if (rte_eal_vfio_setup() < 0) {
+		rte_eal_init_alert("Cannot init VFIO");
+		rte_errno = EAGAIN;
+		__atomic_store_n(&run_once, 0, __ATOMIC_RELAXED);
+		return -1;
+	}
+#endif
+
 	if (rte_bus_scan()) {
 		rte_eal_init_alert("Cannot scan the buses for devices");
 		rte_errno = ENODEV;
@@ -1194,14 +1203,6 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
-#ifdef VFIO_PRESENT
-	if (rte_eal_vfio_setup() < 0) {
-		rte_eal_init_alert("Cannot init VFIO");
-		rte_errno = EAGAIN;
-		__atomic_store_n(&run_once, 0, __ATOMIC_RELAXED);
-		return -1;
-	}
-#endif
 	/* in secondary processes, memory init may allocate additional fbarrays
 	 * not present in primary processes, so to avoid any potential issues,
 	 * initialize memzones first.
-- 
2.17.1


  parent reply	other threads:[~2021-06-01  3:18 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-03  7:18 [dpdk-dev] [RFC 0/3] Add mdev (Mediated device) support in DPDK Tiwei Bie
2019-04-03  7:18 ` Tiwei Bie
2019-04-03  7:18 ` [dpdk-dev] [RFC 1/3] eal: add a helper for reading string from sysfs Tiwei Bie
2019-04-03  7:18   ` Tiwei Bie
2019-04-03  7:18 ` [dpdk-dev] [RFC 2/3] bus/mdev: add mdev bus support Tiwei Bie
2019-04-03  7:18   ` Tiwei Bie
2019-04-03  7:18 ` [dpdk-dev] [RFC 3/3] bus/pci: add mdev support Tiwei Bie
2019-04-03  7:18   ` Tiwei Bie
2019-04-03 14:13   ` Wiles, Keith
2019-04-03 14:13     ` Wiles, Keith
2019-04-04  4:19     ` Tiwei Bie
2019-04-04  4:19       ` Tiwei Bie
2019-04-08  8:44 ` [dpdk-dev] [RFC 0/3] Add mdev (Mediated device) support in DPDK Alejandro Lucero
2019-04-08  8:44   ` Alejandro Lucero
2019-04-08  9:36   ` Tiwei Bie
2019-04-08  9:36     ` Tiwei Bie
2019-04-10 10:02     ` Francois Ozog
2019-04-10 10:02       ` Francois Ozog
2023-07-03 23:54       ` Stephen Hemminger
2019-07-15  7:52 ` [dpdk-dev] [RFC v2 0/5] " Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 1/5] bus/pci: introduce an internal representation of PCI device Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 2/5] bus/pci: avoid depending on private value in kernel source Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 3/5] bus/pci: introduce helper for MMIO read and write Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 4/5] eal: add a helper for reading string from sysfs Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 5/5] bus/pci: add mdev support Tiwei Bie
2021-06-01  3:06     ` [dpdk-dev] [RFC v3 0/6] Add mdev (Mediated device) support in DPDK Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 1/6] bus/pci: introduce an internal representation of PCI device Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 2/6] bus/pci: avoid depending on private value in kernel source Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 3/6] bus/pci: introduce helper for MMIO read and write Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 4/6] eal: add a helper for reading string from sysfs Chenbo Xia
2021-06-01  5:37         ` Stephen Hemminger
2021-06-08  5:47           ` Xia, Chenbo
2021-06-01  5:39         ` Stephen Hemminger
2021-06-08  5:48           ` Xia, Chenbo
2021-06-11  7:19         ` Thomas Monjalon
2021-06-01  3:06       ` Chenbo Xia [this message]
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 6/6] bus/pci: add sparse mmap support for mediated PCI devices Chenbo Xia
2021-06-11  7:15       ` [dpdk-dev] [RFC v3 0/6] Add mdev (Mediated device) support in DPDK Thomas Monjalon
2021-06-15  2:49         ` Xia, Chenbo
2021-06-15  7:48           ` Thomas Monjalon
2021-06-15 10:44             ` Xia, Chenbo
2021-06-15 11:57             ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210601030644.3318-6-chenbo.xia@intel.com \
    --to=chenbo.xia@intel.com \
    --cc=anatoly.burakov@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=cunming.liang@intel.com \
    --cc=david.marchand@redhat.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@intel.com \
    --cc=jingjing.wu@intel.com \
    --cc=konstantin.ananyev@intel.com \
    --cc=mdr@ashroe.eu \
    --cc=nhorman@tuxdriver.com \
    --cc=stephen@networkplumber.org \
    --cc=thomas@monjalon.net \
    --cc=tiwei.bie@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).