From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id F0F36A0679 for ; Wed, 3 Apr 2019 09:19:56 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 5A2AA5F29; Wed, 3 Apr 2019 09:19:36 +0200 (CEST) Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by dpdk.org (Postfix) with ESMTP id 6B6CD5B2A for ; Wed, 3 Apr 2019 09:19:28 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga007.jf.intel.com ([10.7.209.58]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 03 Apr 2019 00:19:28 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,303,1549958400"; d="scan'208";a="128206262" Received: from dpdk-tbie.sh.intel.com ([10.67.104.173]) by orsmga007.jf.intel.com with ESMTP; 03 Apr 2019 00:19:26 -0700 From: Tiwei Bie To: dev@dpdk.org Cc: cunming.liang@intel.com, bruce.richardson@intel.com, alejandro.lucero@netronome.com Date: Wed, 3 Apr 2019 15:18:44 +0800 Message-Id: <20190403071844.21126-4-tiwei.bie@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20190403071844.21126-1-tiwei.bie@intel.com> References: <20190403071844.21126-1-tiwei.bie@intel.com> Subject: [dpdk-dev] [RFC 3/3] bus/pci: add mdev support X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Content-Type: text/plain; charset="UTF-8" Message-ID: <20190403071844.4Kb8PwWaGAs41unVeWNnyQkdl0oqZ-upiwG5ICo7838@z> This patch adds the mdev support in PCI bus driver. A mdev driver is introduced to probe the mdev devices whose device API is "vfio-pci" on the mdev bus. PS. There are some hacks in this patch for now. Signed-off-by: Cunming Liang Signed-off-by: Tiwei Bie --- drivers/bus/pci/Makefile | 3 + drivers/bus/pci/linux/Makefile | 4 + drivers/bus/pci/linux/pci_vfio.c | 35 ++- drivers/bus/pci/linux/pci_vfio_mdev.c | 305 ++++++++++++++++++++++++++ drivers/bus/pci/meson.build | 4 +- drivers/bus/pci/pci_common.c | 17 +- drivers/bus/pci/private.h | 9 + drivers/bus/pci/rte_bus_pci.h | 11 +- 8 files changed, 370 insertions(+), 18 deletions(-) create mode 100644 drivers/bus/pci/linux/pci_vfio_mdev.c diff --git a/drivers/bus/pci/Makefile b/drivers/bus/pci/Makefile index de53ce1bf..085ec9066 100644 --- a/drivers/bus/pci/Makefile +++ b/drivers/bus/pci/Makefile @@ -27,6 +27,9 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS += -lrte_ethdev -lrte_pci -lrte_kvargs +ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y) +LDLIBS += -lrte_bus_mdev +endif include $(RTE_SDK)/drivers/bus/pci/$(SYSTEM)/Makefile SRCS-$(CONFIG_RTE_LIBRTE_PCI_BUS) := $(addprefix $(SYSTEM)/,$(SRCS)) diff --git a/drivers/bus/pci/linux/Makefile b/drivers/bus/pci/linux/Makefile index 90404468b..88bbc2390 100644 --- a/drivers/bus/pci/linux/Makefile +++ b/drivers/bus/pci/linux/Makefile @@ -4,3 +4,7 @@ SRCS += pci.c SRCS += pci_uio.c SRCS += pci_vfio.c + +ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y) + SRCS += pci_vfio_mdev.c +endif diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index ebf6ccd3c..c2c4c6a50 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -13,6 +13,9 @@ #include #include +#ifdef RTE_LIBRTE_MDEV_BUS +#include +#endif #include #include #include @@ -20,6 +23,7 @@ #include #include #include +#include #include "eal_filesystem.h" @@ -648,6 +652,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) { struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; char pci_addr[PATH_MAX] = {0}; + const char *sysfs_path; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret; @@ -663,10 +668,20 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) #endif /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->use_uuid) { +#ifdef RTE_LIBRTE_MDEV_BUS + sysfs_path = rte_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); +#else + return -1; +#endif + } else { + sysfs_path = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } - ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, + ret = rte_vfio_setup_device(sysfs_path, pci_addr, &vfio_dev_fd, &device_info); if (ret) return ret; @@ -793,6 +808,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) { struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; char pci_addr[PATH_MAX] = {0}; + const char *sysfs_path; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret; @@ -808,8 +824,19 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) #endif /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->use_uuid) { +#ifdef RTE_LIBRTE_MDEV_BUS + sysfs_path = rte_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); +#else + return -1; +#endif + } else { + sysfs_path = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } + /* if we're in a secondary process, just find our tailq entry */ TAILQ_FOREACH(vfio_res, vfio_res_list, next) { @@ -825,7 +852,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) return -1; } - ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, + ret = rte_vfio_setup_device(sysfs_path, pci_addr, &vfio_dev_fd, &device_info); if (ret) return ret; diff --git a/drivers/bus/pci/linux/pci_vfio_mdev.c b/drivers/bus/pci/linux/pci_vfio_mdev.c new file mode 100644 index 000000000..92498c2fe --- /dev/null +++ b/drivers/bus/pci/linux/pci_vfio_mdev.c @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" + +#include "private.h" + +extern struct rte_pci_bus rte_pci_bus; + +static int +get_pci_id(const char *sysfs_base, const char *dev_addr, + struct rte_pci_id *pci_id) +{ + int ret = 0; + int iommu_group_num; + int vfio_group_fd; + int vfio_dev_fd; + int container; + int class; + char name[PATH_MAX]; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) }; + + container = open("/dev/vfio/vfio", O_RDWR); + if (container < 0) { + RTE_LOG(WARNING, EAL, "Failed to open VFIO container\n"); + ret = -1; + goto out; + } + + if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + /* Unknown API version */ + RTE_LOG(WARNING, EAL, "Unknown VFIO API version\n"); + ret = -1; + goto close_container; + } + + if (rte_vfio_get_group_num(sysfs_base, dev_addr, + &iommu_group_num) <= 0) { + RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n", + dev_addr); + ret = -1; + goto close_container; + } + + snprintf(name, sizeof(name), "/dev/vfio/%d", iommu_group_num); + + vfio_group_fd = open(name, O_RDWR); + if (vfio_group_fd < 0) { + ret = -1; + goto close_container; + } + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n", + dev_addr); + ret = -1; + goto close_group; + } + + if (ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status)) { + RTE_LOG(ERR, EAL, "%s cannot get group status, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + ret = -1; + goto close_group; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, "%s VFIO group is not viable!\n", dev_addr); + ret = -1; + goto close_group; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + if (ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &container)) { + RTE_LOG(ERR, EAL, "%s cannot add VFIO group to container, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + ret = -1; + goto close_group; + } + } + + if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + RTE_LOG(ERR, EAL, "%s cannot set iommu, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + ret = -1; + goto close_group; + } + + vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + RTE_LOG(ERR, EAL, "Getting a vfio_dev_fd for %s failed errno %d\n", + dev_addr, errno); + ret = -1; + goto close_group; + } + + /* vendor_id */ + if (pread64(vfio_dev_fd, &pci_id->vendor_id, sizeof(uint16_t), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_VENDOR_ID) != sizeof(uint16_t)) { + RTE_LOG(ERR, EAL, "Cannot read VendorID from PCI config space\n"); + ret = -1; + goto close_device; + } + + /* device_id */ + if (pread64(vfio_dev_fd, &pci_id->device_id, sizeof(uint16_t), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_DEVICE_ID) != sizeof(uint16_t)) { + RTE_LOG(ERR, EAL, "Cannot read DeviceID from PCI config space\n"); + ret = -1; + goto close_device; + } + + /* subsystem_vendor_id */ + if (pread64(vfio_dev_fd, &pci_id->subsystem_vendor_id, sizeof(uint16_t), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_SUBSYSTEM_VENDOR_ID) != sizeof(uint16_t)) { + RTE_LOG(ERR, EAL, "Cannot read SubVendorID from PCI config space\n"); + ret = -1; + goto close_device; + } + + /* subsystem_device_id */ + if (pread64(vfio_dev_fd, &pci_id->subsystem_device_id, sizeof(uint16_t), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_SUBSYSTEM_ID) != sizeof(uint16_t)) { + RTE_LOG(ERR, EAL, "Cannot read SubDeviceID from PCI config space\n"); + ret = -1; + goto close_device; + } + + /* class_id */ + if (pread64(vfio_dev_fd, &class, sizeof(uint32_t), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_CLASS_REVISION) != sizeof(uint32_t)) { + RTE_LOG(ERR, EAL, "Cannot read ClassID from PCI config space\n"); + ret = -1; + goto close_device; + } + pci_id->class_id = class >> 8; + +close_device: + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing VFIO device for %s\n", + dev_addr); + ret = -1; + } + +close_group: + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing VFIO group for %s\n", + dev_addr); + ret = -1; + } + +close_container: + if (close(container) < 0) { + RTE_LOG(INFO, EAL, "Error when closing VFIO container\n"); + ret = -1; + } + +out: + return ret; +} + +static int vfio_pci_probe(struct rte_mdev_driver *mdev_drv __rte_unused, + struct rte_mdev_device *mdev_dev) +{ + char name[RTE_UUID_STRLEN]; + struct rte_pci_device *dev; + struct rte_bus *bus; + int ret; + + bus = rte_bus_find_by_name("pci"); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus pci\n"); + return -ENOENT; + } + + if (bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + bus->name); + return -ENOTSUP; + } + + dev = malloc(sizeof(*dev)); + if (dev == NULL) + return -ENOMEM; + + memset(dev, 0, sizeof(*dev)); + dev->device.bus = &rte_pci_bus.bus; + rte_uuid_unparse(mdev_dev->addr, name, sizeof(name)); + + if (get_pci_id(rte_mdev_get_sysfs_path(), name, &dev->id)) { + free(dev); + return -1; + } + + snprintf(dev->name, sizeof(dev->name), "%s", name); + dev->device.name = dev->name; + dev->kdrv = RTE_KDRV_VFIO; + dev->use_uuid = 1; + rte_uuid_copy(dev->uuid, mdev_dev->addr); + + // TODO: dev->device.devargs, etc + + memset(&dev->addr, -1, sizeof(dev->addr)); // XXX: TODO + + /* device is valid, add to the list (sorted) */ + if (TAILQ_EMPTY(&rte_pci_bus.device_list)) { + rte_pci_add_device(dev); + } else { + struct rte_pci_device *dev2; + int ret; + + TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) { + // XXX + ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr); + if (ret == 0) + ret = strncmp(dev->name, dev2->name, + sizeof(dev->name)); + if (ret > 0) + continue; + if (ret < 0) { + rte_pci_insert_device(dev2, dev); + goto plug; + } + /* already registered */ + free(dev); + return 0; + } + + rte_pci_add_device(dev); + } + +plug: + ret = bus->plug(&dev->device); + if (ret != 0) { + rte_pci_remove_device(dev); + free(dev); + } else { + mdev_dev->private = dev; + } + return ret; +} + +static int vfio_pci_remove(struct rte_mdev_device *mdev_dev) +{ + struct rte_pci_device *dev = mdev_dev->private; + struct rte_bus *bus; + int ret; + + if (dev == NULL) + return 0; + + bus = rte_bus_find_by_name("pci"); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus pci\n"); + return -ENOENT; + } + + if (bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + bus->name); + return -ENOTSUP; + } + + ret = bus->unplug(&dev->device); + if (ret == 0) + mdev_dev->private = NULL; + + return ret; +} + +static struct rte_mdev_driver vfio_pci_drv = { + .dev_api = RTE_MDEV_DEV_API_VFIO_PCI, + .probe = vfio_pci_probe, + .remove = vfio_pci_remove +}; + +RTE_MDEV_REGISTER_DRIVER(mdev_vfio_pci, vfio_pci_drv); diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build index a3140ff97..c3e884657 100644 --- a/drivers/bus/pci/meson.build +++ b/drivers/bus/pci/meson.build @@ -11,8 +11,10 @@ sources = files('pci_common.c', if host_machine.system() == 'linux' sources += files('linux/pci.c', 'linux/pci_uio.c', - 'linux/pci_vfio.c') + 'linux/pci_vfio.c', + 'linux/pci_vfio_mdev.c') includes += include_directories('linux') + deps += ['bus_mdev'] else sources += files('bsd/pci.c') includes += include_directories('bsd') diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c index 704b9d71a..6b47333e6 100644 --- a/drivers/bus/pci/pci_common.c +++ b/drivers/bus/pci/pci_common.c @@ -124,21 +124,17 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr, { int ret; bool already_probed; - struct rte_pci_addr *loc; if ((dr == NULL) || (dev == NULL)) return -EINVAL; - loc = &dev->addr; - /* The device is not blacklisted; Check if driver supports it */ if (!rte_pci_match(dr, dev)) /* Match of device and driver failed */ return 1; - RTE_LOG(INFO, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", - loc->domain, loc->bus, loc->devid, loc->function, - dev->device.numa_node); + RTE_LOG(INFO, EAL, "PCI device %s on NUMA socket %i\n", + dev->name, dev->device.numa_node); /* no initialization when blacklisted, return without error */ if (dev->device.devargs != NULL && @@ -208,7 +204,6 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr, static int rte_pci_detach_dev(struct rte_pci_device *dev) { - struct rte_pci_addr *loc; struct rte_pci_driver *dr; int ret = 0; @@ -216,11 +211,9 @@ rte_pci_detach_dev(struct rte_pci_device *dev) return -EINVAL; dr = dev->driver; - loc = &dev->addr; - RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", - loc->domain, loc->bus, loc->devid, - loc->function, dev->device.numa_node); + RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n", + dev->name, dev->device.numa_node); RTE_LOG(DEBUG, EAL, " remove driver: %x:%x %s\n", dev->id.vendor_id, dev->id.device_id, dr->driver.name); @@ -387,7 +380,7 @@ rte_pci_insert_device(struct rte_pci_device *exist_pci_dev, } /* Remove a device from PCI bus */ -static void +void rte_pci_remove_device(struct rte_pci_device *pci_dev) { TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next); diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 13c3324bb..d5815ee44 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -67,6 +67,15 @@ void rte_pci_add_device(struct rte_pci_device *pci_dev); void rte_pci_insert_device(struct rte_pci_device *exist_pci_dev, struct rte_pci_device *new_pci_dev); +/** + * Remove a PCI device from the PCI Bus. + * + * @param pci_dev + * PCI device to remove + * @return void + */ +void rte_pci_remove_device(struct rte_pci_device *pci_dev); + /** * Update a pci device object by asking the kernel for the latest information. * diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h index 06e004cd3..465a44935 100644 --- a/drivers/bus/pci/rte_bus_pci.h +++ b/drivers/bus/pci/rte_bus_pci.h @@ -51,6 +51,13 @@ TAILQ_HEAD(rte_pci_driver_list, rte_pci_driver); struct rte_devargs; +/* It's RTE_UUID_STRLEN, which is bigger than PCI_PRI_STR_SIZE. */ +#define RTE_PCI_NAME_LEN (36 + 1) + +// XXX: we can't include rte_uuid.h directly due to the conflicts +// introduced by stdbool.h +typedef unsigned char rte_uuid_t[16]; + /** * A structure describing a PCI device. */ @@ -58,6 +65,8 @@ struct rte_pci_device { TAILQ_ENTRY(rte_pci_device) next; /**< Next probed PCI device. */ struct rte_device device; /**< Inherit core device */ struct rte_pci_addr addr; /**< PCI location. */ + rte_uuid_t uuid; /**< Mdev location. */ + uint8_t use_uuid; /**< True if uuid field valid. */ struct rte_pci_id id; /**< PCI ID. */ struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; /**< PCI Memory Resource */ @@ -65,7 +74,7 @@ struct rte_pci_device { struct rte_pci_driver *driver; /**< PCI driver used in probing */ uint16_t max_vfs; /**< sriov enable if not zero */ enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */ - char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */ + char name[RTE_PCI_NAME_LEN]; /**< PCI/Mdev location (ASCII) */ struct rte_intr_handle vfio_req_intr_handle; /**< Handler of VFIO request interrupt */ }; -- 2.17.1