From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by dpdk.org (Postfix) with ESMTP id 470CB5699 for ; Mon, 30 Jul 2018 13:17:08 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga006.jf.intel.com ([10.7.209.51]) by orsmga105.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 30 Jul 2018 04:17:05 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.51,422,1526367600"; d="scan'208";a="61842260" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by orsmga006.jf.intel.com with ESMTP; 30 Jul 2018 04:17:04 -0700 Received: from sivswdev01.ir.intel.com (sivswdev01.ir.intel.com [10.237.217.45]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id w6UBH3Ca017397; Mon, 30 Jul 2018 12:17:03 +0100 Received: from sivswdev01.ir.intel.com (localhost [127.0.0.1]) by sivswdev01.ir.intel.com with ESMTP id w6UBH3DD024949; Mon, 30 Jul 2018 12:17:03 +0100 Received: (from aburakov@localhost) by sivswdev01.ir.intel.com with LOCAL id w6UBH3MI024943; Mon, 30 Jul 2018 12:17:03 +0100 From: Anatoly Burakov To: dev@dpdk.org Cc: jerin.jacob@caviumnetworks.com, thomas@monjalon.net, t.yoshimura8869@gmail.com Date: Mon, 30 Jul 2018 12:17:03 +0100 Message-Id: X-Mailer: git-send-email 1.7.0.7 Subject: [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 30 Jul 2018 11:17:09 -0000 Currently, DPDK will skip mapping some areas (or even an entire BAR) if MSI-X happens to be in it but is smaller than page address. Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this as a capability flag. Capability flags themselves are also only supported since kernel 4.6 [2]. This commit will introduce support for checking VFIO capabilities, and will use it to check if we are allowed to map BARs with MSI-X tables in them, along with backwards compatibility for older kernels, including a workaround for a variable rename in VFIO region info structure [3]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279 [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4 Signed-off-by: Anatoly Burakov --- drivers/bus/pci/linux/pci_vfio.c | 127 ++++++++++++++++++++--- lib/librte_eal/common/include/rte_vfio.h | 26 +++++ 2 files changed, 140 insertions(+), 13 deletions(-) diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 686386d6a..e7765ee11 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, return 0; } +/* + * region info may contain capability headers, so we need to keep reallocating + * the memory until we match allocated memory size with argsz. + */ +static int +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, + int region) +{ + struct vfio_region_info *ri; + size_t argsz = sizeof(*ri); + int ret; + + ri = malloc(sizeof(*ri)); + if (ri == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); + return -1; + } +again: + memset(ri, 0, argsz); + ri->argsz = argsz; + ri->index = region; + + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info); + if (ret) { + free(ri); + return ret; + } + if (ri->argsz != argsz) { + argsz = ri->argsz; + ri = realloc(ri, argsz); + + if (ri == NULL) { + RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); + return -1; + } + goto again; + } + *info = ri; + + return 0; +} + +static struct vfio_info_cap_header * +pci_vfio_info_cap(struct vfio_region_info *info, int cap) +{ + struct vfio_info_cap_header *h; + size_t offset; + + if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { + /* VFIO info does not advertise capabilities */ + return NULL; + } + + offset = VFIO_CAP_OFFSET(info); + while (offset != 0) { + h = RTE_PTR_ADD(info, offset); + if (h->id == cap) + return h; + offset = h->next; + } + return NULL; +} + +static int +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) +{ + struct vfio_region_info *info; + int ret; + + ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); + if (ret < 0) + return -1; + + ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; + + /* cleanup */ + free(info); + + return ret; +} + + static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); - goto err_vfio_dev_fd; + goto err_vfio_res; + } + /* if we found our MSI-X BAR region, check if we can mmap it */ + if (vfio_res->msix_table.bar_index != -1) { + int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, + vfio_res->msix_table.bar_index); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); + goto err_vfio_res; + } else if (ret != 0) { + /* we can map it, so we don't care where it is */ + RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); + vfio_res->msix_table.bar_index = -1; + } } for (i = 0; i < (int) vfio_res->nb_maps; i++) { - struct vfio_region_info reg = { .argsz = sizeof(reg) }; + struct vfio_region_info *reg; void *bar_addr; - reg.index = i; - - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); - if (ret) { + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); + if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get device region info " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); + "error %i (%s)\n", pci_addr, errno, + strerror(errno)); goto err_vfio_res; } /* chk for io port region */ ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); - if (ret < 0) + if (ret < 0) { + free(reg); goto err_vfio_res; - else if (ret) { + } else if (ret) { RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", i); + free(reg); continue; } /* skip non-mmapable BARs */ - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) + if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { + free(reg); continue; + } /* try mapping somewhere close to the end of hugepages */ if (pci_map_addr == NULL) pci_map_addr = pci_find_max_end_va(); bar_addr = pci_map_addr; - pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); maps[i].addr = bar_addr; - maps[i].offset = reg.offset; - maps[i].size = reg.size; + maps[i].offset = reg->offset; + maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); if (ret < 0) { RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, strerror(errno)); + free(reg); goto err_vfio_res; } dev->mem_resource[i].addr = maps[i].addr; + + free(reg); } if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index 5ca13fcce..f6617e004 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -14,6 +14,8 @@ extern "C" { #endif +#include + /* * determine if VFIO is present on the system */ @@ -44,6 +46,30 @@ extern "C" { #define RTE_VFIO_NOIOMMU 8 #endif +/* + * capabilities are only supported on kernel 4.6+. there were also some API + * changes as well, so add a macro to get cap offset. + */ +#ifdef VFIO_REGION_INFO_FLAG_CAPS +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS +#define VFIO_CAP_OFFSET(x) (x->cap_offset) +#else +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) +#define VFIO_CAP_OFFSET(x) (x->resv) +struct vfio_info_cap_header { + uint16_t id; + uint16_t version; + uint32_t next; +}; +#endif + +/* kernels 4.16+ can map BAR containing MSI-X table */ +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#else +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 +#endif + #else /* not VFIO_PRESENT */ /* we don't need an actual definition, only pointer is used */ -- 2.17.1