From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com (mga06.intel.com [134.134.136.31]) by dpdk.org (Postfix) with ESMTP id 65D4998 for ; Tue, 31 Jul 2018 13:24:24 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga008.fm.intel.com ([10.253.24.58]) by orsmga104.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 31 Jul 2018 04:24:22 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.51,427,1526367600"; d="scan'208";a="59287872" Received: from aburakov-mobl1.ger.corp.intel.com (HELO [10.237.220.148]) ([10.237.220.148]) by fmsmga008.fm.intel.com with ESMTP; 31 Jul 2018 04:24:03 -0700 To: Takeshi Yoshimura Cc: dev@dpdk.org, Jerin Jacob , thomas@monjalon.net References: From: "Burakov, Anatoly" Message-ID: <664ca8bf-d5e2-50b4-8e28-e5b15d82cb80@intel.com> Date: Tue, 31 Jul 2018 12:24:02 +0100 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Thunderbird/52.9.1 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 31 Jul 2018 11:24:25 -0000 On 31-Jul-18 10:38 AM, Takeshi Yoshimura wrote: > 2018-07-30 20:17 GMT+09:00 Anatoly Burakov : >> Currently, DPDK will skip mapping some areas (or even an entire BAR) >> if MSI-X happens to be in it but is smaller than page address. >> >> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this >> as a capability flag. Capability flags themselves are also only >> supported since kernel 4.6 [2]. >> >> This commit will introduce support for checking VFIO capabilities, >> and will use it to check if we are allowed to map BARs with MSI-X >> tables in them, along with backwards compatibility for older >> kernels, including a workaround for a variable rename in VFIO >> region info structure [3]. >> >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ >> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6 >> >> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ >> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279 >> >> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ >> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4 >> >> Signed-off-by: Anatoly Burakov >> --- >> drivers/bus/pci/linux/pci_vfio.c | 127 ++++++++++++++++++++--- >> lib/librte_eal/common/include/rte_vfio.h | 26 +++++ >> 2 files changed, 140 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c >> index 686386d6a..e7765ee11 100644 >> --- a/drivers/bus/pci/linux/pci_vfio.c >> +++ b/drivers/bus/pci/linux/pci_vfio.c >> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, >> return 0; >> } >> >> +/* >> + * region info may contain capability headers, so we need to keep reallocating >> + * the memory until we match allocated memory size with argsz. >> + */ >> +static int >> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, >> + int region) >> +{ >> + struct vfio_region_info *ri; >> + size_t argsz = sizeof(*ri); >> + int ret; >> + >> + ri = malloc(sizeof(*ri)); >> + if (ri == NULL) { >> + RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); >> + return -1; >> + } >> +again: >> + memset(ri, 0, argsz); >> + ri->argsz = argsz; >> + ri->index = region; >> + >> + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info); >> + if (ret) { >> + free(ri); >> + return ret; >> + } >> + if (ri->argsz != argsz) { >> + argsz = ri->argsz; >> + ri = realloc(ri, argsz); >> + >> + if (ri == NULL) { >> + RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); >> + return -1; >> + } >> + goto again; >> + } >> + *info = ri; >> + >> + return 0; >> +} >> + >> +static struct vfio_info_cap_header * >> +pci_vfio_info_cap(struct vfio_region_info *info, int cap) >> +{ >> + struct vfio_info_cap_header *h; >> + size_t offset; >> + >> + if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { >> + /* VFIO info does not advertise capabilities */ >> + return NULL; >> + } >> + >> + offset = VFIO_CAP_OFFSET(info); >> + while (offset != 0) { >> + h = RTE_PTR_ADD(info, offset); >> + if (h->id == cap) >> + return h; >> + offset = h->next; >> + } >> + return NULL; >> +} >> + >> +static int >> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) >> +{ >> + struct vfio_region_info *info; >> + int ret; >> + >> + ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); >> + if (ret < 0) >> + return -1; >> + >> + ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; >> + >> + /* cleanup */ >> + free(info); >> + >> + return ret; >> +} >> + >> + >> static int >> pci_vfio_map_resource_primary(struct rte_pci_device *dev) >> { >> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) >> if (ret < 0) { >> RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", >> pci_addr); >> - goto err_vfio_dev_fd; >> + goto err_vfio_res; >> + } >> + /* if we found our MSI-X BAR region, check if we can mmap it */ >> + if (vfio_res->msix_table.bar_index != -1) { >> + int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, >> + vfio_res->msix_table.bar_index); >> + if (ret < 0) { >> + RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); >> + goto err_vfio_res; >> + } else if (ret != 0) { >> + /* we can map it, so we don't care where it is */ >> + RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); >> + vfio_res->msix_table.bar_index = -1; >> + } >> } >> >> for (i = 0; i < (int) vfio_res->nb_maps; i++) { >> - struct vfio_region_info reg = { .argsz = sizeof(reg) }; >> + struct vfio_region_info *reg; >> void *bar_addr; >> >> - reg.index = i; >> - >> - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); >> - if (ret) { >> + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); >> + if (ret < 0) { >> RTE_LOG(ERR, EAL, " %s cannot get device region info " >> - "error %i (%s)\n", pci_addr, errno, strerror(errno)); >> + "error %i (%s)\n", pci_addr, errno, >> + strerror(errno)); >> goto err_vfio_res; >> } >> >> /* chk for io port region */ >> ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); >> - if (ret < 0) >> + if (ret < 0) { >> + free(reg); >> goto err_vfio_res; >> - else if (ret) { >> + } else if (ret) { >> RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", >> i); >> + free(reg); >> continue; >> } >> >> /* skip non-mmapable BARs */ >> - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) >> + if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { >> + free(reg); >> continue; >> + } >> >> /* try mapping somewhere close to the end of hugepages */ >> if (pci_map_addr == NULL) >> pci_map_addr = pci_find_max_end_va(); >> >> bar_addr = pci_map_addr; >> - pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); >> + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); >> >> maps[i].addr = bar_addr; >> - maps[i].offset = reg.offset; >> - maps[i].size = reg.size; >> + maps[i].offset = reg->offset; >> + maps[i].size = reg->size; >> maps[i].path = NULL; /* vfio doesn't have per-resource paths */ >> >> ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); >> if (ret < 0) { >> RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", >> pci_addr, i, strerror(errno)); >> + free(reg); >> goto err_vfio_res; >> } >> >> dev->mem_resource[i].addr = maps[i].addr; >> + >> + free(reg); >> } >> >> if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { >> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h >> index 5ca13fcce..f6617e004 100644 >> --- a/lib/librte_eal/common/include/rte_vfio.h >> +++ b/lib/librte_eal/common/include/rte_vfio.h >> @@ -14,6 +14,8 @@ >> extern "C" { >> #endif >> >> +#include >> + >> /* >> * determine if VFIO is present on the system >> */ >> @@ -44,6 +46,30 @@ extern "C" { >> #define RTE_VFIO_NOIOMMU 8 >> #endif >> >> +/* >> + * capabilities are only supported on kernel 4.6+. there were also some API >> + * changes as well, so add a macro to get cap offset. >> + */ >> +#ifdef VFIO_REGION_INFO_FLAG_CAPS >> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS >> +#define VFIO_CAP_OFFSET(x) (x->cap_offset) >> +#else >> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) >> +#define VFIO_CAP_OFFSET(x) (x->resv) >> +struct vfio_info_cap_header { >> + uint16_t id; >> + uint16_t version; >> + uint32_t next; >> +}; >> +#endif >> + >> +/* kernels 4.16+ can map BAR containing MSI-X table */ >> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE >> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE >> +#else >> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 >> +#endif >> + >> #else /* not VFIO_PRESENT */ >> >> /* we don't need an actual definition, only pointer is used */ >> -- >> 2.17.1 > > Hi Anatoly, > I have tested the patch on our ppc64le machine, but the > ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in > pci_vfio_get_region_info() failed. > This may be an issue of ppc64le VFIO implementation. Let me investigate more... > > Thanks, > Takeshi > Hi Takeshi, i think there's a bug in my patch. I'll submit a v2. -- Thanks, Anatoly