DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it
@ 2018-07-30 11:17 Anatoly Burakov
  2018-07-31  9:38 ` Takeshi Yoshimura
  2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
  0 siblings, 2 replies; 8+ messages in thread
From: Anatoly Burakov @ 2018-07-30 11:17 UTC (permalink / raw)
  To: dev; +Cc: jerin.jacob, thomas, t.yoshimura8869

Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X happens to be in it but is smaller than page address.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
 lib/librte_eal/common/include/rte_vfio.h |  26 +++++
 2 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..e7765ee11 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+		int region)
+{
+	struct vfio_region_info *ri;
+	size_t argsz = sizeof(*ri);
+	int ret;
+
+	ri = malloc(sizeof(*ri));
+	if (ri == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+		return -1;
+	}
+again:
+	memset(ri, 0, argsz);
+	ri->argsz = argsz;
+	ri->index = region;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (ret) {
+		free(ri);
+		return ret;
+	}
+	if (ri->argsz != argsz) {
+		argsz = ri->argsz;
+		ri = realloc(ri, argsz);
+
+		if (ri == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+			return -1;
+		}
+		goto again;
+	}
+	*info = ri;
+
+	return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+	struct vfio_info_cap_header *h;
+	size_t offset;
+
+	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+		/* VFIO info does not advertise capabilities */
+		return NULL;
+	}
+
+	offset = VFIO_CAP_OFFSET(info);
+	while (offset != 0) {
+		h = RTE_PTR_ADD(info, offset);
+		if (h->id == cap)
+			return h;
+		offset = h->next;
+	}
+	return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+	struct vfio_region_info *info;
+	int ret;
+
+	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+	if (ret < 0)
+		return -1;
+
+	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+	/* cleanup */
+	free(info);
+
+	return ret;
+}
+
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
 				pci_addr);
-		goto err_vfio_dev_fd;
+		goto err_vfio_res;
+	}
+	/* if we found our MSI-X BAR region, check if we can mmap it */
+	if (vfio_res->msix_table.bar_index != -1) {
+		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+				vfio_res->msix_table.bar_index);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+			goto err_vfio_res;
+		} else if (ret != 0) {
+			/* we can map it, so we don't care where it is */
+			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+			vfio_res->msix_table.bar_index = -1;
+		}
 	}
 
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		struct vfio_region_info *reg;
 		void *bar_addr;
 
-		reg.index = i;
-
-		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-		if (ret) {
+		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+				"error %i (%s)\n", pci_addr, errno,
+				strerror(errno));
 			goto err_vfio_res;
 		}
 
 		/* chk for io port region */
 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-		if (ret < 0)
+		if (ret < 0) {
+			free(reg);
 			goto err_vfio_res;
-		else if (ret) {
+		} else if (ret) {
 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
 					i);
+			free(reg);
 			continue;
 		}
 
 		/* skip non-mmapable BARs */
-		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+			free(reg);
 			continue;
+		}
 
 		/* try mapping somewhere close to the end of hugepages */
 		if (pci_map_addr == NULL)
 			pci_map_addr = pci_find_max_end_va();
 
 		bar_addr = pci_map_addr;
-		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
 
 		maps[i].addr = bar_addr;
-		maps[i].offset = reg.offset;
-		maps[i].size = reg.size;
+		maps[i].offset = reg->offset;
+		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
+			free(reg);
 			goto err_vfio_res;
 		}
 
 		dev->mem_resource[i].addr = maps[i].addr;
+
+		free(reg);
 	}
 
 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -44,6 +46,30 @@ extern "C" {
 #define RTE_VFIO_NOIOMMU 8
 #endif
 
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+	uint16_t id;
+	uint16_t version;
+	uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
 #else /* not VFIO_PRESENT */
 
 /* we don't need an actual definition, only pointer is used */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-07-30 11:17 [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it Anatoly Burakov
@ 2018-07-31  9:38 ` Takeshi Yoshimura
  2018-07-31 11:24   ` Burakov, Anatoly
  2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
  1 sibling, 1 reply; 8+ messages in thread
From: Takeshi Yoshimura @ 2018-07-31  9:38 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev, Jerin Jacob, thomas

2018-07-30 20:17 GMT+09:00 Anatoly Burakov <anatoly.burakov@intel.com>:
> Currently, DPDK will skip mapping some areas (or even an entire BAR)
> if MSI-X happens to be in it but is smaller than page address.
>
> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
> as a capability flag. Capability flags themselves are also only
> supported since kernel 4.6 [2].
>
> This commit will introduce support for checking VFIO capabilities,
> and will use it to check if we are allowed to map BARs with MSI-X
> tables in them, along with backwards compatibility for older
> kernels, including a workaround for a variable rename in VFIO
> region info structure [3].
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
>
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
>
> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>  drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
>  lib/librte_eal/common/include/rte_vfio.h |  26 +++++
>  2 files changed, 140 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
> index 686386d6a..e7765ee11 100644
> --- a/drivers/bus/pci/linux/pci_vfio.c
> +++ b/drivers/bus/pci/linux/pci_vfio.c
> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
>         return 0;
>  }
>
> +/*
> + * region info may contain capability headers, so we need to keep reallocating
> + * the memory until we match allocated memory size with argsz.
> + */
> +static int
> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
> +               int region)
> +{
> +       struct vfio_region_info *ri;
> +       size_t argsz = sizeof(*ri);
> +       int ret;
> +
> +       ri = malloc(sizeof(*ri));
> +       if (ri == NULL) {
> +               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
> +               return -1;
> +       }
> +again:
> +       memset(ri, 0, argsz);
> +       ri->argsz = argsz;
> +       ri->index = region;
> +
> +       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
> +       if (ret) {
> +               free(ri);
> +               return ret;
> +       }
> +       if (ri->argsz != argsz) {
> +               argsz = ri->argsz;
> +               ri = realloc(ri, argsz);
> +
> +               if (ri == NULL) {
> +                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
> +                       return -1;
> +               }
> +               goto again;
> +       }
> +       *info = ri;
> +
> +       return 0;
> +}
> +
> +static struct vfio_info_cap_header *
> +pci_vfio_info_cap(struct vfio_region_info *info, int cap)
> +{
> +       struct vfio_info_cap_header *h;
> +       size_t offset;
> +
> +       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
> +               /* VFIO info does not advertise capabilities */
> +               return NULL;
> +       }
> +
> +       offset = VFIO_CAP_OFFSET(info);
> +       while (offset != 0) {
> +               h = RTE_PTR_ADD(info, offset);
> +               if (h->id == cap)
> +                       return h;
> +               offset = h->next;
> +       }
> +       return NULL;
> +}
> +
> +static int
> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
> +{
> +       struct vfio_region_info *info;
> +       int ret;
> +
> +       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
> +       if (ret < 0)
> +               return -1;
> +
> +       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
> +
> +       /* cleanup */
> +       free(info);
> +
> +       return ret;
> +}
> +
> +
>  static int
>  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>  {
> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>         if (ret < 0) {
>                 RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
>                                 pci_addr);
> -               goto err_vfio_dev_fd;
> +               goto err_vfio_res;
> +       }
> +       /* if we found our MSI-X BAR region, check if we can mmap it */
> +       if (vfio_res->msix_table.bar_index != -1) {
> +               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
> +                               vfio_res->msix_table.bar_index);
> +               if (ret < 0) {
> +                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
> +                       goto err_vfio_res;
> +               } else if (ret != 0) {
> +                       /* we can map it, so we don't care where it is */
> +                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
> +                       vfio_res->msix_table.bar_index = -1;
> +               }
>         }
>
>         for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> -               struct vfio_region_info reg = { .argsz = sizeof(reg) };
> +               struct vfio_region_info *reg;
>                 void *bar_addr;
>
> -               reg.index = i;
> -
> -               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
> -               if (ret) {
> +               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
> +               if (ret < 0) {
>                         RTE_LOG(ERR, EAL, "  %s cannot get device region info "
> -                                       "error %i (%s)\n", pci_addr, errno, strerror(errno));
> +                               "error %i (%s)\n", pci_addr, errno,
> +                               strerror(errno));
>                         goto err_vfio_res;
>                 }
>
>                 /* chk for io port region */
>                 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
> -               if (ret < 0)
> +               if (ret < 0) {
> +                       free(reg);
>                         goto err_vfio_res;
> -               else if (ret) {
> +               } else if (ret) {
>                         RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
>                                         i);
> +                       free(reg);
>                         continue;
>                 }
>
>                 /* skip non-mmapable BARs */
> -               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
> +               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
> +                       free(reg);
>                         continue;
> +               }
>
>                 /* try mapping somewhere close to the end of hugepages */
>                 if (pci_map_addr == NULL)
>                         pci_map_addr = pci_find_max_end_va();
>
>                 bar_addr = pci_map_addr;
> -               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
> +               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
>
>                 maps[i].addr = bar_addr;
> -               maps[i].offset = reg.offset;
> -               maps[i].size = reg.size;
> +               maps[i].offset = reg->offset;
> +               maps[i].size = reg->size;
>                 maps[i].path = NULL; /* vfio doesn't have per-resource paths */
>
>                 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
>                 if (ret < 0) {
>                         RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
>                                         pci_addr, i, strerror(errno));
> +                       free(reg);
>                         goto err_vfio_res;
>                 }
>
>                 dev->mem_resource[i].addr = maps[i].addr;
> +
> +               free(reg);
>         }
>
>         if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
> index 5ca13fcce..f6617e004 100644
> --- a/lib/librte_eal/common/include/rte_vfio.h
> +++ b/lib/librte_eal/common/include/rte_vfio.h
> @@ -14,6 +14,8 @@
>  extern "C" {
>  #endif
>
> +#include <stdint.h>
> +
>  /*
>   * determine if VFIO is present on the system
>   */
> @@ -44,6 +46,30 @@ extern "C" {
>  #define RTE_VFIO_NOIOMMU 8
>  #endif
>
> +/*
> + * capabilities are only supported on kernel 4.6+. there were also some API
> + * changes as well, so add a macro to get cap offset.
> + */
> +#ifdef VFIO_REGION_INFO_FLAG_CAPS
> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
> +#define VFIO_CAP_OFFSET(x) (x->cap_offset)
> +#else
> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
> +#define VFIO_CAP_OFFSET(x) (x->resv)
> +struct vfio_info_cap_header {
> +       uint16_t id;
> +       uint16_t version;
> +       uint32_t next;
> +};
> +#endif
> +
> +/* kernels 4.16+ can map BAR containing MSI-X table */
> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
> +#else
> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
> +#endif
> +
>  #else /* not VFIO_PRESENT */
>
>  /* we don't need an actual definition, only pointer is used */
> --
> 2.17.1

Hi Anatoly,
I have tested the patch on our ppc64le machine, but the
ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in
pci_vfio_get_region_info() failed.
This may be an issue of ppc64le VFIO implementation. Let me investigate more...

Thanks,
Takeshi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-07-31  9:38 ` Takeshi Yoshimura
@ 2018-07-31 11:24   ` Burakov, Anatoly
  0 siblings, 0 replies; 8+ messages in thread
From: Burakov, Anatoly @ 2018-07-31 11:24 UTC (permalink / raw)
  To: Takeshi Yoshimura; +Cc: dev, Jerin Jacob, thomas

On 31-Jul-18 10:38 AM, Takeshi Yoshimura wrote:
> 2018-07-30 20:17 GMT+09:00 Anatoly Burakov <anatoly.burakov@intel.com>:
>> Currently, DPDK will skip mapping some areas (or even an entire BAR)
>> if MSI-X happens to be in it but is smaller than page address.
>>
>> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
>> as a capability flag. Capability flags themselves are also only
>> supported since kernel 4.6 [2].
>>
>> This commit will introduce support for checking VFIO capabilities,
>> and will use it to check if we are allowed to map BARs with MSI-X
>> tables in them, along with backwards compatibility for older
>> kernels, including a workaround for a variable rename in VFIO
>> region info structure [3].
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
>>
>> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
>>
>> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>   drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
>>   lib/librte_eal/common/include/rte_vfio.h |  26 +++++
>>   2 files changed, 140 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
>> index 686386d6a..e7765ee11 100644
>> --- a/drivers/bus/pci/linux/pci_vfio.c
>> +++ b/drivers/bus/pci/linux/pci_vfio.c
>> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
>>          return 0;
>>   }
>>
>> +/*
>> + * region info may contain capability headers, so we need to keep reallocating
>> + * the memory until we match allocated memory size with argsz.
>> + */
>> +static int
>> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
>> +               int region)
>> +{
>> +       struct vfio_region_info *ri;
>> +       size_t argsz = sizeof(*ri);
>> +       int ret;
>> +
>> +       ri = malloc(sizeof(*ri));
>> +       if (ri == NULL) {
>> +               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
>> +               return -1;
>> +       }
>> +again:
>> +       memset(ri, 0, argsz);
>> +       ri->argsz = argsz;
>> +       ri->index = region;
>> +
>> +       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
>> +       if (ret) {
>> +               free(ri);
>> +               return ret;
>> +       }
>> +       if (ri->argsz != argsz) {
>> +               argsz = ri->argsz;
>> +               ri = realloc(ri, argsz);
>> +
>> +               if (ri == NULL) {
>> +                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
>> +                       return -1;
>> +               }
>> +               goto again;
>> +       }
>> +       *info = ri;
>> +
>> +       return 0;
>> +}
>> +
>> +static struct vfio_info_cap_header *
>> +pci_vfio_info_cap(struct vfio_region_info *info, int cap)
>> +{
>> +       struct vfio_info_cap_header *h;
>> +       size_t offset;
>> +
>> +       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
>> +               /* VFIO info does not advertise capabilities */
>> +               return NULL;
>> +       }
>> +
>> +       offset = VFIO_CAP_OFFSET(info);
>> +       while (offset != 0) {
>> +               h = RTE_PTR_ADD(info, offset);
>> +               if (h->id == cap)
>> +                       return h;
>> +               offset = h->next;
>> +       }
>> +       return NULL;
>> +}
>> +
>> +static int
>> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
>> +{
>> +       struct vfio_region_info *info;
>> +       int ret;
>> +
>> +       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
>> +       if (ret < 0)
>> +               return -1;
>> +
>> +       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
>> +
>> +       /* cleanup */
>> +       free(info);
>> +
>> +       return ret;
>> +}
>> +
>> +
>>   static int
>>   pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>>   {
>> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>>          if (ret < 0) {
>>                  RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
>>                                  pci_addr);
>> -               goto err_vfio_dev_fd;
>> +               goto err_vfio_res;
>> +       }
>> +       /* if we found our MSI-X BAR region, check if we can mmap it */
>> +       if (vfio_res->msix_table.bar_index != -1) {
>> +               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
>> +                               vfio_res->msix_table.bar_index);
>> +               if (ret < 0) {
>> +                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
>> +                       goto err_vfio_res;
>> +               } else if (ret != 0) {
>> +                       /* we can map it, so we don't care where it is */
>> +                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
>> +                       vfio_res->msix_table.bar_index = -1;
>> +               }
>>          }
>>
>>          for (i = 0; i < (int) vfio_res->nb_maps; i++) {
>> -               struct vfio_region_info reg = { .argsz = sizeof(reg) };
>> +               struct vfio_region_info *reg;
>>                  void *bar_addr;
>>
>> -               reg.index = i;
>> -
>> -               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
>> -               if (ret) {
>> +               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
>> +               if (ret < 0) {
>>                          RTE_LOG(ERR, EAL, "  %s cannot get device region info "
>> -                                       "error %i (%s)\n", pci_addr, errno, strerror(errno));
>> +                               "error %i (%s)\n", pci_addr, errno,
>> +                               strerror(errno));
>>                          goto err_vfio_res;
>>                  }
>>
>>                  /* chk for io port region */
>>                  ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
>> -               if (ret < 0)
>> +               if (ret < 0) {
>> +                       free(reg);
>>                          goto err_vfio_res;
>> -               else if (ret) {
>> +               } else if (ret) {
>>                          RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
>>                                          i);
>> +                       free(reg);
>>                          continue;
>>                  }
>>
>>                  /* skip non-mmapable BARs */
>> -               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
>> +               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
>> +                       free(reg);
>>                          continue;
>> +               }
>>
>>                  /* try mapping somewhere close to the end of hugepages */
>>                  if (pci_map_addr == NULL)
>>                          pci_map_addr = pci_find_max_end_va();
>>
>>                  bar_addr = pci_map_addr;
>> -               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
>> +               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
>>
>>                  maps[i].addr = bar_addr;
>> -               maps[i].offset = reg.offset;
>> -               maps[i].size = reg.size;
>> +               maps[i].offset = reg->offset;
>> +               maps[i].size = reg->size;
>>                  maps[i].path = NULL; /* vfio doesn't have per-resource paths */
>>
>>                  ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
>>                  if (ret < 0) {
>>                          RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
>>                                          pci_addr, i, strerror(errno));
>> +                       free(reg);
>>                          goto err_vfio_res;
>>                  }
>>
>>                  dev->mem_resource[i].addr = maps[i].addr;
>> +
>> +               free(reg);
>>          }
>>
>>          if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
>> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
>> index 5ca13fcce..f6617e004 100644
>> --- a/lib/librte_eal/common/include/rte_vfio.h
>> +++ b/lib/librte_eal/common/include/rte_vfio.h
>> @@ -14,6 +14,8 @@
>>   extern "C" {
>>   #endif
>>
>> +#include <stdint.h>
>> +
>>   /*
>>    * determine if VFIO is present on the system
>>    */
>> @@ -44,6 +46,30 @@ extern "C" {
>>   #define RTE_VFIO_NOIOMMU 8
>>   #endif
>>
>> +/*
>> + * capabilities are only supported on kernel 4.6+. there were also some API
>> + * changes as well, so add a macro to get cap offset.
>> + */
>> +#ifdef VFIO_REGION_INFO_FLAG_CAPS
>> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
>> +#define VFIO_CAP_OFFSET(x) (x->cap_offset)
>> +#else
>> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
>> +#define VFIO_CAP_OFFSET(x) (x->resv)
>> +struct vfio_info_cap_header {
>> +       uint16_t id;
>> +       uint16_t version;
>> +       uint32_t next;
>> +};
>> +#endif
>> +
>> +/* kernels 4.16+ can map BAR containing MSI-X table */
>> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#else
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
>> +#endif
>> +
>>   #else /* not VFIO_PRESENT */
>>
>>   /* we don't need an actual definition, only pointer is used */
>> --
>> 2.17.1
> 
> Hi Anatoly,
> I have tested the patch on our ppc64le machine, but the
> ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in
> pci_vfio_get_region_info() failed.
> This may be an issue of ppc64le VFIO implementation. Let me investigate more...
> 
> Thanks,
> Takeshi
> 

Hi Takeshi, i think there's a bug in my patch. I'll submit a v2.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH 18.11 v2] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-07-30 11:17 [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it Anatoly Burakov
  2018-07-31  9:38 ` Takeshi Yoshimura
@ 2018-07-31 11:28 ` Anatoly Burakov
  2018-08-02  6:47   ` Takeshi Yoshimura
  2018-09-20 13:11   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
  1 sibling, 2 replies; 8+ messages in thread
From: Anatoly Burakov @ 2018-07-31 11:28 UTC (permalink / raw)
  To: dev; +Cc: jerin.jacob, thomas, t.yoshimura8869

Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X table happens to be in them but is smaller than page size.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2->v1:
    - Fix pointer in pci_vfio_get_region_info
    - Fix commit message

 drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
 lib/librte_eal/common/include/rte_vfio.h |  26 +++++
 2 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..24f665c20 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+		int region)
+{
+	struct vfio_region_info *ri;
+	size_t argsz = sizeof(*ri);
+	int ret;
+
+	ri = malloc(sizeof(*ri));
+	if (ri == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+		return -1;
+	}
+again:
+	memset(ri, 0, argsz);
+	ri->argsz = argsz;
+	ri->index = region;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
+	if (ret) {
+		free(ri);
+		return ret;
+	}
+	if (ri->argsz != argsz) {
+		argsz = ri->argsz;
+		ri = realloc(ri, argsz);
+
+		if (ri == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+			return -1;
+		}
+		goto again;
+	}
+	*info = ri;
+
+	return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+	struct vfio_info_cap_header *h;
+	size_t offset;
+
+	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+		/* VFIO info does not advertise capabilities */
+		return NULL;
+	}
+
+	offset = VFIO_CAP_OFFSET(info);
+	while (offset != 0) {
+		h = RTE_PTR_ADD(info, offset);
+		if (h->id == cap)
+			return h;
+		offset = h->next;
+	}
+	return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+	struct vfio_region_info *info;
+	int ret;
+
+	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+	if (ret < 0)
+		return -1;
+
+	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+	/* cleanup */
+	free(info);
+
+	return ret;
+}
+
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
 				pci_addr);
-		goto err_vfio_dev_fd;
+		goto err_vfio_res;
+	}
+	/* if we found our MSI-X BAR region, check if we can mmap it */
+	if (vfio_res->msix_table.bar_index != -1) {
+		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+				vfio_res->msix_table.bar_index);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+			goto err_vfio_res;
+		} else if (ret != 0) {
+			/* we can map it, so we don't care where it is */
+			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+			vfio_res->msix_table.bar_index = -1;
+		}
 	}
 
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		struct vfio_region_info *reg;
 		void *bar_addr;
 
-		reg.index = i;
-
-		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-		if (ret) {
+		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+				"error %i (%s)\n", pci_addr, errno,
+				strerror(errno));
 			goto err_vfio_res;
 		}
 
 		/* chk for io port region */
 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-		if (ret < 0)
+		if (ret < 0) {
+			free(reg);
 			goto err_vfio_res;
-		else if (ret) {
+		} else if (ret) {
 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
 					i);
+			free(reg);
 			continue;
 		}
 
 		/* skip non-mmapable BARs */
-		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+			free(reg);
 			continue;
+		}
 
 		/* try mapping somewhere close to the end of hugepages */
 		if (pci_map_addr == NULL)
 			pci_map_addr = pci_find_max_end_va();
 
 		bar_addr = pci_map_addr;
-		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
 
 		maps[i].addr = bar_addr;
-		maps[i].offset = reg.offset;
-		maps[i].size = reg.size;
+		maps[i].offset = reg->offset;
+		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
+			free(reg);
 			goto err_vfio_res;
 		}
 
 		dev->mem_resource[i].addr = maps[i].addr;
+
+		free(reg);
 	}
 
 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -44,6 +46,30 @@ extern "C" {
 #define RTE_VFIO_NOIOMMU 8
 #endif
 
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+	uint16_t id;
+	uint16_t version;
+	uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
 #else /* not VFIO_PRESENT */
 
 /* we don't need an actual definition, only pointer is used */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 18.11 v2] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
@ 2018-08-02  6:47   ` Takeshi Yoshimura
  2018-08-02  8:17     ` Burakov, Anatoly
  2018-09-20 13:11   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
  1 sibling, 1 reply; 8+ messages in thread
From: Takeshi Yoshimura @ 2018-08-02  6:47 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev, Jerin Jacob, thomas

2018-07-31 20:28 GMT+09:00 Anatoly Burakov <anatoly.burakov@intel.com>:
> Currently, DPDK will skip mapping some areas (or even an entire BAR)
> if MSI-X table happens to be in them but is smaller than page size.
>
> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
> as a capability flag. Capability flags themselves are also only
> supported since kernel 4.6 [2].
>
> This commit will introduce support for checking VFIO capabilities,
> and will use it to check if we are allowed to map BARs with MSI-X
> tables in them, along with backwards compatibility for older
> kernels, including a workaround for a variable rename in VFIO
> region info structure [3].
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
>
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
>
> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>
> Notes:
>     v2->v1:
>     - Fix pointer in pci_vfio_get_region_info
>     - Fix commit message
>
>  drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
>  lib/librte_eal/common/include/rte_vfio.h |  26 +++++
>  2 files changed, 140 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
> index 686386d6a..24f665c20 100644
> --- a/drivers/bus/pci/linux/pci_vfio.c
> +++ b/drivers/bus/pci/linux/pci_vfio.c
> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
>         return 0;
>  }
>
> +/*
> + * region info may contain capability headers, so we need to keep reallocating
> + * the memory until we match allocated memory size with argsz.
> + */
> +static int
> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
> +               int region)
> +{
> +       struct vfio_region_info *ri;
> +       size_t argsz = sizeof(*ri);
> +       int ret;
> +
> +       ri = malloc(sizeof(*ri));
> +       if (ri == NULL) {
> +               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
> +               return -1;
> +       }
> +again:
> +       memset(ri, 0, argsz);
> +       ri->argsz = argsz;
> +       ri->index = region;
> +
> +       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
> +       if (ret) {
> +               free(ri);
> +               return ret;
> +       }
> +       if (ri->argsz != argsz) {
> +               argsz = ri->argsz;
> +               ri = realloc(ri, argsz);
> +
> +               if (ri == NULL) {
> +                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
> +                       return -1;
> +               }
> +               goto again;
> +       }
> +       *info = ri;
> +
> +       return 0;
> +}
> +
> +static struct vfio_info_cap_header *
> +pci_vfio_info_cap(struct vfio_region_info *info, int cap)
> +{
> +       struct vfio_info_cap_header *h;
> +       size_t offset;
> +
> +       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
> +               /* VFIO info does not advertise capabilities */
> +               return NULL;
> +       }
> +
> +       offset = VFIO_CAP_OFFSET(info);
> +       while (offset != 0) {
> +               h = RTE_PTR_ADD(info, offset);
> +               if (h->id == cap)
> +                       return h;
> +               offset = h->next;
> +       }
> +       return NULL;
> +}
> +
> +static int
> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
> +{
> +       struct vfio_region_info *info;
> +       int ret;
> +
> +       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
> +       if (ret < 0)
> +               return -1;
> +
> +       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
> +
> +       /* cleanup */
> +       free(info);
> +
> +       return ret;
> +}
> +
> +
>  static int
>  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>  {
> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>         if (ret < 0) {
>                 RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
>                                 pci_addr);
> -               goto err_vfio_dev_fd;
> +               goto err_vfio_res;
> +       }
> +       /* if we found our MSI-X BAR region, check if we can mmap it */
> +       if (vfio_res->msix_table.bar_index != -1) {
> +               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
> +                               vfio_res->msix_table.bar_index);
> +               if (ret < 0) {
> +                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
> +                       goto err_vfio_res;
> +               } else if (ret != 0) {
> +                       /* we can map it, so we don't care where it is */
> +                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
> +                       vfio_res->msix_table.bar_index = -1;
> +               }
>         }
>
>         for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> -               struct vfio_region_info reg = { .argsz = sizeof(reg) };
> +               struct vfio_region_info *reg;
>                 void *bar_addr;
>
> -               reg.index = i;
> -
> -               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
> -               if (ret) {
> +               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
> +               if (ret < 0) {
>                         RTE_LOG(ERR, EAL, "  %s cannot get device region info "
> -                                       "error %i (%s)\n", pci_addr, errno, strerror(errno));
> +                               "error %i (%s)\n", pci_addr, errno,
> +                               strerror(errno));
>                         goto err_vfio_res;
>                 }
>
>                 /* chk for io port region */
>                 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
> -               if (ret < 0)
> +               if (ret < 0) {
> +                       free(reg);
>                         goto err_vfio_res;
> -               else if (ret) {
> +               } else if (ret) {
>                         RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
>                                         i);
> +                       free(reg);
>                         continue;
>                 }
>
>                 /* skip non-mmapable BARs */
> -               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
> +               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
> +                       free(reg);
>                         continue;
> +               }
>
>                 /* try mapping somewhere close to the end of hugepages */
>                 if (pci_map_addr == NULL)
>                         pci_map_addr = pci_find_max_end_va();
>
>                 bar_addr = pci_map_addr;
> -               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
> +               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
>
>                 maps[i].addr = bar_addr;
> -               maps[i].offset = reg.offset;
> -               maps[i].size = reg.size;
> +               maps[i].offset = reg->offset;
> +               maps[i].size = reg->size;
>                 maps[i].path = NULL; /* vfio doesn't have per-resource paths */
>
>                 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
>                 if (ret < 0) {
>                         RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
>                                         pci_addr, i, strerror(errno));
> +                       free(reg);
>                         goto err_vfio_res;
>                 }
>
>                 dev->mem_resource[i].addr = maps[i].addr;
> +
> +               free(reg);
>         }
>
>         if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
> index 5ca13fcce..f6617e004 100644
> --- a/lib/librte_eal/common/include/rte_vfio.h
> +++ b/lib/librte_eal/common/include/rte_vfio.h
> @@ -14,6 +14,8 @@
>  extern "C" {
>  #endif
>
> +#include <stdint.h>
> +
>  /*
>   * determine if VFIO is present on the system
>   */
> @@ -44,6 +46,30 @@ extern "C" {
>  #define RTE_VFIO_NOIOMMU 8
>  #endif
>
> +/*
> + * capabilities are only supported on kernel 4.6+. there were also some API
> + * changes as well, so add a macro to get cap offset.
> + */
> +#ifdef VFIO_REGION_INFO_FLAG_CAPS
> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
> +#define VFIO_CAP_OFFSET(x) (x->cap_offset)
> +#else
> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
> +#define VFIO_CAP_OFFSET(x) (x->resv)
> +struct vfio_info_cap_header {
> +       uint16_t id;
> +       uint16_t version;
> +       uint32_t next;
> +};
> +#endif
> +
> +/* kernels 4.16+ can map BAR containing MSI-X table */
> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
> +#else
> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
> +#endif
> +
>  #else /* not VFIO_PRESENT */
>
>  /* we don't need an actual definition, only pointer is used */
> --
> 2.17.1

Hi Anatoly,
Please fix the error check for ioctl in pci_vfio_region_info() from
"if (ret)" to "if (ret < 0)"
My environment reported compiler errors with -Werror=maybe-uninitialized).

dpdk/drivers/bus/pci/linux/pci_vfio.c: In function
‘pci_vfio_map_resource_primary’:
dpdk/drivers/bus/pci/linux/pci_vfio.c:612:4: error: ‘reg’ may be used
uninitialized in this function [-Werror=maybe-uninitialized]
    free(reg);
    ^~~~~~~~~
dpdk/drivers/bus/pci/linux/pci_vfio.c:495:2: error: ‘info’ may be used
uninitialized in this function [-Werror=maybe-uninitialized]
  free(info);
  ^~~~~~~~~~
dpdk/drivers/bus/pci/linux/pci_vfio.c:485:27: note: ‘info’ was declared here
  struct vfio_region_info *info;


Other code looks good to me.
I tested the updated patch with the above change and confirmed it
could mmap BAR on my ppc64le machine.

Thanks,
Takeshi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 18.11 v2] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-08-02  6:47   ` Takeshi Yoshimura
@ 2018-08-02  8:17     ` Burakov, Anatoly
  0 siblings, 0 replies; 8+ messages in thread
From: Burakov, Anatoly @ 2018-08-02  8:17 UTC (permalink / raw)
  To: Takeshi Yoshimura; +Cc: dev, Jerin Jacob, thomas

On 02-Aug-18 7:47 AM, Takeshi Yoshimura wrote:
> 2018-07-31 20:28 GMT+09:00 Anatoly Burakov <anatoly.burakov@intel.com>:
>> Currently, DPDK will skip mapping some areas (or even an entire BAR)
>> if MSI-X table happens to be in them but is smaller than page size.
>>
>> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
>> as a capability flag. Capability flags themselves are also only
>> supported since kernel 4.6 [2].
>>
>> This commit will introduce support for checking VFIO capabilities,
>> and will use it to check if we are allowed to map BARs with MSI-X
>> tables in them, along with backwards compatibility for older
>> kernels, including a workaround for a variable rename in VFIO
>> region info structure [3].
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
>>
>> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
>>
>> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>
>> Notes:
>>      v2->v1:
>>      - Fix pointer in pci_vfio_get_region_info
>>      - Fix commit message
>>
>>   drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
>>   lib/librte_eal/common/include/rte_vfio.h |  26 +++++
>>   2 files changed, 140 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
>> index 686386d6a..24f665c20 100644
>> --- a/drivers/bus/pci/linux/pci_vfio.c
>> +++ b/drivers/bus/pci/linux/pci_vfio.c
>> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
>>          return 0;
>>   }
>>
>> +/*
>> + * region info may contain capability headers, so we need to keep reallocating
>> + * the memory until we match allocated memory size with argsz.
>> + */
>> +static int
>> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
>> +               int region)
>> +{
>> +       struct vfio_region_info *ri;
>> +       size_t argsz = sizeof(*ri);
>> +       int ret;
>> +
>> +       ri = malloc(sizeof(*ri));
>> +       if (ri == NULL) {
>> +               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
>> +               return -1;
>> +       }
>> +again:
>> +       memset(ri, 0, argsz);
>> +       ri->argsz = argsz;
>> +       ri->index = region;
>> +
>> +       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
>> +       if (ret) {
>> +               free(ri);
>> +               return ret;
>> +       }
>> +       if (ri->argsz != argsz) {
>> +               argsz = ri->argsz;
>> +               ri = realloc(ri, argsz);
>> +
>> +               if (ri == NULL) {
>> +                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
>> +                       return -1;
>> +               }
>> +               goto again;
>> +       }
>> +       *info = ri;
>> +
>> +       return 0;
>> +}
>> +
>> +static struct vfio_info_cap_header *
>> +pci_vfio_info_cap(struct vfio_region_info *info, int cap)
>> +{
>> +       struct vfio_info_cap_header *h;
>> +       size_t offset;
>> +
>> +       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
>> +               /* VFIO info does not advertise capabilities */
>> +               return NULL;
>> +       }
>> +
>> +       offset = VFIO_CAP_OFFSET(info);
>> +       while (offset != 0) {
>> +               h = RTE_PTR_ADD(info, offset);
>> +               if (h->id == cap)
>> +                       return h;
>> +               offset = h->next;
>> +       }
>> +       return NULL;
>> +}
>> +
>> +static int
>> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
>> +{
>> +       struct vfio_region_info *info;
>> +       int ret;
>> +
>> +       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
>> +       if (ret < 0)
>> +               return -1;
>> +
>> +       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
>> +
>> +       /* cleanup */
>> +       free(info);
>> +
>> +       return ret;
>> +}
>> +
>> +
>>   static int
>>   pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>>   {
>> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>>          if (ret < 0) {
>>                  RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
>>                                  pci_addr);
>> -               goto err_vfio_dev_fd;
>> +               goto err_vfio_res;
>> +       }
>> +       /* if we found our MSI-X BAR region, check if we can mmap it */
>> +       if (vfio_res->msix_table.bar_index != -1) {
>> +               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
>> +                               vfio_res->msix_table.bar_index);
>> +               if (ret < 0) {
>> +                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
>> +                       goto err_vfio_res;
>> +               } else if (ret != 0) {
>> +                       /* we can map it, so we don't care where it is */
>> +                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
>> +                       vfio_res->msix_table.bar_index = -1;
>> +               }
>>          }
>>
>>          for (i = 0; i < (int) vfio_res->nb_maps; i++) {
>> -               struct vfio_region_info reg = { .argsz = sizeof(reg) };
>> +               struct vfio_region_info *reg;
>>                  void *bar_addr;
>>
>> -               reg.index = i;
>> -
>> -               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
>> -               if (ret) {
>> +               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
>> +               if (ret < 0) {
>>                          RTE_LOG(ERR, EAL, "  %s cannot get device region info "
>> -                                       "error %i (%s)\n", pci_addr, errno, strerror(errno));
>> +                               "error %i (%s)\n", pci_addr, errno,
>> +                               strerror(errno));
>>                          goto err_vfio_res;
>>                  }
>>
>>                  /* chk for io port region */
>>                  ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
>> -               if (ret < 0)
>> +               if (ret < 0) {
>> +                       free(reg);
>>                          goto err_vfio_res;
>> -               else if (ret) {
>> +               } else if (ret) {
>>                          RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
>>                                          i);
>> +                       free(reg);
>>                          continue;
>>                  }
>>
>>                  /* skip non-mmapable BARs */
>> -               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
>> +               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
>> +                       free(reg);
>>                          continue;
>> +               }
>>
>>                  /* try mapping somewhere close to the end of hugepages */
>>                  if (pci_map_addr == NULL)
>>                          pci_map_addr = pci_find_max_end_va();
>>
>>                  bar_addr = pci_map_addr;
>> -               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
>> +               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
>>
>>                  maps[i].addr = bar_addr;
>> -               maps[i].offset = reg.offset;
>> -               maps[i].size = reg.size;
>> +               maps[i].offset = reg->offset;
>> +               maps[i].size = reg->size;
>>                  maps[i].path = NULL; /* vfio doesn't have per-resource paths */
>>
>>                  ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
>>                  if (ret < 0) {
>>                          RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
>>                                          pci_addr, i, strerror(errno));
>> +                       free(reg);
>>                          goto err_vfio_res;
>>                  }
>>
>>                  dev->mem_resource[i].addr = maps[i].addr;
>> +
>> +               free(reg);
>>          }
>>
>>          if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
>> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
>> index 5ca13fcce..f6617e004 100644
>> --- a/lib/librte_eal/common/include/rte_vfio.h
>> +++ b/lib/librte_eal/common/include/rte_vfio.h
>> @@ -14,6 +14,8 @@
>>   extern "C" {
>>   #endif
>>
>> +#include <stdint.h>
>> +
>>   /*
>>    * determine if VFIO is present on the system
>>    */
>> @@ -44,6 +46,30 @@ extern "C" {
>>   #define RTE_VFIO_NOIOMMU 8
>>   #endif
>>
>> +/*
>> + * capabilities are only supported on kernel 4.6+. there were also some API
>> + * changes as well, so add a macro to get cap offset.
>> + */
>> +#ifdef VFIO_REGION_INFO_FLAG_CAPS
>> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
>> +#define VFIO_CAP_OFFSET(x) (x->cap_offset)
>> +#else
>> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
>> +#define VFIO_CAP_OFFSET(x) (x->resv)
>> +struct vfio_info_cap_header {
>> +       uint16_t id;
>> +       uint16_t version;
>> +       uint32_t next;
>> +};
>> +#endif
>> +
>> +/* kernels 4.16+ can map BAR containing MSI-X table */
>> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#else
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
>> +#endif
>> +
>>   #else /* not VFIO_PRESENT */
>>
>>   /* we don't need an actual definition, only pointer is used */
>> --
>> 2.17.1
> 
> Hi Anatoly,
> Please fix the error check for ioctl in pci_vfio_region_info() from
> "if (ret)" to "if (ret < 0)"
> My environment reported compiler errors with -Werror=maybe-uninitialized).
> 
> dpdk/drivers/bus/pci/linux/pci_vfio.c: In function
> ‘pci_vfio_map_resource_primary’:
> dpdk/drivers/bus/pci/linux/pci_vfio.c:612:4: error: ‘reg’ may be used
> uninitialized in this function [-Werror=maybe-uninitialized]
>      free(reg);
>      ^~~~~~~~~
> dpdk/drivers/bus/pci/linux/pci_vfio.c:495:2: error: ‘info’ may be used
> uninitialized in this function [-Werror=maybe-uninitialized]
>    free(info);
>    ^~~~~~~~~~
> dpdk/drivers/bus/pci/linux/pci_vfio.c:485:27: note: ‘info’ was declared here
>    struct vfio_region_info *info;
> 
> 
> Other code looks good to me.
> I tested the updated patch with the above change and confirmed it
> could mmap BAR on my ppc64le machine.

Thanks!

I'll fix it for v3.

> 
> Thanks,
> Takeshi
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH v3] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
  2018-08-02  6:47   ` Takeshi Yoshimura
@ 2018-09-20 13:11   ` Anatoly Burakov
  2018-10-03 22:40     ` Thomas Monjalon
  1 sibling, 1 reply; 8+ messages in thread
From: Anatoly Burakov @ 2018-09-20 13:11 UTC (permalink / raw)
  To: dev; +Cc: t.yoshimura8869, thomas, jerin.jacob

Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X table happens to be in them but is smaller than page size.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v3->v2:
    - Fix potential uninitialized value access as per Takeshi's
      comments
    - Fix potential memory leak on failed memory reallocation
    
    v2->v1:
    - Fix pointer in pci_vfio_get_region_info
    - Fix commit message

 drivers/bus/pci/linux/pci_vfio.c         | 132 ++++++++++++++++++++---
 lib/librte_eal/common/include/rte_vfio.h |  26 +++++
 2 files changed, 145 insertions(+), 13 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..d112b4b54 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,93 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+		int region)
+{
+	struct vfio_region_info *ri;
+	size_t argsz = sizeof(*ri);
+	int ret;
+
+	ri = malloc(sizeof(*ri));
+	if (ri == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+		return -1;
+	}
+again:
+	memset(ri, 0, argsz);
+	ri->argsz = argsz;
+	ri->index = region;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
+	if (ret < 0) {
+		free(ri);
+		return ret;
+	}
+	if (ri->argsz != argsz) {
+		struct vfio_region_info *tmp;
+
+		argsz = ri->argsz;
+		tmp = realloc(ri, argsz);
+
+		if (tmp == NULL) {
+			/* realloc failed but the ri is still there */
+			free(ri);
+			RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+			return -1;
+		}
+		ri = tmp;
+		goto again;
+	}
+	*info = ri;
+
+	return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+	struct vfio_info_cap_header *h;
+	size_t offset;
+
+	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+		/* VFIO info does not advertise capabilities */
+		return NULL;
+	}
+
+	offset = VFIO_CAP_OFFSET(info);
+	while (offset != 0) {
+		h = RTE_PTR_ADD(info, offset);
+		if (h->id == cap)
+			return h;
+		offset = h->next;
+	}
+	return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+	struct vfio_region_info *info;
+	int ret;
+
+	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+	if (ret < 0)
+		return -1;
+
+	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+	/* cleanup */
+	free(info);
+
+	return ret;
+}
+
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -464,56 +551,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
 				pci_addr);
-		goto err_vfio_dev_fd;
+		goto err_vfio_res;
+	}
+	/* if we found our MSI-X BAR region, check if we can mmap it */
+	if (vfio_res->msix_table.bar_index != -1) {
+		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+				vfio_res->msix_table.bar_index);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+			goto err_vfio_res;
+		} else if (ret != 0) {
+			/* we can map it, so we don't care where it is */
+			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+			vfio_res->msix_table.bar_index = -1;
+		}
 	}
 
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		struct vfio_region_info *reg = NULL;
 		void *bar_addr;
 
-		reg.index = i;
-
-		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-		if (ret) {
+		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+				"error %i (%s)\n", pci_addr, errno,
+				strerror(errno));
 			goto err_vfio_res;
 		}
 
 		/* chk for io port region */
 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-		if (ret < 0)
+		if (ret < 0) {
+			free(reg);
 			goto err_vfio_res;
-		else if (ret) {
+		} else if (ret) {
 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
 					i);
+			free(reg);
 			continue;
 		}
 
 		/* skip non-mmapable BARs */
-		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+			free(reg);
 			continue;
+		}
 
 		/* try mapping somewhere close to the end of hugepages */
 		if (pci_map_addr == NULL)
 			pci_map_addr = pci_find_max_end_va();
 
 		bar_addr = pci_map_addr;
-		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
 
 		maps[i].addr = bar_addr;
-		maps[i].offset = reg.offset;
-		maps[i].size = reg.size;
+		maps[i].offset = reg->offset;
+		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
+			free(reg);
 			goto err_vfio_res;
 		}
 
 		dev->mem_resource[i].addr = maps[i].addr;
+
+		free(reg);
 	}
 
 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -44,6 +46,30 @@ extern "C" {
 #define RTE_VFIO_NOIOMMU 8
 #endif
 
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+	uint16_t id;
+	uint16_t version;
+	uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
 #else /* not VFIO_PRESENT */
 
 /* we don't need an actual definition, only pointer is used */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH v3] pci/vfio: allow mapping MSI-X BARs if kernel allows it
  2018-09-20 13:11   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
@ 2018-10-03 22:40     ` Thomas Monjalon
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Monjalon @ 2018-10-03 22:40 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev, t.yoshimura8869, jerin.jacob

20/09/2018 15:11, Anatoly Burakov:
> Currently, DPDK will skip mapping some areas (or even an entire BAR)
> if MSI-X table happens to be in them but is smaller than page size.
> 
> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
> as a capability flag. Capability flags themselves are also only
> supported since kernel 4.6 [2].
> 
> This commit will introduce support for checking VFIO capabilities,
> and will use it to check if we are allowed to map BARs with MSI-X
> tables in them, along with backwards compatibility for older
> kernels, including a workaround for a variable rename in VFIO
> region info structure [3].
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
> 
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
> 
> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-10-03 22:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-30 11:17 [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it Anatoly Burakov
2018-07-31  9:38 ` Takeshi Yoshimura
2018-07-31 11:24   ` Burakov, Anatoly
2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
2018-08-02  6:47   ` Takeshi Yoshimura
2018-08-02  8:17     ` Burakov, Anatoly
2018-09-20 13:11   ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
2018-10-03 22:40     ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).