* [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them
@ 2015-01-22  8:36 Dan Aloni
  2015-01-28 14:06 ` Dan Aloni
  2015-01-28 15:01 ` Burakov, Anatoly
  0 siblings, 2 replies; 8+ messages in thread
From: Dan Aloni @ 2015-01-22  8:36 UTC (permalink / raw)
  To: dev
While VFIO doesn't allow us to map complete BARs with MSI-X tables,
it does allow us to map around them in PAGE_SIZE granularity. There
might be adapters that provide their registers in the same BAR
but on a different page. For example, Intel's NVME adapter, though
not a network adapter, provides only one MMIO BAR that contains
the MSI-X table.
Signed-off-by: Dan Aloni <dan@kernelim.com>
CC: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_pci.c      |  5 +-
 lib/librte_eal/linuxapp/eal/eal_pci_init.h |  2 +-
 lib/librte_eal/linuxapp/eal/eal_pci_uio.c  |  4 +-
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 99 +++++++++++++++++++++++++++---
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |  8 ++-
 5 files changed, 101 insertions(+), 17 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index b5f54101e8aa..4a74a9372a15 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -118,13 +118,14 @@ pci_find_max_end_va(void)
 
 /* map a particular resource from a file */
 void *
-pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
+pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
+		 int additional_flags)
 {
 	void *mapaddr;
 
 	/* Map the PCI memory resource of device */
 	mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
-			MAP_SHARED, fd, offset);
+			MAP_SHARED | additional_flags, fd, offset);
 	if (mapaddr == MAP_FAILED) {
 		RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
 			__func__, fd, requested_addr,
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index 1070eb88fe0a..0a0853d4c4df 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -66,7 +66,7 @@ extern void *pci_map_addr;
 void *pci_find_max_end_va(void);
 
 void *pci_map_resource(void *requested_addr, int fd, off_t offset,
-		size_t size);
+	       size_t size, int additional_flags);
 
 /* map IGB_UIO resource prototype */
 int pci_uio_map_resource(struct rte_pci_device *dev);
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index e53f06b82430..eaa2e36f643e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -139,7 +139,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
 
 			if (pci_map_resource(uio_res->maps[i].addr, fd,
 					     (off_t)uio_res->maps[i].offset,
-					     (size_t)uio_res->maps[i].size)
+					     (size_t)uio_res->maps[i].size, 0)
 			    != uio_res->maps[i].addr) {
 				RTE_LOG(ERR, EAL,
 					"Cannot mmap device resource\n");
@@ -379,7 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 					pci_map_addr = pci_find_max_end_va();
 
 				mapaddr = pci_map_resource(pci_map_addr, fd, (off_t)offset,
-						(size_t)maps[j].size);
+						(size_t)maps[j].size, 0);
 				if (mapaddr == MAP_FAILED)
 					fail = 1;
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 20e097727f80..f6542a1f1464 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -62,6 +62,9 @@
 
 #ifdef VFIO_PRESENT
 
+#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
+#define PAGE_MASK   (~(PAGE_SIZE - 1))
+
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
@@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg;
 
 /* get PCI BAR number where MSI-X interrupts are */
 static int
-pci_vfio_get_msix_bar(int fd, int *msix_bar)
+pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
+		      uint32_t *msix_table_size)
 {
 	int ret;
 	uint32_t reg;
+	uint16_t flags;
 	uint8_t cap_id, cap_offset;
 
 	/* read PCI capability pointer from config space */
@@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar)
 				return -1;
 			}
 
+			ret = pread64(fd, &flags, sizeof(flags),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset + 2);
+			if (ret != sizeof(flags)) {
+				RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+						"space!\n");
+				return -1;
+			}
+
 			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+			*msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+			*msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
 
 			return 0;
 		}
@@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	int i, ret, msix_bar;
 	struct mapped_pci_resource *vfio_res = NULL;
 	struct pci_map *maps;
+	uint32_t msix_table_offset = 0;
+	uint32_t msix_table_size = 0;
 
 	dev->intr_handle.fd = -1;
 	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
@@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/* get MSI-X BAR, if any (we have to know where it is because we can't
-	 * mmap it when using VFIO) */
+	 * easily mmap it when using VFIO) */
 	msix_bar = -1;
-	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
+				    &msix_table_offset, &msix_table_size);
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
 		close(vfio_dev_fd);
@@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
 		struct vfio_region_info reg = { .argsz = sizeof(reg) };
 		void *bar_addr;
+		struct memreg {
+			uint32_t offset, size;
+		} memreg[2] = {};
 
 		reg.index = i;
 
@@ -720,21 +742,78 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
 			continue;
 
-		/* skip MSI-X BAR */
-		if (i == msix_bar)
-			continue;
+		if (i == msix_bar) {
+			/*
+			 * VFIO will not let us map the MSI-X table,
+			 * but we can map around it.
+			 */
+			uint32_t table_start = msix_table_offset;
+			uint32_t table_end = table_start + msix_table_size;
+			table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
+			table_start &= PAGE_MASK;
+
+			if (table_start == 0  &&  table_end >= reg.size) {
+				/* Cannot map this BAR */
+				RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i);
+				continue;
+			} else {
+				memreg[0].offset = reg.offset;
+				memreg[0].size = table_start;
+				memreg[1].offset = table_end;
+				memreg[1].size = reg.size - table_end;
+
+				RTE_LOG(DEBUG, EAL,
+					"Trying to map BAR %d that contains the MSI-X "
+					"table. Trying offsets: "
+					"%04x:%04x, %04x:%04x\n", i,
+					memreg[0].offset, memreg[0].size,
+					memreg[1].offset, memreg[1].size);
+			}
+		} else {
+			memreg[0].offset = reg.offset;
+			memreg[0].size = reg.size;
+		}
 
+		/* try to figure out an address */
 		if (internal_config.process_type == RTE_PROC_PRIMARY) {
 			/* try mapping somewhere close to the end of hugepages */
 			if (pci_map_addr == NULL)
 				pci_map_addr = pci_find_max_end_va();
 
-			bar_addr = pci_map_resource(pci_map_addr, vfio_dev_fd, reg.offset,
-					reg.size);
+			bar_addr = pci_map_addr;
 			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
 		} else {
-			bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
-					reg.size);
+			bar_addr = maps[i].addr;
+		}
+
+		/* reserve the address using an inaccessible mapping */
+		bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
+				MAP_ANONYMOUS, -1, 0);
+		if (bar_addr != MAP_FAILED) {
+			void *map_addr = NULL;
+			if (memreg[0].size) {
+				/* actual map of first part */
+				map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+							    memreg[0].offset,
+							    memreg[0].size,
+							    MAP_FIXED);
+			}
+
+			/* if there's a second part, try to map it */
+			if (map_addr != MAP_FAILED
+			    &&  memreg[1].offset  &&  memreg[1].size) {
+				uint8_t *second_addr =
+					((uint8_t *)bar_addr + memreg[1].offset);
+				map_addr = pci_map_resource((void *)second_addr,
+							    vfio_dev_fd, memreg[1].offset,
+							    memreg[1].size,
+							    MAP_FIXED);
+			}
+
+			if (map_addr == MAP_FAILED  ||  !map_addr) {
+				munmap(bar_addr, reg.size);
+				bar_addr = MAP_FAILED;
+			}
 		}
 
 		if (bar_addr == MAP_FAILED ||
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 03e693e01bf0..72ec3f62a3d8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -43,9 +43,13 @@
 #include <linux/vfio.h>
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
-#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#define RTE_PCI_MSIX_TABLE_BIR    0x7
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#define RTE_PCI_MSIX_FLAGS_QSIZE  0x07ff
 #else
-#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_BIR    PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
 #define VFIO_PRESENT
-- 
1.9.3
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-22  8:36 [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them Dan Aloni
@ 2015-01-28 14:06 ` Dan Aloni
  2015-01-28 15:01 ` Burakov, Anatoly
  1 sibling, 0 replies; 8+ messages in thread
From: Dan Aloni @ 2015-01-28 14:06 UTC (permalink / raw)
  To: dev
On Thu, Jan 22, 2015 at 10:36:11AM +0200, Dan Aloni wrote:
> While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> it does allow us to map around them in PAGE_SIZE granularity. There
> might be adapters that provide their registers in the same BAR
> but on a different page. For example, Intel's NVME adapter, though
> not a network adapter, provides only one MMIO BAR that contains
> the MSI-X table.
> 
> Signed-off-by: Dan Aloni <dan@kernelim.com>
> CC: Anatoly Burakov <anatoly.burakov@intel.com>
Has anyone reviewed this yet?
I am asking because I am interested to know whether someone is aiming
to integrate storage controllers support into DPDK, and this patch
could be instrumental.
--
Dan Aloni
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-22  8:36 [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them Dan Aloni
  2015-01-28 14:06 ` Dan Aloni
@ 2015-01-28 15:01 ` Burakov, Anatoly
  2015-01-28 22:04   ` Dan Aloni
  2015-01-28 22:04   ` [dpdk-dev] [PATCH v2] " Dan Aloni
  1 sibling, 2 replies; 8+ messages in thread
From: Burakov, Anatoly @ 2015-01-28 15:01 UTC (permalink / raw)
  To: Dan Aloni, dev
Hi Dan
Apologies for not looking at it earlier.
> While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> it does allow us to map around them in PAGE_SIZE granularity. There
> might be adapters that provide their registers in the same BAR
> but on a different page. For example, Intel's NVME adapter, though
> not a network adapter, provides only one MMIO BAR that contains
> the MSI-X table.
> 
> Signed-off-by: Dan Aloni <dan@kernelim.com>
> CC: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>  lib/librte_eal/linuxapp/eal/eal_pci.c      |  5 +-
>  lib/librte_eal/linuxapp/eal/eal_pci_init.h |  2 +-
>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c  |  4 +-
>  lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 99
> +++++++++++++++++++++++++++---
>  lib/librte_eal/linuxapp/eal/eal_vfio.h     |  8 ++-
>  5 files changed, 101 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c
> b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index b5f54101e8aa..4a74a9372a15 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -118,13 +118,14 @@ pci_find_max_end_va(void)
> 
>  /* map a particular resource from a file */
>  void *
> -pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
> +pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
> +		 int additional_flags)
>  {
>  	void *mapaddr;
> 
>  	/* Map the PCI memory resource of device */
>  	mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
> -			MAP_SHARED, fd, offset);
> +			MAP_SHARED | additional_flags, fd, offset);
>  	if (mapaddr == MAP_FAILED) {
>  		RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx,
> 0x%lx): %s (%p)\n",
>  			__func__, fd, requested_addr,
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> index 1070eb88fe0a..0a0853d4c4df 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> @@ -66,7 +66,7 @@ extern void *pci_map_addr;
>  void *pci_find_max_end_va(void);
> 
>  void *pci_map_resource(void *requested_addr, int fd, off_t offset,
> -		size_t size);
> +	       size_t size, int additional_flags);
> 
>  /* map IGB_UIO resource prototype */
>  int pci_uio_map_resource(struct rte_pci_device *dev);
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> index e53f06b82430..eaa2e36f643e 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> @@ -139,7 +139,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
> 
>  			if (pci_map_resource(uio_res->maps[i].addr, fd,
>  					     (off_t)uio_res->maps[i].offset,
> -					     (size_t)uio_res->maps[i].size)
> +					     (size_t)uio_res->maps[i].size, 0)
>  			    != uio_res->maps[i].addr) {
>  				RTE_LOG(ERR, EAL,
>  					"Cannot mmap device resource\n");
> @@ -379,7 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  					pci_map_addr =
> pci_find_max_end_va();
> 
>  				mapaddr =
> pci_map_resource(pci_map_addr, fd, (off_t)offset,
> -						(size_t)maps[j].size);
> +						(size_t)maps[j].size, 0);
>  				if (mapaddr == MAP_FAILED)
>  					fail = 1;
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index 20e097727f80..f6542a1f1464 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -62,6 +62,9 @@
> 
>  #ifdef VFIO_PRESENT
> 
> +#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
> +#define PAGE_MASK   (~(PAGE_SIZE - 1))
> +
>  #define VFIO_DIR "/dev/vfio"
>  #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
>  #define VFIO_GROUP_FMT "/dev/vfio/%u"
> @@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg;
> 
>  /* get PCI BAR number where MSI-X interrupts are */
>  static int
> -pci_vfio_get_msix_bar(int fd, int *msix_bar)
> +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
> +		      uint32_t *msix_table_size)
>  {
>  	int ret;
>  	uint32_t reg;
> +	uint16_t flags;
>  	uint8_t cap_id, cap_offset;
> 
>  	/* read PCI capability pointer from config space */
> @@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar)
>  				return -1;
>  			}
> 
> +			ret = pread64(fd, &flags, sizeof(flags),
> +
> 	VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
> +					cap_offset + 2);
> +			if (ret != sizeof(flags)) {
> +				RTE_LOG(ERR, EAL, "Cannot read table flags
> from PCI config "
> +						"space!\n");
> +				return -1;
> +			}
> +
>  			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
> +			*msix_table_offset = reg &
> RTE_PCI_MSIX_TABLE_OFFSET;
> +			*msix_table_size = 16 * (1 + (flags &
> RTE_PCI_MSIX_FLAGS_QSIZE));
> 
>  			return 0;
>  		}
> @@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
>  	int i, ret, msix_bar;
>  	struct mapped_pci_resource *vfio_res = NULL;
>  	struct pci_map *maps;
> +	uint32_t msix_table_offset = 0;
> +	uint32_t msix_table_size = 0;
> 
>  	dev->intr_handle.fd = -1;
>  	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
> @@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
>  	}
> 
>  	/* get MSI-X BAR, if any (we have to know where it is because we
> can't
> -	 * mmap it when using VFIO) */
> +	 * easily mmap it when using VFIO) */
>  	msix_bar = -1;
> -	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
> +	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
> +				    &msix_table_offset, &msix_table_size);
>  	if (ret < 0) {
>  		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
> pci_addr);
>  		close(vfio_dev_fd);
> @@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
>  	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
>  		struct vfio_region_info reg = { .argsz = sizeof(reg) };
>  		void *bar_addr;
> +		struct memreg {
> +			uint32_t offset, size;
> +		} memreg[2] = {};
> 
>  		reg.index = i;
> 
> @@ -720,21 +742,78 @@ pci_vfio_map_resource(struct rte_pci_device
> *dev)
>  		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
>  			continue;
> 
> -		/* skip MSI-X BAR */
> -		if (i == msix_bar)
> -			continue;
> +		if (i == msix_bar) {
> +			/*
> +			 * VFIO will not let us map the MSI-X table,
> +			 * but we can map around it.
> +			 */
> +			uint32_t table_start = msix_table_offset;
> +			uint32_t table_end = table_start + msix_table_size;
> +			table_end = (table_end + ~PAGE_MASK) &
> PAGE_MASK;
> +			table_start &= PAGE_MASK;
> +
> +			if (table_start == 0  &&  table_end >= reg.size) {
> +				/* Cannot map this BAR */
> +				RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n",
> i);
> +				continue;
> +			} else {
> +				memreg[0].offset = reg.offset;
> +				memreg[0].size = table_start;
> +				memreg[1].offset = table_end;
> +				memreg[1].size = reg.size - table_end;
> +
> +				RTE_LOG(DEBUG, EAL,
> +					"Trying to map BAR %d that contains
> the MSI-X "
> +					"table. Trying offsets: "
> +					"%04x:%04x, %04x:%04x\n", i,
> +					memreg[0].offset, memreg[0].size,
> +					memreg[1].offset, memreg[1].size);
> +			}
> +		} else {
> +			memreg[0].offset = reg.offset;
> +			memreg[0].size = reg.size;
> +		}
> 
> +		/* try to figure out an address */
>  		if (internal_config.process_type == RTE_PROC_PRIMARY) {
>  			/* try mapping somewhere close to the end of
> hugepages */
>  			if (pci_map_addr == NULL)
>  				pci_map_addr = pci_find_max_end_va();
> 
> -			bar_addr = pci_map_resource(pci_map_addr,
> vfio_dev_fd, reg.offset,
> -					reg.size);
> +			bar_addr = pci_map_addr;
>  			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t)
> reg.size);
>  		} else {
> -			bar_addr = pci_map_resource(maps[i].addr,
> vfio_dev_fd, reg.offset,
> -					reg.size);
> +			bar_addr = maps[i].addr;
> +		}
> +
> +		/* reserve the address using an inaccessible mapping */
> +		bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
> +				MAP_ANONYMOUS, -1, 0);
> +		if (bar_addr != MAP_FAILED) {
> +			void *map_addr = NULL;
> +			if (memreg[0].size) {
> +				/* actual map of first part */
> +				map_addr = pci_map_resource(bar_addr,
> vfio_dev_fd,
> +							    memreg[0].offset,
> +							    memreg[0].size,
> +							    MAP_FIXED);
> +			}
> +
> +			/* if there's a second part, try to map it */
> +			if (map_addr != MAP_FAILED
> +			    &&  memreg[1].offset  &&  memreg[1].size) {
> +				uint8_t *second_addr =
> +					((uint8_t *)bar_addr +
> memreg[1].offset);
Nitpicking, but probably better to use void* and RTE_PTR_ADD here.
> +				map_addr = pci_map_resource((void
> *)second_addr,
> +							    vfio_dev_fd,
> memreg[1].offset,
> +							    memreg[1].size,
> +							    MAP_FIXED);
> +			}
> +
> +			if (map_addr == MAP_FAILED  ||  !map_addr) {
> +				munmap(bar_addr, reg.size);
> +				bar_addr = MAP_FAILED;
> +			}
>  		}
> 
>  		if (bar_addr == MAP_FAILED ||
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 03e693e01bf0..72ec3f62a3d8 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -43,9 +43,13 @@
>  #include <linux/vfio.h>
> 
>  #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
> -#define RTE_PCI_MSIX_TABLE_BIR 0x7
> +#define RTE_PCI_MSIX_TABLE_BIR    0x7
> +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
> +#define RTE_PCI_MSIX_FLAGS_QSIZE  0x07ff
>  #else
> -#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
> +#define RTE_PCI_MSIX_TABLE_BIR    PCI_MSIX_TABLE_BIR
> +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
> +#define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
>  #endif
> 
>  #define VFIO_PRESENT
> --
> 1.9.3
Otherwise, no issues from me.
Thanks,
Anatoly
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-28 15:01 ` Burakov, Anatoly
@ 2015-01-28 22:04   ` Dan Aloni
  2015-01-28 22:04   ` [dpdk-dev] [PATCH v2] " Dan Aloni
  1 sibling, 0 replies; 8+ messages in thread
From: Dan Aloni @ 2015-01-28 22:04 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev
On Wed, Jan 28, 2015 at 03:01:38PM +0000, Burakov, Anatoly wrote:
> Hi Dan
> 
> Apologies for not looking at it earlier.
No problem, we are all quite busy :)
> > +			if (map_addr != MAP_FAILED
> > +			    &&  memreg[1].offset  &&  memreg[1].size) {
> > +				uint8_t *second_addr =
> > +					((uint8_t *)bar_addr +
> > memreg[1].offset);
> 
> Nitpicking, but probably better to use void* and RTE_PTR_ADD here.
Nitpicking very justified. New patch coming your way.
-- 
Dan Aloni
^ permalink raw reply	[flat|nested] 8+ messages in thread
* [dpdk-dev] [PATCH v2] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-28 15:01 ` Burakov, Anatoly
  2015-01-28 22:04   ` Dan Aloni
@ 2015-01-28 22:04   ` Dan Aloni
  2015-01-29 10:22     ` Burakov, Anatoly
  1 sibling, 1 reply; 8+ messages in thread
From: Dan Aloni @ 2015-01-28 22:04 UTC (permalink / raw)
  To: dev
While VFIO doesn't allow us to map complete BARs with MSI-X tables,
it does allow us to map around them in PAGE_SIZE granularity. There
might be adapters that provide their registers in the same BAR
but on a different page. For example, Intel's NVME adapter, though
not a network adapter, provides only one MMIO BAR that contains
the MSI-X table.
Signed-off-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_pci.c      |  5 +-
 lib/librte_eal/linuxapp/eal/eal_pci_init.h |  2 +-
 lib/librte_eal/linuxapp/eal/eal_pci_uio.c  |  4 +-
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 98 +++++++++++++++++++++++++++---
 lib/librte_eal/linuxapp/eal/eal_vfio.h     |  8 ++-
 5 files changed, 100 insertions(+), 17 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index b5f54101e8aa..4a74a9372a15 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -118,13 +118,14 @@ pci_find_max_end_va(void)
 
 /* map a particular resource from a file */
 void *
-pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
+pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
+		 int additional_flags)
 {
 	void *mapaddr;
 
 	/* Map the PCI memory resource of device */
 	mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
-			MAP_SHARED, fd, offset);
+			MAP_SHARED | additional_flags, fd, offset);
 	if (mapaddr == MAP_FAILED) {
 		RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
 			__func__, fd, requested_addr,
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index 1070eb88fe0a..0a0853d4c4df 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -66,7 +66,7 @@ extern void *pci_map_addr;
 void *pci_find_max_end_va(void);
 
 void *pci_map_resource(void *requested_addr, int fd, off_t offset,
-		size_t size);
+	       size_t size, int additional_flags);
 
 /* map IGB_UIO resource prototype */
 int pci_uio_map_resource(struct rte_pci_device *dev);
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index e53f06b82430..eaa2e36f643e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -139,7 +139,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
 
 			if (pci_map_resource(uio_res->maps[i].addr, fd,
 					     (off_t)uio_res->maps[i].offset,
-					     (size_t)uio_res->maps[i].size)
+					     (size_t)uio_res->maps[i].size, 0)
 			    != uio_res->maps[i].addr) {
 				RTE_LOG(ERR, EAL,
 					"Cannot mmap device resource\n");
@@ -379,7 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 					pci_map_addr = pci_find_max_end_va();
 
 				mapaddr = pci_map_resource(pci_map_addr, fd, (off_t)offset,
-						(size_t)maps[j].size);
+						(size_t)maps[j].size, 0);
 				if (mapaddr == MAP_FAILED)
 					fail = 1;
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 20e097727f80..c8df91c0f800 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -62,6 +62,9 @@
 
 #ifdef VFIO_PRESENT
 
+#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
+#define PAGE_MASK   (~(PAGE_SIZE - 1))
+
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
@@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg;
 
 /* get PCI BAR number where MSI-X interrupts are */
 static int
-pci_vfio_get_msix_bar(int fd, int *msix_bar)
+pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
+		      uint32_t *msix_table_size)
 {
 	int ret;
 	uint32_t reg;
+	uint16_t flags;
 	uint8_t cap_id, cap_offset;
 
 	/* read PCI capability pointer from config space */
@@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar)
 				return -1;
 			}
 
+			ret = pread64(fd, &flags, sizeof(flags),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset + 2);
+			if (ret != sizeof(flags)) {
+				RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+						"space!\n");
+				return -1;
+			}
+
 			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+			*msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+			*msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
 
 			return 0;
 		}
@@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	int i, ret, msix_bar;
 	struct mapped_pci_resource *vfio_res = NULL;
 	struct pci_map *maps;
+	uint32_t msix_table_offset = 0;
+	uint32_t msix_table_size = 0;
 
 	dev->intr_handle.fd = -1;
 	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
@@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	}
 
 	/* get MSI-X BAR, if any (we have to know where it is because we can't
-	 * mmap it when using VFIO) */
+	 * easily mmap it when using VFIO) */
 	msix_bar = -1;
-	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
+				    &msix_table_offset, &msix_table_size);
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
 		close(vfio_dev_fd);
@@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
 		struct vfio_region_info reg = { .argsz = sizeof(reg) };
 		void *bar_addr;
+		struct memreg {
+			uint32_t offset, size;
+		} memreg[2] = {};
 
 		reg.index = i;
 
@@ -720,21 +742,77 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
 		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
 			continue;
 
-		/* skip MSI-X BAR */
-		if (i == msix_bar)
-			continue;
+		if (i == msix_bar) {
+			/*
+			 * VFIO will not let us map the MSI-X table,
+			 * but we can map around it.
+			 */
+			uint32_t table_start = msix_table_offset;
+			uint32_t table_end = table_start + msix_table_size;
+			table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
+			table_start &= PAGE_MASK;
+
+			if (table_start == 0  &&  table_end >= reg.size) {
+				/* Cannot map this BAR */
+				RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i);
+				continue;
+			} else {
+				memreg[0].offset = reg.offset;
+				memreg[0].size = table_start;
+				memreg[1].offset = table_end;
+				memreg[1].size = reg.size - table_end;
+
+				RTE_LOG(DEBUG, EAL,
+					"Trying to map BAR %d that contains the MSI-X "
+					"table. Trying offsets: "
+					"%04x:%04x, %04x:%04x\n", i,
+					memreg[0].offset, memreg[0].size,
+					memreg[1].offset, memreg[1].size);
+			}
+		} else {
+			memreg[0].offset = reg.offset;
+			memreg[0].size = reg.size;
+		}
 
+		/* try to figure out an address */
 		if (internal_config.process_type == RTE_PROC_PRIMARY) {
 			/* try mapping somewhere close to the end of hugepages */
 			if (pci_map_addr == NULL)
 				pci_map_addr = pci_find_max_end_va();
 
-			bar_addr = pci_map_resource(pci_map_addr, vfio_dev_fd, reg.offset,
-					reg.size);
+			bar_addr = pci_map_addr;
 			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
 		} else {
-			bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
-					reg.size);
+			bar_addr = maps[i].addr;
+		}
+
+		/* reserve the address using an inaccessible mapping */
+		bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
+				MAP_ANONYMOUS, -1, 0);
+		if (bar_addr != MAP_FAILED) {
+			void *map_addr = NULL;
+			if (memreg[0].size) {
+				/* actual map of first part */
+				map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+							    memreg[0].offset,
+							    memreg[0].size,
+							    MAP_FIXED);
+			}
+
+			/* if there's a second part, try to map it */
+			if (map_addr != MAP_FAILED
+			    &&  memreg[1].offset  &&  memreg[1].size) {
+				void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset);
+				map_addr = pci_map_resource(second_addr,
+							    vfio_dev_fd, memreg[1].offset,
+							    memreg[1].size,
+							    MAP_FIXED);
+			}
+
+			if (map_addr == MAP_FAILED  ||  !map_addr) {
+				munmap(bar_addr, reg.size);
+				bar_addr = MAP_FAILED;
+			}
 		}
 
 		if (bar_addr == MAP_FAILED ||
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 03e693e01bf0..72ec3f62a3d8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -43,9 +43,13 @@
 #include <linux/vfio.h>
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
-#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#define RTE_PCI_MSIX_TABLE_BIR    0x7
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#define RTE_PCI_MSIX_FLAGS_QSIZE  0x07ff
 #else
-#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_BIR    PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
 #define VFIO_PRESENT
-- 
1.9.3
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH v2] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-28 22:04   ` [dpdk-dev] [PATCH v2] " Dan Aloni
@ 2015-01-29 10:22     ` Burakov, Anatoly
  2015-01-29 10:25       ` Dan Aloni
  2015-02-23 20:58       ` Thomas Monjalon
  0 siblings, 2 replies; 8+ messages in thread
From: Burakov, Anatoly @ 2015-01-29 10:22 UTC (permalink / raw)
  To: Dan Aloni, dev
Hi Dan,
> 
> While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> it does allow us to map around them in PAGE_SIZE granularity. There
> might be adapters that provide their registers in the same BAR
> but on a different page. For example, Intel's NVME adapter, though
> not a network adapter, provides only one MMIO BAR that contains
> the MSI-X table.
> 
> Signed-off-by: Dan Aloni <dan@kernelim.com>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Why is it signed-off by me? :-)
Otherwise,
Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH v2] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-29 10:22     ` Burakov, Anatoly
@ 2015-01-29 10:25       ` Dan Aloni
  2015-02-23 20:58       ` Thomas Monjalon
  1 sibling, 0 replies; 8+ messages in thread
From: Dan Aloni @ 2015-01-29 10:25 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev
On Thu, Jan 29, 2015 at 10:22:58AM +0000, Burakov, Anatoly wrote:
> Hi Dan,
> 
> > 
> > While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> > it does allow us to map around them in PAGE_SIZE granularity. There
> > might be adapters that provide their registers in the same BAR
> > but on a different page. For example, Intel's NVME adapter, though
> > not a network adapter, provides only one MMIO BAR that contains
> > the MSI-X table.
> > 
> > Signed-off-by: Dan Aloni <dan@kernelim.com>
> > Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> 
> Why is it signed-off by me? :-)
The change you have requested got in, it makes you an author too :)
-- 
Dan Aloni
^ permalink raw reply	[flat|nested] 8+ messages in thread
* Re: [dpdk-dev] [PATCH v2] eal/linux: allow to map BARs with MSI-X tables, around them
  2015-01-29 10:22     ` Burakov, Anatoly
  2015-01-29 10:25       ` Dan Aloni
@ 2015-02-23 20:58       ` Thomas Monjalon
  1 sibling, 0 replies; 8+ messages in thread
From: Thomas Monjalon @ 2015-02-23 20:58 UTC (permalink / raw)
  To: Burakov, Anatoly, Dan Aloni; +Cc: dev
> > While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> > it does allow us to map around them in PAGE_SIZE granularity. There
> > might be adapters that provide their registers in the same BAR
> > but on a different page. For example, Intel's NVME adapter, though
> > not a network adapter, provides only one MMIO BAR that contains
> > the MSI-X table.
> > 
> > Signed-off-by: Dan Aloni <dan@kernelim.com>
> > Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
[...]
> Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
Applied, thanks
Note: EAL Linux VFIO has no official maintainer.
^ permalink raw reply	[flat|nested] 8+ messages in thread
end of thread, other threads:[~2015-02-23 20:58 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-22  8:36 [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them Dan Aloni
2015-01-28 14:06 ` Dan Aloni
2015-01-28 15:01 ` Burakov, Anatoly
2015-01-28 22:04   ` Dan Aloni
2015-01-28 22:04   ` [dpdk-dev] [PATCH v2] " Dan Aloni
2015-01-29 10:22     ` Burakov, Anatoly
2015-01-29 10:25       ` Dan Aloni
2015-02-23 20:58       ` Thomas Monjalon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).