From: "Burakov, Anatoly" <anatoly.burakov@intel.com>
To: Dan Aloni <dan@kernelim.com>, "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [PATCH] eal/linux: allow to map BARs with MSI-X tables, around them
Date: Wed, 28 Jan 2015 15:01:38 +0000 [thread overview]
Message-ID: <C6ECDF3AB251BE4894318F4E4512369780C3ECFA@IRSMSX109.ger.corp.intel.com> (raw)
In-Reply-To: <1421915771-10376-1-git-send-email-dan@kernelim.com>
Hi Dan
Apologies for not looking at it earlier.
> While VFIO doesn't allow us to map complete BARs with MSI-X tables,
> it does allow us to map around them in PAGE_SIZE granularity. There
> might be adapters that provide their registers in the same BAR
> but on a different page. For example, Intel's NVME adapter, though
> not a network adapter, provides only one MMIO BAR that contains
> the MSI-X table.
>
> Signed-off-by: Dan Aloni <dan@kernelim.com>
> CC: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
> lib/librte_eal/linuxapp/eal/eal_pci.c | 5 +-
> lib/librte_eal/linuxapp/eal/eal_pci_init.h | 2 +-
> lib/librte_eal/linuxapp/eal/eal_pci_uio.c | 4 +-
> lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 99
> +++++++++++++++++++++++++++---
> lib/librte_eal/linuxapp/eal/eal_vfio.h | 8 ++-
> 5 files changed, 101 insertions(+), 17 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c
> b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index b5f54101e8aa..4a74a9372a15 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -118,13 +118,14 @@ pci_find_max_end_va(void)
>
> /* map a particular resource from a file */
> void *
> -pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
> +pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
> + int additional_flags)
> {
> void *mapaddr;
>
> /* Map the PCI memory resource of device */
> mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
> - MAP_SHARED, fd, offset);
> + MAP_SHARED | additional_flags, fd, offset);
> if (mapaddr == MAP_FAILED) {
> RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx,
> 0x%lx): %s (%p)\n",
> __func__, fd, requested_addr,
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> index 1070eb88fe0a..0a0853d4c4df 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
> @@ -66,7 +66,7 @@ extern void *pci_map_addr;
> void *pci_find_max_end_va(void);
>
> void *pci_map_resource(void *requested_addr, int fd, off_t offset,
> - size_t size);
> + size_t size, int additional_flags);
>
> /* map IGB_UIO resource prototype */
> int pci_uio_map_resource(struct rte_pci_device *dev);
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> index e53f06b82430..eaa2e36f643e 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> @@ -139,7 +139,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
>
> if (pci_map_resource(uio_res->maps[i].addr, fd,
> (off_t)uio_res->maps[i].offset,
> - (size_t)uio_res->maps[i].size)
> + (size_t)uio_res->maps[i].size, 0)
> != uio_res->maps[i].addr) {
> RTE_LOG(ERR, EAL,
> "Cannot mmap device resource\n");
> @@ -379,7 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
> pci_map_addr =
> pci_find_max_end_va();
>
> mapaddr =
> pci_map_resource(pci_map_addr, fd, (off_t)offset,
> - (size_t)maps[j].size);
> + (size_t)maps[j].size, 0);
> if (mapaddr == MAP_FAILED)
> fail = 1;
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index 20e097727f80..f6542a1f1464 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -62,6 +62,9 @@
>
> #ifdef VFIO_PRESENT
>
> +#define PAGE_SIZE (sysconf(_SC_PAGESIZE))
> +#define PAGE_MASK (~(PAGE_SIZE - 1))
> +
> #define VFIO_DIR "/dev/vfio"
> #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
> #define VFIO_GROUP_FMT "/dev/vfio/%u"
> @@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg;
>
> /* get PCI BAR number where MSI-X interrupts are */
> static int
> -pci_vfio_get_msix_bar(int fd, int *msix_bar)
> +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
> + uint32_t *msix_table_size)
> {
> int ret;
> uint32_t reg;
> + uint16_t flags;
> uint8_t cap_id, cap_offset;
>
> /* read PCI capability pointer from config space */
> @@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar)
> return -1;
> }
>
> + ret = pread64(fd, &flags, sizeof(flags),
> +
> VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
> + cap_offset + 2);
> + if (ret != sizeof(flags)) {
> + RTE_LOG(ERR, EAL, "Cannot read table flags
> from PCI config "
> + "space!\n");
> + return -1;
> + }
> +
> *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
> + *msix_table_offset = reg &
> RTE_PCI_MSIX_TABLE_OFFSET;
> + *msix_table_size = 16 * (1 + (flags &
> RTE_PCI_MSIX_FLAGS_QSIZE));
>
> return 0;
> }
> @@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
> int i, ret, msix_bar;
> struct mapped_pci_resource *vfio_res = NULL;
> struct pci_map *maps;
> + uint32_t msix_table_offset = 0;
> + uint32_t msix_table_size = 0;
>
> dev->intr_handle.fd = -1;
> dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
> @@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
> }
>
> /* get MSI-X BAR, if any (we have to know where it is because we
> can't
> - * mmap it when using VFIO) */
> + * easily mmap it when using VFIO) */
> msix_bar = -1;
> - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
> + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
> + &msix_table_offset, &msix_table_size);
> if (ret < 0) {
> RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n",
> pci_addr);
> close(vfio_dev_fd);
> @@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
> for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> struct vfio_region_info reg = { .argsz = sizeof(reg) };
> void *bar_addr;
> + struct memreg {
> + uint32_t offset, size;
> + } memreg[2] = {};
>
> reg.index = i;
>
> @@ -720,21 +742,78 @@ pci_vfio_map_resource(struct rte_pci_device
> *dev)
> if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
> continue;
>
> - /* skip MSI-X BAR */
> - if (i == msix_bar)
> - continue;
> + if (i == msix_bar) {
> + /*
> + * VFIO will not let us map the MSI-X table,
> + * but we can map around it.
> + */
> + uint32_t table_start = msix_table_offset;
> + uint32_t table_end = table_start + msix_table_size;
> + table_end = (table_end + ~PAGE_MASK) &
> PAGE_MASK;
> + table_start &= PAGE_MASK;
> +
> + if (table_start == 0 && table_end >= reg.size) {
> + /* Cannot map this BAR */
> + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n",
> i);
> + continue;
> + } else {
> + memreg[0].offset = reg.offset;
> + memreg[0].size = table_start;
> + memreg[1].offset = table_end;
> + memreg[1].size = reg.size - table_end;
> +
> + RTE_LOG(DEBUG, EAL,
> + "Trying to map BAR %d that contains
> the MSI-X "
> + "table. Trying offsets: "
> + "%04x:%04x, %04x:%04x\n", i,
> + memreg[0].offset, memreg[0].size,
> + memreg[1].offset, memreg[1].size);
> + }
> + } else {
> + memreg[0].offset = reg.offset;
> + memreg[0].size = reg.size;
> + }
>
> + /* try to figure out an address */
> if (internal_config.process_type == RTE_PROC_PRIMARY) {
> /* try mapping somewhere close to the end of
> hugepages */
> if (pci_map_addr == NULL)
> pci_map_addr = pci_find_max_end_va();
>
> - bar_addr = pci_map_resource(pci_map_addr,
> vfio_dev_fd, reg.offset,
> - reg.size);
> + bar_addr = pci_map_addr;
> pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t)
> reg.size);
> } else {
> - bar_addr = pci_map_resource(maps[i].addr,
> vfio_dev_fd, reg.offset,
> - reg.size);
> + bar_addr = maps[i].addr;
> + }
> +
> + /* reserve the address using an inaccessible mapping */
> + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
> + MAP_ANONYMOUS, -1, 0);
> + if (bar_addr != MAP_FAILED) {
> + void *map_addr = NULL;
> + if (memreg[0].size) {
> + /* actual map of first part */
> + map_addr = pci_map_resource(bar_addr,
> vfio_dev_fd,
> + memreg[0].offset,
> + memreg[0].size,
> + MAP_FIXED);
> + }
> +
> + /* if there's a second part, try to map it */
> + if (map_addr != MAP_FAILED
> + && memreg[1].offset && memreg[1].size) {
> + uint8_t *second_addr =
> + ((uint8_t *)bar_addr +
> memreg[1].offset);
Nitpicking, but probably better to use void* and RTE_PTR_ADD here.
> + map_addr = pci_map_resource((void
> *)second_addr,
> + vfio_dev_fd,
> memreg[1].offset,
> + memreg[1].size,
> + MAP_FIXED);
> + }
> +
> + if (map_addr == MAP_FAILED || !map_addr) {
> + munmap(bar_addr, reg.size);
> + bar_addr = MAP_FAILED;
> + }
> }
>
> if (bar_addr == MAP_FAILED ||
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 03e693e01bf0..72ec3f62a3d8 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -43,9 +43,13 @@
> #include <linux/vfio.h>
>
> #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
> -#define RTE_PCI_MSIX_TABLE_BIR 0x7
> +#define RTE_PCI_MSIX_TABLE_BIR 0x7
> +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
> +#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff
> #else
> -#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
> +#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
> +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
> +#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE
> #endif
>
> #define VFIO_PRESENT
> --
> 1.9.3
Otherwise, no issues from me.
Thanks,
Anatoly
next prev parent reply other threads:[~2015-01-28 15:01 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-01-22 8:36 Dan Aloni
2015-01-28 14:06 ` Dan Aloni
2015-01-28 15:01 ` Burakov, Anatoly [this message]
2015-01-28 22:04 ` Dan Aloni
2015-01-28 22:04 ` [dpdk-dev] [PATCH v2] " Dan Aloni
2015-01-29 10:22 ` Burakov, Anatoly
2015-01-29 10:25 ` Dan Aloni
2015-02-23 20:58 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=C6ECDF3AB251BE4894318F4E4512369780C3ECFA@IRSMSX109.ger.corp.intel.com \
--to=anatoly.burakov@intel.com \
--cc=dan@kernelim.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).