From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wi0-f182.google.com (mail-wi0-f182.google.com [209.85.212.182]) by dpdk.org (Postfix) with ESMTP id 3289B5A77 for ; Wed, 28 Jan 2015 23:05:00 +0100 (CET) Received: by mail-wi0-f182.google.com with SMTP id n3so16160275wiv.3 for ; Wed, 28 Jan 2015 14:05:00 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=1/60LPEdwDsunfcHE4KlnQ6SEvkB/lbV5BYr7LAITuI=; b=dBpWQGU+UPdwAMfNeiZMp7OdP1EQGVIJZGUfxYci4EmgZPBm8WqXW/QXfe/8xDw80d jWnmusyxOxU/zSFaewdiPRL60p7oFpuR2Oqn7/yWZVXmRIygYMaA5VmnPtUkqSVuSNPj 48rNaueMVCizBshm3RDt5f+f0btUF53/9FqXm5L/qt4lcUVtKd4jVLYVDLRMTTaOKUeh MyOkDG32Veaddn71Ia8hpH25uQNDrI6XArwhjUXGXEWKlmcAbFAV8Z39aVHEcMhbUzD7 viqnYBpAsZS8yYM6UOsKsgAfJTPMicfZNk885PY/PvScLeXKx2m0NbBf33G7WGmLsNST sIag== X-Gm-Message-State: ALoCoQnscyegUBmhNm5L/E57CbRP2Qe9PbD+vhicfeyZK9KFyC7bgv4Rzt1MRUHpKWeVz4S8JUu6 X-Received: by 10.194.71.164 with SMTP id w4mr11400115wju.19.1422482698768; Wed, 28 Jan 2015 14:04:58 -0800 (PST) Received: from carbon.home.aloni.org ([188.120.132.209]) by mx.google.com with ESMTPSA id k3sm7826315wje.30.2015.01.28.14.04.57 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Wed, 28 Jan 2015 14:04:58 -0800 (PST) From: Dan Aloni To: dev@dpdk.org Date: Thu, 29 Jan 2015 00:04:53 +0200 Message-Id: <1422482693-14158-1-git-send-email-dan@kernelim.com> X-Mailer: git-send-email 1.9.3 In-Reply-To: References: Subject: [dpdk-dev] [PATCH v2] eal/linux: allow to map BARs with MSI-X tables, around them X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 28 Jan 2015 22:05:00 -0000 While VFIO doesn't allow us to map complete BARs with MSI-X tables, it does allow us to map around them in PAGE_SIZE granularity. There might be adapters that provide their registers in the same BAR but on a different page. For example, Intel's NVME adapter, though not a network adapter, provides only one MMIO BAR that contains the MSI-X table. Signed-off-by: Dan Aloni Signed-off-by: Anatoly Burakov --- lib/librte_eal/linuxapp/eal/eal_pci.c | 5 +- lib/librte_eal/linuxapp/eal/eal_pci_init.h | 2 +- lib/librte_eal/linuxapp/eal/eal_pci_uio.c | 4 +- lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 98 +++++++++++++++++++++++++++--- lib/librte_eal/linuxapp/eal/eal_vfio.h | 8 ++- 5 files changed, 100 insertions(+), 17 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c index b5f54101e8aa..4a74a9372a15 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -118,13 +118,14 @@ pci_find_max_end_va(void) /* map a particular resource from a file */ void * -pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size) +pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size, + int additional_flags) { void *mapaddr; /* Map the PCI memory resource of device */ mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, offset); + MAP_SHARED | additional_flags, fd, offset); if (mapaddr == MAP_FAILED) { RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n", __func__, fd, requested_addr, diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h index 1070eb88fe0a..0a0853d4c4df 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -66,7 +66,7 @@ extern void *pci_map_addr; void *pci_find_max_end_va(void); void *pci_map_resource(void *requested_addr, int fd, off_t offset, - size_t size); + size_t size, int additional_flags); /* map IGB_UIO resource prototype */ int pci_uio_map_resource(struct rte_pci_device *dev); diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c index e53f06b82430..eaa2e36f643e 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c @@ -139,7 +139,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev) if (pci_map_resource(uio_res->maps[i].addr, fd, (off_t)uio_res->maps[i].offset, - (size_t)uio_res->maps[i].size) + (size_t)uio_res->maps[i].size, 0) != uio_res->maps[i].addr) { RTE_LOG(ERR, EAL, "Cannot mmap device resource\n"); @@ -379,7 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev) pci_map_addr = pci_find_max_end_va(); mapaddr = pci_map_resource(pci_map_addr, fd, (off_t)offset, - (size_t)maps[j].size); + (size_t)maps[j].size, 0); if (mapaddr == MAP_FAILED) fail = 1; diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 20e097727f80..c8df91c0f800 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -62,6 +62,9 @@ #ifdef VFIO_PRESENT +#define PAGE_SIZE (sysconf(_SC_PAGESIZE)) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + #define VFIO_DIR "/dev/vfio" #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" #define VFIO_GROUP_FMT "/dev/vfio/%u" @@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg; /* get PCI BAR number where MSI-X interrupts are */ static int -pci_vfio_get_msix_bar(int fd, int *msix_bar) +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, + uint32_t *msix_table_size) { int ret; uint32_t reg; + uint16_t flags; uint8_t cap_id, cap_offset; /* read PCI capability pointer from config space */ @@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar) return -1; } + ret = pread64(fd, &flags, sizeof(flags), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 2); + if (ret != sizeof(flags)) { + RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " + "space!\n"); + return -1; + } + *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; + *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); return 0; } @@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev) int i, ret, msix_bar; struct mapped_pci_resource *vfio_res = NULL; struct pci_map *maps; + uint32_t msix_table_offset = 0; + uint32_t msix_table_size = 0; dev->intr_handle.fd = -1; dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; @@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev) } /* get MSI-X BAR, if any (we have to know where it is because we can't - * mmap it when using VFIO) */ + * easily mmap it when using VFIO) */ msix_bar = -1; - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar); + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, + &msix_table_offset, &msix_table_size); if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); close(vfio_dev_fd); @@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev) for (i = 0; i < (int) vfio_res->nb_maps; i++) { struct vfio_region_info reg = { .argsz = sizeof(reg) }; void *bar_addr; + struct memreg { + uint32_t offset, size; + } memreg[2] = {}; reg.index = i; @@ -720,21 +742,77 @@ pci_vfio_map_resource(struct rte_pci_device *dev) if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) continue; - /* skip MSI-X BAR */ - if (i == msix_bar) - continue; + if (i == msix_bar) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table_offset; + uint32_t table_end = table_start + msix_table_size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= reg.size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); + continue; + } else { + memreg[0].offset = reg.offset; + memreg[0].size = table_start; + memreg[1].offset = table_end; + memreg[1].size = reg.size - table_end; + + RTE_LOG(DEBUG, EAL, + "Trying to map BAR %d that contains the MSI-X " + "table. Trying offsets: " + "%04x:%04x, %04x:%04x\n", i, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } + } else { + memreg[0].offset = reg.offset; + memreg[0].size = reg.size; + } + /* try to figure out an address */ if (internal_config.process_type == RTE_PROC_PRIMARY) { /* try mapping somewhere close to the end of hugepages */ if (pci_map_addr == NULL) pci_map_addr = pci_find_max_end_va(); - bar_addr = pci_map_resource(pci_map_addr, vfio_dev_fd, reg.offset, - reg.size); + bar_addr = pci_map_addr; pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); } else { - bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset, - reg.size); + bar_addr = maps[i].addr; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, reg.size); + bar_addr = MAP_FAILED; + } } if (bar_addr == MAP_FAILED || diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 03e693e01bf0..72ec3f62a3d8 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -43,9 +43,13 @@ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) -#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8 +#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff #else -#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET +#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE #endif #define VFIO_PRESENT -- 1.9.3