DPDK patches and discussions
 help / color / mirror / Atom feed
* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
@ 2014-05-27  3:19 Xu, HuilongX
  0 siblings, 0 replies; 10+ messages in thread
From: Xu, HuilongX @ 2014-05-27  3:19 UTC (permalink / raw)
  To: dev, Burakov, Anatoly

VFIO is kernel 3.6+ only, and so is only compiled when DPDK config
option CONFIG_RTE_EAL_VFIO is enabled, and kernel 3.6 or higher is
detected, thus preventing compile failures on older kernels if VFIO is
enabled in config (and it is, by default).

Since VFIO cannot be used to map the same device twice, secondary
processes receive the device/group fd's by means of communicating over a
local socket. Only group and container fd's should be sent, as device
fd's can be obtained via ioctl() calls' on the group fd.

For multiprocess, VFIO distinguishes between existing but unused groups
(e.g. grups that aren't bound to VFIO driver) and non-existing groups in
order to know if the secondary process requests a valid group, or if
secondary process requests something that doesn't exist.

Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>

Test-by: HuilongX Xu <huilongx.xu@intel.com<mailto:huilongx.xu@intel.com>>



Compile pass

     >>Compile OS: FC20 x86_64

     >>Kernel version: 3.13.6-200

     >>GCC version: 4.8.2

     >>Server: Crownpass

---
lib/librte_eal/linuxapp/eal/Makefile               |    5 +-
lib/librte_eal/linuxapp/eal/eal.c                  |    1 +
lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  719 ++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c  |  367 ++++++++++
.../linuxapp/eal/include/eal_internal_cfg.h        |    3 +
lib/librte_eal/linuxapp/eal/include/eal_pci_init.h |   55 ++
lib/librte_eal/linuxapp/eal/include/eal_vfio.h     |    6 +
7 files changed, 1155 insertions(+), 1 deletions(-)
create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c

diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 527fa2a..3a39cca 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -58,6 +58,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_log.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_uio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_socket.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_debug.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_timer.c
@@ -88,12 +90,13 @@ CFLAGS_eal_common_log.o := -D_GNU_SOURCE
CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
CFLAGS_eal_pci.o := -D_GNU_SOURCE
CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE
 # workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
CFLAGS_eal_thread.o += -Wno-return-type
-CFLAGS_eal_hpet.o += -Wno-return-type
+CFLAGS_eal_pci_vfio_socket.o += -Wno-return-type
endif
 INC := rte_per_lcore.h rte_lcore.h rte_interrupts.h rte_kni_common.h rte_dom0_common.h
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index de182e1..01bfd6c 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -650,6 +650,7 @@ eal_parse_args(int argc, char **argv)
               internal_config.force_sockets = 0;
               internal_config.syslog_facility = LOG_DAEMON;
               internal_config.xen_dom0_support = 0;
+             internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
#ifdef RTE_LIBEAL_USE_HPET
               internal_config.no_hpet = 0;
#else
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
new file mode 100644
index 0000000..0a6f95c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -0,0 +1,719 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, int * msix_bar)
+{
+             int ret;
+             uint32_t reg;
+             uint8_t cap_id, cap_offset;
+
+             /* read PCI capability pointer from config space */
+             ret = pread64(fd, &reg, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_CAPABILITY_LIST);
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+                                                             "config space!\n");
+                             return -1;
+             }
+
+             /* we need first byte */
+             cap_offset = reg & 0xFF;
+
+             while (cap_offset){
+
+                             /* read PCI capability ID */
+                             ret = pread64(fd, &reg, sizeof(reg),
+                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                             cap_offset);
+                             if (ret != sizeof(reg)) {
+                                             RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+                                                                             "config space!\n");
+                                             return -1;
+                             }
+
+                             /* we need first byte */
+                             cap_id = reg & 0xFF;
+
+                             /* if we haven't reached MSI-X, check next capability */
+                             if (cap_id != PCI_CAP_ID_MSIX) {
+                                             ret = pread64(fd, &reg, sizeof(reg),
+                                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                                             cap_offset);
+                                             if (ret != sizeof(reg)) {
+                                                             RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+                                                                                             "config space!\n");
+                                                             return -1;
+                                             }
+
+                                             /* we need second byte */
+                                             cap_offset = (reg & 0xFF00) >> 8;
+
+                                             continue;
+                             }
+                             /* else, read table offset */
+                             else {
+                                             /* table offset resides in the next 4 bytes */
+                                             ret = pread64(fd, &reg, sizeof(reg),
+                                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                                             cap_offset + 4);
+                                             if (ret != sizeof(reg)) {
+                                                             RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+                                                                                             "space!\n");
+                                                             return -1;
+                                             }
+
+                                             *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+
+                                             return 0;
+                             }
+             }
+             return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd)
+{
+             uint16_t reg;
+             int ret;
+
+             ret = pread64(dev_fd, &reg, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_COMMAND);
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+                             return -1;
+             }
+
+             /* set the master bit */
+             reg |= PCI_COMMAND_MASTER;
+
+             ret = pwrite64(dev_fd, &reg, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_COMMAND);
+
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+                             return -1;
+             }
+
+             return 0;
+}
+
+/* set up DMA mappings */
+static int
+pci_vfio_setup_dma_maps(int vfio_container_fd)
+{
+             const struct rte_memseg * ms = rte_eal_get_physmem_layout();
+             int i, ret;
+
+             ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+                                             VFIO_TYPE1_IOMMU);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  cannot set IOMMU type!\n");
+                             return -1;
+             }
+
+             /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+             for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+                             struct vfio_iommu_type1_dma_map dma_map;
+
+                             if (ms[i].addr == NULL)
+                                             break;
+
+                             memset(&dma_map, 0, sizeof(dma_map));
+                             dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+                             dma_map.vaddr = ms[i].addr_64;
+                             dma_map.size = ms[i].len;
+                             dma_map.iova = ms[i].phys_addr;
+                             dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+                             ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  cannot set up DMA remapping!\n");
+                                             return -1;
+                             }
+             }
+
+             return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd,
+                             int num_irqs)
+{
+             int i, ret, intr_idx;
+             enum rte_intr_handle_type handle_type;
+
+             /* get interrupt type from internal config (MSI-X by default, can be
+             * overriden from the command line
+             */
+             switch (internal_config.vfio_intr_mode) {
+             case RTE_INTR_MODE_MSIX:
+                             intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+                             handle_type = RTE_INTR_HANDLE_VFIO_MSIX;
+                             break;
+             case RTE_INTR_MODE_LEGACY:
+                             intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+                             handle_type = RTE_INTR_HANDLE_VFIO_LEGACY;
+                             break;
+             default:
+                             RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
+                             return -1;
+             }
+
+             for (i = 0; i < num_irqs; i++) {
+                             struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+                             int fd = -1;
+
+                             /* skip interrupt modes we don't want */
+                             if (i != intr_idx)
+                                             continue;
+
+                             irq.index = i;
+
+                             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+                             if (ret < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot get IRQ info!\n");
+                                             return -1;
+                             }
+
+                             /* fail if this vector cannot be used with eventfd */
+                             if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+                                             RTE_LOG(ERR, EAL, "  interrupt vector does not support eventfd!\n");
+                                             return -1;
+                             }
+
+                             /* set up an eventfd for interrupts */
+                             fd = eventfd(0, 0);
+                             if (fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot set up eventfd!\n");
+                                             return -1;
+                             }
+
+                             dev->intr_handle.type = handle_type;
+                             dev->intr_handle.fd = fd;
+                             dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+                             return 0;
+             }
+
+             /* if we're here, we haven't found a suitable interrupt vector */
+             return -1;
+}
+
+/* open container fd or get an existing one */
+static int
+pci_vfio_get_container_fd(void)
+{
+             int ret, vfio_container_fd;
+
+             /* if we're in a primary process, try to open the container */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot open VFIO container!\n");
+                                             return -1;
+                             }
+
+                             /* check VFIO API version */
+                             ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+                             if (ret != VFIO_API_VERSION) {
+                                             RTE_LOG(ERR, EAL, "  unknown VFIO API version!\n");
+                                             close(vfio_container_fd);
+                                             return -1;
+                             }
+
+                             /* check if we support IOMMU type 1 */
+                             ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+                             if (!ret) {
+                                             RTE_LOG(ERR, EAL, "  unknown IOMMU driver!\n");
+                                             close(vfio_container_fd);
+                                             return -1;
+                             }
+
+                             return vfio_container_fd;
+             }
+             /* if we're in a secondary process, request container fd from the primary
+             * process via our socket
+             */
+             else {
+                             int socket_fd;
+                             if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             vfio_container_fd = vfio_socket_receive_fd(socket_fd);
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             close(socket_fd);
+                             return vfio_container_fd;
+             }
+
+             return -1;
+}
+
+/* open group fd or get an existing one */
+static int
+pci_vfio_get_group_fd(int iommu_group_no)
+{
+             int i;
+             int vfio_group_fd;
+             char filename[PATH_MAX];
+
+             /* check if we already have the group descriptor open */
+             for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+                             if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
+                                             return vfio_cfg.vfio_groups[i].fd;
+
+             /* if primary, try to open the group */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             rte_snprintf(filename, sizeof(filename),
+                                                             VFIO_GROUP_FMT, iommu_group_no);
+                             vfio_group_fd = open(filename, O_RDWR);
+                             if (vfio_group_fd < 0) {
+                                             /* if file not found, it's not an error */
+                                             if (errno != ENOENT) {
+                                                             RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+                                                                                             strerror(errno));
+                                                             return -1;
+                                             }
+                                             return 0;
+                             }
+
+                             /* if the fd is valid, create a new group for it */
+                             if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
+                                             RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+                                             return -1;
+                             }
+                             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+                             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+                             return vfio_group_fd;
+             }
+             /* if we're in a secondary process, request group fd from the primary
+             * process via our socket
+             */
+             else {
+                             int socket_fd, ret;
+                             if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, iommu_group_no) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot send group number!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             ret = vfio_socket_receive_request(socket_fd);
+                             switch(ret) {
+                             case SOCKET_NO_FD:
+                                             close(socket_fd);
+                                             return 0;
+                             case SOCKET_OK:
+                                             vfio_group_fd = vfio_socket_receive_fd(socket_fd);
+                                             /* if we got the fd, return it */
+                                             if (vfio_group_fd > 0) {
+                                                             close(socket_fd);
+                                                             return vfio_group_fd;
+                                             }
+                                             /* fall-through on error */
+                             default:
+                                             RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+             }
+             return -1;
+}
+
+/* parse IOMMU group number for a PCI device
+ * returns -1 for errors, 0 for non-existent group */
+static int
+pci_vfio_get_group_no(const char * pci_addr)
+{
+             char linkname[PATH_MAX];
+             char filename[PATH_MAX];
+             char * tok[16], *group_tok, *end;
+             int ret, iommu_group_no;
+
+             memset(linkname, 0, sizeof(linkname));
+             memset(filename, 0, sizeof(filename));
+
+             /* try to find out IOMMU group for this device */
+             rte_snprintf(linkname, sizeof(linkname),
+                                             SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+             ret = readlink(linkname, filename, sizeof(filename));
+
+             /* if the link doesn't exist, no VFIO for us */
+             if (ret < 0)
+                             return 0;
+
+             ret = rte_strsplit(filename, sizeof(filename),
+                                             tok, RTE_DIM(tok), '/');
+
+             if (ret <= 0) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", pci_addr);
+                             return -1;
+             }
+
+             /* IOMMU group is always the last token */
+             errno = 0;
+             group_tok = tok[ret - 1];
+             end = group_tok;
+             iommu_group_no = strtol(group_tok, &end, 10);
+             if ((end != group_tok && *end != '\0') || errno != 0) {
+                             RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", pci_addr);
+                             return -1;
+             }
+
+             return iommu_group_no;
+}
+
+static void
+clear_current_group(void)
+{
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+}
+
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+             struct vfio_group_status group_status =
+                                                                             { .argsz = sizeof(group_status) };
+             struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+             int vfio_group_fd, vfio_dev_fd;
+             int iommu_group_no;
+             char pci_addr[PATH_MAX] = {0};
+             struct rte_pci_addr *loc = &dev->addr;
+             int i, ret, msix_bar;
+             struct mapped_pci_resource *vfio_res = NULL;
+             struct pci_map *maps;
+
+             dev->intr_handle.fd = -1;
+             dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+             /* store PCI address string */
+             rte_snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+                                             loc->domain, loc->bus, loc->devid, loc->function);
+
+             /* get container fd (needs to be done only once per initialization) */
+             if (vfio_cfg.vfio_container_fd == -1) {
+                             int vfio_container_fd = pci_vfio_get_container_fd();
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot open VFIO container!\n", pci_addr);
+                                             return -1;
+                             }
+
+                             vfio_cfg.vfio_container_fd = vfio_container_fd;
+             }
+
+             /* get group number */
+             iommu_group_no = pci_vfio_get_group_no(pci_addr);
+
+             /* if 0, group doesn't exist */
+             if (iommu_group_no == 0) {
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             return 1;
+             }
+             /* if negative, something failed */
+             else if (iommu_group_no < 0)
+                             return -1;
+
+             /* get the actual group fd */
+             vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no);
+             if (vfio_group_fd < 0) {
+                             return -1;
+             }
+
+             /* store group fd */
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+
+             /* if group_fd == 0, that means the device isn't managed by VFIO */
+             if (vfio_group_fd == 0) {
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             /* we store 0 as group fd to distinguish between existing but
+                             * unbound VFIO groups, and groups that don't exist at all.
+                             */
+                             vfio_cfg.vfio_group_idx++;
+                             return 1;
+             }
+
+             /*
+             * at this point, we know at least one port on this device is bound to VFIO,
+             * so we can proceed to try and set this particular port up
+             */
+
+             /* check if the group is viable */
+             ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get group status!\n", pci_addr);
+                             close(vfio_group_fd);
+                             clear_current_group();
+                             return -1;
+             }
+             else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+                             RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", pci_addr);
+                             close(vfio_group_fd);
+                             clear_current_group();
+                             return -1;
+             }
+
+             /*
+             * at this point, we know that this group is viable (meaning, all devices
+             * are either bound to VFIO or not bound to anything)
+             */
+
+             /* check if group does not have a container yet */
+             if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+                             /* add group to a container */
+                             ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+                                                             &vfio_cfg.vfio_container_fd);
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container!\n",
+                                                                             pci_addr);
+                                             close(vfio_group_fd);
+                                             clear_current_group();
+                                             return -1;
+                             }
+                             /*
+                             * at this point we know that this group has been successfully
+                             * initialized, so we increment vfio_group_idx to indicate that we can
+                             * add new groups.
+                             */
+                             vfio_cfg.vfio_group_idx++;
+             }
+
+             /*
+             * set up DMA mappings for container (needs to be done only once, only when
+             * at least one group is assigned to a container and only in primary process)
+             */
+             if (internal_config.process_type == RTE_PROC_PRIMARY &&
+                                             vfio_cfg.vfio_container_has_dma == 0) {
+                             ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s DMA remapping failed!\n", pci_addr);
+                                             return -1;
+                             }
+                             vfio_cfg.vfio_container_has_dma = 1;
+             }
+
+             /* get a file descriptor for the device */
+             vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+             if (vfio_dev_fd < 0) {
+                             /* if we cannot get a device fd, this simply means that this
+                             * particular port is not bound to VFIO
+                             */
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             return 1;
+             }
+
+             /* test and setup the device */
+             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get device info!\n", pci_addr);
+                             close(vfio_dev_fd);
+                             return -1;
+             }
+
+             /* get MSI-X BAR, if any (we have to know where it is because we can't
+             * mmap it when using VFIO) */
+             msix_bar = -1;
+             ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+             if (ret < 0) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
+                             close(vfio_dev_fd);
+                             return -1;
+             }
+
+             /* if we're in a primary process, allocate vfio_res and get region info */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             if ((vfio_res = rte_zmalloc("VFIO_RES", sizeof (*vfio_res), 0)) == NULL) {
+                                             RTE_LOG(ERR, EAL,
+                                                             "%s(): cannot store uio mmap details\n", __func__);
+                                             close(vfio_dev_fd);
+                                             return -1;
+                             }
+                             memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+                             /* get number of registers (up to BAR5) */
+                             vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+                                                             VFIO_PCI_BAR5_REGION_INDEX + 1);
+             }
+             /* if we're in a secondary process, just find our tailq entry and use that */
+             else {
+                             TAILQ_FOREACH(vfio_res, pci_res_list, next) {
+                                             if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+                                                             continue;
+                                             break;
+                             }
+                             /* if we haven't found our tailq entry, something's wrong */
+                             if (vfio_res == NULL) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
+                                                                             pci_addr);
+                                             close(vfio_dev_fd);
+                                             return -1;
+                             }
+             }
+
+             /* map BARs */
+             maps = vfio_res->maps;
+
+             for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+                             struct vfio_region_info reg = { .argsz = sizeof(reg) };
+                             void * bar_addr;
+
+                             reg.index = i;
+
+                             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot get device region info!\n",
+                                                                             pci_addr);
+                                             close(vfio_dev_fd);
+                                             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* skip non-mmapable BARs */
+                             if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+                                             continue;
+
+                             /* skip MSI-X BAR */
+                             if (i == msix_bar)
+                                             continue;
+
+                             bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
+                                                             reg.size);
+
+                             if (bar_addr == NULL) {
+                                             RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n", pci_addr, i,
+                                                                             strerror(errno));
+                                             close(vfio_dev_fd);
+                                             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             maps[i].addr = bar_addr;
+                             maps[i].offset = reg.offset;
+                             maps[i].size = reg.size;
+                             dev->mem_resource[i].addr = bar_addr;
+             }
+
+             /* if secondary process, do not set up interrupts */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             if (pci_vfio_setup_interrupts(dev, vfio_dev_fd,
+                                                             (int) device_info.num_irqs) != 0) {
+                                             RTE_LOG(ERR, EAL, "  %s error setting up interrupts!\n", pci_addr);
+                                             close(vfio_dev_fd);
+                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* set bus mastering for the device */
+                             if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
+                                             close(vfio_dev_fd);
+                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* Reset the device */
+                             ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
+             }
+
+             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                             TAILQ_INSERT_TAIL(pci_res_list, vfio_res, next);
+
+             return (0);
+}
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
new file mode 100644
index 0000000..1605fce
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+
+/* sys/un.h with __USE_MISC uses strlen, which is unsafe and should not be used. */
+#ifdef __USE_MISC
+#define REMOVED_USE_MISC
+#undef __USE_MISC
+#endif
+#include <sys/un.h>
+/* make sure we redefine __USE_MISC only if it was previously undefined */
+#ifdef REMOVED_USE_MISC
+#define __USE_MISC
+#undef REMOVED_USE_MISC
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
+#define CMSGLEN (CMSG_LEN(sizeof(int)))
+#define FD_TO_CMSGHDR(fd,chdr) \
+                             do {\
+                                             (chdr).cmsg_len = CMSGLEN;\
+                                             (chdr).cmsg_level = SOL_SOCKET;\
+                                             (chdr).cmsg_type = SCM_RIGHTS;\
+                                             memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
+                             } while(0)
+#define CMSGHDR_TO_FD(chdr,fd) \
+                             do {\
+                                             memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd));\
+                             } while (0)
+
+
+/* get socket path (/var/run if root, $HOME otherwise) */
+static void
+get_socket_path(char * buffer, int bufsz)
+{
+             const char *dir = "/var/run";
+             const char *home_dir = getenv("HOME");
+
+             if (getuid() != 0 && home_dir != NULL)
+                             dir = home_dir;
+
+             /* use current prefix as file path */
+             rte_snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
+                                             internal_config.hugefile_prefix);
+}
+
+
+
+/*
+ * data flow for socket comm protocol:
+ * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
+ * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
+ * 2. server receives message
+ * 2a. in case of invalid group, SOCKET_ERR is sent back to client
+ * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
+ * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
+ *
+ * in case of any error, socket is closed.
+ */
+
+/* send a request, return -1 on error */
+int
+vfio_socket_send_request(int socket, int req)
+{
+             struct msghdr hdr;
+             struct iovec iov;
+             int buf;
+             int ret;
+
+             memset(&hdr, 0, sizeof(hdr));
+
+             buf = req;
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+
+             ret = sendmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+             return 0;
+}
+
+/* receive a request and return it */
+int
+vfio_socket_receive_request(int socket)
+{
+             int buf;
+             struct msghdr hdr;
+             struct iovec iov;
+             int ret, req;
+
+             memset(&hdr, 0, sizeof(hdr));
+
+             buf = SOCKET_ERR;
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+
+             ret = recvmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+
+             req = buf;
+
+             return req;
+}
+
+/* send OK in message, fd in control message */
+int
+vfio_socket_send_fd(int socket, int fd)
+{
+             int buf;
+             struct msghdr hdr;
+             struct cmsghdr * chdr;
+             char chdr_buf[CMSGLEN];
+             struct iovec iov;
+             int ret;
+
+             chdr = (struct cmsghdr *) chdr_buf;
+             memset(chdr, 0, sizeof(chdr_buf));
+             memset(&hdr, 0, sizeof(hdr));
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+             hdr.msg_control = chdr;
+             hdr.msg_controllen = CMSGLEN;
+
+             buf = SOCKET_OK;
+             FD_TO_CMSGHDR(fd, *chdr);
+
+             ret = sendmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+             return 0;
+}
+
+/* receive OK in message, fd in control message */
+int
+vfio_socket_receive_fd(int socket)
+{
+             int buf;
+             struct msghdr hdr;
+             struct cmsghdr * chdr;
+             char chdr_buf[CMSGLEN];
+             struct iovec iov;
+             int ret, req, fd;
+
+             buf = SOCKET_ERR;
+
+             chdr = (struct cmsghdr *) chdr_buf;
+             memset(chdr, 0, sizeof(chdr_buf));
+             memset(&hdr, 0, sizeof(hdr));
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+             hdr.msg_control = chdr;
+             hdr.msg_controllen = CMSGLEN;
+
+             ret = recvmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+
+             req = buf;
+
+             if (req != SOCKET_OK)
+                             return -1;
+
+             CMSGHDR_TO_FD(*chdr, fd);
+
+             return fd;
+}
+
+/* connect socket_fd in secondary process to the primary process's socket */
+int
+vfio_socket_connect_to_primary(void)
+{
+             struct sockaddr_un addr;
+             socklen_t sockaddr_len;
+             int socket_fd;
+
+             /* set up a socket */
+             socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+             if (socket_fd < 0) {
+                             RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+                             return -1;
+             }
+
+             get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+             addr.sun_family = AF_UNIX;
+
+             sockaddr_len = sizeof(struct sockaddr_un);
+
+             if (connect(socket_fd, (struct sockaddr*) &addr, sockaddr_len) == 0)
+                             return socket_fd;
+
+             /* if connect failed */
+             close(socket_fd);
+             return -1;
+}
+
+
+
+/*
+ * socket listening thread for primary process
+ */
+__attribute__((noreturn)) void *
+pci_vfio_socket_thread(void *arg)
+{
+             int ret, i, vfio_group_no;
+             int socket_fd = *(int*) arg;
+
+             /* wait for requests on the socket */
+             for (;;) {
+                             int conn_sock;
+                             struct sockaddr_un addr;
+                             socklen_t sockaddr_len = sizeof(addr);
+
+                             /* this is a blocking call */
+                             conn_sock = accept(socket_fd, (struct sockaddr*) &addr, &sockaddr_len);
+
+                             /* just restart on error */
+                             if (conn_sock == -1)
+                                             continue;
+
+                             /* set socket to linger after close */
+                             struct linger l;
+                             l.l_onoff = 1;
+                             l.l_linger = 60;
+                             setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l));
+
+                             ret = vfio_socket_receive_request(conn_sock);
+
+                             switch (ret) {
+                             case SOCKET_REQ_CONTAINER:
+                                             vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_container_fd);
+                                             break;
+                             case SOCKET_REQ_GROUP:
+                                             /* wait for group number */
+                                             vfio_group_no = vfio_socket_receive_request(conn_sock);
+                                             if (vfio_group_no < 0) {
+                                                             close(conn_sock);
+                                                             continue;
+                                             }
+                                             for (i = 0; i < vfio_cfg.vfio_group_idx; i++) {
+                                                             if (vfio_cfg.vfio_groups[i].group_no == vfio_group_no)
+                                                                             break;
+                                             }
+                                             /* if we reached end of the list, the group doesn't exist */
+                                             if (i == vfio_cfg.vfio_group_idx)
+                                                             vfio_socket_send_request(conn_sock, SOCKET_ERR);
+                                             /* if VFIO group exists but isn't bound to VFIO driver */
+                                             else if (vfio_cfg.vfio_groups[i].fd == 0)
+                                                             vfio_socket_send_request(conn_sock, SOCKET_NO_FD);
+                                             /* if group exists and is bound to VFIO driver */
+                                             else {
+                                                             vfio_socket_send_request(conn_sock, SOCKET_OK);
+                                                             vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_groups[i].fd);
+                                             }
+                                             break;
+                             default:
+                                             vfio_socket_send_request(conn_sock, SOCKET_ERR);
+                                             break;
+                             }
+                             close(conn_sock);
+             }
+}
+
+/*
+ * set up a local socket and tell it to listen for incoming connections
+ */
+int
+pci_vfio_socket_setup(void)
+{
+             int ret, socket_fd;
+             struct sockaddr_un addr;
+             socklen_t sockaddr_len;
+
+             /* set up a socket */
+             socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+             if (socket_fd < 0) {
+                             RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+                             return -1;
+             }
+
+             get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+             addr.sun_family = AF_UNIX;
+
+             sockaddr_len = sizeof(struct sockaddr_un);
+
+             unlink(addr.sun_path);
+
+             ret = bind(socket_fd, (struct sockaddr*) &addr, sockaddr_len);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
+                             close(socket_fd);
+                             return -1;
+             }
+
+             ret = listen(socket_fd, 50);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
+                             close(socket_fd);
+                             return -1;
+             }
+
+             return socket_fd;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
index 92e3065..5468b0a 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
@@ -40,6 +40,7 @@
#define _EAL_LINUXAPP_INTERNAL_CFG
 #include <rte_eal.h>
+#include <rte_pci_dev_feature_defs.h>
 #define MAX_HUGEPAGE_SIZES 3  /**< support up to 3 page sizes */
@@ -76,6 +77,8 @@ struct internal_config {
               volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
               uintptr_t base_virtaddr;          /**< base address to try and reserve memory from */
               volatile int syslog_facility;               /**< facility passed to openlog() */
+             /** default interrupt mode for VFIO */
+             volatile enum rte_intr_mode vfio_intr_mode;
               const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
               const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
index 699e80d..b163ab5 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
@@ -34,6 +34,8 @@
#ifndef EAL_PCI_INIT_H_
#define EAL_PCI_INIT_H_
+#include "eal_vfio.h"
+
struct pci_map {
               void *addr;
               uint64_t offset;
@@ -62,4 +64,57 @@ void * pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
/* map IGB_UIO resource prototype */
int pci_uio_map_resource(struct rte_pci_device *dev);
+#ifdef VFIO_PRESENT
+
+#define VFIO_MAX_GROUPS 64
+#define VFIO_DIR "/dev/vfio"
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
+
+/* map VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+
+/*
+ * Function prototypes for VFIO socket functions
+ */
+int vfio_socket_send_request(int socket, int req);
+int vfio_socket_receive_request(int socket);
+int vfio_socket_send_fd(int socket, int fd);
+int vfio_socket_receive_fd(int socket);
+int vfio_socket_connect_to_primary(void);
+int pci_vfio_socket_setup(void);
+void * pci_vfio_socket_thread(void *arg);
+
+/* socket comm protocol definitions */
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+             int group_no;
+             int fd;
+};
+
+struct vfio_config {
+             int vfio_enabled;
+             int vfio_container_fd;
+             int vfio_container_has_dma;
+             int vfio_group_idx;
+             struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+};
+
+/* per-process VFIO config */
+struct vfio_config vfio_cfg;
+
+pthread_t socket_thread;
+
+#endif
+
#endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
index ca4982b..32953c0 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
@@ -42,6 +42,12 @@
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
#include <linux/vfio.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#else
+#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#endif
+
#define VFIO_PRESENT
#endif /* kernel version */
#endif /* RTE_EAL_VFIO */
--
1.7.0.7

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-27 16:21     ` Burakov, Anatoly
@ 2014-05-27 16:36       ` Thomas Monjalon
  0 siblings, 0 replies; 10+ messages in thread
From: Thomas Monjalon @ 2014-05-27 16:36 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev

2014-05-27 16:21, Burakov, Anatoly:

> > You are defining some variables in a .h file. I think it is a problem.
> 
> I have managed to move everything to .c files, except for "struct
> mapped_pci_res_list *pci_res_list;" - which I need in both uio and vfio .c
> files. I don't think I'll be able to move it out of the eal_pci_init header
> file. Should declaring it as extern be fine as a compromise?

I think it's acceptable.
Like this one:
	extern struct pci_device_list pci_device_list;

-- 
Thomas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 11:53   ` Thomas Monjalon
  2014-05-22 12:06     ` Burakov, Anatoly
@ 2014-05-27 16:21     ` Burakov, Anatoly
  2014-05-27 16:36       ` Thomas Monjalon
  1 sibling, 1 reply; 10+ messages in thread
From: Burakov, Anatoly @ 2014-05-27 16:21 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> You are defining some variables in a .h file. I think it is a problem.

I have managed to move everything to .c files, except for "struct mapped_pci_res_list *pci_res_list;" - which I need in both uio and vfio .c files. I don't think I'll be able to move it out of the eal_pci_init header file. Should declaring it as extern be fine as a compromise?

Best regards,
Anatoly Burakov
DPDK SW Engineer

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 12:46           ` Thomas Monjalon
@ 2014-05-22 12:54             ` Burakov, Anatoly
  0 siblings, 0 replies; 10+ messages in thread
From: Burakov, Anatoly @ 2014-05-22 12:54 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> Yes I agree. But I stopped on the name for another thing: it's not really
> specific to vfio. Actually, vfio uses it for synchronization. But wouldn't it be
> more generic?

OK, _mp_sync it is then.

Best regards,
Anatoly Burakov
DPDK SW Engineer

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 12:37         ` Burakov, Anatoly
@ 2014-05-22 12:46           ` Thomas Monjalon
  2014-05-22 12:54             ` Burakov, Anatoly
  0 siblings, 1 reply; 10+ messages in thread
From: Thomas Monjalon @ 2014-05-22 12:46 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev

2014-05-22 12:37, Burakov, Anatoly:
> > Yes, in some environments, it could be easier to be able to configure
> > devices directly on application command line instead of having to call a
> > python script. I think having a clear and extendable syntax to configure
> > devices in command line could greatly improve usability. But it can be
> > another step.
> 
> That's probably out of scope for this patch. We can discuss this later
> without stalling VFIO :)

Yes, I agree to discuss it later.

> > What do you think of _mp_sync or _mp_conf?
> > Usage of the socket is to synchronize VFIO config between processes,
> > right?
> 
> More or less, yes. However, the code inside that file is the communication
> mechanism. I.e. it's not actually synchronizing or configuring anything,
> it's simply providing means to do so for primary and secondary processes,
> so I don't think _mp_sync or _mp_conf is a good name for that. IMO
> something like _mp_socket or similar (_mp_comm?) would be more appropriate.

Yes I agree. But I stopped on the name for another thing: it's not really 
specific to vfio. Actually, vfio uses it for synchronization. But wouldn't it 
be more generic?

-- 
Thomas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 12:28       ` Thomas Monjalon
@ 2014-05-22 12:37         ` Burakov, Anatoly
  2014-05-22 12:46           ` Thomas Monjalon
  0 siblings, 1 reply; 10+ messages in thread
From: Burakov, Anatoly @ 2014-05-22 12:37 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> Yes, in some environments, it could be easier to be able to configure devices
> directly on application command line instead of having to call a python script.
> I think having a clear and extendable syntax to configure devices in command
> line could greatly improve usability. But it can be another step.

That's probably out of scope for this patch. We can discuss this later without stalling VFIO :)

> What do you think of _mp_sync or _mp_conf?
> Usage of the socket is to synchronize VFIO config between processes, right?

More or less, yes. However, the code inside that file is the communication mechanism. I.e. it's not actually synchronizing or configuring anything, it's simply providing means to do so for primary and secondary processes, so I don't think _mp_sync or _mp_conf is a good name for that. IMO something like _mp_socket or similar (_mp_comm?) would be more appropriate. 

> Oh yes. Do you think you could merge the thread spawning in the patch
> adding it?

Good point, I'll do that.

> So you should use an "extern" trick in order to have only one instance of the
> variables. But I think it's not a good practice.
> You probably need to group functions using these variables in one .c file.
> Or do I miss something?

I'll look into this.

Best regards,
Anatoly Burakov
DPDK SW Engineer

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 12:06     ` Burakov, Anatoly
@ 2014-05-22 12:28       ` Thomas Monjalon
  2014-05-22 12:37         ` Burakov, Anatoly
  0 siblings, 1 reply; 10+ messages in thread
From: Thomas Monjalon @ 2014-05-22 12:28 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: dev

2014-05-22 12:06, Burakov, Anatoly:
> > We should discuss a way to request igb_uio or VFIO binding of a device.
> 
> Why? The device can either be bound to VFIO or igb_uio. So unless you want
> binding code in DPDK EAL (to avoid which the
> pci_unbind/igb_uio_bind/dpdk_bind script was created in the first place), I
> see no point in that. The dpdk_bind script already does that (you bind
> either to igb_uio or to vfio-pci).

Yes, in some environments, it could be easier to be able to configure devices 
directly on application command line instead of having to call a python 
script.
I think having a clear and extendable syntax to configure devices in command 
line could greatly improve usability. But it can be another step.

> > This whole socket communication deserves a separated patch with protocol
> > description.
> 
> Agreed, I'll break it up and provide a more detailed explanation.

Thanks.

> > By the way, I'm not a big fan of the suffix "_socket" which can be
> > misleading. But I have no other good naming idea.
> 
> Would _mp_socket do?

What do you think of _mp_sync or _mp_conf?
Usage of the socket is to synchronize VFIO config between processes, right?

> > So we have another thread to manage.
> > I don't see where it is spawned?
> 
> In rte_eal_pci_init().

Oh yes. Do you think you could merge the thread spawning in the patch adding 
it?

> > You are defining some variables in a .h file. I think it is a problem.
> 
> Well, they need to be shared between several .c files.

So you should use an "extern" trick in order to have only one instance of the 
variables. But I think it's not a good practice.
You probably need to group functions using these variables in one .c file.
Or do I miss something?

> > Here are some other relevant errors from checkpatch.pl:
> Thanks, I'll fix those.

Thank you
-- 
Thomas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-22 11:53   ` Thomas Monjalon
@ 2014-05-22 12:06     ` Burakov, Anatoly
  2014-05-22 12:28       ` Thomas Monjalon
  2014-05-27 16:21     ` Burakov, Anatoly
  1 sibling, 1 reply; 10+ messages in thread
From: Burakov, Anatoly @ 2014-05-22 12:06 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

> How did you test this feature?
> Did you see some performance differences with igb_uio?

The same way everything else is tested - bind a NIC to the driver and see if it works :-)

As for performance differences, potentially it can be degraded a bit because of mandatory IOMMU involvement, but I did not see any performance impact during my tests.

> For history reason, it's better to explain in another patch that eal_hpet has
> been renamed eal_timer and there is no such need anymore in this file.

Agreed.

> 
> We should discuss a way to request igb_uio or VFIO binding of a device.

Why? The device can either be bound to VFIO or igb_uio. So unless you want binding code in DPDK EAL (to avoid which the pci_unbind/igb_uio_bind/dpdk_bind script was created in the first place), I see no point in that. The dpdk_bind script already does that (you bind either to igb_uio or to vfio-pci).

> This whole socket communication deserves a separated patch with protocol
> description.

Agreed, I'll break it up and provide a more detailed explanation.

> By the way, I'm not a big fan of the suffix "_socket" which can be misleading.
> But I have no other good naming idea.

Would _mp_socket do?
 
> So we have another thread to manage.
> I don't see where it is spawned?

In rte_eal_pci_init().

> You are defining some variables in a .h file. I think it is a problem.

Well, they need to be shared between several .c files.
 
> Here are some other relevant errors from checkpatch.pl:

Thanks, I'll fix those.

Best regards,
Anatoly Burakov
DPDK SW Engineer

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-19 15:51 ` [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO Anatoly Burakov
@ 2014-05-22 11:53   ` Thomas Monjalon
  2014-05-22 12:06     ` Burakov, Anatoly
  2014-05-27 16:21     ` Burakov, Anatoly
  0 siblings, 2 replies; 10+ messages in thread
From: Thomas Monjalon @ 2014-05-22 11:53 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

Hi Anatoly,

It seems to be the main patch, so I have many comments.

2014-05-19 16:51, Anatoly Burakov:
> VFIO is kernel 3.6+ only, and so is only compiled when DPDK config
> option CONFIG_RTE_EAL_VFIO is enabled, and kernel 3.6 or higher is
> detected, thus preventing compile failures on older kernels if VFIO is
> enabled in config (and it is, by default).
> 
> Since VFIO cannot be used to map the same device twice, secondary
> processes receive the device/group fd's by means of communicating over a
> local socket. Only group and container fd's should be sent, as device
> fd's can be obtained via ioctl() calls' on the group fd.
> 
> For multiprocess, VFIO distinguishes between existing but unused groups
> (e.g. grups that aren't bound to VFIO driver) and non-existing groups in
> order to know if the secondary process requests a valid group, or if
> secondary process requests something that doesn't exist.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>

How did you test this feature?
Did you see some performance differences with igb_uio?

>  # workaround for a gcc bug with noreturn attribute
>  # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
>  ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
>  CFLAGS_eal_thread.o += -Wno-return-type
> -CFLAGS_eal_hpet.o += -Wno-return-type

For history reason, it's better to explain in another patch that eal_hpet has 
been renamed eal_timer and there is no such need anymore in this file.

> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
[...]
> + * This code tries to determine if the PCI device is bound to VFIO driver,

We should discuss a way to request igb_uio or VFIO binding of a device.

> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c

This whole socket communication deserves a separated patch with protocol 
description.
By the way, I'm not a big fan of the suffix "_socket" which can be misleading. 
But I have no other good naming idea.

> +/*
> + * socket listening thread for primary process
> + */
> +__attribute__((noreturn)) void *
> +pci_vfio_socket_thread(void *arg)

So we have another thread to manage.
I don't see where it is spawned?

> --- a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
> +++ b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
[...]
> +struct vfio_config vfio_cfg;
> +
> +pthread_t socket_thread;

You are defining some variables in a .h file. I think it is a problem.


Here are some other relevant errors from checkpatch.pl:

ERROR: "foo * bar" should be "foo *bar"
#197: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:64:
+pci_vfio_get_msix_bar(int fd, int * msix_bar)

ERROR: space required before the open brace '{'
#216: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:83:
+	while (cap_offset){

ERROR: "foo * bar" should be "foo *bar"
#301: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:168:
+	const struct rte_memseg * ms = rte_eal_get_physmem_layout();

ERROR: space required before the open parenthesis '('
#517: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:384:
+		switch(ret) {

ERROR: "foo * bar" should be "foo *bar"
#541: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:408:
+pci_vfio_get_group_no(const char * pci_addr)

ERROR: "foo * bar" should be "foo *bar"
#545: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:412:
+	char * tok[16], *group_tok, *end;

ERROR: else should follow close brace '}'
#673: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:540:
+	}
+	else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {

WARNING: space prohibited between function name and open parenthesis '('
#751: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:618:
+		if ((vfio_res = rte_zmalloc("VFIO_RES", sizeof (*vfio_res), 0)) == 
NULL) {

ERROR: "foo * bar" should be "foo *bar"
#784: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:651:
+		void * bar_addr;

ERROR: return is not a function, parentheses are not required
#850: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio.c:717:
+	return (0);

ERROR: space required before the open parenthesis '('
#933: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c:75:
+		} while(0)

WARNING: Single statement macros should not use a do {} while (0) loop
#934: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c:76:
+#define CMSGHDR_TO_FD(chdr,fd) \
+		do {\
+			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd));\
+		} while (0)

ERROR: "foo * bar" should be "foo *bar"
#942: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c:84:
+get_socket_path(char * buffer, int bufsz)

ERROR: "foo * bar" should be "foo *bar"
#1026: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c:168:
+	struct cmsghdr * chdr;

ERROR: "foo * bar" should be "foo *bar"
#1057: FILE: lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c:199:
+	struct cmsghdr * chdr;

ERROR: "foo * bar" should be "foo *bar"
#1284: FILE: lib/librte_eal/linuxapp/eal/include/eal_pci_init.h:87:
+void * pci_vfio_socket_thread(void *arg);


Thanks
-- 
Thomas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO.
  2014-05-01 11:05 [dpdk-dev] [PATCH 00/16] [RFC] [VFIO] Add VFIO support to DPDK Burakov, Anatoly
@ 2014-05-19 15:51 ` Anatoly Burakov
  2014-05-22 11:53   ` Thomas Monjalon
  0 siblings, 1 reply; 10+ messages in thread
From: Anatoly Burakov @ 2014-05-19 15:51 UTC (permalink / raw)
  To: dev

VFIO is kernel 3.6+ only, and so is only compiled when DPDK config
option CONFIG_RTE_EAL_VFIO is enabled, and kernel 3.6 or higher is
detected, thus preventing compile failures on older kernels if VFIO is
enabled in config (and it is, by default).

Since VFIO cannot be used to map the same device twice, secondary
processes receive the device/group fd's by means of communicating over a
local socket. Only group and container fd's should be sent, as device
fd's can be obtained via ioctl() calls' on the group fd.

For multiprocess, VFIO distinguishes between existing but unused groups
(e.g. grups that aren't bound to VFIO driver) and non-existing groups in
order to know if the secondary process requests a valid group, or if
secondary process requests something that doesn't exist.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/linuxapp/eal/Makefile               |    5 +-
 lib/librte_eal/linuxapp/eal/eal.c                  |    1 +
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  719 ++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c  |  367 ++++++++++
 .../linuxapp/eal/include/eal_internal_cfg.h        |    3 +
 lib/librte_eal/linuxapp/eal/include/eal_pci_init.h |   55 ++
 lib/librte_eal/linuxapp/eal/include/eal_vfio.h     |    6 +
 7 files changed, 1155 insertions(+), 1 deletions(-)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c

diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 527fa2a..3a39cca 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -58,6 +58,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_uio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_lcore.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_timer.c
@@ -88,12 +90,13 @@ CFLAGS_eal_common_log.o := -D_GNU_SOURCE
 CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
 CFLAGS_eal_pci.o := -D_GNU_SOURCE
 CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE
 
 # workaround for a gcc bug with noreturn attribute
 # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
 CFLAGS_eal_thread.o += -Wno-return-type
-CFLAGS_eal_hpet.o += -Wno-return-type
+CFLAGS_eal_pci_vfio_socket.o += -Wno-return-type
 endif
 
 INC := rte_per_lcore.h rte_lcore.h rte_interrupts.h rte_kni_common.h rte_dom0_common.h
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index de182e1..01bfd6c 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -650,6 +650,7 @@ eal_parse_args(int argc, char **argv)
 	internal_config.force_sockets = 0;
 	internal_config.syslog_facility = LOG_DAEMON;
 	internal_config.xen_dom0_support = 0;
+	internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
 #ifdef RTE_LIBEAL_USE_HPET
 	internal_config.no_hpet = 0;
 #else
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
new file mode 100644
index 0000000..0a6f95c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -0,0 +1,719 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, int * msix_bar)
+{
+	int ret;
+	uint32_t reg;
+	uint8_t cap_id, cap_offset;
+
+	/* read PCI capability pointer from config space */
+	ret = pread64(fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_CAPABILITY_LIST);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+				"config space!\n");
+		return -1;
+	}
+
+	/* we need first byte */
+	cap_offset = reg & 0xFF;
+
+	while (cap_offset){
+
+		/* read PCI capability ID */
+		ret = pread64(fd, &reg, sizeof(reg),
+				VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+				cap_offset);
+		if (ret != sizeof(reg)) {
+			RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+					"config space!\n");
+			return -1;
+		}
+
+		/* we need first byte */
+		cap_id = reg & 0xFF;
+
+		/* if we haven't reached MSI-X, check next capability */
+		if (cap_id != PCI_CAP_ID_MSIX) {
+			ret = pread64(fd, &reg, sizeof(reg),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset);
+			if (ret != sizeof(reg)) {
+				RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+						"config space!\n");
+				return -1;
+			}
+
+			/* we need second byte */
+			cap_offset = (reg & 0xFF00) >> 8;
+
+			continue;
+		}
+		/* else, read table offset */
+		else {
+			/* table offset resides in the next 4 bytes */
+			ret = pread64(fd, &reg, sizeof(reg),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset + 4);
+			if (ret != sizeof(reg)) {
+				RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+						"space!\n");
+				return -1;
+			}
+
+			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+
+			return 0;
+		}
+	}
+	return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd)
+{
+	uint16_t reg;
+	int ret;
+
+	ret = pread64(dev_fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	/* set the master bit */
+	reg |= PCI_COMMAND_MASTER;
+
+	ret = pwrite64(dev_fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_COMMAND);
+
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/* set up DMA mappings */
+static int
+pci_vfio_setup_dma_maps(int vfio_container_fd)
+{
+	const struct rte_memseg * ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+			VFIO_TYPE1_IOMMU);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  cannot set IOMMU type!\n");
+		return -1;
+	}
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping!\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd,
+		int num_irqs)
+{
+	int i, ret, intr_idx;
+	enum rte_intr_handle_type handle_type;
+
+	/* get interrupt type from internal config (MSI-X by default, can be
+	 * overriden from the command line
+	 */
+	switch (internal_config.vfio_intr_mode) {
+	case RTE_INTR_MODE_MSIX:
+		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+		handle_type = RTE_INTR_HANDLE_VFIO_MSIX;
+		break;
+	case RTE_INTR_MODE_LEGACY:
+		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+		handle_type = RTE_INTR_HANDLE_VFIO_LEGACY;
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
+		return -1;
+	}
+
+	for (i = 0; i < num_irqs; i++) {
+		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+		int fd = -1;
+
+		/* skip interrupt modes we don't want */
+		if (i != intr_idx)
+			continue;
+
+		irq.index = i;
+
+		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  cannot get IRQ info!\n");
+			return -1;
+		}
+
+		/* fail if this vector cannot be used with eventfd */
+		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+			RTE_LOG(ERR, EAL, "  interrupt vector does not support eventfd!\n");
+			return -1;
+		}
+
+		/* set up an eventfd for interrupts */
+		fd = eventfd(0, 0);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot set up eventfd!\n");
+			return -1;
+		}
+
+		dev->intr_handle.type = handle_type;
+		dev->intr_handle.fd = fd;
+		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+		return 0;
+	}
+
+	/* if we're here, we haven't found a suitable interrupt vector */
+	return -1;
+}
+
+/* open container fd or get an existing one */
+static int
+pci_vfio_get_container_fd(void)
+{
+	int ret, vfio_container_fd;
+
+	/* if we're in a primary process, try to open the container */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+		if (vfio_container_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot open VFIO container!\n");
+			return -1;
+		}
+
+		/* check VFIO API version */
+		ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+		if (ret != VFIO_API_VERSION) {
+			RTE_LOG(ERR, EAL, "  unknown VFIO API version!\n");
+			close(vfio_container_fd);
+			return -1;
+		}
+
+		/* check if we support IOMMU type 1 */
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+		if (!ret) {
+			RTE_LOG(ERR, EAL, "  unknown IOMMU driver!\n");
+			close(vfio_container_fd);
+			return -1;
+		}
+
+		return vfio_container_fd;
+	}
+	/* if we're in a secondary process, request container fd from the primary
+	 * process via our socket
+	 */
+	else {
+		int socket_fd;
+		if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+			return -1;
+		}
+		if (vfio_socket_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		vfio_container_fd = vfio_socket_receive_fd(socket_fd);
+		if (vfio_container_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		close(socket_fd);
+		return vfio_container_fd;
+	}
+
+	return -1;
+}
+
+/* open group fd or get an existing one */
+static int
+pci_vfio_get_group_fd(int iommu_group_no)
+{
+	int i;
+	int vfio_group_fd;
+	char filename[PATH_MAX];
+
+	/* check if we already have the group descriptor open */
+	for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
+			return vfio_cfg.vfio_groups[i].fd;
+
+	/* if primary, try to open the group */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		rte_snprintf(filename, sizeof(filename),
+				 VFIO_GROUP_FMT, iommu_group_no);
+		vfio_group_fd = open(filename, O_RDWR);
+		if (vfio_group_fd < 0) {
+			/* if file not found, it's not an error */
+			if (errno != ENOENT) {
+				RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+						strerror(errno));
+				return -1;
+			}
+			return 0;
+		}
+
+		/* if the fd is valid, create a new group for it */
+		if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
+			RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+			return -1;
+		}
+		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+		return vfio_group_fd;
+	}
+	/* if we're in a secondary process, request group fd from the primary
+	 * process via our socket
+	 */
+	else {
+		int socket_fd, ret;
+		if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+			return -1;
+		}
+		if (vfio_socket_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		if (vfio_socket_send_request(socket_fd, iommu_group_no) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
+			close(socket_fd);
+			return -1;
+		}
+		ret = vfio_socket_receive_request(socket_fd);
+		switch(ret) {
+		case SOCKET_NO_FD:
+			close(socket_fd);
+			return 0;
+		case SOCKET_OK:
+			vfio_group_fd = vfio_socket_receive_fd(socket_fd);
+			/* if we got the fd, return it */
+			if (vfio_group_fd > 0) {
+				close(socket_fd);
+				return vfio_group_fd;
+			}
+			/* fall-through on error */
+		default:
+			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+	}
+	return -1;
+}
+
+/* parse IOMMU group number for a PCI device
+ * returns -1 for errors, 0 for non-existent group */
+static int
+pci_vfio_get_group_no(const char * pci_addr)
+{
+	char linkname[PATH_MAX];
+	char filename[PATH_MAX];
+	char * tok[16], *group_tok, *end;
+	int ret, iommu_group_no;
+
+	memset(linkname, 0, sizeof(linkname));
+	memset(filename, 0, sizeof(filename));
+
+	/* try to find out IOMMU group for this device */
+	rte_snprintf(linkname, sizeof(linkname),
+			 SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+	ret = readlink(linkname, filename, sizeof(filename));
+
+	/* if the link doesn't exist, no VFIO for us */
+	if (ret < 0)
+		return 0;
+
+	ret = rte_strsplit(filename, sizeof(filename),
+			tok, RTE_DIM(tok), '/');
+
+	if (ret <= 0) {
+		RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", pci_addr);
+		return -1;
+	}
+
+	/* IOMMU group is always the last token */
+	errno = 0;
+	group_tok = tok[ret - 1];
+	end = group_tok;
+	iommu_group_no = strtol(group_tok, &end, 10);
+	if ((end != group_tok && *end != '\0') || errno != 0) {
+		RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", pci_addr);
+		return -1;
+	}
+
+	return iommu_group_no;
+}
+
+static void
+clear_current_group(void)
+{
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+}
+
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+	struct vfio_group_status group_status =
+					{ .argsz = sizeof(group_status) };
+	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+	int vfio_group_fd, vfio_dev_fd;
+	int iommu_group_no;
+	char pci_addr[PATH_MAX] = {0};
+	struct rte_pci_addr *loc = &dev->addr;
+	int i, ret, msix_bar;
+	struct mapped_pci_resource *vfio_res = NULL;
+	struct pci_map *maps;
+
+	dev->intr_handle.fd = -1;
+	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+	/* store PCI address string */
+	rte_snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+			loc->domain, loc->bus, loc->devid, loc->function);
+
+	/* get container fd (needs to be done only once per initialization) */
+	if (vfio_cfg.vfio_container_fd == -1) {
+		int vfio_container_fd = pci_vfio_get_container_fd();
+		if (vfio_container_fd < 0) {
+			RTE_LOG(ERR, EAL, "  %s cannot open VFIO container!\n", pci_addr);
+			return -1;
+		}
+
+		vfio_cfg.vfio_container_fd = vfio_container_fd;
+	}
+
+	/* get group number */
+	iommu_group_no = pci_vfio_get_group_no(pci_addr);
+
+	/* if 0, group doesn't exist */
+	if (iommu_group_no == 0) {
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+				pci_addr);
+		return 1;
+	}
+	/* if negative, something failed */
+	else if (iommu_group_no < 0)
+		return -1;
+
+	/* get the actual group fd */
+	vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		return -1;
+	}
+
+	/* store group fd */
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+
+	/* if group_fd == 0, that means the device isn't managed by VFIO */
+	if (vfio_group_fd == 0) {
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+				pci_addr);
+		/* we store 0 as group fd to distinguish between existing but
+		 * unbound VFIO groups, and groups that don't exist at all.
+		 */
+		vfio_cfg.vfio_group_idx++;
+		return 1;
+	}
+
+	/*
+	 * at this point, we know at least one port on this device is bound to VFIO,
+	 * so we can proceed to try and set this particular port up
+	 */
+
+	/* check if the group is viable */
+	ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  %s cannot get group status!\n", pci_addr);
+		close(vfio_group_fd);
+		clear_current_group();
+		return -1;
+	}
+	else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", pci_addr);
+		close(vfio_group_fd);
+		clear_current_group();
+		return -1;
+	}
+
+	/*
+	 * at this point, we know that this group is viable (meaning, all devices
+	 * are either bound to VFIO or not bound to anything)
+	 */
+
+	/* check if group does not have a container yet */
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+		/* add group to a container */
+		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+				&vfio_cfg.vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container!\n",
+					pci_addr);
+			close(vfio_group_fd);
+			clear_current_group();
+			return -1;
+		}
+		/*
+		 * at this point we know that this group has been successfully
+		 * initialized, so we increment vfio_group_idx to indicate that we can
+		 * add new groups.
+		 */
+		vfio_cfg.vfio_group_idx++;
+	}
+
+	/*
+	 * set up DMA mappings for container (needs to be done only once, only when
+	 * at least one group is assigned to a container and only in primary process)
+	 */
+	if (internal_config.process_type == RTE_PROC_PRIMARY &&
+			vfio_cfg.vfio_container_has_dma == 0) {
+		ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s DMA remapping failed!\n", pci_addr);
+			return -1;
+		}
+		vfio_cfg.vfio_container_has_dma = 1;
+	}
+
+	/* get a file descriptor for the device */
+	vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+	if (vfio_dev_fd < 0) {
+		/* if we cannot get a device fd, this simply means that this
+		 * particular port is not bound to VFIO
+		 */
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+				pci_addr);
+		return 1;
+	}
+
+	/* test and setup the device */
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  %s cannot get device info!\n", pci_addr);
+		close(vfio_dev_fd);
+		return -1;
+	}
+
+	/* get MSI-X BAR, if any (we have to know where it is because we can't
+	 * mmap it when using VFIO) */
+	msix_bar = -1;
+	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
+		close(vfio_dev_fd);
+		return -1;
+	}
+
+	/* if we're in a primary process, allocate vfio_res and get region info */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		if ((vfio_res = rte_zmalloc("VFIO_RES", sizeof (*vfio_res), 0)) == NULL) {
+			RTE_LOG(ERR, EAL,
+				"%s(): cannot store uio mmap details\n", __func__);
+			close(vfio_dev_fd);
+			return -1;
+		}
+		memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+		/* get number of registers (up to BAR5) */
+		vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+				VFIO_PCI_BAR5_REGION_INDEX + 1);
+	}
+	/* if we're in a secondary process, just find our tailq entry and use that */
+	else {
+		TAILQ_FOREACH(vfio_res, pci_res_list, next) {
+			if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+				continue;
+			break;
+		}
+		/* if we haven't found our tailq entry, something's wrong */
+		if (vfio_res == NULL) {
+			RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
+					pci_addr);
+			close(vfio_dev_fd);
+			return -1;
+		}
+	}
+
+	/* map BARs */
+	maps = vfio_res->maps;
+
+	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		void * bar_addr;
+
+		reg.index = i;
+
+		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s cannot get device region info!\n",
+					pci_addr);
+			close(vfio_dev_fd);
+			if (internal_config.process_type == RTE_PROC_PRIMARY)
+				rte_free(vfio_res);
+			return -1;
+		}
+
+		/* skip non-mmapable BARs */
+		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+			continue;
+
+		/* skip MSI-X BAR */
+		if (i == msix_bar)
+			continue;
+
+		bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
+				reg.size);
+
+		if (bar_addr == NULL) {
+			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n", pci_addr, i,
+					strerror(errno));
+			close(vfio_dev_fd);
+			if (internal_config.process_type == RTE_PROC_PRIMARY)
+				rte_free(vfio_res);
+			return -1;
+		}
+
+		maps[i].addr = bar_addr;
+		maps[i].offset = reg.offset;
+		maps[i].size = reg.size;
+		dev->mem_resource[i].addr = bar_addr;
+	}
+
+	/* if secondary process, do not set up interrupts */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		if (pci_vfio_setup_interrupts(dev, vfio_dev_fd,
+				(int) device_info.num_irqs) != 0) {
+			RTE_LOG(ERR, EAL, "  %s error setting up interrupts!\n", pci_addr);
+			close(vfio_dev_fd);
+			rte_free(vfio_res);
+			return -1;
+		}
+
+		/* set bus mastering for the device */
+		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+			RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
+			close(vfio_dev_fd);
+			rte_free(vfio_res);
+			return -1;
+		}
+
+		/* Reset the device */
+		ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
+	}
+
+	if (internal_config.process_type == RTE_PROC_PRIMARY)
+		TAILQ_INSERT_TAIL(pci_res_list, vfio_res, next);
+
+	return (0);
+}
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
new file mode 100644
index 0000000..1605fce
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+
+/* sys/un.h with __USE_MISC uses strlen, which is unsafe and should not be used. */
+#ifdef __USE_MISC
+#define REMOVED_USE_MISC
+#undef __USE_MISC
+#endif
+#include <sys/un.h>
+/* make sure we redefine __USE_MISC only if it was previously undefined */
+#ifdef REMOVED_USE_MISC
+#define __USE_MISC
+#undef REMOVED_USE_MISC
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
+#define CMSGLEN (CMSG_LEN(sizeof(int)))
+#define FD_TO_CMSGHDR(fd,chdr) \
+		do {\
+			(chdr).cmsg_len = CMSGLEN;\
+			(chdr).cmsg_level = SOL_SOCKET;\
+			(chdr).cmsg_type = SCM_RIGHTS;\
+			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
+		} while(0)
+#define CMSGHDR_TO_FD(chdr,fd) \
+		do {\
+			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd));\
+		} while (0)
+
+
+/* get socket path (/var/run if root, $HOME otherwise) */
+static void
+get_socket_path(char * buffer, int bufsz)
+{
+	const char *dir = "/var/run";
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		dir = home_dir;
+
+	/* use current prefix as file path */
+	rte_snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
+			internal_config.hugefile_prefix);
+}
+
+
+
+/*
+ * data flow for socket comm protocol:
+ * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
+ * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
+ * 2. server receives message
+ * 2a. in case of invalid group, SOCKET_ERR is sent back to client
+ * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
+ * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
+ *
+ * in case of any error, socket is closed.
+ */
+
+/* send a request, return -1 on error */
+int
+vfio_socket_send_request(int socket, int req)
+{
+	struct msghdr hdr;
+	struct iovec iov;
+	int buf;
+	int ret;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	buf = req;
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char*) &buf;
+	iov.iov_len = sizeof(buf);
+
+	ret = sendmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
+/* receive a request and return it */
+int
+vfio_socket_receive_request(int socket)
+{
+	int buf;
+	struct msghdr hdr;
+	struct iovec iov;
+	int ret, req;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	buf = SOCKET_ERR;
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char*) &buf;
+	iov.iov_len = sizeof(buf);
+
+	ret = recvmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+
+	req = buf;
+
+	return req;
+}
+
+/* send OK in message, fd in control message */
+int
+vfio_socket_send_fd(int socket, int fd)
+{
+	int buf;
+	struct msghdr hdr;
+	struct cmsghdr * chdr;
+	char chdr_buf[CMSGLEN];
+	struct iovec iov;
+	int ret;
+
+	chdr = (struct cmsghdr *) chdr_buf;
+	memset(chdr, 0, sizeof(chdr_buf));
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char*) &buf;
+	iov.iov_len = sizeof(buf);
+	hdr.msg_control = chdr;
+	hdr.msg_controllen = CMSGLEN;
+
+	buf = SOCKET_OK;
+	FD_TO_CMSGHDR(fd, *chdr);
+
+	ret = sendmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
+/* receive OK in message, fd in control message */
+int
+vfio_socket_receive_fd(int socket)
+{
+	int buf;
+	struct msghdr hdr;
+	struct cmsghdr * chdr;
+	char chdr_buf[CMSGLEN];
+	struct iovec iov;
+	int ret, req, fd;
+
+	buf = SOCKET_ERR;
+
+	chdr = (struct cmsghdr *) chdr_buf;
+	memset(chdr, 0, sizeof(chdr_buf));
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char*) &buf;
+	iov.iov_len = sizeof(buf);
+	hdr.msg_control = chdr;
+	hdr.msg_controllen = CMSGLEN;
+
+	ret = recvmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+
+	req = buf;
+
+	if (req != SOCKET_OK)
+		return -1;
+
+	CMSGHDR_TO_FD(*chdr, fd);
+
+	return fd;
+}
+
+/* connect socket_fd in secondary process to the primary process's socket */
+int
+vfio_socket_connect_to_primary(void)
+{
+	struct sockaddr_un addr;
+	socklen_t sockaddr_len;
+	int socket_fd;
+
+	/* set up a socket */
+	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (socket_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		return -1;
+	}
+
+	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+	addr.sun_family = AF_UNIX;
+
+	sockaddr_len = sizeof(struct sockaddr_un);
+
+	if (connect(socket_fd, (struct sockaddr*) &addr, sockaddr_len) == 0)
+		return socket_fd;
+
+	/* if connect failed */
+	close(socket_fd);
+	return -1;
+}
+
+
+
+/*
+ * socket listening thread for primary process
+ */
+__attribute__((noreturn)) void *
+pci_vfio_socket_thread(void *arg)
+{
+	int ret, i, vfio_group_no;
+	int socket_fd = *(int*) arg;
+
+	/* wait for requests on the socket */
+	for (;;) {
+		int conn_sock;
+		struct sockaddr_un addr;
+		socklen_t sockaddr_len = sizeof(addr);
+
+		/* this is a blocking call */
+		conn_sock = accept(socket_fd, (struct sockaddr*) &addr, &sockaddr_len);
+
+		/* just restart on error */
+		if (conn_sock == -1)
+			continue;
+
+		/* set socket to linger after close */
+		struct linger l;
+		l.l_onoff = 1;
+		l.l_linger = 60;
+		setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l));
+
+		ret = vfio_socket_receive_request(conn_sock);
+
+		switch (ret) {
+		case SOCKET_REQ_CONTAINER:
+			vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_container_fd);
+			break;
+		case SOCKET_REQ_GROUP:
+			/* wait for group number */
+			vfio_group_no = vfio_socket_receive_request(conn_sock);
+			if (vfio_group_no < 0) {
+				close(conn_sock);
+				continue;
+			}
+			for (i = 0; i < vfio_cfg.vfio_group_idx; i++) {
+				if (vfio_cfg.vfio_groups[i].group_no == vfio_group_no)
+					break;
+			}
+			/* if we reached end of the list, the group doesn't exist */
+			if (i == vfio_cfg.vfio_group_idx)
+				vfio_socket_send_request(conn_sock, SOCKET_ERR);
+			/* if VFIO group exists but isn't bound to VFIO driver */
+			else if (vfio_cfg.vfio_groups[i].fd == 0)
+				vfio_socket_send_request(conn_sock, SOCKET_NO_FD);
+			/* if group exists and is bound to VFIO driver */
+			else {
+				vfio_socket_send_request(conn_sock, SOCKET_OK);
+				vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_groups[i].fd);
+			}
+			break;
+		default:
+			vfio_socket_send_request(conn_sock, SOCKET_ERR);
+			break;
+		}
+		close(conn_sock);
+	}
+}
+
+/*
+ * set up a local socket and tell it to listen for incoming connections
+ */
+int
+pci_vfio_socket_setup(void)
+{
+	int ret, socket_fd;
+	struct sockaddr_un addr;
+	socklen_t sockaddr_len;
+
+	/* set up a socket */
+	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (socket_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		return -1;
+	}
+
+	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+	addr.sun_family = AF_UNIX;
+
+	sockaddr_len = sizeof(struct sockaddr_un);
+
+	unlink(addr.sun_path);
+
+	ret = bind(socket_fd, (struct sockaddr*) &addr, sockaddr_len);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
+		close(socket_fd);
+		return -1;
+	}
+
+	ret = listen(socket_fd, 50);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
+		close(socket_fd);
+		return -1;
+	}
+
+	return socket_fd;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
index 92e3065..5468b0a 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
@@ -40,6 +40,7 @@
 #define _EAL_LINUXAPP_INTERNAL_CFG
 
 #include <rte_eal.h>
+#include <rte_pci_dev_feature_defs.h>
 
 #define MAX_HUGEPAGE_SIZES 3  /**< support up to 3 page sizes */
 
@@ -76,6 +77,8 @@ struct internal_config {
 	volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
 	uintptr_t base_virtaddr;          /**< base address to try and reserve memory from */
 	volatile int syslog_facility;	  /**< facility passed to openlog() */
+	/** default interrupt mode for VFIO */
+	volatile enum rte_intr_mode vfio_intr_mode;
 	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
 	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
 
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
index 699e80d..b163ab5 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
@@ -34,6 +34,8 @@
 #ifndef EAL_PCI_INIT_H_
 #define EAL_PCI_INIT_H_
 
+#include "eal_vfio.h"
+
 struct pci_map {
 	void *addr;
 	uint64_t offset;
@@ -62,4 +64,57 @@ void * pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
 /* map IGB_UIO resource prototype */
 int pci_uio_map_resource(struct rte_pci_device *dev);
 
+#ifdef VFIO_PRESENT
+
+#define VFIO_MAX_GROUPS 64
+#define VFIO_DIR "/dev/vfio"
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
+
+/* map VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+
+/*
+ * Function prototypes for VFIO socket functions
+ */
+int vfio_socket_send_request(int socket, int req);
+int vfio_socket_receive_request(int socket);
+int vfio_socket_send_fd(int socket, int fd);
+int vfio_socket_receive_fd(int socket);
+int vfio_socket_connect_to_primary(void);
+int pci_vfio_socket_setup(void);
+void * pci_vfio_socket_thread(void *arg);
+
+/* socket comm protocol definitions */
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+	int group_no;
+	int fd;
+};
+
+struct vfio_config {
+	int vfio_enabled;
+	int vfio_container_fd;
+	int vfio_container_has_dma;
+	int vfio_group_idx;
+	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+};
+
+/* per-process VFIO config */
+struct vfio_config vfio_cfg;
+
+pthread_t socket_thread;
+
+#endif
+
 #endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
index ca4982b..32953c0 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
@@ -42,6 +42,12 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
 #include <linux/vfio.h>
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#else
+#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#endif
+
 #define VFIO_PRESENT
 #endif /* kernel version */
 #endif /* RTE_EAL_VFIO */
-- 
1.7.0.7

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-05-27 16:36 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-27  3:19 [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO Xu, HuilongX
  -- strict thread matches above, loose matches on Subject: below --
2014-05-01 11:05 [dpdk-dev] [PATCH 00/16] [RFC] [VFIO] Add VFIO support to DPDK Burakov, Anatoly
2014-05-19 15:51 ` [dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through VFIO Anatoly Burakov
2014-05-22 11:53   ` Thomas Monjalon
2014-05-22 12:06     ` Burakov, Anatoly
2014-05-22 12:28       ` Thomas Monjalon
2014-05-22 12:37         ` Burakov, Anatoly
2014-05-22 12:46           ` Thomas Monjalon
2014-05-22 12:54             ` Burakov, Anatoly
2014-05-27 16:21     ` Burakov, Anatoly
2014-05-27 16:36       ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).