From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by dpdk.org (Postfix) with ESMTP id 3C055968 for ; Tue, 19 Sep 2017 13:41:00 +0200 (CEST) Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by fmsmga101.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 19 Sep 2017 04:40:53 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.42,418,1500966000"; d="scan'208";a="901788376" Received: from aburakov-mobl2.ger.corp.intel.com (HELO [10.237.210.143]) ([10.237.210.143]) by FMSMGA003.fm.intel.com with ESMTP; 19 Sep 2017 04:40:52 -0700 To: Jonas Pfefferle , dev@dpdk.org References: <1502969759-11036-1-git-send-email-jpf@zurich.ibm.com> From: "Burakov, Anatoly" Message-ID: <8c14c677-6341-6b3c-aa20-36a9297b1f94@intel.com> Date: Tue, 19 Sep 2017 12:40:51 +0100 User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Thunderbird/52.3.0 MIME-Version: 1.0 In-Reply-To: <1502969759-11036-1-git-send-email-jpf@zurich.ibm.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 19 Sep 2017 11:41:01 -0000 Hi Jonas, On 17-Aug-17 12:35 PM, Jonas Pfefferle wrote: > Split pci_vfio_map_resource for primary and secondary processes. > Save all relevant mapping data in primary process to allow > the secondary process to perform mappings. > > Signed-off-by: Jonas Pfefferle > --- > lib/librte_eal/common/include/rte_pci.h | 7 + > lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 447 +++++++++++++++++------------ > 2 files changed, 271 insertions(+), 183 deletions(-) > > diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h > index 8b12339..0821af9 100644 > --- a/lib/librte_eal/common/include/rte_pci.h > +++ b/lib/librte_eal/common/include/rte_pci.h > @@ -214,6 +214,12 @@ struct pci_map { > uint64_t phaddr; > }; > > +struct pci_msix_table { > + int bar_index; > + uint32_t offset; > + uint32_t size; > +}; > + > /** > * A structure describing a mapped PCI resource. > * For multi-process we need to reproduce all PCI mappings in secondary > @@ -226,6 +232,7 @@ struct mapped_pci_resource { > char path[PATH_MAX]; > int nb_maps; > struct pci_map maps[PCI_MAX_RESOURCE]; > + struct pci_msix_table msix_table; > }; > > /** mapped pci device list */ > diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > index aa9d96e..f37552a 100644 > --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > @@ -88,8 +88,7 @@ pci_vfio_write_config(const struct rte_intr_handle *intr_handle, > > /* get PCI BAR number where MSI-X interrupts are */ > static int > -pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, > - uint32_t *msix_table_size) > +pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) > { > int ret; > uint32_t reg; > @@ -161,9 +160,10 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, > return -1; > } > > - *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; > - *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; > - *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); > + msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; > + msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; > + msix_table->size = > + 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); > > return 0; > } > @@ -300,25 +300,152 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) > return -1; > } > > -/* > - * map the PCI resources of a PCI device in virtual memory (VFIO version). > - * primary and secondary processes follow almost exactly the same path > - */ > -int > -pci_vfio_map_resource(struct rte_pci_device *dev) > +static int > +pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) > +{ > + uint32_t ioport_bar; > + int ret; > + > + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), > + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) > + + PCI_BASE_ADDRESS_0 + bar_index*4); > + if (ret != sizeof(ioport_bar)) { > + RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", > + PCI_BASE_ADDRESS_0 + bar_index*4); > + return -1; > + } > + > + if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) { > + RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d) addr: %x\n", > + bar_index, ioport_bar); This log message should probably go to the "continue" portion of the calling code, it looks out of place here. > + return 1; > + } > + return 0; > +} > + > +static int > +pci_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) > +{ > + if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { > + RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); > + return -1; > + } > + > + /* set bus mastering for the device */ > + if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { > + RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); > + return -1; > + } > + > + /* Reset the device */ > + ioctl(vfio_dev_fd, VFIO_DEVICE_RESET); > + > + return 0; > +} > + > +static int > +pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, > + int bar_index, int additional_flags) > +{ > + struct memreg { > + unsigned long offset, size; > + } memreg[2] = {}; > + void *bar_addr; > + struct pci_msix_table *msix_table = &vfio_res->msix_table; > + struct pci_map *bar = &vfio_res->maps[bar_index]; > + > + if (bar->size == 0) > + /* Skip this BAR */ > + return 0; > + > + if (msix_table->bar_index == bar_index) { > + /* > + * VFIO will not let us map the MSI-X table, > + * but we can map around it. > + */ > + uint32_t table_start = msix_table->offset; > + uint32_t table_end = table_start + msix_table->size; > + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; > + table_start &= PAGE_MASK; > + > + if (table_start == 0 && table_end >= bar->size) { > + /* Cannot map this BAR */ > + RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); > + bar->size = 0; > + bar->addr = 0; > + return 0; > + } > + > + memreg[0].offset = bar->offset; > + memreg[0].size = table_start; > + memreg[1].offset = bar->offset + table_end; > + memreg[1].size = bar->size - table_end; > + > + RTE_LOG(DEBUG, EAL, > + "Trying to map BAR%d that contains the MSI-X " > + "table. Trying offsets: " > + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index, > + memreg[0].offset, memreg[0].size, > + memreg[1].offset, memreg[1].size); > + } I believe you forgot the "else" part. memreg is, by default, initialized to zeroes, and if bar_index is not equal to MSI-X bar index, memreg does not get filled with any values, and therefore all of the following checks for memreg.size etc. will return false and you'll end up with failed BAR mappings. Confirmed with testing: EAL: PCI device 0000:08:00.0 on NUMA socket 0 EAL: probe driver: 8086:10fb net_ixgbe EAL: using IOMMU type 1 (Type 1) EAL: Failed to map pci BAR0 EAL: 0000:08:00.0 mapping BAR0 failed: Success EAL: Requested device 0000:08:00.0 cannot be used -- Thanks, Anatoly