From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 1F64BA0A0A; Tue, 1 Jun 2021 05:18:17 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id A4419410E5; Tue, 1 Jun 2021 05:18:13 +0200 (CEST) Received: from mga04.intel.com (mga04.intel.com [192.55.52.120]) by mails.dpdk.org (Postfix) with ESMTP id A3B8A410EE for ; Tue, 1 Jun 2021 05:18:11 +0200 (CEST) IronPort-SDR: nP9u7NetYl16hQLvN96+M9ximiHxyS+6vVvZFikmYj7PcyDf71xVNu51dFLAcZXTPPSj5wuizs sBimBbvppGcg== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="201593048" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="201593048" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:18:11 -0700 IronPort-SDR: kNcKM3TR6UPpXjqzmAelTAkLEEO0fNA7llkCYtkn+3Id/gBK5MxaYdkCF48oEQ1XN/0XCHGbL+ GIeaAG3XxpxA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315504" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:18:08 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com Date: Tue, 1 Jun 2021 11:06:44 +0800 Message-Id: <20210601030644.3318-7-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 6/6] bus/pci: add sparse mmap support for mediated PCI devices X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" This patch adds sparse mmap support in PCI bus. Sparse mmap is a capability defined in VFIO which allows multiple mmap areas in one VFIO region. Mediated pci devices could use this capability to let mdev parent driver have control over access of non-mmapable part of regions. Signed-off-by: Chenbo Xia --- drivers/bus/pci/linux/pci_vfio.c | 229 +++++++++++++++++++++++++++---- drivers/bus/pci/private.h | 2 + drivers/bus/pci/rte_bus_pci.h | 18 ++- 3 files changed, 218 insertions(+), 31 deletions(-) diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 00ba5db03a..e68eccb63f 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -654,6 +654,82 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, return 0; } +static int +pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, + struct vfio_region_sparse_mmap_area *vfio_areas, + uint32_t nr_areas, int bar_index, int additional_flags, + int numa_node) +{ + struct pci_map *map = &vfio_res->maps[bar_index]; + struct rte_mem_map_area *area; + struct vfio_region_sparse_mmap_area *sparse; + void *bar_addr; + uint32_t i, j; + + map->nr_areas = nr_areas; + + if (map->size == 0) { + RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); + return 0; + } + + if (!map->nr_areas) { + RTE_LOG(DEBUG, EAL, "Skip bar %d with no sparse mmap areas\n", + bar_index); + map->areas = NULL; + return 0; + } + + if (map->areas == NULL) { + map->areas = rte_zmalloc_socket(NULL, + sizeof(*map->areas) * nr_areas, + RTE_CACHE_LINE_SIZE, numa_node); + if (map->areas == NULL) { + RTE_LOG(ERR, EAL, + "Cannot alloc memory for sparse map areas\n"); + return -1; + } + } + + for (i = 0; i < map->nr_areas; i++) { + area = &map->areas[i]; + sparse = &vfio_areas[i]; + + bar_addr = mmap(map->addr, sparse->size, 0, MAP_PRIVATE | + MAP_ANONYMOUS | additional_flags, -1, 0); + if (bar_addr != MAP_FAILED) { + area->addr = pci_map_resource(bar_addr, vfio_dev_fd, + map->offset + sparse->offset, sparse->size, + RTE_MAP_FORCE_ADDRESS); + if (area->addr == NULL) { + munmap(bar_addr, sparse->size); + RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", + bar_index); + goto err_map; + } + + area->offset = sparse->offset; + area->size = sparse->size; + } else { + RTE_LOG(ERR, EAL, "Failed to create inaccessible mapping for BAR%d\n", + bar_index); + goto err_map; + } + } + + return 0; + +err_map: + for (j = 0; j < i; j++) { + pci_unmap_resource(map->areas[j].addr, map->areas[j].size); + map->areas[j].offset = 0; + map->areas[j].size = 0; + } + rte_free(map->areas); + map->nr_areas = 0; + return -1; +} + /* * region info may contain capability headers, so we need to keep reallocating * the memory until we match allocated memory size with argsz. @@ -770,6 +846,31 @@ pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd, return 0; } +static void +clean_up_pci_resource(struct mapped_pci_resource *vfio_res) +{ + struct pci_map *map; + uint32_t i, j; + + for (i = 0; i < PCI_MAX_RESOURCE; i++) { + map = &vfio_res->maps[i]; + if (map->nr_areas > 1) { + for (j = 0; j < map->nr_areas; j++) + pci_unmap_resource(map->areas[j].addr, + map->areas[j].size); + } else { + /* + * We do not need to be aware of MSI-X BAR mappings. + * Using current maps array is enough. + */ + if (map->addr) + pci_unmap_resource(map->addr, map->size); + } + } + + rte_free(map->areas); +} + static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { @@ -866,6 +967,8 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) for (i = 0; i < vfio_res->nb_maps; i++) { void *bar_addr; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); if (ret < 0) { @@ -911,15 +1014,59 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - free(reg); - goto err_vfio_res; - } + hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + + if (dev->is_mdev && hdr != NULL) { + sparse = container_of(hdr, + struct vfio_region_info_cap_sparse_mmap, + header); + + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, + sparse->areas, sparse->nr_areas, i, 0, + dev->device.numa_node); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_vfio_res; + } - dev->mem_resource[i].addr = maps[i].addr; + dev->sparse_mem[i].size = reg->size; + dev->sparse_mem[i].nr_maps = vfio_res->maps[i].nr_areas; + dev->sparse_mem[i].areas = vfio_res->maps[i].areas; + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_vfio_res; + } + + if (dev->is_mdev) { + struct pci_map *mdev_map = &maps[i]; + mdev_map->nr_areas = 1; + mdev_map->areas = rte_zmalloc_socket(NULL, + sizeof(*mdev_map->areas), + RTE_CACHE_LINE_SIZE, + dev->device.numa_node); + if (maps[i].areas == NULL) { + RTE_LOG(ERR, EAL, + "Cannot allocate memory for sparse map areas\n"); + goto err_vfio_res; + } + mdev_map->areas[0].addr = maps[i].addr; + mdev_map->areas[0].offset = 0; + mdev_map->areas[0].size = reg->size; + dev->sparse_mem[i].size = reg->size; + dev->sparse_mem[i].nr_maps = 1; + dev->sparse_mem[i].areas = mdev_map->areas; + } else { + maps[i].nr_areas = 0; + maps[i].areas = NULL; + dev->mem_resource[i].addr = maps[i].addr; + } + } free(reg); } @@ -940,6 +1087,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) return 0; err_vfio_res: + clean_up_pci_resource(vfio_res); rte_free(vfio_res); err_vfio_dev_fd: rte_vfio_release_device(rte_pci_get_sysfs_path(), @@ -960,7 +1108,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); - struct pci_map *maps; + struct pci_map *maps, *cur; dev->intr_handle.fd = -1; #ifdef HAVE_VFIO_DEV_REQ_INTERFACE @@ -1012,14 +1160,49 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) maps = vfio_res->maps; for (i = 0; i < vfio_res->nb_maps; i++) { - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - goto err_vfio_dev_fd; + cur = &maps[i]; + if (cur->nr_areas > 1) { + struct vfio_region_sparse_mmap_area *areas; + uint32_t i; + + areas = malloc(sizeof(*areas) * cur->nr_areas); + if (areas == NULL) { + RTE_LOG(ERR, EAL, "Failed to alloc vfio areas for %s\n", + pci_addr); + goto err_vfio_dev_fd; + } + + for (i = 0; i < cur->nr_areas; i++) { + areas[i].offset = cur->areas[i].offset; + areas[i].size = cur->areas[i].size; + } + + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, + areas, cur->nr_areas, i, MAP_FIXED, + dev->device.numa_node); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(areas); + goto err_vfio_dev_fd; + } + + free(areas); + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, + i, MAP_FIXED); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_dev_fd; + } + + if (dev->is_mdev) + cur->areas[0].addr = cur->addr; + else + dev->mem_resource[i].addr = cur->addr; } - dev->mem_resource[i].addr = maps[i].addr; } /* we need save vfio_dev_fd, so it can be used during release */ @@ -1054,8 +1237,6 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, const char *pci_addr) { struct mapped_pci_resource *vfio_res = NULL; - struct pci_map *maps; - int i; /* Get vfio_res */ TAILQ_FOREACH(vfio_res, vfio_res_list, next) { @@ -1079,19 +1260,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n", pci_addr); - maps = vfio_res->maps; - for (i = 0; i < vfio_res->nb_maps; i++) { - - /* - * We do not need to be aware of MSI-X table BAR mappings as - * when mapping. Just using current maps array is enough - */ - if (maps[i].addr) { - RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", - pci_addr, maps[i].addr); - pci_unmap_resource(maps[i].addr, maps[i].size); - } - } + clean_up_pci_resource(vfio_res); return vfio_res; } diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 3515c086aa..8d94d8acf8 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -110,6 +110,8 @@ struct pci_map { uint64_t offset; uint64_t size; uint64_t phaddr; + uint32_t nr_areas; + struct rte_mem_map_area *areas; }; struct pci_msix_table { diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h index fb7d934bd0..ddc913f121 100644 --- a/drivers/bus/pci/rte_bus_pci.h +++ b/drivers/bus/pci/rte_bus_pci.h @@ -70,6 +70,18 @@ enum rte_pci_kernel_driver { RTE_PCI_KDRV_NET_UIO, /* NetUIO for Windows */ }; +struct rte_mem_map_area { + void *addr; + uint64_t offset; + uint64_t size; +}; + +struct rte_sparse_mem_map { + uint64_t size; + uint32_t nr_maps; + struct rte_mem_map_area *areas; +}; + /** * A structure describing a PCI device. */ @@ -82,8 +94,12 @@ struct rte_pci_device { }; uint8_t is_mdev; /**< True for mediated PCI device */ struct rte_pci_id id; /**< PCI ID. */ - struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; + union { + struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; /**< PCI Memory Resource */ + struct rte_sparse_mem_map sparse_mem[PCI_MAX_RESOURCE]; + /**< Sparse Memory Map for Mdev */ + }; struct rte_intr_handle intr_handle; /**< Interrupt handle */ struct rte_pci_driver *driver; /**< PCI driver used in probing */ uint16_t max_vfs; /**< sriov enable if not zero */ -- 2.17.1