From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 0148A42BEB; Wed, 31 May 2023 07:38:24 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id C44FC42D39; Wed, 31 May 2023 07:38:09 +0200 (CEST) Received: from mga17.intel.com (mga17.intel.com [192.55.52.151]) by mails.dpdk.org (Postfix) with ESMTP id 5644142C76 for ; Wed, 31 May 2023 07:38:07 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1685511487; x=1717047487; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=75qQgyEffNjGvP82AdA75FyhNPOdFo6o/PO2aB4YVxk=; b=jFsFS5HXH2cGk4uwSZjheDvzYEWmmHQ45S9no2ysfosIg0YXpF50dAYW nbDAYjiESE+iHCV/tEgsiJnXEDAlEDh35jOaLNdvmElUCkt45Y5wYQC1y ET5jzQKcQN724+n1AlluW9o7O3dVQJCpJ/yCiT6nHNU9eLvALbhglchZP wxRQmE9YMDQ1ZCxXOa6XOB1DtpdVd/l0y/hvfF7KHvF7ULX+P/6yiOS7v vzCAJ9xARhO1zjHL7KjxetjrM3Vt5w25tVyUhhm3VRW9wSpzmtd/3Frjo +5BfkeR+oTG872JKNCBg3pdnVacFhdspvgw77/YoPRgfEyS/79XGCW4ym g==; X-IronPort-AV: E=McAfee;i="6600,9927,10726"; a="335489303" X-IronPort-AV: E=Sophos;i="6.00,205,1681196400"; d="scan'208";a="335489303" Received: from orsmga006.jf.intel.com ([10.7.209.51]) by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 30 May 2023 22:38:06 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=McAfee;i="6600,9927,10726"; a="684273536" X-IronPort-AV: E=Sophos;i="6.00,205,1681196400"; d="scan'208";a="684273536" Received: from dpdk-limiao-icelake.sh.intel.com ([10.67.111.26]) by orsmga006.jf.intel.com with ESMTP; 30 May 2023 22:38:04 -0700 From: Miao Li To: dev@dpdk.org Cc: skori@marvell.com, thomas@monjalon.net, david.marchand@redhat.com, ferruh.yigit@amd.com, chenbo.xia@intel.com, yahui.cao@intel.com, Anatoly Burakov Subject: [PATCH v4 4/4] bus/pci: add VFIO sparse mmap support Date: Wed, 31 May 2023 05:37:42 +0000 Message-Id: <20230531053743.129442-5-miao.li@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20230531053743.129442-1-miao.li@intel.com> References: <20230525163116.682000-1-miao.li@intel.com> <20230531053743.129442-1-miao.li@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org This patch adds sparse mmap support in PCI bus. Sparse mmap is a capability defined in VFIO which allows multiple mmap areas in one VFIO region. In this patch, the sparse mmap regions are mapped to one continuous virtual address region that follows device-specific BAR layout. So, driver can still access all mapped sparse mmap regions by using 'bar_base_address + bar_offset'. Signed-off-by: Miao Li Signed-off-by: Chenbo Xia Acked-by: Sunil Kumar Kori Acked-by: Yahui Cao --- drivers/bus/pci/linux/pci_vfio.c | 138 +++++++++++++++++++++++++++---- drivers/bus/pci/private.h | 2 + 2 files changed, 122 insertions(+), 18 deletions(-) diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 24b0795fbd..e6db30d36a 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -673,6 +673,54 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, return 0; } +static int +pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, + int bar_index, int additional_flags) +{ + struct pci_map *bar = &vfio_res->maps[bar_index]; + struct vfio_region_sparse_mmap_area *sparse; + void *bar_addr; + uint32_t i; + + if (bar->size == 0) { + RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); + return 0; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | + MAP_ANONYMOUS | additional_flags, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + for (i = 0; i < bar->nr_areas; i++) { + sparse = &bar->areas[i]; + if (sparse->size) { + void *addr = RTE_PTR_ADD(bar_addr, (uintptr_t)sparse->offset); + map_addr = pci_map_resource(addr, vfio_dev_fd, + bar->offset + sparse->offset, sparse->size, + RTE_MAP_FORCE_ADDRESS); + if (map_addr == NULL) { + munmap(bar_addr, bar->size); + RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", + bar_index); + goto err_map; + } + } + } + } else { + RTE_LOG(ERR, EAL, "Failed to create inaccessible mapping for BAR%d\n", + bar_index); + goto err_map; + } + + bar->addr = bar_addr; + return 0; + +err_map: + bar->nr_areas = 0; + return -1; +} + /* * region info may contain capability headers, so we need to keep reallocating * the memory until we match allocated memory size with argsz. @@ -798,7 +846,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) char pci_addr[PATH_MAX] = {0}; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; - int i, ret; + int i, j, ret; struct mapped_pci_resource *vfio_res = NULL; struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); @@ -875,13 +923,15 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) for (i = 0; i < vfio_res->nb_maps; i++) { void *bar_addr; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); if (ret < 0) { RTE_LOG(ERR, EAL, "%s cannot get device region info error " "%i (%s)\n", pci_addr, errno, strerror(errno)); - goto err_vfio_res; + goto err_map; } pdev->region[i].size = reg->size; @@ -891,7 +941,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) ret = pci_vfio_is_ioport_bar(dev, vfio_dev_fd, i); if (ret < 0) { free(reg); - goto err_vfio_res; + goto err_map; } else if (ret) { RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", i); @@ -920,12 +970,41 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - free(reg); - goto err_vfio_res; + hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + + if (hdr != NULL) { + sparse = container_of(hdr, + struct vfio_region_info_cap_sparse_mmap, header); + if (sparse->nr_areas > 0) { + maps[i].nr_areas = sparse->nr_areas; + maps[i].areas = rte_zmalloc(NULL, + sizeof(*maps[i].areas) * maps[i].nr_areas, 0); + if (maps[i].areas == NULL) { + RTE_LOG(ERR, EAL, + "Cannot alloc memory for sparse map areas\n"); + goto err_map; + } + memcpy(maps[i].areas, sparse->areas, + sizeof(*maps[i].areas) * maps[i].nr_areas); + } + } + + if (maps[i].nr_areas > 0) { + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, 0); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_map; + } + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_map; + } } dev->mem_resource[i].addr = maps[i].addr; @@ -935,19 +1014,26 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { RTE_LOG(ERR, EAL, "%s setup device failed\n", pci_addr); - goto err_vfio_res; + goto err_map; } #ifdef HAVE_VFIO_DEV_REQ_INTERFACE if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); - goto err_vfio_res; + goto err_map; } #endif TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); return 0; +err_map: + for (j = 0; j < i; j++) { + if (maps[j].addr) + pci_unmap_resource(maps[j].addr, maps[j].size); + if (maps[j].nr_areas > 0) + rte_free(maps[j].areas); + } err_vfio_res: rte_free(vfio_res); err_vfio_dev_fd: @@ -963,7 +1049,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) char pci_addr[PATH_MAX] = {0}; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; - int i, ret; + int i, j, ret; struct mapped_pci_resource *vfio_res = NULL; struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); @@ -1008,11 +1094,20 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) maps = vfio_res->maps; for (i = 0; i < vfio_res->nb_maps; i++) { - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - goto err_vfio_dev_fd; + if (maps[i].nr_areas > 0) { + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_dev_fd; + } + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_dev_fd; + } } dev->mem_resource[i].addr = maps[i].addr; @@ -1028,6 +1123,10 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) return 0; err_vfio_dev_fd: + for (j = 0; j < i; j++) { + if (maps[j].addr) + pci_unmap_resource(maps[j].addr, maps[j].size); + } rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, vfio_dev_fd); return -1; @@ -1062,7 +1161,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, break; } - if (vfio_res == NULL) + if (vfio_res == NULL) return vfio_res; RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n", @@ -1080,6 +1179,9 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, pci_addr, maps[i].addr); pci_unmap_resource(maps[i].addr, maps[i].size); } + + if (maps[i].nr_areas > 0) + rte_free(maps[i].areas); } return vfio_res; diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 2d6991ccb7..8b0ce73533 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -121,6 +121,8 @@ struct pci_map { uint64_t offset; uint64_t size; uint64_t phaddr; + uint32_t nr_areas; + struct vfio_region_sparse_mmap_area *areas; }; struct pci_msix_table { -- 2.25.1