From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id DCE39A0471 for ; Wed, 19 Jun 2019 17:20:08 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 177A91C3F4; Wed, 19 Jun 2019 17:16:17 +0200 (CEST) Received: from mx0.arrikto.com (mx0.arrikto.com [212.71.252.59]) by dpdk.org (Postfix) with ESMTP id E796A1C391 for ; Wed, 19 Jun 2019 17:15:41 +0200 (CEST) Received: from troi.prod.arr (mail.arr [10.99.0.5]) by mx0.arrikto.com (Postfix) with ESMTP id A64F5182011; Wed, 19 Jun 2019 18:15:41 +0300 (EEST) Received: from localhost.localdomain (unknown [10.89.50.133]) by troi.prod.arr (Postfix) with ESMTPSA id B5E4E32C; Wed, 19 Jun 2019 18:15:40 +0300 (EEST) From: Nikos Dragazis To: dev@dpdk.org Cc: Maxime Coquelin , Tiwei Bie , Zhihong Wang , Stefan Hajnoczi , Wei Wang , Stojaczyk Dariusz , Vangelis Koukis Date: Wed, 19 Jun 2019 18:14:38 +0300 Message-Id: <1560957293-17294-14-git-send-email-ndragazis@arrikto.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1560957293-17294-1-git-send-email-ndragazis@arrikto.com> References: <1560957293-17294-1-git-send-email-ndragazis@arrikto.com> Subject: [dpdk-dev] [PATCH 13/28] vhost: move mmap/munmap X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Mapping the vhost memory regions is a transport-specific operation, so this patch moves the relevant code into trans_af_unix.c. The new .map_mem_table()/.unmap_mem_table() interfaces allow transports to perform the mapping and unmapping. In addition, the function vhost_user_set_mem_table(), which performs the mmaping, contains some code for postcopy live migration. However, postcopy live migration is an AF_UNIX-bound feature, due to the userfaultfd mechanism. The virtio-vhost-user transport, which will be added in later patches, cannot support it. Therefore, we move this code into trans_af_unix.c as well. The vhost_user_set_mem_table() debug logs have also been moved to the .map_mem_table(). All new .map_mem_table() interfaces have to implement the debug logs. This is necessary in order to keep the ordering of the log messages in case of postcopy live migration. Last but not least, after refactoring vhost_user_set_mem_table(), read_vhost_message() is no longer being used in vhost_user.c. So, mark it as static in trans_af_unix.c. Signed-off-by: Nikos Dragazis Signed-off-by: Stefan Hajnoczi --- lib/librte_vhost/trans_af_unix.c | 185 ++++++++++++++++++++++++++++++++++++++- lib/librte_vhost/vhost.h | 22 +++++ lib/librte_vhost/vhost_user.c | 171 ++++-------------------------------- lib/librte_vhost/vhost_user.h | 3 + 4 files changed, 225 insertions(+), 156 deletions(-) diff --git a/lib/librte_vhost/trans_af_unix.c b/lib/librte_vhost/trans_af_unix.c index 5f9ef5a..522823f 100644 --- a/lib/librte_vhost/trans_af_unix.c +++ b/lib/librte_vhost/trans_af_unix.c @@ -5,7 +5,14 @@ */ #include +#include +#include #include +#include +#include +#ifdef RTE_LIBRTE_VHOST_POSTCOPY +#include +#endif #include #include @@ -43,7 +50,7 @@ struct af_unix_socket { struct sockaddr_un un; }; -int read_vhost_message(int sockfd, struct VhostUserMsg *msg); +static int read_vhost_message(int sockfd, struct VhostUserMsg *msg); static int create_unix_socket(struct vhost_user_socket *vsocket); static int vhost_user_start_server(struct vhost_user_socket *vsocket); static int vhost_user_start_client(struct vhost_user_socket *vsocket); @@ -317,7 +324,7 @@ vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) } /* return bytes# of read on success or negative val on failure. */ -int +static int read_vhost_message(int sockfd, struct VhostUserMsg *msg) { int ret; @@ -771,6 +778,178 @@ af_unix_vring_call(struct virtio_net *dev __rte_unused, return 0; } +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static int +af_unix_map_mem_regions(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint32_t i; + struct VhostUserMemory *memory = &msg->payload.memory; + struct vhost_user_connection *conn = + container_of(dev, struct vhost_user_connection, device); + + for (i = 0; i < dev->mem->nregions; i++) { + struct rte_vhost_mem_region *reg = &dev->mem->regions[i]; + uint64_t mmap_size = reg->mmap_size; + uint64_t mmap_offset = mmap_size - reg->size; + uint64_t alignment; + void *mmap_addr; + int populate; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(reg->fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + return -1; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | populate, reg->fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + return -1; + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)reg->mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + if (add_guest_pages(dev, reg, alignment) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "adding guest pages to region %u failed.\n", + i); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)reg->mmap_addr, + reg->mmap_size, + alignment, + mmap_offset); + + if (dev->postcopy_listening) { + /* + * We haven't a better way right now than sharing + * DPDK's virtual address with Qemu, so that Qemu can + * retrieve the region offset when handling userfaults. + */ + memory->regions[i].userspace_addr = + reg->host_user_addr; + } + } + + if (dev->postcopy_listening) { + /* Send the addresses back to qemu */ + msg->fd_num = 0; + /* Send reply */ + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + af_unix_send_reply(dev, msg); + + /* Wait for qemu to acknolwedge it's got the addresses + * we've got to wait before we're allowed to generate faults. + */ + VhostUserMsg ack_msg; + if (read_vhost_message(conn->connfd, &ack_msg) <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to read qemu ack on postcopy set-mem-table\n"); + return -1; + } + if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Bad qemu ack on postcopy set-mem-table (%d)\n", + ack_msg.request.master); + return -1; + } + + /* Now userfault register and we can use the memory */ + for (i = 0; i < memory->nregions; i++) { +#ifdef RTE_LIBRTE_VHOST_POSTCOPY + struct rte_vhost_mem_region *reg = &dev->mem->regions[i]; + struct uffdio_register reg_struct; + + /* + * Let's register all the mmap'ed area to ensure + * alignment on page boundary. + */ + reg_struct.range.start = + (uint64_t)(uintptr_t)reg->mmap_addr; + reg_struct.range.len = reg->mmap_size; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, + ®_struct)) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to register ufd for region %d: (ufd = %d) %s\n", + i, dev->postcopy_ufd, + strerror(errno)); + return -1; + } + RTE_LOG(INFO, VHOST_CONFIG, + "\t userfaultfd registered for range : %llx - %llx\n", + reg_struct.range.start, + reg_struct.range.start + + reg_struct.range.len - 1); +#else + return -1; +#endif + } + } + + return 0; +} + +static void +af_unix_unmap_mem_regions(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + const struct vhost_transport_ops af_unix_trans_ops = { .socket_size = sizeof(struct af_unix_socket), .device_size = sizeof(struct vhost_user_connection), @@ -783,4 +962,6 @@ const struct vhost_transport_ops af_unix_trans_ops = { .send_slave_req = af_unix_send_slave_req, .process_slave_message_reply = af_unix_process_slave_message_reply, .set_slave_req_fd = af_unix_set_slave_req_fd, + .map_mem_regions = af_unix_map_mem_regions, + .unmap_mem_regions = af_unix_unmap_mem_regions, }; diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 2213fbe..28038c6 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -415,6 +415,28 @@ struct vhost_transport_ops { */ int (*set_slave_req_fd)(struct virtio_net *dev, struct VhostUserMsg *msg); + + /** + * Map memory table regions in dev->mem->regions[]. + * + * @param dev + * vhost device + * @param msg + * message + * @return + * 0 on success, -1 on failure + */ + int (*map_mem_regions)(struct virtio_net *dev, + struct VhostUserMsg *msg); + + /** + * Unmap memory table regions in dev->mem->regions[] and free any + * resources, such as file descriptors. + * + * @param dev + * vhost device + */ + void (*unmap_mem_regions)(struct virtio_net *dev); }; /** The traditional AF_UNIX vhost-user protocol transport. */ diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index a4dcba1..ed8dbd8 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -81,17 +81,6 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { }; static int send_vhost_reply(struct virtio_net *dev, struct VhostUserMsg *msg); -int read_vhost_message(int sockfd, struct VhostUserMsg *msg); - -static uint64_t -get_blk_size(int fd) -{ - struct stat stat; - int ret; - - ret = fstat(fd, &stat); - return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; -} /* * Reclaim all the outstanding zmbufs for a virtqueue. @@ -120,7 +109,6 @@ static void free_mem_region(struct virtio_net *dev) { uint32_t i; - struct rte_vhost_mem_region *reg; struct vhost_virtqueue *vq; if (!dev || !dev->mem) @@ -134,13 +122,7 @@ free_mem_region(struct virtio_net *dev) } } - for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; - if (reg->host_user_addr) { - munmap(reg->mmap_addr, reg->mmap_size); - close(reg->fd); - } - } + dev->trans_ops->unmap_mem_regions(dev); } void @@ -792,7 +774,7 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, return 0; } -static int +int add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, uint64_t page_size) { @@ -881,18 +863,13 @@ vhost_memory_changed(struct VhostUserMemory *new, static int vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, - int main_fd) + int main_fd __rte_unused) { struct virtio_net *dev = *pdev; struct VhostUserMemory *memory = &msg->payload.memory; struct rte_vhost_mem_region *reg; - void *mmap_addr; - uint64_t mmap_size; uint64_t mmap_offset; - uint64_t alignment; uint32_t i; - int populate; - int fd; if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { RTE_LOG(ERR, VHOST_CONFIG, @@ -904,8 +881,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, RTE_LOG(INFO, VHOST_CONFIG, "(%d) memory regions not changed\n", dev->vid); - for (i = 0; i < memory->nregions; i++) - close(msg->fds[i]); + for (i = 0; i < memory->nregions; i++) { + if (msg->fds[i] >= 0) { + close(msg->fds[i]); + } + } return RTE_VHOST_MSG_RESULT_OK; } @@ -946,13 +926,15 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, dev->mem->nregions = memory->nregions; for (i = 0; i < memory->nregions; i++) { - fd = msg->fds[i]; reg = &dev->mem->regions[i]; reg->guest_phys_addr = memory->regions[i].guest_phys_addr; reg->guest_user_addr = memory->regions[i].userspace_addr; reg->size = memory->regions[i].memory_size; - reg->fd = fd; + reg->mmap_size = reg->size + memory->regions[i].mmap_offset; + reg->mmap_addr = NULL; + reg->host_user_addr = 0; + reg->fd = msg->fds[i]; mmap_offset = memory->regions[i].mmap_offset; @@ -962,132 +944,13 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, "mmap_offset (%#"PRIx64") and memory_size " "(%#"PRIx64") overflow\n", mmap_offset, reg->size); - goto err_mmap; - } - - mmap_size = reg->size + mmap_offset; - - /* mmap() without flag of MAP_ANONYMOUS, should be called - * with length argument aligned with hugepagesz at older - * longterm version Linux, like 2.6.32 and 3.2.72, or - * mmap() will fail with EINVAL. - * - * to avoid failure, make sure in caller to keep length - * aligned. - */ - alignment = get_blk_size(fd); - if (alignment == (uint64_t)-1) { - RTE_LOG(ERR, VHOST_CONFIG, - "couldn't get hugepage size through fstat\n"); - goto err_mmap; - } - mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); - - populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; - mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | populate, fd, 0); - - if (mmap_addr == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "mmap region %u failed.\n", i); - goto err_mmap; + goto err; } - reg->mmap_addr = mmap_addr; - reg->mmap_size = mmap_size; - reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + - mmap_offset; - - if (dev->dequeue_zero_copy) - if (add_guest_pages(dev, reg, alignment) < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "adding guest pages to region %u failed.\n", - i); - goto err_mmap; - } - - RTE_LOG(INFO, VHOST_CONFIG, - "guest memory region %u, size: 0x%" PRIx64 "\n" - "\t guest physical addr: 0x%" PRIx64 "\n" - "\t guest virtual addr: 0x%" PRIx64 "\n" - "\t host virtual addr: 0x%" PRIx64 "\n" - "\t mmap addr : 0x%" PRIx64 "\n" - "\t mmap size : 0x%" PRIx64 "\n" - "\t mmap align: 0x%" PRIx64 "\n" - "\t mmap off : 0x%" PRIx64 "\n", - i, reg->size, - reg->guest_phys_addr, - reg->guest_user_addr, - reg->host_user_addr, - (uint64_t)(uintptr_t)mmap_addr, - mmap_size, - alignment, - mmap_offset); - - if (dev->postcopy_listening) { - /* - * We haven't a better way right now than sharing - * DPDK's virtual address with Qemu, so that Qemu can - * retrieve the region offset when handling userfaults. - */ - memory->regions[i].userspace_addr = - reg->host_user_addr; - } } - if (dev->postcopy_listening) { - /* Send the addresses back to qemu */ - msg->fd_num = 0; - send_vhost_reply(dev, msg); - - /* Wait for qemu to acknolwedge it's got the addresses - * we've got to wait before we're allowed to generate faults. - */ - VhostUserMsg ack_msg; - if (read_vhost_message(main_fd, &ack_msg) <= 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to read qemu ack on postcopy set-mem-table\n"); - goto err_mmap; - } - if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) { - RTE_LOG(ERR, VHOST_CONFIG, - "Bad qemu ack on postcopy set-mem-table (%d)\n", - ack_msg.request.master); - goto err_mmap; - } - - /* Now userfault register and we can use the memory */ - for (i = 0; i < memory->nregions; i++) { -#ifdef RTE_LIBRTE_VHOST_POSTCOPY - reg = &dev->mem->regions[i]; - struct uffdio_register reg_struct; - /* - * Let's register all the mmap'ed area to ensure - * alignment on page boundary. - */ - reg_struct.range.start = - (uint64_t)(uintptr_t)reg->mmap_addr; - reg_struct.range.len = reg->mmap_size; - reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; - - if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, - ®_struct)) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to register ufd for region %d: (ufd = %d) %s\n", - i, dev->postcopy_ufd, - strerror(errno)); - goto err_mmap; - } - RTE_LOG(INFO, VHOST_CONFIG, - "\t userfaultfd registered for range : %llx - %llx\n", - reg_struct.range.start, - reg_struct.range.start + - reg_struct.range.len - 1); -#else - goto err_mmap; -#endif - } - } + if (dev->trans_ops->map_mem_regions(dev, msg) < 0) + goto err; for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; @@ -1103,7 +966,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, dev = translate_ring_addresses(dev, i); if (!dev) { dev = *pdev; - goto err_mmap; + goto err; } *pdev = dev; @@ -1114,7 +977,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, return RTE_VHOST_MSG_RESULT_OK; -err_mmap: +err: free_mem_region(dev); rte_free(dev->mem); dev->mem = NULL; diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index 0169bd2..200e47b 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -147,6 +147,9 @@ typedef struct VhostUserMsg { /* vhost_user.c */ int vhost_user_msg_handler(int vid, int fd, const struct VhostUserMsg *msg); +int add_guest_pages(struct virtio_net *dev, + struct rte_vhost_mem_region *reg, + uint64_t page_size); int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); #endif -- 2.7.4