* [PATCH v2] vhost: exclude VM hugepages from coredumps @ 2022-12-06 15:05 Mike Pattrick 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick 2023-02-03 17:45 ` [PATCH v2] vhost: exclude VM hugepages from coredumps Stephen Hemminger 0 siblings, 2 replies; 19+ messages in thread From: Mike Pattrick @ 2022-12-06 15:05 UTC (permalink / raw) To: Maxime Coquelin, Chenbo Xia; +Cc: dev, Mike Pattrick Currently if an application wants to include shared hugepages in coredumps in conjunction with the vhost library, the coredump will be larger than expected and include unneeded virtual machine memory. This patch will mark all vhost huge pages as DONTDUMP, except for some select pages used by DPDK. Signed-off-by: Mike Pattrick <mkp@redhat.com> --- v2: * Removed warning on unsupported platforms --- lib/vhost/iotlb.c | 5 +++++ lib/vhost/vhost.h | 12 ++++++++++++ lib/vhost/vhost_user.c | 10 ++++++++++ 3 files changed, 27 insertions(+) diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c index 6a729e8804..2f89f88817 100644 --- a/lib/vhost/iotlb.c +++ b/lib/vhost/iotlb.c @@ -149,6 +149,7 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) rte_rwlock_write_lock(&vq->iotlb_lock); RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { + mem_set_dump((void *)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); } @@ -170,6 +171,7 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { if (!entry_idx) { + mem_set_dump((void *)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; @@ -222,12 +224,14 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq vhost_user_iotlb_pool_put(vq, new_node); goto unlock; } else if (node->iova > new_node->iova) { + mem_set_dump((void *)node->uaddr, node->size, true); TAILQ_INSERT_BEFORE(node, new_node, next); vq->iotlb_cache_nr++; goto unlock; } } + mem_set_dump((void *)node->uaddr, node->size, true); TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); vq->iotlb_cache_nr++; @@ -255,6 +259,7 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, break; if (iova < node->iova + node->size) { + mem_set_dump((void *)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index ef211ed519..1f913803f6 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -13,6 +13,7 @@ #include <linux/virtio_net.h> #include <sys/socket.h> #include <linux/if.h> +#include <sys/mman.h> #include <rte_log.h> #include <rte_ether.h> @@ -987,4 +988,15 @@ mbuf_is_consumed(struct rte_mbuf *m) return true; } + +static __rte_always_inline void +mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) +{ +#ifdef MADV_DONTDUMP + if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { + rte_log(RTE_LOG_INFO, vhost_config_log_level, + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); + } +#endif +} #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index 9902ae9944..8f33d5f4d9 100644 --- a/lib/vhost/vhost_user.c +++ b/lib/vhost/vhost_user.c @@ -793,6 +793,9 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc_packed, len, true); + mem_set_dump(vq->driver_event, len, true); + mem_set_dump(vq->device_event, len, true); vq->access_ok = true; return; } @@ -846,6 +849,9 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) "some packets maybe resent for Tx and dropped for Rx\n"); } + mem_set_dump(vq->desc, len, true); + mem_set_dump(vq->avail, len, true); + mem_set_dump(vq->used, len, true); vq->access_ok = true; VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); @@ -1224,6 +1230,7 @@ vhost_user_mmap_region(struct virtio_net *dev, region->mmap_addr = mmap_addr; region->mmap_size = mmap_size; region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; + mem_set_dump(mmap_addr, mmap_size, false); if (dev->async_copy) { if (add_guest_pages(dev, region, alignment) < 0) { @@ -1528,6 +1535,7 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f return NULL; } + mem_set_dump(ptr, size, false); *fd = mfd; return ptr; } @@ -1736,6 +1744,7 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } + mem_set_dump(addr, mmap_size, false); dev->inflight_info->fd = fd; dev->inflight_info->addr = addr; dev->inflight_info->size = mmap_size; @@ -2283,6 +2292,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, dev->log_addr = (uint64_t)(uintptr_t)addr; dev->log_base = dev->log_addr + off; dev->log_size = size; + mem_set_dump(addr, size, false); for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; -- 2.31.1 ^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v3] vhost: exclude VM hugepages from coredumps 2022-12-06 15:05 [PATCH v2] vhost: exclude VM hugepages from coredumps Mike Pattrick @ 2022-12-07 16:54 ` Mike Pattrick 2022-12-20 12:43 ` Maxime Coquelin ` (2 more replies) 2023-02-03 17:45 ` [PATCH v2] vhost: exclude VM hugepages from coredumps Stephen Hemminger 1 sibling, 3 replies; 19+ messages in thread From: Mike Pattrick @ 2022-12-07 16:54 UTC (permalink / raw) To: Maxime Coquelin, Chenbo Xia; +Cc: dev, Mike Pattrick Currently if an application wants to include shared hugepages in coredumps in conjunction with the vhost library, the coredump will be larger than expected and include unneeded virtual machine memory. This patch will mark all vhost huge pages as DONTDUMP, except for some select pages used by DPDK. Signed-off-by: Mike Pattrick <mkp@redhat.com> --- v2: * Removed warning on unsupported platforms v3: * Removed pointer warning on 32bit platforms --- lib/vhost/iotlb.c | 5 +++++ lib/vhost/vhost.h | 12 ++++++++++++ lib/vhost/vhost_user.c | 10 ++++++++++ 3 files changed, 27 insertions(+) diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c index 6a729e8804..a0b8fd7302 100644 --- a/lib/vhost/iotlb.c +++ b/lib/vhost/iotlb.c @@ -149,6 +149,7 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) rte_rwlock_write_lock(&vq->iotlb_lock); RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); } @@ -170,6 +171,7 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { if (!entry_idx) { + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; @@ -222,12 +224,14 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq vhost_user_iotlb_pool_put(vq, new_node); goto unlock; } else if (node->iova > new_node->iova) { + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_INSERT_BEFORE(node, new_node, next); vq->iotlb_cache_nr++; goto unlock; } } + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); vq->iotlb_cache_nr++; @@ -255,6 +259,7 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, break; if (iova < node->iova + node->size) { + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index ef211ed519..1f913803f6 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -13,6 +13,7 @@ #include <linux/virtio_net.h> #include <sys/socket.h> #include <linux/if.h> +#include <sys/mman.h> #include <rte_log.h> #include <rte_ether.h> @@ -987,4 +988,15 @@ mbuf_is_consumed(struct rte_mbuf *m) return true; } + +static __rte_always_inline void +mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) +{ +#ifdef MADV_DONTDUMP + if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { + rte_log(RTE_LOG_INFO, vhost_config_log_level, + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); + } +#endif +} #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index 9902ae9944..8f33d5f4d9 100644 --- a/lib/vhost/vhost_user.c +++ b/lib/vhost/vhost_user.c @@ -793,6 +793,9 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc_packed, len, true); + mem_set_dump(vq->driver_event, len, true); + mem_set_dump(vq->device_event, len, true); vq->access_ok = true; return; } @@ -846,6 +849,9 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) "some packets maybe resent for Tx and dropped for Rx\n"); } + mem_set_dump(vq->desc, len, true); + mem_set_dump(vq->avail, len, true); + mem_set_dump(vq->used, len, true); vq->access_ok = true; VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); @@ -1224,6 +1230,7 @@ vhost_user_mmap_region(struct virtio_net *dev, region->mmap_addr = mmap_addr; region->mmap_size = mmap_size; region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; + mem_set_dump(mmap_addr, mmap_size, false); if (dev->async_copy) { if (add_guest_pages(dev, region, alignment) < 0) { @@ -1528,6 +1535,7 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f return NULL; } + mem_set_dump(ptr, size, false); *fd = mfd; return ptr; } @@ -1736,6 +1744,7 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } + mem_set_dump(addr, mmap_size, false); dev->inflight_info->fd = fd; dev->inflight_info->addr = addr; dev->inflight_info->size = mmap_size; @@ -2283,6 +2292,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, dev->log_addr = (uint64_t)(uintptr_t)addr; dev->log_base = dev->log_addr + off; dev->log_size = size; + mem_set_dump(addr, size, false); for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; -- 2.31.1 ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick @ 2022-12-20 12:43 ` Maxime Coquelin 2023-02-03 14:50 ` Maxime Coquelin 2023-02-10 15:53 ` David Marchand 2 siblings, 0 replies; 19+ messages in thread From: Maxime Coquelin @ 2022-12-20 12:43 UTC (permalink / raw) To: Mike Pattrick, Chenbo Xia; +Cc: dev Hi Mike, On 12/7/22 17:54, Mike Pattrick wrote: > Currently if an application wants to include shared hugepages in > coredumps in conjunction with the vhost library, the coredump will be > larger than expected and include unneeded virtual machine memory. > > This patch will mark all vhost huge pages as DONTDUMP, except for some > select pages used by DPDK. > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > --- > v2: > * Removed warning on unsupported platforms > > v3: > * Removed pointer warning on 32bit platforms > > --- > lib/vhost/iotlb.c | 5 +++++ > lib/vhost/vhost.h | 12 ++++++++++++ > lib/vhost/vhost_user.c | 10 ++++++++++ > 3 files changed, 27 insertions(+) > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Thanks, Maxime ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick 2022-12-20 12:43 ` Maxime Coquelin @ 2023-02-03 14:50 ` Maxime Coquelin 2023-02-10 15:53 ` David Marchand 2 siblings, 0 replies; 19+ messages in thread From: Maxime Coquelin @ 2023-02-03 14:50 UTC (permalink / raw) To: Mike Pattrick, Chenbo Xia; +Cc: dev On 12/7/22 17:54, Mike Pattrick wrote: > Currently if an application wants to include shared hugepages in > coredumps in conjunction with the vhost library, the coredump will be > larger than expected and include unneeded virtual machine memory. > > This patch will mark all vhost huge pages as DONTDUMP, except for some > select pages used by DPDK. > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > --- > v2: > * Removed warning on unsupported platforms > > v3: > * Removed pointer warning on 32bit platforms > > --- > lib/vhost/iotlb.c | 5 +++++ > lib/vhost/vhost.h | 12 ++++++++++++ > lib/vhost/vhost_user.c | 10 ++++++++++ > 3 files changed, 27 insertions(+) > Applied to dpdk-next-virtio/main. Thanks, Maxime ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick 2022-12-20 12:43 ` Maxime Coquelin 2023-02-03 14:50 ` Maxime Coquelin @ 2023-02-10 15:53 ` David Marchand 2023-02-10 15:59 ` Maxime Coquelin 2023-02-10 21:12 ` Mike Pattrick 2 siblings, 2 replies; 19+ messages in thread From: David Marchand @ 2023-02-10 15:53 UTC (permalink / raw) To: Mike Pattrick; +Cc: Maxime Coquelin, Chenbo Xia, dev Hello Mike, On Wed, Dec 7, 2022 at 5:54 PM Mike Pattrick <mkp@redhat.com> wrote: > > Currently if an application wants to include shared hugepages in > coredumps in conjunction with the vhost library, the coredump will be > larger than expected and include unneeded virtual machine memory. > > This patch will mark all vhost huge pages as DONTDUMP, except for some > select pages used by DPDK. > > Signed-off-by: Mike Pattrick <mkp@redhat.com> I noticed the following warnings today on my f37 kernel, while running a vhost-user/virtio-user testpmd setup on next-virtio branch. Linux dmarchan 6.1.9-200.fc37.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 00:21:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux My system has 2M hugepages, only. $ rm vhost-net; strace -e trace=madvise -f ./build-clang/app/dpdk-testpmd --in-memory --no-pci --vdev=net_vhost0,iface=./vhost-net,client=1 -- -i $ ./build-clang/app/dpdk-testpmd --in-memory --single-file-segment --no-pci --vdev 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1' -- -i Then, on the "vhost side" testpmd: ... VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE VHOST_CONFIG: (./vhost-net) vring base idx:0 last_used_idx:0 last_avail_idx:0. VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_ADDR VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_KICK VHOST_CONFIG: (./vhost-net) vring kick idx:0 file:391 [pid 59565] madvise(0x7fa6d8da4000, 2052, MADV_DODUMP) = -1 EINVAL (Invalid argument) VHOST_CONFIG: could not set coredump preference (Invalid argument). [pid 59565] madvise(0x7fa6d8da5000, 2052, MADV_DODUMP) = -1 EINVAL (Invalid argument) VHOST_CONFIG: could not set coredump preference (Invalid argument). [pid 59565] madvise(0x7fa6d8da6000, 2052, MADV_DODUMP) = -1 EINVAL (Invalid argument) VHOST_CONFIG: could not set coredump preference (Invalid argument). VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE Looking at the whole trace, only madvise calls with MADV_DODUMP (with all of them for a 2052 size) fail. I did not investigate further. Could you have a look please? -- David Marchand ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2023-02-10 15:53 ` David Marchand @ 2023-02-10 15:59 ` Maxime Coquelin 2023-02-10 21:12 ` Mike Pattrick 1 sibling, 0 replies; 19+ messages in thread From: Maxime Coquelin @ 2023-02-10 15:59 UTC (permalink / raw) To: David Marchand, Mike Pattrick; +Cc: Chenbo Xia, dev Thanks David for the report. On 2/10/23 16:53, David Marchand wrote: > Hello Mike, > > On Wed, Dec 7, 2022 at 5:54 PM Mike Pattrick <mkp@redhat.com> wrote: >> >> Currently if an application wants to include shared hugepages in >> coredumps in conjunction with the vhost library, the coredump will be >> larger than expected and include unneeded virtual machine memory. >> >> This patch will mark all vhost huge pages as DONTDUMP, except for some >> select pages used by DPDK. >> >> Signed-off-by: Mike Pattrick <mkp@redhat.com> > > I noticed the following warnings today on my f37 kernel, while running > a vhost-user/virtio-user testpmd setup on next-virtio branch. > Linux dmarchan 6.1.9-200.fc37.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 > 00:21:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux > My system has 2M hugepages, only. FYI, I don't see it on my system which is using 2MB hugepages too: Linux max-t490s 6.1.9-100.fc36.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 03:27:13 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux Almost the same Kernel BTW. > > $ rm vhost-net; strace -e trace=madvise -f > ./build-clang/app/dpdk-testpmd --in-memory --no-pci > --vdev=net_vhost0,iface=./vhost-net,client=1 -- -i > > $ ./build-clang/app/dpdk-testpmd --in-memory --single-file-segment > --no-pci --vdev > 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1' -- > -i > > Then, on the "vhost side" testpmd: > ... > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > VHOST_CONFIG: (./vhost-net) vring base idx:0 last_used_idx:0 last_avail_idx:0. > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_ADDR > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_KICK > VHOST_CONFIG: (./vhost-net) vring kick idx:0 file:391 > [pid 59565] madvise(0x7fa6d8da4000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > [pid 59565] madvise(0x7fa6d8da5000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > [pid 59565] madvise(0x7fa6d8da6000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > > Looking at the whole trace, only madvise calls with MADV_DODUMP (with > all of them for a 2052 size) fail. > > I did not investigate further. > Could you have a look please? > > Maxime ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2023-02-10 15:53 ` David Marchand 2023-02-10 15:59 ` Maxime Coquelin @ 2023-02-10 21:12 ` Mike Pattrick 2023-02-21 16:26 ` Maxime Coquelin 1 sibling, 1 reply; 19+ messages in thread From: Mike Pattrick @ 2023-02-10 21:12 UTC (permalink / raw) To: David Marchand; +Cc: Maxime Coquelin, Chenbo Xia, dev On Fri, Feb 10, 2023 at 10:53 AM David Marchand <david.marchand@redhat.com> wrote: > > Hello Mike, > > On Wed, Dec 7, 2022 at 5:54 PM Mike Pattrick <mkp@redhat.com> wrote: > > > > Currently if an application wants to include shared hugepages in > > coredumps in conjunction with the vhost library, the coredump will be > > larger than expected and include unneeded virtual machine memory. > > > > This patch will mark all vhost huge pages as DONTDUMP, except for some > > select pages used by DPDK. > > > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > I noticed the following warnings today on my f37 kernel, while running > a vhost-user/virtio-user testpmd setup on next-virtio branch. > Linux dmarchan 6.1.9-200.fc37.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 > 00:21:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux > My system has 2M hugepages, only. > > > $ rm vhost-net; strace -e trace=madvise -f > ./build-clang/app/dpdk-testpmd --in-memory --no-pci > --vdev=net_vhost0,iface=./vhost-net,client=1 -- -i > > $ ./build-clang/app/dpdk-testpmd --in-memory --single-file-segment > --no-pci --vdev > 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1' -- > -i > > Then, on the "vhost side" testpmd: > ... > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > VHOST_CONFIG: (./vhost-net) vring base idx:0 last_used_idx:0 last_avail_idx:0. > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_ADDR > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_KICK > VHOST_CONFIG: (./vhost-net) vring kick idx:0 file:391 > [pid 59565] madvise(0x7fa6d8da4000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > [pid 59565] madvise(0x7fa6d8da5000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > [pid 59565] madvise(0x7fa6d8da6000, 2052, MADV_DODUMP) = -1 EINVAL > (Invalid argument) > VHOST_CONFIG: could not set coredump preference (Invalid argument). > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > > Looking at the whole trace, only madvise calls with MADV_DODUMP (with > all of them for a 2052 size) fail. > > I did not investigate further. > Could you have a look please? > I tried it on that exact kernel and also ran into this issue. I'll check it out in more depth. -M > > -- > David Marchand > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2023-02-10 21:12 ` Mike Pattrick @ 2023-02-21 16:26 ` Maxime Coquelin 2023-02-22 13:59 ` Mike Pattrick 2023-02-22 21:43 ` [PATCH] vhost: fix madvise arguments alignment Mike Pattrick 0 siblings, 2 replies; 19+ messages in thread From: Maxime Coquelin @ 2023-02-21 16:26 UTC (permalink / raw) To: Mike Pattrick, David Marchand; +Cc: Chenbo Xia, dev Hi Mike, On 2/10/23 22:12, Mike Pattrick wrote: > On Fri, Feb 10, 2023 at 10:53 AM David Marchand > <david.marchand@redhat.com> wrote: >> >> Hello Mike, >> >> On Wed, Dec 7, 2022 at 5:54 PM Mike Pattrick <mkp@redhat.com> wrote: >>> >>> Currently if an application wants to include shared hugepages in >>> coredumps in conjunction with the vhost library, the coredump will be >>> larger than expected and include unneeded virtual machine memory. >>> >>> This patch will mark all vhost huge pages as DONTDUMP, except for some >>> select pages used by DPDK. >>> >>> Signed-off-by: Mike Pattrick <mkp@redhat.com> >> >> I noticed the following warnings today on my f37 kernel, while running >> a vhost-user/virtio-user testpmd setup on next-virtio branch. >> Linux dmarchan 6.1.9-200.fc37.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 >> 00:21:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux >> My system has 2M hugepages, only. >> >> >> $ rm vhost-net; strace -e trace=madvise -f >> ./build-clang/app/dpdk-testpmd --in-memory --no-pci >> --vdev=net_vhost0,iface=./vhost-net,client=1 -- -i >> >> $ ./build-clang/app/dpdk-testpmd --in-memory --single-file-segment >> --no-pci --vdev >> 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1' -- >> -i >> >> Then, on the "vhost side" testpmd: >> ... >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE >> VHOST_CONFIG: (./vhost-net) vring base idx:0 last_used_idx:0 last_avail_idx:0. >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_ADDR >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_KICK >> VHOST_CONFIG: (./vhost-net) vring kick idx:0 file:391 >> [pid 59565] madvise(0x7fa6d8da4000, 2052, MADV_DODUMP) = -1 EINVAL >> (Invalid argument) >> VHOST_CONFIG: could not set coredump preference (Invalid argument). >> [pid 59565] madvise(0x7fa6d8da5000, 2052, MADV_DODUMP) = -1 EINVAL >> (Invalid argument) >> VHOST_CONFIG: could not set coredump preference (Invalid argument). >> [pid 59565] madvise(0x7fa6d8da6000, 2052, MADV_DODUMP) = -1 EINVAL >> (Invalid argument) >> VHOST_CONFIG: could not set coredump preference (Invalid argument). >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE >> >> Looking at the whole trace, only madvise calls with MADV_DODUMP (with >> all of them for a 2052 size) fail. >> >> I did not investigate further. >> Could you have a look please? >> > > I tried it on that exact kernel and also ran into this issue. I'll > check it out in more depth. Gentle reminder, have you found the root cause for this issue? Thanks, Maxime > -M > >> >> -- >> David Marchand >> > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: exclude VM hugepages from coredumps 2023-02-21 16:26 ` Maxime Coquelin @ 2023-02-22 13:59 ` Mike Pattrick 2023-02-22 21:43 ` [PATCH] vhost: fix madvise arguments alignment Mike Pattrick 1 sibling, 0 replies; 19+ messages in thread From: Mike Pattrick @ 2023-02-22 13:59 UTC (permalink / raw) To: Maxime Coquelin; +Cc: David Marchand, Chenbo Xia, dev On Tue, Feb 21, 2023 at 11:26 AM Maxime Coquelin <maxime.coquelin@redhat.com> wrote: > > Hi Mike, > > On 2/10/23 22:12, Mike Pattrick wrote: > > On Fri, Feb 10, 2023 at 10:53 AM David Marchand > > <david.marchand@redhat.com> wrote: > >> > >> Hello Mike, > >> > >> On Wed, Dec 7, 2022 at 5:54 PM Mike Pattrick <mkp@redhat.com> wrote: > >>> > >>> Currently if an application wants to include shared hugepages in > >>> coredumps in conjunction with the vhost library, the coredump will be > >>> larger than expected and include unneeded virtual machine memory. > >>> > >>> This patch will mark all vhost huge pages as DONTDUMP, except for some > >>> select pages used by DPDK. > >>> > >>> Signed-off-by: Mike Pattrick <mkp@redhat.com> > >> > >> I noticed the following warnings today on my f37 kernel, while running > >> a vhost-user/virtio-user testpmd setup on next-virtio branch. > >> Linux dmarchan 6.1.9-200.fc37.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Feb 2 > >> 00:21:48 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux > >> My system has 2M hugepages, only. > >> > >> > >> $ rm vhost-net; strace -e trace=madvise -f > >> ./build-clang/app/dpdk-testpmd --in-memory --no-pci > >> --vdev=net_vhost0,iface=./vhost-net,client=1 -- -i > >> > >> $ ./build-clang/app/dpdk-testpmd --in-memory --single-file-segment > >> --no-pci --vdev > >> 'net_virtio_user0,mac=00:01:02:03:04:05,path=./vhost-net,server=1' -- > >> -i > >> > >> Then, on the "vhost side" testpmd: > >> ... > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > >> VHOST_CONFIG: (./vhost-net) vring base idx:0 last_used_idx:0 last_avail_idx:0. > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_ADDR > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_KICK > >> VHOST_CONFIG: (./vhost-net) vring kick idx:0 file:391 > >> [pid 59565] madvise(0x7fa6d8da4000, 2052, MADV_DODUMP) = -1 EINVAL > >> (Invalid argument) > >> VHOST_CONFIG: could not set coredump preference (Invalid argument). > >> [pid 59565] madvise(0x7fa6d8da5000, 2052, MADV_DODUMP) = -1 EINVAL > >> (Invalid argument) > >> VHOST_CONFIG: could not set coredump preference (Invalid argument). > >> [pid 59565] madvise(0x7fa6d8da6000, 2052, MADV_DODUMP) = -1 EINVAL > >> (Invalid argument) > >> VHOST_CONFIG: could not set coredump preference (Invalid argument). > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_NUM > >> VHOST_CONFIG: (./vhost-net) read message VHOST_USER_SET_VRING_BASE > >> > >> Looking at the whole trace, only madvise calls with MADV_DODUMP (with > >> all of them for a 2052 size) fail. > >> > >> I did not investigate further. > >> Could you have a look please? > >> > > > > I tried it on that exact kernel and also ran into this issue. I'll > > check it out in more depth. > > Gentle reminder, have you found the root cause for this issue? Yes, thanks for the ping. I should have a patch soon. Cheers, M > > Thanks, > Maxime > > > -M > > > >> > >> -- > >> David Marchand > >> > > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH] vhost: fix madvise arguments alignment 2023-02-21 16:26 ` Maxime Coquelin 2023-02-22 13:59 ` Mike Pattrick @ 2023-02-22 21:43 ` Mike Pattrick 2023-02-23 4:35 ` [PATCH v2] " Mike Pattrick 1 sibling, 1 reply; 19+ messages in thread From: Mike Pattrick @ 2023-02-22 21:43 UTC (permalink / raw) To: dev; +Cc: david.marchand, maxime.coquelin, chenbo.xia, Mike Pattrick The arguments passed to madvise should be aligned to the alignment of the backing memory. Now we keep track of each regions alignment and use then when setting coredump preferences. To facilitate this, a new member was added to rte_vhost_mem_region. A new function was added to easily translate memory address back to region alignment. Unneeded calls to madvise were reduced, as the cache removal case should already be covered by the cache insertion case. The previously inline function mem_set_dump was removed from a header file and made not inline. Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") Signed-off-by: Mike Pattrick <mkp@redhat.com> --- lib/vhost/iotlb.c | 9 +++--- lib/vhost/rte_vhost.h | 1 + lib/vhost/vhost.h | 12 ++------ lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++------- 4 files changed, 60 insertions(+), 25 deletions(-) diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c index a0b8fd7302..aa5ba3ba6d 100644 --- a/lib/vhost/iotlb.c +++ b/lib/vhost/iotlb.c @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) rte_rwlock_write_lock(&vq->iotlb_lock); RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); } @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { if (!entry_idx) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq vhost_user_iotlb_pool_put(vq, new_node); goto unlock; } else if (node->iova > new_node->iova) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)node->uaddr)); TAILQ_INSERT_BEFORE(node, new_node, next); vq->iotlb_cache_nr++; goto unlock; } } - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); vq->iotlb_cache_nr++; @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, break; if (iova < node->iova + node->size) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index a395843fe9..c5c97ea67e 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -136,6 +136,7 @@ struct rte_vhost_mem_region { void *mmap_addr; uint64_t mmap_size; int fd; + uint64_t alignment; }; /** diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 5750f0c005..a2467ba509 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) return true; } -static __rte_always_inline void -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) -{ -#ifdef MADV_DONTDUMP - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { - rte_log(RTE_LOG_INFO, vhost_config_log_level, - "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); - } -#endif -} +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index d702d082dd..6d09597fbe 100644 --- a/lib/vhost/vhost_user.c +++ b/lib/vhost/vhost_user.c @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) return log_gpa; } +uint64_t +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + uintptr_t hua = (uintptr_t)ptr; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (hua >= r->host_user_addr && + hua < r->host_user_addr + r->size) { + return r->alignment; + } + } + + /* If region isn't found, don't align at all */ + return 1; +} + +void +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) +{ +#ifdef MADV_DONTDUMP + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); + size_t len = end - (uintptr_t)start; + + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { + rte_log(RTE_LOG_INFO, vhost_config_log_level, + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); + } +#endif +} + static void translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) { @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc_packed, len, true, + hua_to_alignment(dev->mem, vq->desc_packed)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->driver_event, len, true, + hua_to_alignment(dev->mem, vq->driver_event)); len = sizeof(struct vring_packed_desc_event); vq->device_event = (struct vring_packed_desc_event *) (uintptr_t)ring_addr_to_vva(dev, @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } - mem_set_dump(vq->desc_packed, len, true); - mem_set_dump(vq->driver_event, len, true); - mem_set_dump(vq->device_event, len, true); + mem_set_dump(vq->device_event, len, true, + hua_to_alignment(dev->mem, vq->device_event)); vq->access_ok = true; return; } @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); len = sizeof(struct vring_used) + sizeof(struct vring_used_elem) * vq->size; if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); + if (vq->last_used_idx != vq->used->idx) { VHOST_LOG_CONFIG(dev->ifname, WARNING, "last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) "some packets maybe resent for Tx and dropped for Rx\n"); } - mem_set_dump(vq->desc, len, true); - mem_set_dump(vq->avail, len, true); - mem_set_dump(vq->used, len, true); vq->access_ok = true; VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev, region->mmap_addr = mmap_addr; region->mmap_size = mmap_size; region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; - mem_set_dump(mmap_addr, mmap_size, false); + region->alignment = alignment; + mem_set_dump(mmap_addr, mmap_size, false, alignment); if (dev->async_copy) { if (add_guest_pages(dev, region, alignment) < 0) { @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f return NULL; } - mem_set_dump(ptr, size, false); *fd = mfd; return ptr; } @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, uint64_t pervq_inflight_size, mmap_size; uint16_t num_queues, queue_size; struct virtio_net *dev = *pdev; + uint64_t alignment; int fd, i, j; int numa_node = SOCKET_ID_ANY; void *addr; @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } + alignment = get_blk_size(fd); + mem_set_dump(addr, mmap_size, false, alignment); dev->inflight_info->addr = addr; dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; dev->inflight_info->fd = ctx->fds[0] = fd; @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } - mem_set_dump(addr, mmap_size, false); dev->inflight_info->fd = fd; dev->inflight_info->addr = addr; dev->inflight_info->size = mmap_size; + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); for (i = 0; i < num_queues; i++) { vq = dev->virtqueue[i]; @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct virtio_net *dev = *pdev; int fd = ctx->fds[0]; uint64_t size, off; + uint64_t alignment; void *addr; uint32_t i; @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, * fail when offset is not page size aligned. */ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + alignment = get_blk_size(fd); close(fd); if (addr == MAP_FAILED) { VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n"); @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, dev->log_addr = (uint64_t)(uintptr_t)addr; dev->log_base = dev->log_addr + off; dev->log_size = size; - mem_set_dump(addr, size, false); + mem_set_dump(addr, size + off, false, alignment); for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; -- 2.31.1 ^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v2] vhost: fix madvise arguments alignment 2023-02-22 21:43 ` [PATCH] vhost: fix madvise arguments alignment Mike Pattrick @ 2023-02-23 4:35 ` Mike Pattrick 2023-02-23 16:12 ` Maxime Coquelin 2023-03-01 20:02 ` [PATCH v3] " Mike Pattrick 0 siblings, 2 replies; 19+ messages in thread From: Mike Pattrick @ 2023-02-23 4:35 UTC (permalink / raw) To: dev; +Cc: david.marchand, maxime.coquelin, chenbo.xia, Mike Pattrick The arguments passed to madvise should be aligned to the alignment of the backing memory. Now we keep track of each regions alignment and use then when setting coredump preferences. To facilitate this, a new member was added to rte_vhost_mem_region. A new function was added to easily translate memory address back to region alignment. Unneeded calls to madvise were reduced, as the cache removal case should already be covered by the cache insertion case. The previously inline function mem_set_dump was removed from a header file and made not inline. Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") Signed-off-by: Mike Pattrick <mkp@redhat.com> --- Since v1: - Corrected a cast for 32bit compiles --- lib/vhost/iotlb.c | 9 +++--- lib/vhost/rte_vhost.h | 1 + lib/vhost/vhost.h | 12 ++------ lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++------- 4 files changed, 60 insertions(+), 25 deletions(-) diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c index a0b8fd7302..5293507b63 100644 --- a/lib/vhost/iotlb.c +++ b/lib/vhost/iotlb.c @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) rte_rwlock_write_lock(&vq->iotlb_lock); RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); } @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { if (!entry_idx) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq vhost_user_iotlb_pool_put(vq, new_node); goto unlock; } else if (node->iova > new_node->iova) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr)); TAILQ_INSERT_BEFORE(node, new_node, next); vq->iotlb_cache_nr++; goto unlock; } } - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); vq->iotlb_cache_nr++; @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, break; if (iova < node->iova + node->size) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index a395843fe9..c5c97ea67e 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -136,6 +136,7 @@ struct rte_vhost_mem_region { void *mmap_addr; uint64_t mmap_size; int fd; + uint64_t alignment; }; /** diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 5750f0c005..a2467ba509 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) return true; } -static __rte_always_inline void -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) -{ -#ifdef MADV_DONTDUMP - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { - rte_log(RTE_LOG_INFO, vhost_config_log_level, - "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); - } -#endif -} +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index d702d082dd..6d09597fbe 100644 --- a/lib/vhost/vhost_user.c +++ b/lib/vhost/vhost_user.c @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) return log_gpa; } +uint64_t +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + uintptr_t hua = (uintptr_t)ptr; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (hua >= r->host_user_addr && + hua < r->host_user_addr + r->size) { + return r->alignment; + } + } + + /* If region isn't found, don't align at all */ + return 1; +} + +void +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) +{ +#ifdef MADV_DONTDUMP + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); + size_t len = end - (uintptr_t)start; + + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { + rte_log(RTE_LOG_INFO, vhost_config_log_level, + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); + } +#endif +} + static void translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) { @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc_packed, len, true, + hua_to_alignment(dev->mem, vq->desc_packed)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->driver_event, len, true, + hua_to_alignment(dev->mem, vq->driver_event)); len = sizeof(struct vring_packed_desc_event); vq->device_event = (struct vring_packed_desc_event *) (uintptr_t)ring_addr_to_vva(dev, @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } - mem_set_dump(vq->desc_packed, len, true); - mem_set_dump(vq->driver_event, len, true); - mem_set_dump(vq->device_event, len, true); + mem_set_dump(vq->device_event, len, true, + hua_to_alignment(dev->mem, vq->device_event)); vq->access_ok = true; return; } @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); len = sizeof(struct vring_used) + sizeof(struct vring_used_elem) * vq->size; if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); + if (vq->last_used_idx != vq->used->idx) { VHOST_LOG_CONFIG(dev->ifname, WARNING, "last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) "some packets maybe resent for Tx and dropped for Rx\n"); } - mem_set_dump(vq->desc, len, true); - mem_set_dump(vq->avail, len, true); - mem_set_dump(vq->used, len, true); vq->access_ok = true; VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev, region->mmap_addr = mmap_addr; region->mmap_size = mmap_size; region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; - mem_set_dump(mmap_addr, mmap_size, false); + region->alignment = alignment; + mem_set_dump(mmap_addr, mmap_size, false, alignment); if (dev->async_copy) { if (add_guest_pages(dev, region, alignment) < 0) { @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f return NULL; } - mem_set_dump(ptr, size, false); *fd = mfd; return ptr; } @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, uint64_t pervq_inflight_size, mmap_size; uint16_t num_queues, queue_size; struct virtio_net *dev = *pdev; + uint64_t alignment; int fd, i, j; int numa_node = SOCKET_ID_ANY; void *addr; @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } + alignment = get_blk_size(fd); + mem_set_dump(addr, mmap_size, false, alignment); dev->inflight_info->addr = addr; dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; dev->inflight_info->fd = ctx->fds[0] = fd; @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } - mem_set_dump(addr, mmap_size, false); dev->inflight_info->fd = fd; dev->inflight_info->addr = addr; dev->inflight_info->size = mmap_size; + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); for (i = 0; i < num_queues; i++) { vq = dev->virtqueue[i]; @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct virtio_net *dev = *pdev; int fd = ctx->fds[0]; uint64_t size, off; + uint64_t alignment; void *addr; uint32_t i; @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, * fail when offset is not page size aligned. */ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + alignment = get_blk_size(fd); close(fd); if (addr == MAP_FAILED) { VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n"); @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, dev->log_addr = (uint64_t)(uintptr_t)addr; dev->log_base = dev->log_addr + off; dev->log_size = size; - mem_set_dump(addr, size, false); + mem_set_dump(addr, size + off, false, alignment); for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; -- 2.31.1 ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] vhost: fix madvise arguments alignment 2023-02-23 4:35 ` [PATCH v2] " Mike Pattrick @ 2023-02-23 16:12 ` Maxime Coquelin 2023-02-23 16:57 ` Mike Pattrick 2023-03-01 20:02 ` [PATCH v3] " Mike Pattrick 1 sibling, 1 reply; 19+ messages in thread From: Maxime Coquelin @ 2023-02-23 16:12 UTC (permalink / raw) To: Mike Pattrick, dev; +Cc: david.marchand, chenbo.xia Hi Mike, Thanks for looking into this issue. On 2/23/23 05:35, Mike Pattrick wrote: > The arguments passed to madvise should be aligned to the alignment of > the backing memory. Now we keep track of each regions alignment and use > then when setting coredump preferences. To facilitate this, a new member > was added to rte_vhost_mem_region. A new function was added to easily > translate memory address back to region alignment. Unneeded calls to > madvise were reduced, as the cache removal case should already be > covered by the cache insertion case. The previously inline function > mem_set_dump was removed from a header file and made not inline. > > Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > --- > Since v1: > - Corrected a cast for 32bit compiles > --- > lib/vhost/iotlb.c | 9 +++--- > lib/vhost/rte_vhost.h | 1 + > lib/vhost/vhost.h | 12 ++------ > lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++------- > 4 files changed, 60 insertions(+), 25 deletions(-) > > diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c > index a0b8fd7302..5293507b63 100644 > --- a/lib/vhost/iotlb.c > +++ b/lib/vhost/iotlb.c > @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) > rte_rwlock_write_lock(&vq->iotlb_lock); > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); Hmm, it should have been called with enable=false here since we are removing the entry from the IOTLB cache. It should be kept in order to "DONTDUMP" pages evicted from the cache. > TAILQ_REMOVE(&vq->iotlb_list, node, next); > vhost_user_iotlb_pool_put(vq, node); > } > @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > if (!entry_idx) { > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); Same here. > TAILQ_REMOVE(&vq->iotlb_list, node, next); > vhost_user_iotlb_pool_put(vq, node); > vq->iotlb_cache_nr--; > @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq > vhost_user_iotlb_pool_put(vq, new_node); > goto unlock; > } else if (node->iova > new_node->iova) { > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, > + hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr)); > TAILQ_INSERT_BEFORE(node, new_node, next); > vq->iotlb_cache_nr++; > goto unlock; > } > } > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, > + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); > TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); > vq->iotlb_cache_nr++; > > @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, > break; > > if (iova < node->iova + node->size) { > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > TAILQ_REMOVE(&vq->iotlb_list, node, next); > vhost_user_iotlb_pool_put(vq, node); > vq->iotlb_cache_nr--; > diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h > index a395843fe9..c5c97ea67e 100644 > --- a/lib/vhost/rte_vhost.h > +++ b/lib/vhost/rte_vhost.h > @@ -136,6 +136,7 @@ struct rte_vhost_mem_region { > void *mmap_addr; > uint64_t mmap_size; > int fd; > + uint64_t alignment; This is not possible to do this as it breaks the ABI. You have to store the information somewhere else, or simply call get_blk_size() in hua_to_alignment() since the fd is not closed. > }; > > /** > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > index 5750f0c005..a2467ba509 100644 > --- a/lib/vhost/vhost.h > +++ b/lib/vhost/vhost.h > @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) > return true; > } > > -static __rte_always_inline void > -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) > -{ > -#ifdef MADV_DONTDUMP > - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > - rte_log(RTE_LOG_INFO, vhost_config_log_level, > - "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > - } > -#endif > -} > +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); > +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); > #endif /* _VHOST_NET_CDEV_H_ */ > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c > index d702d082dd..6d09597fbe 100644 > --- a/lib/vhost/vhost_user.c > +++ b/lib/vhost/vhost_user.c > @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) > return log_gpa; > } > > +uint64_t > +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) > +{ > + struct rte_vhost_mem_region *r; > + uint32_t i; > + uintptr_t hua = (uintptr_t)ptr; > + > + for (i = 0; i < mem->nregions; i++) { > + r = &mem->regions[i]; > + if (hua >= r->host_user_addr && > + hua < r->host_user_addr + r->size) { > + return r->alignment; > + } > + } > + > + /* If region isn't found, don't align at all */ > + return 1; > +} > + > +void > +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) > +{ > +#ifdef MADV_DONTDUMP > + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); > + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); > + size_t len = end - (uintptr_t)start; > + > + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > + rte_log(RTE_LOG_INFO, vhost_config_log_level, > + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > + } > +#endif > +} > + > static void > translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > { > @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > + mem_set_dump(vq->desc_packed, len, true, > + hua_to_alignment(dev->mem, vq->desc_packed)); > numa_realloc(&dev, &vq); > *pdev = dev; > *pvq = vq; > @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > + mem_set_dump(vq->driver_event, len, true, > + hua_to_alignment(dev->mem, vq->driver_event)); > len = sizeof(struct vring_packed_desc_event); > vq->device_event = (struct vring_packed_desc_event *) > (uintptr_t)ring_addr_to_vva(dev, > @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > - mem_set_dump(vq->desc_packed, len, true); > - mem_set_dump(vq->driver_event, len, true); > - mem_set_dump(vq->device_event, len, true); > + mem_set_dump(vq->device_event, len, true, > + hua_to_alignment(dev->mem, vq->device_event)); > vq->access_ok = true; > return; > } > @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); > numa_realloc(&dev, &vq); > *pdev = dev; > *pvq = vq; > @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); > len = sizeof(struct vring_used) + > sizeof(struct vring_used_elem) * vq->size; > if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) > @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > return; > } > > + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); > + > if (vq->last_used_idx != vq->used->idx) { > VHOST_LOG_CONFIG(dev->ifname, WARNING, > "last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", > @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > "some packets maybe resent for Tx and dropped for Rx\n"); > } > > - mem_set_dump(vq->desc, len, true); > - mem_set_dump(vq->avail, len, true); > - mem_set_dump(vq->used, len, true); > vq->access_ok = true; > > VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); > @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev, > region->mmap_addr = mmap_addr; > region->mmap_size = mmap_size; > region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; > - mem_set_dump(mmap_addr, mmap_size, false); > + region->alignment = alignment; > + mem_set_dump(mmap_addr, mmap_size, false, alignment); > > if (dev->async_copy) { > if (add_guest_pages(dev, region, alignment) < 0) { > @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f > return NULL; > } > > - mem_set_dump(ptr, size, false); > *fd = mfd; > return ptr; > } > @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, > uint64_t pervq_inflight_size, mmap_size; > uint16_t num_queues, queue_size; > struct virtio_net *dev = *pdev; > + uint64_t alignment; > int fd, i, j; > int numa_node = SOCKET_ID_ANY; > void *addr; > @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, > dev->inflight_info->fd = -1; > } > > + alignment = get_blk_size(fd); > + mem_set_dump(addr, mmap_size, false, alignment); > dev->inflight_info->addr = addr; > dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; > dev->inflight_info->fd = ctx->fds[0] = fd; > @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, > dev->inflight_info->fd = -1; > } > > - mem_set_dump(addr, mmap_size, false); > dev->inflight_info->fd = fd; > dev->inflight_info->addr = addr; > dev->inflight_info->size = mmap_size; > + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); > > for (i = 0; i < num_queues; i++) { > vq = dev->virtqueue[i]; > @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > struct virtio_net *dev = *pdev; > int fd = ctx->fds[0]; > uint64_t size, off; > + uint64_t alignment; > void *addr; > uint32_t i; > > @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > * fail when offset is not page size aligned. > */ > addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); > + alignment = get_blk_size(fd); > close(fd); > if (addr == MAP_FAILED) { > VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n"); > @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > dev->log_addr = (uint64_t)(uintptr_t)addr; > dev->log_base = dev->log_addr + off; > dev->log_size = size; > - mem_set_dump(addr, size, false); > + mem_set_dump(addr, size + off, false, alignment); > > for (i = 0; i < dev->nr_vring; i++) { > struct vhost_virtqueue *vq = dev->virtqueue[i]; ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] vhost: fix madvise arguments alignment 2023-02-23 16:12 ` Maxime Coquelin @ 2023-02-23 16:57 ` Mike Pattrick 2023-02-24 15:05 ` Patrick Robb 0 siblings, 1 reply; 19+ messages in thread From: Mike Pattrick @ 2023-02-23 16:57 UTC (permalink / raw) To: Maxime Coquelin; +Cc: dev, david.marchand, chenbo.xia On Thu, Feb 23, 2023 at 11:12 AM Maxime Coquelin <maxime.coquelin@redhat.com> wrote: > > Hi Mike, > > Thanks for looking into this issue. > > On 2/23/23 05:35, Mike Pattrick wrote: > > The arguments passed to madvise should be aligned to the alignment of > > the backing memory. Now we keep track of each regions alignment and use > > then when setting coredump preferences. To facilitate this, a new member > > was added to rte_vhost_mem_region. A new function was added to easily > > translate memory address back to region alignment. Unneeded calls to > > madvise were reduced, as the cache removal case should already be > > covered by the cache insertion case. The previously inline function > > mem_set_dump was removed from a header file and made not inline. > > > > Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") > > > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > --- > > Since v1: > > - Corrected a cast for 32bit compiles > > --- > > lib/vhost/iotlb.c | 9 +++--- > > lib/vhost/rte_vhost.h | 1 + > > lib/vhost/vhost.h | 12 ++------ > > lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++------- > > 4 files changed, 60 insertions(+), 25 deletions(-) > > > > diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c > > index a0b8fd7302..5293507b63 100644 > > --- a/lib/vhost/iotlb.c > > +++ b/lib/vhost/iotlb.c > > @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) > > rte_rwlock_write_lock(&vq->iotlb_lock); > > > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > Hmm, it should have been called with enable=false here since we are > removing the entry from the IOTLB cache. It should be kept in order to > "DONTDUMP" pages evicted from the cache. Here I was thinking that if we add an entry and then remove a different entry, they could be in the same page. But on I should have kept an enable=false in remove_all(). And now that I think about it again, I could just check if there are any active cache entries in the page on every evict/remove, they're sorted so that should be an easy check. Unless there are any objections I'll go forward with that. > > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > vhost_user_iotlb_pool_put(vq, node); > > } > > @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) > > > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > > if (!entry_idx) { > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > Same here. > > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > vhost_user_iotlb_pool_put(vq, node); > > vq->iotlb_cache_nr--; > > @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq > > vhost_user_iotlb_pool_put(vq, new_node); > > goto unlock; > > } else if (node->iova > new_node->iova) { > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, > > + hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr)); > > TAILQ_INSERT_BEFORE(node, new_node, next); > > vq->iotlb_cache_nr++; > > goto unlock; > > } > > } > > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, > > + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); > > TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); > > vq->iotlb_cache_nr++; > > > > @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, > > break; > > > > if (iova < node->iova + node->size) { > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > vhost_user_iotlb_pool_put(vq, node); > > vq->iotlb_cache_nr--; > > diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h > > index a395843fe9..c5c97ea67e 100644 > > --- a/lib/vhost/rte_vhost.h > > +++ b/lib/vhost/rte_vhost.h > > @@ -136,6 +136,7 @@ struct rte_vhost_mem_region { > > void *mmap_addr; > > uint64_t mmap_size; > > int fd; > > + uint64_t alignment; > > This is not possible to do this as it breaks the ABI. > You have to store the information somewhere else, or simply call > get_blk_size() in hua_to_alignment() since the fd is not closed. > Sorry about that! You're right, checking the fd per operation should be easy enough. Thanks for the review, M > > }; > > > > /** > > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > > index 5750f0c005..a2467ba509 100644 > > --- a/lib/vhost/vhost.h > > +++ b/lib/vhost/vhost.h > > @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) > > return true; > > } > > > > -static __rte_always_inline void > > -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) > > -{ > > -#ifdef MADV_DONTDUMP > > - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > > - rte_log(RTE_LOG_INFO, vhost_config_log_level, > > - "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > > - } > > -#endif > > -} > > +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); > > +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); > > #endif /* _VHOST_NET_CDEV_H_ */ > > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c > > index d702d082dd..6d09597fbe 100644 > > --- a/lib/vhost/vhost_user.c > > +++ b/lib/vhost/vhost_user.c > > @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) > > return log_gpa; > > } > > > > +uint64_t > > +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) > > +{ > > + struct rte_vhost_mem_region *r; > > + uint32_t i; > > + uintptr_t hua = (uintptr_t)ptr; > > + > > + for (i = 0; i < mem->nregions; i++) { > > + r = &mem->regions[i]; > > + if (hua >= r->host_user_addr && > > + hua < r->host_user_addr + r->size) { > > + return r->alignment; > > + } > > + } > > + > > + /* If region isn't found, don't align at all */ > > + return 1; > > +} > > + > > +void > > +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) > > +{ > > +#ifdef MADV_DONTDUMP > > + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); > > + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); > > + size_t len = end - (uintptr_t)start; > > + > > + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > > + rte_log(RTE_LOG_INFO, vhost_config_log_level, > > + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > > + } > > +#endif > > +} > > + > > static void > > translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > { > > @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > + mem_set_dump(vq->desc_packed, len, true, > > + hua_to_alignment(dev->mem, vq->desc_packed)); > > numa_realloc(&dev, &vq); > > *pdev = dev; > > *pvq = vq; > > @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > + mem_set_dump(vq->driver_event, len, true, > > + hua_to_alignment(dev->mem, vq->driver_event)); > > len = sizeof(struct vring_packed_desc_event); > > vq->device_event = (struct vring_packed_desc_event *) > > (uintptr_t)ring_addr_to_vva(dev, > > @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > - mem_set_dump(vq->desc_packed, len, true); > > - mem_set_dump(vq->driver_event, len, true); > > - mem_set_dump(vq->device_event, len, true); > > + mem_set_dump(vq->device_event, len, true, > > + hua_to_alignment(dev->mem, vq->device_event)); > > vq->access_ok = true; > > return; > > } > > @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); > > numa_realloc(&dev, &vq); > > *pdev = dev; > > *pvq = vq; > > @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); > > len = sizeof(struct vring_used) + > > sizeof(struct vring_used_elem) * vq->size; > > if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) > > @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > return; > > } > > > > + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); > > + > > if (vq->last_used_idx != vq->used->idx) { > > VHOST_LOG_CONFIG(dev->ifname, WARNING, > > "last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", > > @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) > > "some packets maybe resent for Tx and dropped for Rx\n"); > > } > > > > - mem_set_dump(vq->desc, len, true); > > - mem_set_dump(vq->avail, len, true); > > - mem_set_dump(vq->used, len, true); > > vq->access_ok = true; > > > > VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); > > @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev, > > region->mmap_addr = mmap_addr; > > region->mmap_size = mmap_size; > > region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; > > - mem_set_dump(mmap_addr, mmap_size, false); > > + region->alignment = alignment; > > + mem_set_dump(mmap_addr, mmap_size, false, alignment); > > > > if (dev->async_copy) { > > if (add_guest_pages(dev, region, alignment) < 0) { > > @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f > > return NULL; > > } > > > > - mem_set_dump(ptr, size, false); > > *fd = mfd; > > return ptr; > > } > > @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, > > uint64_t pervq_inflight_size, mmap_size; > > uint16_t num_queues, queue_size; > > struct virtio_net *dev = *pdev; > > + uint64_t alignment; > > int fd, i, j; > > int numa_node = SOCKET_ID_ANY; > > void *addr; > > @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, > > dev->inflight_info->fd = -1; > > } > > > > + alignment = get_blk_size(fd); > > + mem_set_dump(addr, mmap_size, false, alignment); > > dev->inflight_info->addr = addr; > > dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; > > dev->inflight_info->fd = ctx->fds[0] = fd; > > @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, > > dev->inflight_info->fd = -1; > > } > > > > - mem_set_dump(addr, mmap_size, false); > > dev->inflight_info->fd = fd; > > dev->inflight_info->addr = addr; > > dev->inflight_info->size = mmap_size; > > + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); > > > > for (i = 0; i < num_queues; i++) { > > vq = dev->virtqueue[i]; > > @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > struct virtio_net *dev = *pdev; > > int fd = ctx->fds[0]; > > uint64_t size, off; > > + uint64_t alignment; > > void *addr; > > uint32_t i; > > > > @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > * fail when offset is not page size aligned. > > */ > > addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); > > + alignment = get_blk_size(fd); > > close(fd); > > if (addr == MAP_FAILED) { > > VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n"); > > @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > dev->log_addr = (uint64_t)(uintptr_t)addr; > > dev->log_base = dev->log_addr + off; > > dev->log_size = size; > > - mem_set_dump(addr, size, false); > > + mem_set_dump(addr, size + off, false, alignment); > > > > for (i = 0; i < dev->nr_vring; i++) { > > struct vhost_virtqueue *vq = dev->virtqueue[i]; > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] vhost: fix madvise arguments alignment 2023-02-23 16:57 ` Mike Pattrick @ 2023-02-24 15:05 ` Patrick Robb 0 siblings, 0 replies; 19+ messages in thread From: Patrick Robb @ 2023-02-24 15:05 UTC (permalink / raw) To: Mike Pattrick; +Cc: Maxime Coquelin, dev, david.marchand, chenbo.xia [-- Attachment #1: Type: text/plain, Size: 16088 bytes --] UNH CI reported an ABI failure for this patch which did not report due to a bug on our end, so I'm manually reporting it now. I see Maxime you already predicted the issue though! *07:58:32* 1 function with some indirect sub-type change:*07:58:32* *07:58:32* [C] 'function int rte_vhost_get_mem_table(int, rte_vhost_memory**)' at vhost.c:922:1 has some indirect sub-type changes:*07:58:32* parameter 2 of type 'rte_vhost_memory**' has sub-type changes:*07:58:32* in pointed to type 'rte_vhost_memory*':*07:58:32* in pointed to type 'struct rte_vhost_memory' at rte_vhost.h:145:1:*07:58:32* type size hasn't changed*07:58:32* 1 data member change:*07:58:32* type of 'rte_vhost_mem_region regions[]' changed:*07:58:32* array element type 'struct rte_vhost_mem_region' changed:*07:58:32* type size changed from 448 to 512 (in bits)*07:58:32* 1 data member insertion:*07:58:32* 'uint64_t alignment', at offset 448 (in bits) at rte_vhost.h:139:1*07:58:32* type size hasn't changed*07:58:32* *07:58:32* Error: ABI issue reported for abidiff --suppr dpdk/devtools/libabigail.abignore --no-added-syms --headers-dir1 reference/include --headers-dir2 build_install/include reference/dump/librte_vhost.dump build_install/dump/librte_vhost.dump*07:58:32* ABIDIFF_ABI_CHANGE, this change requires a review (abidiff flagged this as a potential issue). On Thu, Feb 23, 2023 at 11:57 AM Mike Pattrick <mkp@redhat.com> wrote: > On Thu, Feb 23, 2023 at 11:12 AM Maxime Coquelin > <maxime.coquelin@redhat.com> wrote: > > > > Hi Mike, > > > > Thanks for looking into this issue. > > > > On 2/23/23 05:35, Mike Pattrick wrote: > > > The arguments passed to madvise should be aligned to the alignment of > > > the backing memory. Now we keep track of each regions alignment and use > > > then when setting coredump preferences. To facilitate this, a new > member > > > was added to rte_vhost_mem_region. A new function was added to easily > > > translate memory address back to region alignment. Unneeded calls to > > > madvise were reduced, as the cache removal case should already be > > > covered by the cache insertion case. The previously inline function > > > mem_set_dump was removed from a header file and made not inline. > > > > > > Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") > > > > > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > > > --- > > > Since v1: > > > - Corrected a cast for 32bit compiles > > > --- > > > lib/vhost/iotlb.c | 9 +++--- > > > lib/vhost/rte_vhost.h | 1 + > > > lib/vhost/vhost.h | 12 ++------ > > > lib/vhost/vhost_user.c | 63 > +++++++++++++++++++++++++++++++++++------- > > > 4 files changed, 60 insertions(+), 25 deletions(-) > > > > > > diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c > > > index a0b8fd7302..5293507b63 100644 > > > --- a/lib/vhost/iotlb.c > > > +++ b/lib/vhost/iotlb.c > > > @@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct > vhost_virtqueue *vq) > > > rte_rwlock_write_lock(&vq->iotlb_lock); > > > > > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, > true); > > > > Hmm, it should have been called with enable=false here since we are > > removing the entry from the IOTLB cache. It should be kept in order to > > "DONTDUMP" pages evicted from the cache. > > Here I was thinking that if we add an entry and then remove a > different entry, they could be in the same page. But on I should have > kept an enable=false in remove_all(). > > And now that I think about it again, I could just check if there are > any active cache entries in the page on every evict/remove, they're > sorted so that should be an easy check. Unless there are any > objections I'll go forward with that. > > > > > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > > vhost_user_iotlb_pool_put(vq, node); > > > } > > > @@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct > vhost_virtqueue *vq) > > > > > > RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { > > > if (!entry_idx) { > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, > node->size, true); > > > > Same here. > > > > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > > vhost_user_iotlb_pool_put(vq, node); > > > vq->iotlb_cache_nr--; > > > @@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net > *dev, struct vhost_virtqueue *vq > > > vhost_user_iotlb_pool_put(vq, new_node); > > > goto unlock; > > > } else if (node->iova > new_node->iova) { > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, > node->size, true); > > > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, > new_node->size, true, > > > + hua_to_alignment(dev->mem, (void > *)(uintptr_t)node->uaddr)); > > > TAILQ_INSERT_BEFORE(node, new_node, next); > > > vq->iotlb_cache_nr++; > > > goto unlock; > > > } > > > } > > > > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); > > > + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, > true, > > > + hua_to_alignment(dev->mem, (void > *)(uintptr_t)new_node->uaddr)); > > > TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); > > > vq->iotlb_cache_nr++; > > > > > > @@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct > vhost_virtqueue *vq, > > > break; > > > > > > if (iova < node->iova + node->size) { > > > - mem_set_dump((void *)(uintptr_t)node->uaddr, > node->size, true); > > > TAILQ_REMOVE(&vq->iotlb_list, node, next); > > > vhost_user_iotlb_pool_put(vq, node); > > > vq->iotlb_cache_nr--; > > > diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h > > > index a395843fe9..c5c97ea67e 100644 > > > --- a/lib/vhost/rte_vhost.h > > > +++ b/lib/vhost/rte_vhost.h > > > @@ -136,6 +136,7 @@ struct rte_vhost_mem_region { > > > void *mmap_addr; > > > uint64_t mmap_size; > > > int fd; > > > + uint64_t alignment; > > > > This is not possible to do this as it breaks the ABI. > > You have to store the information somewhere else, or simply call > > get_blk_size() in hua_to_alignment() since the fd is not closed. > > > > Sorry about that! You're right, checking the fd per operation should > be easy enough. > > Thanks for the review, > > M > > > > }; > > > > > > /** > > > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > > > index 5750f0c005..a2467ba509 100644 > > > --- a/lib/vhost/vhost.h > > > +++ b/lib/vhost/vhost.h > > > @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) > > > return true; > > > } > > > > > > -static __rte_always_inline void > > > -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, > __rte_unused bool enable) > > > -{ > > > -#ifdef MADV_DONTDUMP > > > - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == > -1) { > > > - rte_log(RTE_LOG_INFO, vhost_config_log_level, > > > - "VHOST_CONFIG: could not set coredump preference > (%s).\n", strerror(errno)); > > > - } > > > -#endif > > > -} > > > +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); > > > +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t > alignment); > > > #endif /* _VHOST_NET_CDEV_H_ */ > > > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c > > > index d702d082dd..6d09597fbe 100644 > > > --- a/lib/vhost/vhost_user.c > > > +++ b/lib/vhost/vhost_user.c > > > @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct > vhost_virtqueue *vq) > > > return log_gpa; > > > } > > > > > > +uint64_t > > > +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) > > > +{ > > > + struct rte_vhost_mem_region *r; > > > + uint32_t i; > > > + uintptr_t hua = (uintptr_t)ptr; > > > + > > > + for (i = 0; i < mem->nregions; i++) { > > > + r = &mem->regions[i]; > > > + if (hua >= r->host_user_addr && > > > + hua < r->host_user_addr + r->size) { > > > + return r->alignment; > > > + } > > > + } > > > + > > > + /* If region isn't found, don't align at all */ > > > + return 1; > > > +} > > > + > > > +void > > > +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) > > > +{ > > > +#ifdef MADV_DONTDUMP > > > + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); > > > + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); > > > + size_t len = end - (uintptr_t)start; > > > + > > > + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == > -1) { > > > + rte_log(RTE_LOG_INFO, vhost_config_log_level, > > > + "VHOST_CONFIG: could not set coredump preference > (%s).\n", strerror(errno)); > > > + } > > > +#endif > > > +} > > > + > > > static void > > > translate_ring_addresses(struct virtio_net **pdev, struct > vhost_virtqueue **pvq) > > > { > > > @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > + mem_set_dump(vq->desc_packed, len, true, > > > + hua_to_alignment(dev->mem, vq->desc_packed)); > > > numa_realloc(&dev, &vq); > > > *pdev = dev; > > > *pvq = vq; > > > @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > + mem_set_dump(vq->driver_event, len, true, > > > + hua_to_alignment(dev->mem, vq->driver_event)); > > > len = sizeof(struct vring_packed_desc_event); > > > vq->device_event = (struct vring_packed_desc_event *) > > > (uintptr_t)ring_addr_to_vva(dev, > > > @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > - mem_set_dump(vq->desc_packed, len, true); > > > - mem_set_dump(vq->driver_event, len, true); > > > - mem_set_dump(vq->device_event, len, true); > > > + mem_set_dump(vq->device_event, len, true, > > > + hua_to_alignment(dev->mem, vq->device_event)); > > > vq->access_ok = true; > > > return; > > > } > > > @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, > vq->desc)); > > > numa_realloc(&dev, &vq); > > > *pdev = dev; > > > *pvq = vq; > > > @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, > vq->avail)); > > > len = sizeof(struct vring_used) + > > > sizeof(struct vring_used_elem) * vq->size; > > > if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) > > > @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > return; > > > } > > > > > > + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, > vq->used)); > > > + > > > if (vq->last_used_idx != vq->used->idx) { > > > VHOST_LOG_CONFIG(dev->ifname, WARNING, > > > "last_used_idx (%u) and vq->used->idx (%u) > mismatches;\n", > > > @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, > struct vhost_virtqueue **pvq) > > > "some packets maybe resent for Tx and dropped > for Rx\n"); > > > } > > > > > > - mem_set_dump(vq->desc, len, true); > > > - mem_set_dump(vq->avail, len, true); > > > - mem_set_dump(vq->used, len, true); > > > vq->access_ok = true; > > > > > > VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: > %p\n", vq->desc); > > > @@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev, > > > region->mmap_addr = mmap_addr; > > > region->mmap_size = mmap_size; > > > region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + > mmap_offset; > > > - mem_set_dump(mmap_addr, mmap_size, false); > > > + region->alignment = alignment; > > > + mem_set_dump(mmap_addr, mmap_size, false, alignment); > > > > > > if (dev->async_copy) { > > > if (add_guest_pages(dev, region, alignment) < 0) { > > > @@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const > char *name, size_t size, int *f > > > return NULL; > > > } > > > > > > - mem_set_dump(ptr, size, false); > > > *fd = mfd; > > > return ptr; > > > } > > > @@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net > **pdev, > > > uint64_t pervq_inflight_size, mmap_size; > > > uint16_t num_queues, queue_size; > > > struct virtio_net *dev = *pdev; > > > + uint64_t alignment; > > > int fd, i, j; > > > int numa_node = SOCKET_ID_ANY; > > > void *addr; > > > @@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net > **pdev, > > > dev->inflight_info->fd = -1; > > > } > > > > > > + alignment = get_blk_size(fd); > > > + mem_set_dump(addr, mmap_size, false, alignment); > > > dev->inflight_info->addr = addr; > > > dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = > mmap_size; > > > dev->inflight_info->fd = ctx->fds[0] = fd; > > > @@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net > **pdev, > > > dev->inflight_info->fd = -1; > > > } > > > > > > - mem_set_dump(addr, mmap_size, false); > > > dev->inflight_info->fd = fd; > > > dev->inflight_info->addr = addr; > > > dev->inflight_info->size = mmap_size; > > > + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); > > > > > > for (i = 0; i < num_queues; i++) { > > > vq = dev->virtqueue[i]; > > > @@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > > struct virtio_net *dev = *pdev; > > > int fd = ctx->fds[0]; > > > uint64_t size, off; > > > + uint64_t alignment; > > > void *addr; > > > uint32_t i; > > > > > > @@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > > * fail when offset is not page size aligned. > > > */ > > > addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, > fd, 0); > > > + alignment = get_blk_size(fd); > > > close(fd); > > > if (addr == MAP_FAILED) { > > > VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base > failed!\n"); > > > @@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, > > > dev->log_addr = (uint64_t)(uintptr_t)addr; > > > dev->log_base = dev->log_addr + off; > > > dev->log_size = size; > > > - mem_set_dump(addr, size, false); > > > + mem_set_dump(addr, size + off, false, alignment); > > > > > > for (i = 0; i < dev->nr_vring; i++) { > > > struct vhost_virtqueue *vq = dev->virtqueue[i]; > > > > -- Patrick Robb Technical Service Manager UNH InterOperability Laboratory 21 Madbury Rd, Suite 100, Durham, NH 03824 www.iol.unh.edu [-- Attachment #2: Type: text/html, Size: 24631 bytes --] ^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v3] vhost: fix madvise arguments alignment 2023-02-23 4:35 ` [PATCH v2] " Mike Pattrick 2023-02-23 16:12 ` Maxime Coquelin @ 2023-03-01 20:02 ` Mike Pattrick 2023-03-02 10:19 ` Maxime Coquelin 2023-03-06 14:22 ` Maxime Coquelin 1 sibling, 2 replies; 19+ messages in thread From: Mike Pattrick @ 2023-03-01 20:02 UTC (permalink / raw) To: dev; +Cc: david.marchand, maxime.coquelin, chenbo.xia, Mike Pattrick The arguments passed to madvise should be aligned to the alignment of the backing memory. Now we keep track of each regions alignment and use then when setting coredump preferences. To facilitate this, a new member was added to rte_vhost_mem_region. A new function was added to easily translate memory address back to region alignment. Unneeded calls to madvise were reduced, as the cache removal case should already be covered by the cache insertion case. The previously inline function mem_set_dump was removed from a header file and made not inline. Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") Signed-off-by: Mike Pattrick <mkp@redhat.com> --- Since v1: - Corrected a cast for 32bit compiles Since v2: - Removed changes from rte_vhost.h - Restored set_mem_dump in ioctl remove/evict functions --- lib/vhost/iotlb.c | 68 +++++++++++++++++++++++++++++++----------- lib/vhost/iotlb.h | 5 ++-- lib/vhost/vhost.h | 12 ++------ lib/vhost/vhost_user.c | 68 ++++++++++++++++++++++++++++++++++-------- 4 files changed, 110 insertions(+), 43 deletions(-) diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c index a0b8fd7302..11785392ac 100644 --- a/lib/vhost/iotlb.c +++ b/lib/vhost/iotlb.c @@ -46,7 +46,7 @@ vhost_user_iotlb_pool_put(struct vhost_virtqueue *vq, } static void -vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq); +vhost_user_iotlb_cache_random_evict(struct virtio_net *dev, struct vhost_virtqueue *vq); static void vhost_user_iotlb_pending_remove_all(struct vhost_virtqueue *vq) @@ -98,7 +98,7 @@ vhost_user_iotlb_pending_insert(struct virtio_net *dev, struct vhost_virtqueue * if (!TAILQ_EMPTY(&vq->iotlb_pending_list)) vhost_user_iotlb_pending_remove_all(vq); else - vhost_user_iotlb_cache_random_evict(vq); + vhost_user_iotlb_cache_random_evict(dev, vq); node = vhost_user_iotlb_pool_get(vq); if (node == NULL) { VHOST_LOG_CONFIG(dev->ifname, ERR, @@ -142,14 +142,15 @@ vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, } static void -vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) +vhost_user_iotlb_cache_remove_all(struct virtio_net *dev, struct vhost_virtqueue *vq) { struct vhost_iotlb_entry *node, *temp_node; rte_rwlock_write_lock(&vq->iotlb_lock); RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, false, + hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr)); TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); } @@ -160,9 +161,10 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) } static void -vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) +vhost_user_iotlb_cache_random_evict(struct virtio_net *dev, struct vhost_virtqueue *vq) { - struct vhost_iotlb_entry *node, *temp_node; + struct vhost_iotlb_entry *node, *temp_node, *prev_node = NULL; + uint64_t alignment, mask; int entry_idx; rte_rwlock_write_lock(&vq->iotlb_lock); @@ -171,12 +173,26 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { if (!entry_idx) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + struct vhost_iotlb_entry *next_node; + alignment = hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr); + mask = ~(alignment - 1); + + /* Don't disable coredump if the previous node is in the same page */ + if (prev_node == NULL || + (node->uaddr & mask) != (prev_node->uaddr & mask)) { + next_node = RTE_TAILQ_NEXT(node, next); + /* Don't disable coredump if the next node is in the same page */ + if (next_node == NULL || + (node->uaddr & mask) != (next_node->uaddr & mask)) + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, + false, alignment); + } TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; break; } + prev_node = node; entry_idx--; } @@ -196,7 +212,7 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq "IOTLB pool vq %"PRIu32" empty, clear entries for cache insertion\n", vq->index); if (!TAILQ_EMPTY(&vq->iotlb_list)) - vhost_user_iotlb_cache_random_evict(vq); + vhost_user_iotlb_cache_random_evict(dev, vq); else vhost_user_iotlb_pending_remove_all(vq); new_node = vhost_user_iotlb_pool_get(vq); @@ -224,14 +240,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq vhost_user_iotlb_pool_put(vq, new_node); goto unlock; } else if (node->iova > new_node->iova) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); TAILQ_INSERT_BEFORE(node, new_node, next); vq->iotlb_cache_nr++; goto unlock; } } - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true, + hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr)); TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); vq->iotlb_cache_nr++; @@ -243,10 +261,11 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq } void -vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, +vhost_user_iotlb_cache_remove(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t iova, uint64_t size) { - struct vhost_iotlb_entry *node, *temp_node; + struct vhost_iotlb_entry *node, *temp_node, *prev_node = NULL; + uint64_t alignment, mask; if (unlikely(!size)) return; @@ -259,11 +278,26 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, break; if (iova < node->iova + node->size) { - mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true); + struct vhost_iotlb_entry *next_node; + alignment = hua_to_alignment(dev->mem, (void *)(uintptr_t)node->uaddr); + mask = ~(alignment-1); + + /* Don't disable coredump if the previous node is in the same page */ + if (prev_node == NULL || + (node->uaddr & mask) != (prev_node->uaddr & mask)) { + next_node = RTE_TAILQ_NEXT(node, next); + /* Don't disable coredump if the next node is in the same page */ + if (next_node == NULL || + (node->uaddr & mask) != (next_node->uaddr & mask)) + mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, + false, alignment); + } + TAILQ_REMOVE(&vq->iotlb_list, node, next); vhost_user_iotlb_pool_put(vq, node); vq->iotlb_cache_nr--; - } + } else + prev_node = node; } rte_rwlock_write_unlock(&vq->iotlb_lock); @@ -312,9 +346,9 @@ vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova, } void -vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq) +vhost_user_iotlb_flush_all(struct virtio_net *dev, struct vhost_virtqueue *vq) { - vhost_user_iotlb_cache_remove_all(vq); + vhost_user_iotlb_cache_remove_all(dev, vq); vhost_user_iotlb_pending_remove_all(vq); } @@ -329,7 +363,7 @@ vhost_user_iotlb_init(struct virtio_net *dev, struct vhost_virtqueue *vq) * The cache has already been initialized, * just drop all cached and pending entries. */ - vhost_user_iotlb_flush_all(vq); + vhost_user_iotlb_flush_all(dev, vq); rte_free(vq->iotlb_pool); } diff --git a/lib/vhost/iotlb.h b/lib/vhost/iotlb.h index be079a8aa7..73b5465b41 100644 --- a/lib/vhost/iotlb.h +++ b/lib/vhost/iotlb.h @@ -40,7 +40,7 @@ vhost_user_iotlb_wr_unlock(struct vhost_virtqueue *vq) void vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t iova, uint64_t uaddr, uint64_t size, uint8_t perm); -void vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, +void vhost_user_iotlb_cache_remove(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t iova, uint64_t size); uint64_t vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova, uint64_t *size, uint8_t perm); @@ -50,8 +50,7 @@ void vhost_user_iotlb_pending_insert(struct virtio_net *dev, struct vhost_virtqu uint64_t iova, uint8_t perm); void vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, uint64_t iova, uint64_t size, uint8_t perm); -void vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq); +void vhost_user_iotlb_flush_all(struct virtio_net *dev, struct vhost_virtqueue *vq); int vhost_user_iotlb_init(struct virtio_net *dev, struct vhost_virtqueue *vq); void vhost_user_iotlb_destroy(struct vhost_virtqueue *vq); - #endif /* _VHOST_IOTLB_H_ */ diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 5750f0c005..a2467ba509 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m) return true; } -static __rte_always_inline void -mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) -{ -#ifdef MADV_DONTDUMP - if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { - rte_log(RTE_LOG_INFO, vhost_config_log_level, - "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); - } -#endif -} +uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); +void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c index d702d082dd..909d19351e 100644 --- a/lib/vhost/vhost_user.c +++ b/lib/vhost/vhost_user.c @@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) return log_gpa; } +uint64_t +hua_to_alignment(struct rte_vhost_memory *mem, void *ptr) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + uintptr_t hua = (uintptr_t)ptr; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (hua >= r->host_user_addr && + hua < r->host_user_addr + r->size) { + return get_blk_size(r->fd); + } + } + + /* If region isn't found, don't align at all */ + return 1; +} + +void +mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz) +{ +#ifdef MADV_DONTDUMP + void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz); + uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz); + size_t len = end - (uintptr_t)start; + + if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { + rte_log(RTE_LOG_INFO, vhost_config_log_level, + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); + } +#endif +} + static void translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) { @@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc_packed, len, true, + hua_to_alignment(dev->mem, vq->desc_packed)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->driver_event, len, true, + hua_to_alignment(dev->mem, vq->driver_event)); len = sizeof(struct vring_packed_desc_event); vq->device_event = (struct vring_packed_desc_event *) (uintptr_t)ring_addr_to_vva(dev, @@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } - mem_set_dump(vq->desc_packed, len, true); - mem_set_dump(vq->driver_event, len, true); - mem_set_dump(vq->device_event, len, true); + mem_set_dump(vq->device_event, len, true, + hua_to_alignment(dev->mem, vq->device_event)); vq->access_ok = true; return; } @@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc)); numa_realloc(&dev, &vq); *pdev = dev; *pvq = vq; @@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail)); len = sizeof(struct vring_used) + sizeof(struct vring_used_elem) * vq->size; if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) @@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) return; } + mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used)); + if (vq->last_used_idx != vq->used->idx) { VHOST_LOG_CONFIG(dev->ifname, WARNING, "last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", @@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq) "some packets maybe resent for Tx and dropped for Rx\n"); } - mem_set_dump(vq->desc, len, true); - mem_set_dump(vq->avail, len, true); - mem_set_dump(vq->used, len, true); vq->access_ok = true; VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc); @@ -1230,7 +1268,7 @@ vhost_user_mmap_region(struct virtio_net *dev, region->mmap_addr = mmap_addr; region->mmap_size = mmap_size; region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; - mem_set_dump(mmap_addr, mmap_size, false); + mem_set_dump(mmap_addr, mmap_size, false, alignment); if (dev->async_copy) { if (add_guest_pages(dev, region, alignment) < 0) { @@ -1325,7 +1363,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, /* Flush IOTLB cache as previous HVAs are now invalid */ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) for (i = 0; i < dev->nr_vring; i++) - vhost_user_iotlb_flush_all(dev->virtqueue[i]); + vhost_user_iotlb_flush_all(dev, dev->virtqueue[i]); /* * If VQ 0 has already been allocated, try to allocate on the same @@ -1504,6 +1542,7 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f { void *ptr; int mfd = -1; + uint64_t alignment; char fname[20] = "/tmp/memfd-XXXXXX"; *fd = -1; @@ -1535,7 +1574,8 @@ inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *f return NULL; } - mem_set_dump(ptr, size, false); + alignment = get_blk_size(mfd); + mem_set_dump(ptr, size, false, alignment); *fd = mfd; return ptr; } @@ -1744,7 +1784,7 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, dev->inflight_info->fd = -1; } - mem_set_dump(addr, mmap_size, false); + mem_set_dump(addr, mmap_size, false, get_blk_size(fd)); dev->inflight_info->fd = fd; dev->inflight_info->addr = addr; dev->inflight_info->size = mmap_size; @@ -2151,7 +2191,7 @@ vhost_user_get_vring_base(struct virtio_net **pdev, ctx->msg.size = sizeof(ctx->msg.payload.state); ctx->fd_num = 0; - vhost_user_iotlb_flush_all(vq); + vhost_user_iotlb_flush_all(dev, vq); vring_invalidate(dev, vq); @@ -2242,6 +2282,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct virtio_net *dev = *pdev; int fd = ctx->fds[0]; uint64_t size, off; + uint64_t alignment; void *addr; uint32_t i; @@ -2280,6 +2321,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, * fail when offset is not page size aligned. */ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + alignment = get_blk_size(fd); close(fd); if (addr == MAP_FAILED) { VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n"); @@ -2296,7 +2338,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, dev->log_addr = (uint64_t)(uintptr_t)addr; dev->log_base = dev->log_addr + off; dev->log_size = size; - mem_set_dump(addr, size, false); + mem_set_dump(addr, size + off, false, alignment); for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; @@ -2618,7 +2660,7 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, if (!vq) continue; - vhost_user_iotlb_cache_remove(vq, imsg->iova, + vhost_user_iotlb_cache_remove(dev, vq, imsg->iova, imsg->size); if (is_vring_iotlb(dev, vq, imsg)) { -- 2.31.1 ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: fix madvise arguments alignment 2023-03-01 20:02 ` [PATCH v3] " Mike Pattrick @ 2023-03-02 10:19 ` Maxime Coquelin 2023-03-06 14:22 ` Maxime Coquelin 1 sibling, 0 replies; 19+ messages in thread From: Maxime Coquelin @ 2023-03-02 10:19 UTC (permalink / raw) To: Mike Pattrick, dev; +Cc: david.marchand, chenbo.xia Hi Mike, On 3/1/23 21:02, Mike Pattrick wrote: > The arguments passed to madvise should be aligned to the alignment of > the backing memory. Now we keep track of each regions alignment and use > then when setting coredump preferences. To facilitate this, a new member > was added to rte_vhost_mem_region. A new function was added to easily The above sentence is no more valid, I can rework it while applying. > translate memory address back to region alignment. Unneeded calls to > madvise were reduced, as the cache removal case should already be > covered by the cache insertion case. The previously inline function > mem_set_dump was removed from a header file and made not inline. > > Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > --- > Since v1: > - Corrected a cast for 32bit compiles > Since v2: > - Removed changes from rte_vhost.h > - Restored set_mem_dump in ioctl remove/evict functions > --- > lib/vhost/iotlb.c | 68 +++++++++++++++++++++++++++++++----------- > lib/vhost/iotlb.h | 5 ++-- > lib/vhost/vhost.h | 12 ++------ > lib/vhost/vhost_user.c | 68 ++++++++++++++++++++++++++++++++++-------- > 4 files changed, 110 insertions(+), 43 deletions(-) > Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Thanks, Maxime ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v3] vhost: fix madvise arguments alignment 2023-03-01 20:02 ` [PATCH v3] " Mike Pattrick 2023-03-02 10:19 ` Maxime Coquelin @ 2023-03-06 14:22 ` Maxime Coquelin 1 sibling, 0 replies; 19+ messages in thread From: Maxime Coquelin @ 2023-03-06 14:22 UTC (permalink / raw) To: Mike Pattrick, dev; +Cc: david.marchand, chenbo.xia On 3/1/23 21:02, Mike Pattrick wrote: > The arguments passed to madvise should be aligned to the alignment of > the backing memory. Now we keep track of each regions alignment and use > then when setting coredump preferences. To facilitate this, a new member > was added to rte_vhost_mem_region. A new function was added to easily > translate memory address back to region alignment. Unneeded calls to > madvise were reduced, as the cache removal case should already be > covered by the cache insertion case. The previously inline function > mem_set_dump was removed from a header file and made not inline. > > Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps") > > Signed-off-by: Mike Pattrick <mkp@redhat.com> > --- > Since v1: > - Corrected a cast for 32bit compiles > Since v2: > - Removed changes from rte_vhost.h > - Restored set_mem_dump in ioctl remove/evict functions > --- > lib/vhost/iotlb.c | 68 +++++++++++++++++++++++++++++++----------- > lib/vhost/iotlb.h | 5 ++-- > lib/vhost/vhost.h | 12 ++------ > lib/vhost/vhost_user.c | 68 ++++++++++++++++++++++++++++++++++-------- > 4 files changed, 110 insertions(+), 43 deletions(-) > Applied to dpdk-next-virtio/main. Thanks, Maxime ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] vhost: exclude VM hugepages from coredumps 2022-12-06 15:05 [PATCH v2] vhost: exclude VM hugepages from coredumps Mike Pattrick 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick @ 2023-02-03 17:45 ` Stephen Hemminger 2023-02-06 1:13 ` Mike Pattrick 1 sibling, 1 reply; 19+ messages in thread From: Stephen Hemminger @ 2023-02-03 17:45 UTC (permalink / raw) To: Mike Pattrick; +Cc: Maxime Coquelin, Chenbo Xia, dev On Tue, 6 Dec 2022 10:05:09 -0500 Mike Pattrick <mkp@redhat.com> wrote: > + > +static __rte_always_inline void > +mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) > +{ > +#ifdef MADV_DONTDUMP > + if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > + rte_log(RTE_LOG_INFO, vhost_config_log_level, > + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > + } > +#endif > +} Why is this inlined in vhost.h? Hardly a critical path function. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] vhost: exclude VM hugepages from coredumps 2023-02-03 17:45 ` [PATCH v2] vhost: exclude VM hugepages from coredumps Stephen Hemminger @ 2023-02-06 1:13 ` Mike Pattrick 0 siblings, 0 replies; 19+ messages in thread From: Mike Pattrick @ 2023-02-06 1:13 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Maxime Coquelin, Chenbo Xia, dev On Fri, Feb 3, 2023 at 12:45 PM Stephen Hemminger <stephen@networkplumber.org> wrote: > > On Tue, 6 Dec 2022 10:05:09 -0500 > Mike Pattrick <mkp@redhat.com> wrote: > > > + > > +static __rte_always_inline void > > +mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused bool enable) > > +{ > > +#ifdef MADV_DONTDUMP > > + if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) { > > + rte_log(RTE_LOG_INFO, vhost_config_log_level, > > + "VHOST_CONFIG: could not set coredump preference (%s).\n", strerror(errno)); > > + } > > +#endif > > +} > > Why is this inlined in vhost.h? Hardly a critical path function. > Hello Steven, I thought it was appropriate here due to the size and contents of that function. Thanks, M ^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2023-03-06 14:22 UTC | newest] Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-12-06 15:05 [PATCH v2] vhost: exclude VM hugepages from coredumps Mike Pattrick 2022-12-07 16:54 ` [PATCH v3] " Mike Pattrick 2022-12-20 12:43 ` Maxime Coquelin 2023-02-03 14:50 ` Maxime Coquelin 2023-02-10 15:53 ` David Marchand 2023-02-10 15:59 ` Maxime Coquelin 2023-02-10 21:12 ` Mike Pattrick 2023-02-21 16:26 ` Maxime Coquelin 2023-02-22 13:59 ` Mike Pattrick 2023-02-22 21:43 ` [PATCH] vhost: fix madvise arguments alignment Mike Pattrick 2023-02-23 4:35 ` [PATCH v2] " Mike Pattrick 2023-02-23 16:12 ` Maxime Coquelin 2023-02-23 16:57 ` Mike Pattrick 2023-02-24 15:05 ` Patrick Robb 2023-03-01 20:02 ` [PATCH v3] " Mike Pattrick 2023-03-02 10:19 ` Maxime Coquelin 2023-03-06 14:22 ` Maxime Coquelin 2023-02-03 17:45 ` [PATCH v2] vhost: exclude VM hugepages from coredumps Stephen Hemminger 2023-02-06 1:13 ` Mike Pattrick
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).