From: Eelco Chaudron <echaudro@redhat.com>
To: David Marchand <david.marchand@redhat.com>
Cc: maxime.coquelin@redhat.com, chenbo.xia@intel.com, dev@dpdk.org
Subject: Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
Date: Tue, 16 May 2023 10:53:00 +0200 [thread overview]
Message-ID: <683D0767-DCE8-45F4-B904-CD2DD574DE53@redhat.com> (raw)
In-Reply-To: <CAJFAV8x-myEoVhYaxrfvRcfQhERo3cRnUFpwx7z2=-=gNVCOPw@mail.gmail.com>
On 10 May 2023, at 13:44, David Marchand wrote:
> On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>>
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>> lib/vhost/meson.build | 2 +
>> lib/vhost/rte_vhost.h | 23 +++++++++++++++-
>> lib/vhost/socket.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++---
>> lib/vhost/version.map | 9 ++++++
>> lib/vhost/vhost.c | 38 ++++++++++++++++++++++++++
>> lib/vhost/vhost.h | 65 +++++++++++++++++++++++++++++++-------------
>> 6 files changed, 184 insertions(+), 25 deletions(-)
>>
>> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
>> index 197a51d936..e93ba6b078 100644
>> --- a/lib/vhost/meson.build
>> +++ b/lib/vhost/meson.build
>> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>> 'vdpa_driver.h',
>> )
>> deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
>> +
>> +use_function_versioning = true
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..7a10bc36cf 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>> */
>> void (*guest_notified)(int vid);
>>
>> - void *reserved[1]; /**< Reserved for future extension */
>> + /**
>> + * If this callback is registered, notification to the guest can
>> + * be handled by the front-end calling rte_vhost_notify_guest().
>> + * If it's not handled, 'false' should be returned. This can be used
>> + * to remove the "slow" eventfd_write() syscall from the datapath.
>> + */
>> + bool (*guest_notify)(int vid, uint16_t queue_id);
>> };
>>
>> /**
>> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>>
>> int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
>> + *
>> + * Inject the offloaded interrupt into the vhost device's queue. For more
>> + * details see the 'guest_notify' vhost device operation.
>> + *
>> + * @param vid
>> + * vhost device ID
>> + * @param queue_id
>> + * virtio queue index
>> + */
>> +__rte_experimental
>> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>> +
>> /**
>> * Register vhost driver. path could be different for multiple
>> * instance support.
>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>> index 669c322e12..787d6bacf8 100644
>> --- a/lib/vhost/socket.c
>> +++ b/lib/vhost/socket.c
>> @@ -15,6 +15,7 @@
>> #include <fcntl.h>
>> #include <pthread.h>
>>
>> +#include <rte_function_versioning.h>
>> #include <rte_log.h>
>>
>> #include "fd_man.h"
>> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>> bool async_copy;
>> bool net_compliant_ol_flags;
>> bool stats_enabled;
>> + bool alloc_notify_ops;
>>
>> /*
>> * The "supported_features" indicates the feature bits the
>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>> vsocket->path = NULL;
>> }
>>
>> + if (vsocket && vsocket->alloc_notify_ops) {
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> + free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> + vsocket->notify_ops = NULL;
>> + }
>
> Rather than select the behavior based on a boolean (and here force the
> compiler to close its eyes), I would instead add a non const pointer
> to ops (let's say alloc_notify_ops) in vhost_user_socket.
> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
Good idea, I will make the change in v3.
>> +
>> if (vsocket) {
>> free(vsocket);
>> vsocket = NULL;
>> @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
>> /*
>> * Register ops so that we can add/remove device to data core.
>> */
>> -int
>> -rte_vhost_driver_callback_register(const char *path,
>> - struct rte_vhost_device_ops const * const ops)
>> +static int
>> +rte_vhost_driver_callback_register__(const char *path,
>
> No need for a rte_ prefix on static symbol.
> And we can simply call this internal helper vhost_driver_callback_register().
ACK.
>> + struct rte_vhost_device_ops const * const ops, bool ops_allocated)
>> {
>> struct vhost_user_socket *vsocket;
>>
>> pthread_mutex_lock(&vhost_user.mutex);
>> vsocket = find_vhost_user_socket(path);
>> - if (vsocket)
>> + if (vsocket) {
>> + if (vsocket->alloc_notify_ops) {
>> + vsocket->alloc_notify_ops = false;
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> + free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> + }
>> vsocket->notify_ops = ops;
>> + if (ops_allocated)
>> + vsocket->alloc_notify_ops = true;
>> + }
>> pthread_mutex_unlock(&vhost_user.mutex);
>>
>> return vsocket ? 0 : -1;
>> }
>>
>> +int __vsym
>> +rte_vhost_driver_callback_register_v24(const char *path,
>> + struct rte_vhost_device_ops const * const ops)
>> +{
>> + return rte_vhost_driver_callback_register__(path, ops, false);
>> +}
>> +
>> +int __vsym
>> +rte_vhost_driver_callback_register_v23(const char *path,
>> + struct rte_vhost_device_ops const * const ops)
>> +{
>> + int ret;
>> +
>> + /*
>> + * Although the ops structure is a const structure, we do need to
>> + * override the guest_notify operation. This is because with the
>> + * previous APIs it was "reserved" and if any garbage value was passed,
>> + * it could crash the application.
>> + */
>> + if (ops && !ops->guest_notify) {
>
> Hum, as described in the comment above, I don't think we should look
> at ops->guest_notify value at all.
> Checking ops != NULL should be enough.
Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>> + struct rte_vhost_device_ops *new_ops;
>> +
>> + new_ops = malloc(sizeof(*new_ops));
>
> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
> I am unclear of the impact though.
Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>> + if (new_ops == NULL)
>> + return -1;
>> +
>> + memcpy(new_ops, ops, sizeof(*new_ops));
>> + new_ops->guest_notify = NULL;
>> +
>> + ret = rte_vhost_driver_callback_register__(path, ops, true);
>> + } else {
>> + ret = rte_vhost_driver_callback_register__(path, ops, false);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/* Mark the v23 function as the old version, and v24 as the default version. */
>> +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
>> +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
>> +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
>> + struct rte_vhost_device_ops const * const ops),
>> + rte_vhost_driver_callback_register_v24);
>> +
>> struct rte_vhost_device_ops const *
>> vhost_driver_callback_get(const char *path)
>> {
>> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
>> index d322a4a888..7bcbfd12cf 100644
>> --- a/lib/vhost/version.map
>> +++ b/lib/vhost/version.map
>> @@ -64,6 +64,12 @@ DPDK_23 {
>> local: *;
>> };
>>
>> +DPDK_24 {
>> + global:
>> +
>> + rte_vhost_driver_callback_register;
>> +} DPDK_23;
>> +
>> EXPERIMENTAL {
>> global:
>>
>> @@ -98,6 +104,9 @@ EXPERIMENTAL {
>> # added in 22.11
>> rte_vhost_async_dma_unconfigure;
>> rte_vhost_vring_call_nonblock;
>> +
>> + # added in 23.07
>> + rte_vhost_notify_guest;
>> };
>>
>> INTERNAL {
>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>> index 8ff6434c93..79e88f986e 100644
>> --- a/lib/vhost/vhost.c
>> +++ b/lib/vhost/vhost.c
>> @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
>> {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
>> {"size_1519_max_packets", offsetof(struct vhost_virtqueue, stats.size_bins[7])},
>> {"guest_notifications", offsetof(struct vhost_virtqueue, stats.guest_notifications)},
>> + {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
>> + stats.guest_notifications_offloaded)},
>> + {"guest_notifications_error", offsetof(struct vhost_virtqueue,
>> + stats.guest_notifications_error)},
>> {"iotlb_hits", offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
>> {"iotlb_misses", offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
>> {"inflight_submitted", offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
>> @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>> return ret;
>> }
>>
>> +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
>> +{
>> + struct virtio_net *dev = get_device(vid);
>> + struct vhost_virtqueue *vq;
>> +
>> + if (!dev || queue_id >= VHOST_MAX_VRING)
>> + return;
>> +
>> + vq = dev->virtqueue[queue_id];
>> + if (!vq)
>> + return;
>> +
>> + rte_rwlock_read_lock(&vq->access_lock);
>> +
>> + if (vq->callfd >= 0) {
>> + int ret = eventfd_write(vq->callfd, (eventfd_t)1);
>> +
>> + if (ret) {
>> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> + __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> + 1, __ATOMIC_RELAXED);
>> + } else {
>> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> + __atomic_fetch_add(&vq->stats.guest_notifications,
>> + 1, __ATOMIC_RELAXED);
>> + if (dev->notify_ops->guest_notified)
>> + dev->notify_ops->guest_notified(dev->vid);
>> + }
>> + }
>> +
>> + rte_rwlock_read_unlock(&vq->access_lock);
>> +}
>> +
>> void
>> rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>> {
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 37609c7c8d..0ab75b78b5 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -141,6 +141,8 @@ struct virtqueue_stats {
>> uint64_t inflight_completed;
>> /* Counters below are atomic, and should be incremented as such. */
>> uint64_t guest_notifications;
>> + uint64_t guest_notifications_offloaded;
>> + uint64_t guest_notifications_error;
>> };
>>
>> /**
>> @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>> return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>> }
>>
>> +static __rte_always_inline void
>> +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> + int ret;
>> +
>> + if (dev->notify_ops->guest_notify &&
>> + dev->notify_ops->guest_notify(dev->vid, vq->index)) {
>> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> + __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
>> + 1, __ATOMIC_RELAXED);
>> + return;
>> + }
>> +
>> + ret = eventfd_write(vq->callfd, (eventfd_t) 1);
>> + if (ret) {
>> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> + __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> + 1, __ATOMIC_RELAXED);
>> + return;
>> + }
>> +
>> + if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> + __atomic_fetch_add(&vq->stats.guest_notifications,
>> + 1, __ATOMIC_RELAXED);
>> + if (dev->notify_ops->guest_notified)
>> + dev->notify_ops->guest_notified(dev->vid);
>> +}
>> +
>> +
>
> One empty line is enough.
Will remove.
>> static __rte_always_inline void
>> vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> {
>> @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> "%s: used_event_idx=%d, old=%d, new=%d\n",
>> __func__, vhost_used_event(vq), old, new);
>>
>> - if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>> - (vq->callfd >= 0)) ||
>> - unlikely(!signalled_used_valid)) {
>> - eventfd_write(vq->callfd, (eventfd_t) 1);
>> - if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> - __atomic_fetch_add(&vq->stats.guest_notifications,
>> - 1, __ATOMIC_RELAXED);
>> - if (dev->notify_ops->guest_notified)
>> - dev->notify_ops->guest_notified(dev->vid);
>> + if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>> + unlikely(!signalled_used_valid)) &&
>> + vq->callfd >= 0) {
>
> A bit hard to read (even before your change).
>
> After the change, indentation does not comply with dpdk coding style.
> http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation
Yes, I always mess up the DPDK indentation style :(
>
> But putting indentation aside, is this change equivalent?
> - if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> - (vq->callfd >= 0)) ||
> - unlikely(!signalled_used_valid)) {
> + if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> + unlikely(!signalled_used_valid)) &&
> + vq->callfd >= 0) {
They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>> + vhost_vring_inject_irq(dev, vq);
>> }
>> } else {
>> /* Kick the guest if necessary. */
>> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>> && (vq->callfd >= 0)) {
>> - eventfd_write(vq->callfd, (eventfd_t)1);
>> - if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> - __atomic_fetch_add(&vq->stats.guest_notifications,
>> - 1, __ATOMIC_RELAXED);
>> - if (dev->notify_ops->guest_notified)
>> - dev->notify_ops->guest_notified(dev->vid);
>> + vhost_vring_inject_irq(dev, vq);
>> }
>> }
>> }
>> @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> if (vhost_need_event(off, new, old))
>> kick = true;
>> kick:
>> - if (kick) {
>> - eventfd_write(vq->callfd, (eventfd_t)1);
>> - if (dev->notify_ops->guest_notified)
>> - dev->notify_ops->guest_notified(dev->vid);
>> - }
>> + if (kick && vq->callfd >= 0)
>> + vhost_vring_inject_irq(dev, vq);
>> }
>>
>> static __rte_always_inline void
>> @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
>>
>> uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
>> void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
>> +
>> +/* Versioned functions */
>> +int rte_vhost_driver_callback_register_v23(const char *path,
>> + struct rte_vhost_device_ops const * const ops);
>> +int rte_vhost_driver_callback_register_v24(const char *path,
>> + struct rte_vhost_device_ops const * const ops);
>> +
>> #endif /* _VHOST_NET_CDEV_H_ */
>>
>
>
> --
> David Marchand
next prev parent reply other threads:[~2023-05-16 8:53 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-04-05 12:40 [PATCH v2 0/3] " Eelco Chaudron
2023-04-05 12:40 ` [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one Eelco Chaudron
2023-04-05 12:41 ` [PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic Eelco Chaudron
2023-04-05 12:41 ` [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
2023-05-10 11:44 ` David Marchand
2023-05-16 8:53 ` Eelco Chaudron [this message]
2023-05-16 10:12 ` David Marchand
2023-05-16 11:36 ` Eelco Chaudron
2023-05-16 11:45 ` Maxime Coquelin
2023-05-16 12:07 ` Eelco Chaudron
2023-05-17 9:18 ` Eelco Chaudron
2023-05-08 13:58 ` [PATCH v2 0/3] " Eelco Chaudron
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=683D0767-DCE8-45F4-B904-CD2DD574DE53@redhat.com \
--to=echaudro@redhat.com \
--cc=chenbo.xia@intel.com \
--cc=david.marchand@redhat.com \
--cc=dev@dpdk.org \
--cc=maxime.coquelin@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).