DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v2 0/3] vhost: add device op to offload the interrupt kick
@ 2023-04-05 12:40 Eelco Chaudron
  2023-04-05 12:40 ` [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one Eelco Chaudron
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-04-05 12:40 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia; +Cc: dev

This series adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

v2: - Used vhost_virtqueue->index to find index for operation.
    - Aligned function name to VDUSE RFC patchset.
    - Added error and offload statistics counter.
    - Mark new API as experimental.
    - Change the virtual queue spin lock to read/write spin lock.
    - Made shared counters atomic.
    - Add versioned rte_vhost_driver_callback_register() for
      ABI compliance.

Eelco Chaudron (3):
      vhost: Change vhost_virtqueue access lock to a read/write one.
      vhost: make the guest_notifications statistic counter atomic.
      vhost: add device op to offload the interrupt kick


 lib/eal/include/generic/rte_rwlock.h | 17 +++++
 lib/vhost/meson.build                |  2 +
 lib/vhost/rte_vhost.h                | 23 ++++++-
 lib/vhost/socket.c                   | 72 ++++++++++++++++++++--
 lib/vhost/version.map                |  9 +++
 lib/vhost/vhost.c                    | 92 +++++++++++++++++++++-------
 lib/vhost/vhost.h                    | 70 ++++++++++++++-------
 lib/vhost/vhost_user.c               | 14 ++---
 lib/vhost/virtio_net.c               | 90 +++++++++++++--------------
 9 files changed, 288 insertions(+), 101 deletions(-)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one.
  2023-04-05 12:40 [PATCH v2 0/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
@ 2023-04-05 12:40 ` Eelco Chaudron
  2023-04-05 12:41 ` [PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic Eelco Chaudron
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-04-05 12:40 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia; +Cc: dev

This change will allow the vhost interrupt datapath handling to be split
between two processed without one of them holding an explicit lock.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/eal/include/generic/rte_rwlock.h |   17 ++++++
 lib/vhost/vhost.c                    |   46 +++++++++--------
 lib/vhost/vhost.h                    |    4 +-
 lib/vhost/vhost_user.c               |   14 +++--
 lib/vhost/virtio_net.c               |   90 +++++++++++++++++-----------------
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/lib/eal/include/generic/rte_rwlock.h b/lib/eal/include/generic/rte_rwlock.h
index d45c22c189..3a5f0e568f 100644
--- a/lib/eal/include/generic/rte_rwlock.h
+++ b/lib/eal/include/generic/rte_rwlock.h
@@ -236,6 +236,23 @@ rte_rwlock_write_unlock(rte_rwlock_t *rwl)
 	__atomic_fetch_sub(&rwl->cnt, RTE_RWLOCK_WRITE, __ATOMIC_RELEASE);
 }
 
+/**
+ * Test if the write lock is taken.
+ *
+ * @param rwl
+ *   A pointer to a rwlock structure.
+ * @return
+ *   1 if the write lock is currently taken; 0 otherwise.
+ */
+static inline int
+rte_rwlock_write_is_locked(rte_rwlock_t *rwl)
+{
+	if (__atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED) & RTE_RWLOCK_WRITE)
+		return 1;
+
+	return 0;
+}
+
 /**
  * Try to execute critical section in a hardware memory transaction, if it
  * fails or not available take a read lock
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index ef37943817..74bdbfd810 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -393,9 +393,9 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	else
 		rte_free(vq->shadow_used_split);
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 	vhost_free_async_mem(vq);
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 	rte_free(vq->batch_copy_elems);
 	vhost_user_iotlb_destroy(vq);
 	rte_free(vq->log_cache);
@@ -630,7 +630,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 
 		dev->virtqueue[i] = vq;
 		init_vring_queue(dev, vq, i);
-		rte_spinlock_init(&vq->access_lock);
+		rte_rwlock_init(&vq->access_lock);
 		vq->avail_wrap_counter = 1;
 		vq->used_wrap_counter = 1;
 		vq->signalled_used_valid = false;
@@ -1305,14 +1305,14 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
 	if (!vq)
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_read_lock(&vq->access_lock);
 
 	if (vq_is_packed(dev))
 		vhost_vring_call_packed(dev, vq);
 	else
 		vhost_vring_call_split(dev, vq);
 
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	return 0;
 }
@@ -1334,7 +1334,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
 	if (!vq)
 		return -1;
 
-	if (!rte_spinlock_trylock(&vq->access_lock))
+	if (rte_rwlock_read_trylock(&vq->access_lock))
 		return -EAGAIN;
 
 	if (vq_is_packed(dev))
@@ -1342,7 +1342,7 @@ rte_vhost_vring_call_nonblock(int vid, uint16_t vring_idx)
 	else
 		vhost_vring_call_split(dev, vq);
 
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	return 0;
 }
@@ -1365,7 +1365,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
 	if (!vq)
 		return 0;
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 
 	if (unlikely(!vq->enabled || vq->avail == NULL))
 		goto out;
@@ -1373,7 +1373,7 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
 	ret = *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
 
 out:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 	return ret;
 }
 
@@ -1457,12 +1457,12 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
 	if (!vq)
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 
 	vq->notif_enable = enable;
 	ret = vhost_enable_guest_notification(dev, vq, enable);
 
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return ret;
 }
@@ -1520,7 +1520,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
 	if (vq == NULL)
 		return 0;
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 
 	if (unlikely(!vq->enabled || vq->avail == NULL))
 		goto out;
@@ -1528,7 +1528,7 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
 	ret = *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
 
 out:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 	return ret;
 }
 
@@ -1757,9 +1757,9 @@ rte_vhost_async_channel_register(int vid, uint16_t queue_id)
 	if (unlikely(vq == NULL || !dev->async_copy || dev->vdpa_dev != NULL))
 		return -1;
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 	ret = async_channel_register(dev, vq);
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return ret;
 }
@@ -1804,7 +1804,7 @@ rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 	if (vq == NULL)
 		return ret;
 
-	if (!rte_spinlock_trylock(&vq->access_lock)) {
+	if (rte_rwlock_write_trylock(&vq->access_lock)) {
 		VHOST_LOG_CONFIG(dev->ifname, ERR,
 			"failed to unregister async channel, virtqueue busy.\n");
 		return ret;
@@ -1821,7 +1821,7 @@ rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 		ret = 0;
 	}
 
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return ret;
 }
@@ -1954,7 +1954,7 @@ rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
 	if (vq == NULL)
 		return ret;
 
-	if (!rte_spinlock_trylock(&vq->access_lock)) {
+	if (rte_rwlock_write_trylock(&vq->access_lock)) {
 		VHOST_LOG_CONFIG(dev->ifname, DEBUG,
 			"failed to check in-flight packets. virtqueue busy.\n");
 		return ret;
@@ -1963,7 +1963,7 @@ rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
 	if (vq->async)
 		ret = vq->async->pkts_inflight_n;
 
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return ret;
 }
@@ -2084,13 +2084,13 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 	for (i = 0; i < VHOST_NB_VQ_STATS; i++) {
 		stats[i].value =
 			*(uint64_t *)(((char *)vq) + vhost_vq_stat_strings[i].offset);
 		stats[i].id = i;
 	}
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return VHOST_NB_VQ_STATS;
 }
@@ -2111,9 +2111,9 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t queue_id)
 
 	vq = dev->virtqueue[queue_id];
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 	memset(&vq->stats, 0, sizeof(vq->stats));
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return 0;
 }
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 8fdab13c70..5c939ef06f 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -277,7 +277,7 @@ struct vhost_virtqueue {
 	bool			access_ok;
 	bool			ready;
 
-	rte_spinlock_t		access_lock;
+	rte_rwlock_t		access_lock;
 
 
 	union {
@@ -517,7 +517,7 @@ static inline void
 vq_assert_lock__(struct virtio_net *dev, struct vhost_virtqueue *vq, const char *func)
 	__rte_assert_exclusive_lock(&vq->access_lock)
 {
-	if (unlikely(!rte_spinlock_is_locked(&vq->access_lock)))
+	if (unlikely(!rte_rwlock_write_is_locked(&vq->access_lock)))
 		rte_panic("VHOST_CONFIG: (%s) %s() called without access lock taken.\n",
 			dev->ifname, func);
 }
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index d60e39b6bc..c9454ce3d9 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -399,7 +399,7 @@ vhost_user_set_features(struct virtio_net **pdev,
 			cleanup_vq_inflight(dev, vq);
 			/* vhost_user_lock_all_queue_pairs locked all qps */
 			vq_assert_lock(dev, vq);
-			rte_spinlock_unlock(&vq->access_lock);
+			rte_rwlock_write_unlock(&vq->access_lock);
 			free_vq(dev, vq);
 		}
 	}
@@ -2649,10 +2649,10 @@ vhost_user_iotlb_msg(struct virtio_net **pdev,
 					len, imsg->perm);
 
 			if (is_vring_iotlb(dev, vq, imsg)) {
-				rte_spinlock_lock(&vq->access_lock);
+				rte_rwlock_write_lock(&vq->access_lock);
 				translate_ring_addresses(&dev, &vq);
 				*pdev = dev;
-				rte_spinlock_unlock(&vq->access_lock);
+				rte_rwlock_write_unlock(&vq->access_lock);
 			}
 		}
 		break;
@@ -2667,9 +2667,9 @@ vhost_user_iotlb_msg(struct virtio_net **pdev,
 					imsg->size);
 
 			if (is_vring_iotlb(dev, vq, imsg)) {
-				rte_spinlock_lock(&vq->access_lock);
+				rte_rwlock_write_lock(&vq->access_lock);
 				vring_invalidate(dev, vq);
-				rte_spinlock_unlock(&vq->access_lock);
+				rte_rwlock_write_unlock(&vq->access_lock);
 			}
 		}
 		break;
@@ -3030,7 +3030,7 @@ vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
 		struct vhost_virtqueue *vq = dev->virtqueue[i];
 
 		if (vq) {
-			rte_spinlock_lock(&vq->access_lock);
+			rte_rwlock_write_lock(&vq->access_lock);
 			vq_num++;
 		}
 		i++;
@@ -3048,7 +3048,7 @@ vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
 		struct vhost_virtqueue *vq = dev->virtqueue[i];
 
 		if (vq) {
-			rte_spinlock_unlock(&vq->access_lock);
+			rte_rwlock_write_unlock(&vq->access_lock);
 			vq_num++;
 		}
 		i++;
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index be28ea5151..267faaa4fd 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -55,7 +55,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
 static inline void
 vhost_queue_stats_update(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct virtqueue_stats *stats = &vq->stats;
 	int i;
@@ -102,7 +102,7 @@ static __rte_always_inline int64_t
 vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		int16_t dma_id, uint16_t vchan_id, uint16_t flag_idx,
 		struct vhost_iov_iter *pkt)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
 	uint16_t ring_mask = dma_info->ring_mask;
@@ -152,7 +152,7 @@ static __rte_always_inline uint16_t
 vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		int16_t dma_id, uint16_t vchan_id, uint16_t head_idx,
 		struct vhost_iov_iter *pkts, uint16_t nr_pkts)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
 	int64_t ret, nr_copies = 0;
@@ -454,7 +454,7 @@ vhost_async_shadow_enqueue_packed_batch(struct vhost_virtqueue *vq,
 
 static __rte_always_inline void
 vhost_async_shadow_dequeue_packed_batch(struct vhost_virtqueue *vq, uint16_t *ids)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	uint16_t i;
 	struct vhost_async *async = vq->async;
@@ -1131,7 +1131,7 @@ static __rte_always_inline int
 async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, uint32_t mbuf_offset,
 		uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	struct vhost_async *async = vq->async;
@@ -1211,7 +1211,7 @@ static __rte_always_inline int
 mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf *m, struct buf_vector *buf_vec,
 		uint16_t nr_vec, uint16_t num_buffers, bool is_async)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t vec_idx = 0;
@@ -1341,7 +1341,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
 			    struct rte_mbuf *pkt,
 			    struct buf_vector *buf_vec,
 			    uint16_t *nr_descs)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint16_t nr_vec = 0;
@@ -1403,7 +1403,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
 static __rte_noinline uint32_t
 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint32_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t pkt_idx = 0;
@@ -1635,7 +1635,7 @@ static __rte_always_inline int16_t
 virtio_dev_rx_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,
 			    struct rte_mbuf *pkt)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
@@ -1661,7 +1661,7 @@ virtio_dev_rx_packed(struct virtio_net *dev,
 		     struct vhost_virtqueue *__rte_restrict vq,
 		     struct rte_mbuf **__rte_restrict pkts,
 		     uint32_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t pkt_idx = 0;
@@ -1701,7 +1701,7 @@ virtio_dev_rx(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t nb_tx = 0;
 
 	VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_read_lock(&vq->access_lock);
 
 	if (unlikely(!vq->enabled))
 		goto out_access_unlock;
@@ -1727,7 +1727,7 @@ virtio_dev_rx(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	vhost_user_iotlb_rd_unlock(vq);
 
 out_access_unlock:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	return nb_tx;
 }
@@ -1760,7 +1760,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 
 static __rte_always_inline uint16_t
 async_get_first_inflight_pkt_idx(struct vhost_virtqueue *vq)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct vhost_async *async = vq->async;
 
@@ -2165,7 +2165,7 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue
 
 static __rte_always_inline void
 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct vhost_async *async = vq->async;
 	uint16_t nr_left = n_descs;
@@ -2198,7 +2198,7 @@ write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
 static __rte_always_inline void
 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
 				uint16_t n_buffers)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct vhost_async *async = vq->async;
 	uint16_t from = async->last_buffer_idx_packed;
@@ -2263,7 +2263,7 @@ write_back_completed_descs_packed(struct vhost_virtqueue *vq,
 static __rte_always_inline uint16_t
 vhost_poll_enqueue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf **pkts, uint16_t count, int16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct vhost_async *async = vq->async;
 	struct async_inflight_info *pkts_info = async->pkts_info;
@@ -2357,7 +2357,7 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
-	if (!rte_spinlock_trylock(&vq->access_lock)) {
+	if (rte_rwlock_read_trylock(&vq->access_lock)) {
 		VHOST_LOG_DATA(dev->ifname, DEBUG,
 			"%s: virtqueue %u is busy.\n",
 			__func__, queue_id);
@@ -2377,7 +2377,7 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 	vq->stats.inflight_completed += n_pkts_cpl;
 
 out:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	return n_pkts_cpl;
 }
@@ -2465,7 +2465,7 @@ rte_vhost_clear_queue(int vid, uint16_t queue_id, struct rte_mbuf **pkts,
 
 	vq = dev->virtqueue[queue_id];
 
-	if (!rte_spinlock_trylock(&vq->access_lock)) {
+	if (rte_rwlock_read_trylock(&vq->access_lock)) {
 		VHOST_LOG_DATA(dev->ifname, DEBUG, "%s: virtqueue %u is busy.\n",
 			__func__, queue_id);
 		return 0;
@@ -2495,7 +2495,7 @@ rte_vhost_clear_queue(int vid, uint16_t queue_id, struct rte_mbuf **pkts,
 	vq->stats.inflight_completed += n_pkts_cpl;
 
 out_access_unlock:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	return n_pkts_cpl;
 }
@@ -2516,7 +2516,7 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		return 0;
 	}
 
-	rte_spinlock_lock(&vq->access_lock);
+	rte_rwlock_write_lock(&vq->access_lock);
 
 	if (unlikely(!vq->enabled || !vq->async))
 		goto out_access_unlock;
@@ -2544,7 +2544,7 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	vhost_user_iotlb_rd_unlock(vq);
 
 out_access_unlock:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_write_unlock(&vq->access_lock);
 
 	return nb_tx;
 }
@@ -2866,7 +2866,7 @@ desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		  struct buf_vector *buf_vec, uint16_t nr_vec,
 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
 		  bool legacy_ol_flags, uint16_t slot_idx, bool is_async)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t buf_avail, buf_offset, buf_len;
@@ -3073,7 +3073,7 @@ static uint16_t
 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
 	bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint16_t i;
@@ -3178,7 +3178,7 @@ static uint16_t
 virtio_dev_tx_split_legacy(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
 	struct rte_mbuf **pkts, uint16_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
@@ -3189,7 +3189,7 @@ static uint16_t
 virtio_dev_tx_split_compliant(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
 	struct rte_mbuf **pkts, uint16_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
@@ -3393,7 +3393,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev,
 			    uint16_t *buf_id,
 			    uint16_t *desc_count,
 			    bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
@@ -3443,7 +3443,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev,
 			    struct rte_mempool *mbuf_pool,
 			    struct rte_mbuf *pkts,
 			    bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 
@@ -3475,7 +3475,7 @@ virtio_dev_tx_packed(struct virtio_net *dev,
 		     struct rte_mbuf **__rte_restrict pkts,
 		     uint32_t count,
 		     bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t pkt_idx = 0;
@@ -3520,7 +3520,7 @@ static uint16_t
 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
@@ -3531,7 +3531,7 @@ static uint16_t
 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
@@ -3566,7 +3566,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
-	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+	if (unlikely(rte_rwlock_read_trylock(&vq->access_lock) != 0))
 		return 0;
 
 	if (unlikely(!vq->enabled)) {
@@ -3636,7 +3636,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	vhost_user_iotlb_rd_unlock(vq);
 
 out_access_unlock:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	if (unlikely(rarp_mbuf != NULL))
 		count += 1;
@@ -3648,7 +3648,7 @@ static __rte_always_inline uint16_t
 async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
 		uint16_t vchan_id, bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	uint16_t start_idx, from, i;
 	uint16_t nr_cpl_pkts = 0;
@@ -3695,7 +3695,7 @@ static __rte_always_inline uint16_t
 virtio_dev_tx_async_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
 		int16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	static bool allocerr_warned;
@@ -3844,7 +3844,7 @@ virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
 		struct rte_mbuf **pkts, uint16_t count,
 		int16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_async_split(dev, vq, mbuf_pool,
@@ -3857,7 +3857,7 @@ virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
 		struct rte_mbuf **pkts, uint16_t count,
 		int16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_async_split(dev, vq, mbuf_pool,
@@ -3867,7 +3867,7 @@ virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
 static __rte_always_inline void
 vhost_async_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
 				uint16_t buf_id, uint16_t count)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 {
 	struct vhost_async *async = vq->async;
 	uint16_t idx = async->buffer_idx_packed;
@@ -3889,7 +3889,7 @@ virtio_dev_tx_async_single_packed(struct virtio_net *dev,
 			struct rte_mbuf *pkts,
 			uint16_t slot_idx,
 			bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	int err;
@@ -3942,7 +3942,7 @@ virtio_dev_tx_async_packed_batch(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
 			   struct rte_mbuf **pkts, uint16_t slot_idx,
 			   uint16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint16_t avail_idx = vq->last_avail_idx;
@@ -4000,7 +4000,7 @@ static __rte_always_inline uint16_t
 virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
 		uint16_t count, uint16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	uint32_t pkt_idx = 0;
@@ -4111,7 +4111,7 @@ static uint16_t
 virtio_dev_tx_async_packed_legacy(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
 		uint16_t count, uint16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_async_packed(dev, vq, mbuf_pool,
@@ -4123,7 +4123,7 @@ static uint16_t
 virtio_dev_tx_async_packed_compliant(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
 		uint16_t count, uint16_t dma_id, uint16_t vchan_id)
-	__rte_exclusive_locks_required(&vq->access_lock)
+	__rte_shared_locks_required(&vq->access_lock)
 	__rte_shared_locks_required(&vq->iotlb_lock)
 {
 	return virtio_dev_tx_async_packed(dev, vq, mbuf_pool,
@@ -4173,7 +4173,7 @@ rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
 
 	vq = dev->virtqueue[queue_id];
 
-	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+	if (unlikely(rte_rwlock_read_trylock(&vq->access_lock) != 0))
 		return 0;
 
 	if (unlikely(vq->enabled == 0)) {
@@ -4255,7 +4255,7 @@ rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
 	vhost_user_iotlb_rd_unlock(vq);
 
 out_access_unlock:
-	rte_spinlock_unlock(&vq->access_lock);
+	rte_rwlock_read_unlock(&vq->access_lock);
 
 	if (unlikely(rarp_mbuf != NULL))
 		count += 1;


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic.
  2023-04-05 12:40 [PATCH v2 0/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
  2023-04-05 12:40 ` [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one Eelco Chaudron
@ 2023-04-05 12:41 ` Eelco Chaudron
  2023-04-05 12:41 ` [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
  2023-05-08 13:58 ` [PATCH v2 0/3] " Eelco Chaudron
  3 siblings, 0 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-04-05 12:41 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia; +Cc: dev

Making the guest_notifications statistic counter atomic, allows
it to be safely incremented while holding the read access_lock.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/vhost/vhost.c |    8 ++++++++
 lib/vhost/vhost.h |    9 ++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 74bdbfd810..8ff6434c93 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -2086,6 +2086,10 @@ rte_vhost_vring_stats_get(int vid, uint16_t queue_id,
 
 	rte_rwlock_write_lock(&vq->access_lock);
 	for (i = 0; i < VHOST_NB_VQ_STATS; i++) {
+		/*
+		 * No need to the read atomic counters as such, due to the
+		 * above write access_lock preventing them to be updated.
+		 */
 		stats[i].value =
 			*(uint64_t *)(((char *)vq) + vhost_vq_stat_strings[i].offset);
 		stats[i].id = i;
@@ -2112,6 +2116,10 @@ int rte_vhost_vring_stats_reset(int vid, uint16_t queue_id)
 	vq = dev->virtqueue[queue_id];
 
 	rte_rwlock_write_lock(&vq->access_lock);
+	/*
+	 * No need to the reset atomic counters as such, due to the
+	 * above write access_lock preventing them to be updated.
+	 */
 	memset(&vq->stats, 0, sizeof(vq->stats));
 	rte_rwlock_write_unlock(&vq->access_lock);
 
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 5c939ef06f..37609c7c8d 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -135,11 +135,12 @@ struct virtqueue_stats {
 	uint64_t broadcast;
 	/* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */
 	uint64_t size_bins[8];
-	uint64_t guest_notifications;
 	uint64_t iotlb_hits;
 	uint64_t iotlb_misses;
 	uint64_t inflight_submitted;
 	uint64_t inflight_completed;
+	/* Counters below are atomic, and should be incremented as such. */
+	uint64_t guest_notifications;
 };
 
 /**
@@ -907,7 +908,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 				unlikely(!signalled_used_valid)) {
 			eventfd_write(vq->callfd, (eventfd_t) 1);
 			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				vq->stats.guest_notifications++;
+				__atomic_fetch_add(&vq->stats.guest_notifications,
+					1, __ATOMIC_RELAXED);
 			if (dev->notify_ops->guest_notified)
 				dev->notify_ops->guest_notified(dev->vid);
 		}
@@ -917,7 +919,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 				&& (vq->callfd >= 0)) {
 			eventfd_write(vq->callfd, (eventfd_t)1);
 			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				vq->stats.guest_notifications++;
+				__atomic_fetch_add(&vq->stats.guest_notifications,
+					1, __ATOMIC_RELAXED);
 			if (dev->notify_ops->guest_notified)
 				dev->notify_ops->guest_notified(dev->vid);
 		}


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-04-05 12:40 [PATCH v2 0/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
  2023-04-05 12:40 ` [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one Eelco Chaudron
  2023-04-05 12:41 ` [PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic Eelco Chaudron
@ 2023-04-05 12:41 ` Eelco Chaudron
  2023-05-10 11:44   ` David Marchand
  2023-05-08 13:58 ` [PATCH v2 0/3] " Eelco Chaudron
  3 siblings, 1 reply; 12+ messages in thread
From: Eelco Chaudron @ 2023-04-05 12:41 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia; +Cc: dev

This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/vhost/meson.build |    2 +
 lib/vhost/rte_vhost.h |   23 +++++++++++++++-
 lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/vhost/version.map |    9 ++++++
 lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
 lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
 6 files changed, 184 insertions(+), 25 deletions(-)

diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
index 197a51d936..e93ba6b078 100644
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@@ -39,3 +39,5 @@ driver_sdk_headers = files(
         'vdpa_driver.h',
 )
 deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
+
+use_function_versioning = true
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..7a10bc36cf 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
 	 */
 	void (*guest_notified)(int vid);
 
-	void *reserved[1]; /**< Reserved for future extension */
+	/**
+	 * If this callback is registered, notification to the guest can
+	 * be handled by the front-end calling rte_vhost_notify_guest().
+	 * If it's not handled, 'false' should be returned. This can be used
+	 * to remove the "slow" eventfd_write() syscall from the datapath.
+	 */
+	bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Inject the offloaded interrupt into the vhost device's queue. For more
+ * details see the 'guest_notify' vhost device operation.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ */
+__rte_experimental
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 669c322e12..787d6bacf8 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -15,6 +15,7 @@
 #include <fcntl.h>
 #include <pthread.h>
 
+#include <rte_function_versioning.h>
 #include <rte_log.h>
 
 #include "fd_man.h"
@@ -43,6 +44,7 @@ struct vhost_user_socket {
 	bool async_copy;
 	bool net_compliant_ol_flags;
 	bool stats_enabled;
+	bool alloc_notify_ops;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
 		vsocket->path = NULL;
 	}
 
+	if (vsocket && vsocket->alloc_notify_ops) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+		free((struct rte_vhost_device_ops *)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+		vsocket->notify_ops = NULL;
+	}
+
 	if (vsocket) {
 		free(vsocket);
 		vsocket = NULL;
@@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
 /*
  * Register ops so that we can add/remove device to data core.
  */
-int
-rte_vhost_driver_callback_register(const char *path,
-	struct rte_vhost_device_ops const * const ops)
+static int
+rte_vhost_driver_callback_register__(const char *path,
+	struct rte_vhost_device_ops const * const ops, bool ops_allocated)
 {
 	struct vhost_user_socket *vsocket;
 
 	pthread_mutex_lock(&vhost_user.mutex);
 	vsocket = find_vhost_user_socket(path);
-	if (vsocket)
+	if (vsocket) {
+		if (vsocket->alloc_notify_ops) {
+			vsocket->alloc_notify_ops = false;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+			free((struct rte_vhost_device_ops *)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+		}
 		vsocket->notify_ops = ops;
+		if (ops_allocated)
+			vsocket->alloc_notify_ops = true;
+	}
 	pthread_mutex_unlock(&vhost_user.mutex);
 
 	return vsocket ? 0 : -1;
 }
 
+int __vsym
+rte_vhost_driver_callback_register_v24(const char *path,
+	struct rte_vhost_device_ops const * const ops)
+{
+	return rte_vhost_driver_callback_register__(path, ops, false);
+}
+
+int __vsym
+rte_vhost_driver_callback_register_v23(const char *path,
+	struct rte_vhost_device_ops const * const ops)
+{
+	int ret;
+
+	/*
+	 * Although the ops structure is a const structure, we do need to
+	 * override the guest_notify operation. This is because with the
+	 * previous APIs it was "reserved" and if any garbage value was passed,
+	 * it could crash the application.
+	 */
+	if (ops && !ops->guest_notify) {
+		struct rte_vhost_device_ops *new_ops;
+
+		new_ops = malloc(sizeof(*new_ops));
+		if (new_ops == NULL)
+			return -1;
+
+		memcpy(new_ops, ops, sizeof(*new_ops));
+		new_ops->guest_notify = NULL;
+
+		ret = rte_vhost_driver_callback_register__(path, ops, true);
+	} else {
+		ret = rte_vhost_driver_callback_register__(path, ops, false);
+	}
+
+	return ret;
+}
+
+/* Mark the v23 function as the old version, and v24 as the default version. */
+VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
+BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
+MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
+		struct rte_vhost_device_ops const * const ops),
+		rte_vhost_driver_callback_register_v24);
+
 struct rte_vhost_device_ops const *
 vhost_driver_callback_get(const char *path)
 {
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index d322a4a888..7bcbfd12cf 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -64,6 +64,12 @@ DPDK_23 {
 	local: *;
 };
 
+DPDK_24 {
+	global:
+
+	rte_vhost_driver_callback_register;
+} DPDK_23;
+
 EXPERIMENTAL {
 	global:
 
@@ -98,6 +104,9 @@ EXPERIMENTAL {
 	# added in 22.11
 	rte_vhost_async_dma_unconfigure;
 	rte_vhost_vring_call_nonblock;
+
+        # added in 23.07
+	rte_vhost_notify_guest;
 };
 
 INTERNAL {
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 8ff6434c93..79e88f986e 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
 	{"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
 	{"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
 	{"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
+	{"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
+		stats.guest_notifications_offloaded)},
+	{"guest_notifications_error", offsetof(struct vhost_virtqueue,
+		stats.guest_notifications_error)},
 	{"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
 	{"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
 	{"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
@@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
 	return ret;
 }
 
+void
+rte_vhost_notify_guest(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev = get_device(vid);
+	struct vhost_virtqueue *vq;
+
+	if (!dev ||  queue_id >= VHOST_MAX_VRING)
+		return;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq)
+		return;
+
+	rte_rwlock_read_lock(&vq->access_lock);
+
+	if (vq->callfd >= 0) {
+		int ret = eventfd_write(vq->callfd, (eventfd_t)1);
+
+		if (ret) {
+			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+				__atomic_fetch_add(&vq->stats.guest_notifications_error,
+					1, __ATOMIC_RELAXED);
+		} else {
+			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+				__atomic_fetch_add(&vq->stats.guest_notifications,
+					1, __ATOMIC_RELAXED);
+			if (dev->notify_ops->guest_notified)
+				dev->notify_ops->guest_notified(dev->vid);
+		}
+	}
+
+	rte_rwlock_read_unlock(&vq->access_lock);
+}
+
 void
 rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
 {
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 37609c7c8d..0ab75b78b5 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -141,6 +141,8 @@ struct virtqueue_stats {
 	uint64_t inflight_completed;
 	/* Counters below are atomic, and should be incremented as such. */
 	uint64_t guest_notifications;
+	uint64_t guest_notifications_offloaded;
+	uint64_t guest_notifications_error;
 };
 
 /**
@@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
 	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
 }
 
+static __rte_always_inline void
+vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	int ret;
+
+	if (dev->notify_ops->guest_notify &&
+	    dev->notify_ops->guest_notify(dev->vid, vq->index)) {
+		if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+			__atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
+				1, __ATOMIC_RELAXED);
+		return;
+	}
+
+	ret = eventfd_write(vq->callfd, (eventfd_t) 1);
+	if (ret) {
+		if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+			__atomic_fetch_add(&vq->stats.guest_notifications_error,
+				1, __ATOMIC_RELAXED);
+		return;
+	}
+
+	if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+		__atomic_fetch_add(&vq->stats.guest_notifications,
+			1, __ATOMIC_RELAXED);
+	if (dev->notify_ops->guest_notified)
+		dev->notify_ops->guest_notified(dev->vid);
+}
+
+
 static __rte_always_inline void
 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 			"%s: used_event_idx=%d, old=%d, new=%d\n",
 			__func__, vhost_used_event(vq), old, new);
 
-		if ((vhost_need_event(vhost_used_event(vq), new, old) &&
-					(vq->callfd >= 0)) ||
-				unlikely(!signalled_used_valid)) {
-			eventfd_write(vq->callfd, (eventfd_t) 1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				__atomic_fetch_add(&vq->stats.guest_notifications,
-					1, __ATOMIC_RELAXED);
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+		if ((vhost_need_event(vhost_used_event(vq), new, old) ||
+		     unlikely(!signalled_used_valid)) &&
+		    vq->callfd >= 0) {
+			vhost_vring_inject_irq(dev, vq);
 		}
 	} else {
 		/* Kick the guest if necessary. */
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 				&& (vq->callfd >= 0)) {
-			eventfd_write(vq->callfd, (eventfd_t)1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				__atomic_fetch_add(&vq->stats.guest_notifications,
-					1, __ATOMIC_RELAXED);
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+			vhost_vring_inject_irq(dev, vq);
 		}
 	}
 }
@@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	if (vhost_need_event(off, new, old))
 		kick = true;
 kick:
-	if (kick) {
-		eventfd_write(vq->callfd, (eventfd_t)1);
-		if (dev->notify_ops->guest_notified)
-			dev->notify_ops->guest_notified(dev->vid);
-	}
+	if (kick && vq->callfd >= 0)
+		vhost_vring_inject_irq(dev, vq);
 }
 
 static __rte_always_inline void
@@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
 
 uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
 void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
+
+/* Versioned functions */
+int rte_vhost_driver_callback_register_v23(const char *path,
+	struct rte_vhost_device_ops const * const ops);
+int rte_vhost_driver_callback_register_v24(const char *path,
+	struct rte_vhost_device_ops const * const ops);
+
 #endif /* _VHOST_NET_CDEV_H_ */


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 0/3] vhost: add device op to offload the interrupt kick
  2023-04-05 12:40 [PATCH v2 0/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
                   ` (2 preceding siblings ...)
  2023-04-05 12:41 ` [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
@ 2023-05-08 13:58 ` Eelco Chaudron
  3 siblings, 0 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-05-08 13:58 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia; +Cc: dev



On 5 Apr 2023, at 14:40, Eelco Chaudron wrote:

> This series adds an operation callback which gets called every time the
> library wants to call eventfd_write(). This eventfd_write() call could
> result in a system call, which could potentially block the PMD thread.
>
> The callback function can decide whether it's ok to handle the
> eventfd_write() now or have the newly introduced function,
> rte_vhost_notify_guest(), called at a later time.
>
> This can be used by 3rd party applications, like OVS, to avoid system
> calls being called as part of the PMD threads.


Wondering if anyone had a chance to look at this patchset.

Cheers,

Eelco

> v2: - Used vhost_virtqueue->index to find index for operation.
>     - Aligned function name to VDUSE RFC patchset.
>     - Added error and offload statistics counter.
>     - Mark new API as experimental.
>     - Change the virtual queue spin lock to read/write spin lock.
>     - Made shared counters atomic.
>     - Add versioned rte_vhost_driver_callback_register() for
>       ABI compliance.
>
> Eelco Chaudron (3):
>       vhost: Change vhost_virtqueue access lock to a read/write one.
>       vhost: make the guest_notifications statistic counter atomic.
>       vhost: add device op to offload the interrupt kick
>
>
>  lib/eal/include/generic/rte_rwlock.h | 17 +++++
>  lib/vhost/meson.build                |  2 +
>  lib/vhost/rte_vhost.h                | 23 ++++++-
>  lib/vhost/socket.c                   | 72 ++++++++++++++++++++--
>  lib/vhost/version.map                |  9 +++
>  lib/vhost/vhost.c                    | 92 +++++++++++++++++++++-------
>  lib/vhost/vhost.h                    | 70 ++++++++++++++-------
>  lib/vhost/vhost_user.c               | 14 ++---
>  lib/vhost/virtio_net.c               | 90 +++++++++++++--------------
>  9 files changed, 288 insertions(+), 101 deletions(-)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-04-05 12:41 ` [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
@ 2023-05-10 11:44   ` David Marchand
  2023-05-16  8:53     ` Eelco Chaudron
  0 siblings, 1 reply; 12+ messages in thread
From: David Marchand @ 2023-05-10 11:44 UTC (permalink / raw)
  To: Eelco Chaudron; +Cc: maxime.coquelin, chenbo.xia, dev

On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>
> This patch adds an operation callback which gets called every time the
> library wants to call eventfd_write(). This eventfd_write() call could
> result in a system call, which could potentially block the PMD thread.
>
> The callback function can decide whether it's ok to handle the
> eventfd_write() now or have the newly introduced function,
> rte_vhost_notify_guest(), called at a later time.
>
> This can be used by 3rd party applications, like OVS, to avoid system
> calls being called as part of the PMD threads.
>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>  lib/vhost/meson.build |    2 +
>  lib/vhost/rte_vhost.h |   23 +++++++++++++++-
>  lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
>  lib/vhost/version.map |    9 ++++++
>  lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
>  lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
>  6 files changed, 184 insertions(+), 25 deletions(-)
>
> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
> index 197a51d936..e93ba6b078 100644
> --- a/lib/vhost/meson.build
> +++ b/lib/vhost/meson.build
> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>          'vdpa_driver.h',
>  )
>  deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
> +
> +use_function_versioning = true
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> index 58a5d4be92..7a10bc36cf 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>          */
>         void (*guest_notified)(int vid);
>
> -       void *reserved[1]; /**< Reserved for future extension */
> +       /**
> +        * If this callback is registered, notification to the guest can
> +        * be handled by the front-end calling rte_vhost_notify_guest().
> +        * If it's not handled, 'false' should be returned. This can be used
> +        * to remove the "slow" eventfd_write() syscall from the datapath.
> +        */
> +       bool (*guest_notify)(int vid, uint16_t queue_id);
>  };
>
>  /**
> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>
>  int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
> + *
> + * Inject the offloaded interrupt into the vhost device's queue. For more
> + * details see the 'guest_notify' vhost device operation.
> + *
> + * @param vid
> + *  vhost device ID
> + * @param queue_id
> + *  virtio queue index
> + */
> +__rte_experimental
> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
> +
>  /**
>   * Register vhost driver. path could be different for multiple
>   * instance support.
> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> index 669c322e12..787d6bacf8 100644
> --- a/lib/vhost/socket.c
> +++ b/lib/vhost/socket.c
> @@ -15,6 +15,7 @@
>  #include <fcntl.h>
>  #include <pthread.h>
>
> +#include <rte_function_versioning.h>
>  #include <rte_log.h>
>
>  #include "fd_man.h"
> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>         bool async_copy;
>         bool net_compliant_ol_flags;
>         bool stats_enabled;
> +       bool alloc_notify_ops;
>
>         /*
>          * The "supported_features" indicates the feature bits the
> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>                 vsocket->path = NULL;
>         }
>
> +       if (vsocket && vsocket->alloc_notify_ops) {
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> +#pragma GCC diagnostic pop
> +               vsocket->notify_ops = NULL;
> +       }

Rather than select the behavior based on a boolean (and here force the
compiler to close its eyes), I would instead add a non const pointer
to ops (let's say alloc_notify_ops) in vhost_user_socket.
The code can then unconditionnally call free(vsocket->alloc_notify_ops);


> +
>         if (vsocket) {
>                 free(vsocket);
>                 vsocket = NULL;
> @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
>  /*
>   * Register ops so that we can add/remove device to data core.
>   */
> -int
> -rte_vhost_driver_callback_register(const char *path,
> -       struct rte_vhost_device_ops const * const ops)
> +static int
> +rte_vhost_driver_callback_register__(const char *path,

No need for a rte_ prefix on static symbol.
And we can simply call this internal helper vhost_driver_callback_register().


> +       struct rte_vhost_device_ops const * const ops, bool ops_allocated)
>  {
>         struct vhost_user_socket *vsocket;
>
>         pthread_mutex_lock(&vhost_user.mutex);
>         vsocket = find_vhost_user_socket(path);
> -       if (vsocket)
> +       if (vsocket) {
> +               if (vsocket->alloc_notify_ops) {
> +                       vsocket->alloc_notify_ops = false;
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +                       free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> +#pragma GCC diagnostic pop
> +               }
>                 vsocket->notify_ops = ops;
> +               if (ops_allocated)
> +                       vsocket->alloc_notify_ops = true;
> +       }
>         pthread_mutex_unlock(&vhost_user.mutex);
>
>         return vsocket ? 0 : -1;
>  }
>
> +int __vsym
> +rte_vhost_driver_callback_register_v24(const char *path,
> +       struct rte_vhost_device_ops const * const ops)
> +{
> +       return rte_vhost_driver_callback_register__(path, ops, false);
> +}
> +
> +int __vsym
> +rte_vhost_driver_callback_register_v23(const char *path,
> +       struct rte_vhost_device_ops const * const ops)
> +{
> +       int ret;
> +
> +       /*
> +        * Although the ops structure is a const structure, we do need to
> +        * override the guest_notify operation. This is because with the
> +        * previous APIs it was "reserved" and if any garbage value was passed,
> +        * it could crash the application.
> +        */
> +       if (ops && !ops->guest_notify) {

Hum, as described in the comment above, I don't think we should look
at ops->guest_notify value at all.
Checking ops != NULL should be enough.


> +               struct rte_vhost_device_ops *new_ops;
> +
> +               new_ops = malloc(sizeof(*new_ops));

Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
I am unclear of the impact though.


> +               if (new_ops == NULL)
> +                       return -1;
> +
> +               memcpy(new_ops, ops, sizeof(*new_ops));
> +               new_ops->guest_notify = NULL;
> +
> +               ret = rte_vhost_driver_callback_register__(path, ops, true);
> +       } else {
> +               ret = rte_vhost_driver_callback_register__(path, ops, false);
> +       }
> +
> +       return ret;
> +}
> +
> +/* Mark the v23 function as the old version, and v24 as the default version. */
> +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
> +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
> +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
> +               struct rte_vhost_device_ops const * const ops),
> +               rte_vhost_driver_callback_register_v24);
> +
>  struct rte_vhost_device_ops const *
>  vhost_driver_callback_get(const char *path)
>  {
> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
> index d322a4a888..7bcbfd12cf 100644
> --- a/lib/vhost/version.map
> +++ b/lib/vhost/version.map
> @@ -64,6 +64,12 @@ DPDK_23 {
>         local: *;
>  };
>
> +DPDK_24 {
> +       global:
> +
> +       rte_vhost_driver_callback_register;
> +} DPDK_23;
> +
>  EXPERIMENTAL {
>         global:
>
> @@ -98,6 +104,9 @@ EXPERIMENTAL {
>         # added in 22.11
>         rte_vhost_async_dma_unconfigure;
>         rte_vhost_vring_call_nonblock;
> +
> +        # added in 23.07
> +       rte_vhost_notify_guest;
>  };
>
>  INTERNAL {
> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> index 8ff6434c93..79e88f986e 100644
> --- a/lib/vhost/vhost.c
> +++ b/lib/vhost/vhost.c
> @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
>         {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
>         {"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
>         {"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
> +       {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
> +               stats.guest_notifications_offloaded)},
> +       {"guest_notifications_error", offsetof(struct vhost_virtqueue,
> +               stats.guest_notifications_error)},
>         {"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
>         {"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
>         {"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
> @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>         return ret;
>  }
>
> +void
> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
> +{
> +       struct virtio_net *dev = get_device(vid);
> +       struct vhost_virtqueue *vq;
> +
> +       if (!dev ||  queue_id >= VHOST_MAX_VRING)
> +               return;
> +
> +       vq = dev->virtqueue[queue_id];
> +       if (!vq)
> +               return;
> +
> +       rte_rwlock_read_lock(&vq->access_lock);
> +
> +       if (vq->callfd >= 0) {
> +               int ret = eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +               if (ret) {
> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                               __atomic_fetch_add(&vq->stats.guest_notifications_error,
> +                                       1, __ATOMIC_RELAXED);
> +               } else {
> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                               __atomic_fetch_add(&vq->stats.guest_notifications,
> +                                       1, __ATOMIC_RELAXED);
> +                       if (dev->notify_ops->guest_notified)
> +                               dev->notify_ops->guest_notified(dev->vid);
> +               }
> +       }
> +
> +       rte_rwlock_read_unlock(&vq->access_lock);
> +}
> +
>  void
>  rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>  {
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 37609c7c8d..0ab75b78b5 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -141,6 +141,8 @@ struct virtqueue_stats {
>         uint64_t inflight_completed;
>         /* Counters below are atomic, and should be incremented as such. */
>         uint64_t guest_notifications;
> +       uint64_t guest_notifications_offloaded;
> +       uint64_t guest_notifications_error;
>  };
>
>  /**
> @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>         return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>  }
>
> +static __rte_always_inline void
> +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +       int ret;
> +
> +       if (dev->notify_ops->guest_notify &&
> +           dev->notify_ops->guest_notify(dev->vid, vq->index)) {
> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                       __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
> +                               1, __ATOMIC_RELAXED);
> +               return;
> +       }
> +
> +       ret = eventfd_write(vq->callfd, (eventfd_t) 1);
> +       if (ret) {
> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                       __atomic_fetch_add(&vq->stats.guest_notifications_error,
> +                               1, __ATOMIC_RELAXED);
> +               return;
> +       }
> +
> +       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +               __atomic_fetch_add(&vq->stats.guest_notifications,
> +                       1, __ATOMIC_RELAXED);
> +       if (dev->notify_ops->guest_notified)
> +               dev->notify_ops->guest_notified(dev->vid);
> +}
> +
> +

One empty line is enough.

>  static __rte_always_inline void
>  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>                         "%s: used_event_idx=%d, old=%d, new=%d\n",
>                         __func__, vhost_used_event(vq), old, new);
>
> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> -                                       (vq->callfd >= 0)) ||
> -                               unlikely(!signalled_used_valid)) {
> -                       eventfd_write(vq->callfd, (eventfd_t) 1);
> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
> -                                       1, __ATOMIC_RELAXED);
> -                       if (dev->notify_ops->guest_notified)
> -                               dev->notify_ops->guest_notified(dev->vid);
> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> +                    unlikely(!signalled_used_valid)) &&

A bit hard to read (even before your change).

After the change, indentation does not comply with dpdk coding style.
http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation


But putting indentation aside, is this change equivalent?
-               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
-                                       (vq->callfd >= 0)) ||
-                               unlikely(!signalled_used_valid)) {
+               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
+                               unlikely(!signalled_used_valid)) &&
+                               vq->callfd >= 0) {


> +                       vhost_vring_inject_irq(dev, vq);
>                 }
>         } else {
>                 /* Kick the guest if necessary. */
>                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>                                 && (vq->callfd >= 0)) {
> -                       eventfd_write(vq->callfd, (eventfd_t)1);
> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
> -                                       1, __ATOMIC_RELAXED);
> -                       if (dev->notify_ops->guest_notified)
> -                               dev->notify_ops->guest_notified(dev->vid);
> +                       vhost_vring_inject_irq(dev, vq);
>                 }
>         }
>  }
> @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>         if (vhost_need_event(off, new, old))
>                 kick = true;
>  kick:
> -       if (kick) {
> -               eventfd_write(vq->callfd, (eventfd_t)1);
> -               if (dev->notify_ops->guest_notified)
> -                       dev->notify_ops->guest_notified(dev->vid);
> -       }
> +       if (kick && vq->callfd >= 0)
> +               vhost_vring_inject_irq(dev, vq);
>  }
>
>  static __rte_always_inline void
> @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
>
>  uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
>  void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
> +
> +/* Versioned functions */
> +int rte_vhost_driver_callback_register_v23(const char *path,
> +       struct rte_vhost_device_ops const * const ops);
> +int rte_vhost_driver_callback_register_v24(const char *path,
> +       struct rte_vhost_device_ops const * const ops);
> +
>  #endif /* _VHOST_NET_CDEV_H_ */
>


-- 
David Marchand


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-10 11:44   ` David Marchand
@ 2023-05-16  8:53     ` Eelco Chaudron
  2023-05-16 10:12       ` David Marchand
  0 siblings, 1 reply; 12+ messages in thread
From: Eelco Chaudron @ 2023-05-16  8:53 UTC (permalink / raw)
  To: David Marchand; +Cc: maxime.coquelin, chenbo.xia, dev



On 10 May 2023, at 13:44, David Marchand wrote:

> On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>>
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>  lib/vhost/meson.build |    2 +
>>  lib/vhost/rte_vhost.h |   23 +++++++++++++++-
>>  lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  lib/vhost/version.map |    9 ++++++
>>  lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
>>  lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
>>  6 files changed, 184 insertions(+), 25 deletions(-)
>>
>> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
>> index 197a51d936..e93ba6b078 100644
>> --- a/lib/vhost/meson.build
>> +++ b/lib/vhost/meson.build
>> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>>          'vdpa_driver.h',
>>  )
>>  deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
>> +
>> +use_function_versioning = true
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..7a10bc36cf 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>>          */
>>         void (*guest_notified)(int vid);
>>
>> -       void *reserved[1]; /**< Reserved for future extension */
>> +       /**
>> +        * If this callback is registered, notification to the guest can
>> +        * be handled by the front-end calling rte_vhost_notify_guest().
>> +        * If it's not handled, 'false' should be returned. This can be used
>> +        * to remove the "slow" eventfd_write() syscall from the datapath.
>> +        */
>> +       bool (*guest_notify)(int vid, uint16_t queue_id);
>>  };
>>
>>  /**
>> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>>
>>  int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
>> + *
>> + * Inject the offloaded interrupt into the vhost device's queue. For more
>> + * details see the 'guest_notify' vhost device operation.
>> + *
>> + * @param vid
>> + *  vhost device ID
>> + * @param queue_id
>> + *  virtio queue index
>> + */
>> +__rte_experimental
>> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>> +
>>  /**
>>   * Register vhost driver. path could be different for multiple
>>   * instance support.
>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>> index 669c322e12..787d6bacf8 100644
>> --- a/lib/vhost/socket.c
>> +++ b/lib/vhost/socket.c
>> @@ -15,6 +15,7 @@
>>  #include <fcntl.h>
>>  #include <pthread.h>
>>
>> +#include <rte_function_versioning.h>
>>  #include <rte_log.h>
>>
>>  #include "fd_man.h"
>> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>>         bool async_copy;
>>         bool net_compliant_ol_flags;
>>         bool stats_enabled;
>> +       bool alloc_notify_ops;
>>
>>         /*
>>          * The "supported_features" indicates the feature bits the
>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>                 vsocket->path = NULL;
>>         }
>>
>> +       if (vsocket && vsocket->alloc_notify_ops) {
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> +               vsocket->notify_ops = NULL;
>> +       }
>
> Rather than select the behavior based on a boolean (and here force the
> compiler to close its eyes), I would instead add a non const pointer
> to ops (let's say alloc_notify_ops) in vhost_user_socket.
> The code can then unconditionnally call free(vsocket->alloc_notify_ops);

Good idea, I will make the change in v3.

>> +
>>         if (vsocket) {
>>                 free(vsocket);
>>                 vsocket = NULL;
>> @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
>>  /*
>>   * Register ops so that we can add/remove device to data core.
>>   */
>> -int
>> -rte_vhost_driver_callback_register(const char *path,
>> -       struct rte_vhost_device_ops const * const ops)
>> +static int
>> +rte_vhost_driver_callback_register__(const char *path,
>
> No need for a rte_ prefix on static symbol.
> And we can simply call this internal helper vhost_driver_callback_register().

ACK.

>> +       struct rte_vhost_device_ops const * const ops, bool ops_allocated)
>>  {
>>         struct vhost_user_socket *vsocket;
>>
>>         pthread_mutex_lock(&vhost_user.mutex);
>>         vsocket = find_vhost_user_socket(path);
>> -       if (vsocket)
>> +       if (vsocket) {
>> +               if (vsocket->alloc_notify_ops) {
>> +                       vsocket->alloc_notify_ops = false;
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> +                       free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> +               }
>>                 vsocket->notify_ops = ops;
>> +               if (ops_allocated)
>> +                       vsocket->alloc_notify_ops = true;
>> +       }
>>         pthread_mutex_unlock(&vhost_user.mutex);
>>
>>         return vsocket ? 0 : -1;
>>  }
>>
>> +int __vsym
>> +rte_vhost_driver_callback_register_v24(const char *path,
>> +       struct rte_vhost_device_ops const * const ops)
>> +{
>> +       return rte_vhost_driver_callback_register__(path, ops, false);
>> +}
>> +
>> +int __vsym
>> +rte_vhost_driver_callback_register_v23(const char *path,
>> +       struct rte_vhost_device_ops const * const ops)
>> +{
>> +       int ret;
>> +
>> +       /*
>> +        * Although the ops structure is a const structure, we do need to
>> +        * override the guest_notify operation. This is because with the
>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>> +        * it could crash the application.
>> +        */
>> +       if (ops && !ops->guest_notify) {
>
> Hum, as described in the comment above, I don't think we should look
> at ops->guest_notify value at all.
> Checking ops != NULL should be enough.

Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.

I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.

>> +               struct rte_vhost_device_ops *new_ops;
>> +
>> +               new_ops = malloc(sizeof(*new_ops));
>
> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
> I am unclear of the impact though.

Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?

Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.

>> +               if (new_ops == NULL)
>> +                       return -1;
>> +
>> +               memcpy(new_ops, ops, sizeof(*new_ops));
>> +               new_ops->guest_notify = NULL;
>> +
>> +               ret = rte_vhost_driver_callback_register__(path, ops, true);
>> +       } else {
>> +               ret = rte_vhost_driver_callback_register__(path, ops, false);
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +/* Mark the v23 function as the old version, and v24 as the default version. */
>> +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
>> +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
>> +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
>> +               struct rte_vhost_device_ops const * const ops),
>> +               rte_vhost_driver_callback_register_v24);
>> +
>>  struct rte_vhost_device_ops const *
>>  vhost_driver_callback_get(const char *path)
>>  {
>> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
>> index d322a4a888..7bcbfd12cf 100644
>> --- a/lib/vhost/version.map
>> +++ b/lib/vhost/version.map
>> @@ -64,6 +64,12 @@ DPDK_23 {
>>         local: *;
>>  };
>>
>> +DPDK_24 {
>> +       global:
>> +
>> +       rte_vhost_driver_callback_register;
>> +} DPDK_23;
>> +
>>  EXPERIMENTAL {
>>         global:
>>
>> @@ -98,6 +104,9 @@ EXPERIMENTAL {
>>         # added in 22.11
>>         rte_vhost_async_dma_unconfigure;
>>         rte_vhost_vring_call_nonblock;
>> +
>> +        # added in 23.07
>> +       rte_vhost_notify_guest;
>>  };
>>
>>  INTERNAL {
>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>> index 8ff6434c93..79e88f986e 100644
>> --- a/lib/vhost/vhost.c
>> +++ b/lib/vhost/vhost.c
>> @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
>>         {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
>>         {"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
>>         {"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
>> +       {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
>> +               stats.guest_notifications_offloaded)},
>> +       {"guest_notifications_error", offsetof(struct vhost_virtqueue,
>> +               stats.guest_notifications_error)},
>>         {"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
>>         {"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
>>         {"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
>> @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>>         return ret;
>>  }
>>
>> +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
>> +{
>> +       struct virtio_net *dev = get_device(vid);
>> +       struct vhost_virtqueue *vq;
>> +
>> +       if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +               return;
>> +
>> +       vq = dev->virtqueue[queue_id];
>> +       if (!vq)
>> +               return;
>> +
>> +       rte_rwlock_read_lock(&vq->access_lock);
>> +
>> +       if (vq->callfd >= 0) {
>> +               int ret = eventfd_write(vq->callfd, (eventfd_t)1);
>> +
>> +               if (ret) {
>> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                               __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> +                                       1, __ATOMIC_RELAXED);
>> +               } else {
>> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> +                                       1, __ATOMIC_RELAXED);
>> +                       if (dev->notify_ops->guest_notified)
>> +                               dev->notify_ops->guest_notified(dev->vid);
>> +               }
>> +       }
>> +
>> +       rte_rwlock_read_unlock(&vq->access_lock);
>> +}
>> +
>>  void
>>  rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>>  {
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 37609c7c8d..0ab75b78b5 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -141,6 +141,8 @@ struct virtqueue_stats {
>>         uint64_t inflight_completed;
>>         /* Counters below are atomic, and should be incremented as such. */
>>         uint64_t guest_notifications;
>> +       uint64_t guest_notifications_offloaded;
>> +       uint64_t guest_notifications_error;
>>  };
>>
>>  /**
>> @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>>         return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>>  }
>>
>> +static __rte_always_inline void
>> +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +       int ret;
>> +
>> +       if (dev->notify_ops->guest_notify &&
>> +           dev->notify_ops->guest_notify(dev->vid, vq->index)) {
>> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                       __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
>> +                               1, __ATOMIC_RELAXED);
>> +               return;
>> +       }
>> +
>> +       ret = eventfd_write(vq->callfd, (eventfd_t) 1);
>> +       if (ret) {
>> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                       __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> +                               1, __ATOMIC_RELAXED);
>> +               return;
>> +       }
>> +
>> +       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +               __atomic_fetch_add(&vq->stats.guest_notifications,
>> +                       1, __ATOMIC_RELAXED);
>> +       if (dev->notify_ops->guest_notified)
>> +               dev->notify_ops->guest_notified(dev->vid);
>> +}
>> +
>> +
>
> One empty line is enough.

Will remove.

>>  static __rte_always_inline void
>>  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>  {
>> @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>                         "%s: used_event_idx=%d, old=%d, new=%d\n",
>>                         __func__, vhost_used_event(vq), old, new);
>>
>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>> -                                       (vq->callfd >= 0)) ||
>> -                               unlikely(!signalled_used_valid)) {
>> -                       eventfd_write(vq->callfd, (eventfd_t) 1);
>> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> -                                       1, __ATOMIC_RELAXED);
>> -                       if (dev->notify_ops->guest_notified)
>> -                               dev->notify_ops->guest_notified(dev->vid);
>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>> +                    unlikely(!signalled_used_valid)) &&
>> + 		            vq->callfd >= 0) {
>
> A bit hard to read (even before your change).
>
> After the change, indentation does not comply with dpdk coding style.
> http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation

Yes, I always mess up the DPDK indentation style :(
>
> But putting indentation aside, is this change equivalent?
> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> -                                       (vq->callfd >= 0)) ||
> -                               unlikely(!signalled_used_valid)) {
> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> +                               unlikely(!signalled_used_valid)) &&
> +                               vq->callfd >= 0) {

They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)

>> +                       vhost_vring_inject_irq(dev, vq);
>>                 }
>>         } else {
>>                 /* Kick the guest if necessary. */
>>                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>>                                 && (vq->callfd >= 0)) {
>> -                       eventfd_write(vq->callfd, (eventfd_t)1);
>> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> -                                       1, __ATOMIC_RELAXED);
>> -                       if (dev->notify_ops->guest_notified)
>> -                               dev->notify_ops->guest_notified(dev->vid);
>> +                       vhost_vring_inject_irq(dev, vq);
>>                 }
>>         }
>>  }
>> @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>         if (vhost_need_event(off, new, old))
>>                 kick = true;
>>  kick:
>> -       if (kick) {
>> -               eventfd_write(vq->callfd, (eventfd_t)1);
>> -               if (dev->notify_ops->guest_notified)
>> -                       dev->notify_ops->guest_notified(dev->vid);
>> -       }
>> +       if (kick && vq->callfd >= 0)
>> +               vhost_vring_inject_irq(dev, vq);
>>  }
>>
>>  static __rte_always_inline void
>> @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
>>
>>  uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
>>  void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
>> +
>> +/* Versioned functions */
>> +int rte_vhost_driver_callback_register_v23(const char *path,
>> +       struct rte_vhost_device_ops const * const ops);
>> +int rte_vhost_driver_callback_register_v24(const char *path,
>> +       struct rte_vhost_device_ops const * const ops);
>> +
>>  #endif /* _VHOST_NET_CDEV_H_ */
>>
>
>
> -- 
> David Marchand


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-16  8:53     ` Eelco Chaudron
@ 2023-05-16 10:12       ` David Marchand
  2023-05-16 11:36         ` Eelco Chaudron
  0 siblings, 1 reply; 12+ messages in thread
From: David Marchand @ 2023-05-16 10:12 UTC (permalink / raw)
  To: Eelco Chaudron, maxime.coquelin, chenbo.xia; +Cc: dev

On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
> On 10 May 2023, at 13:44, David Marchand wrote:

[snip]

> >> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
> >>                 vsocket->path = NULL;
> >>         }
> >>
> >> +       if (vsocket && vsocket->alloc_notify_ops) {
> >> +#pragma GCC diagnostic push
> >> +#pragma GCC diagnostic ignored "-Wcast-qual"
> >> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> >> +#pragma GCC diagnostic pop
> >> +               vsocket->notify_ops = NULL;
> >> +       }
> >
> > Rather than select the behavior based on a boolean (and here force the
> > compiler to close its eyes), I would instead add a non const pointer
> > to ops (let's say alloc_notify_ops) in vhost_user_socket.
> > The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>
> Good idea, I will make the change in v3.

Feel free to use a better name for this field :-).

>
> >> +
> >>         if (vsocket) {
> >>                 free(vsocket);
> >>                 vsocket = NULL;

[snip]

> >> +       /*
> >> +        * Although the ops structure is a const structure, we do need to
> >> +        * override the guest_notify operation. This is because with the
> >> +        * previous APIs it was "reserved" and if any garbage value was passed,
> >> +        * it could crash the application.
> >> +        */
> >> +       if (ops && !ops->guest_notify) {
> >
> > Hum, as described in the comment above, I don't think we should look
> > at ops->guest_notify value at all.
> > Checking ops != NULL should be enough.
>
> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>
> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.

Hum, I don't understand my comment either o_O'.
Too many days off... or maybe my evil twin took over the keyboard.


>
> >> +               struct rte_vhost_device_ops *new_ops;
> >> +
> >> +               new_ops = malloc(sizeof(*new_ops));
> >
> > Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
> > I am unclear of the impact though.
>
> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>
> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.

Determinining current numa is doable, via 'ops'
get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
numa_realloc().
The problem is how to allocate on this numa with the libc allocator
for which I have no idea...
We could go with the dpdk allocator (again, like numa_realloc()).


In practice, the passed ops will be probably from a const variable in
the program .data section (for which I think fields are set to 0
unless explicitly initialised), or a memset() will be called for a
dynamic allocation from good citizens.
So we can probably live with the current proposal.
Plus, this is only for one release, since in 23.11 with the ABI bump,
we will drop this compat code.

Maxime, Chenbo, what do you think?


[snip]

> >
> > But putting indentation aside, is this change equivalent?
> > -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> > -                                       (vq->callfd >= 0)) ||
> > -                               unlikely(!signalled_used_valid)) {
> > +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> > +                               unlikely(!signalled_used_valid)) &&
> > +                               vq->callfd >= 0) {
>
> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)

I think this should be a separate fix.

>
> >> +                       vhost_vring_inject_irq(dev, vq);


-- 
David Marchand


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-16 10:12       ` David Marchand
@ 2023-05-16 11:36         ` Eelco Chaudron
  2023-05-16 11:45           ` Maxime Coquelin
  2023-05-17  9:18           ` Eelco Chaudron
  0 siblings, 2 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-05-16 11:36 UTC (permalink / raw)
  To: David Marchand; +Cc: maxime.coquelin, chenbo.xia, dev



On 16 May 2023, at 12:12, David Marchand wrote:

> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>> On 10 May 2023, at 13:44, David Marchand wrote:
>
> [snip]
>
>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>                 vsocket->path = NULL;
>>>>         }
>>>>
>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>> +#pragma GCC diagnostic push
>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>> +#pragma GCC diagnostic pop
>>>> +               vsocket->notify_ops = NULL;
>>>> +       }
>>>
>>> Rather than select the behavior based on a boolean (and here force the
>>> compiler to close its eyes), I would instead add a non const pointer
>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>
>> Good idea, I will make the change in v3.
>
> Feel free to use a better name for this field :-).
>
>>
>>>> +
>>>>         if (vsocket) {
>>>>                 free(vsocket);
>>>>                 vsocket = NULL;
>
> [snip]
>
>>>> +       /*
>>>> +        * Although the ops structure is a const structure, we do need to
>>>> +        * override the guest_notify operation. This is because with the
>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>> +        * it could crash the application.
>>>> +        */
>>>> +       if (ops && !ops->guest_notify) {
>>>
>>> Hum, as described in the comment above, I don't think we should look
>>> at ops->guest_notify value at all.
>>> Checking ops != NULL should be enough.
>>
>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>
>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>
> Hum, I don't understand my comment either o_O'.
> Too many days off... or maybe my evil twin took over the keyboard.
>
>
>>
>>>> +               struct rte_vhost_device_ops *new_ops;
>>>> +
>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>
>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>> I am unclear of the impact though.
>>
>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>
>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>
> Determinining current numa is doable, via 'ops'
> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
> numa_realloc().
> The problem is how to allocate on this numa with the libc allocator
> for which I have no idea...
> We could go with the dpdk allocator (again, like numa_realloc()).
>
>
> In practice, the passed ops will be probably from a const variable in
> the program .data section (for which I think fields are set to 0
> unless explicitly initialised), or a memset() will be called for a
> dynamic allocation from good citizens.
> So we can probably live with the current proposal.
> Plus, this is only for one release, since in 23.11 with the ABI bump,
> we will drop this compat code.
>
> Maxime, Chenbo, what do you think?

Wait for their response, but for now I assume we can just keep the numa unaware malloc().

>
> [snip]
>
>>>
>>> But putting indentation aside, is this change equivalent?
>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>> -                                       (vq->callfd >= 0)) ||
>>> -                               unlikely(!signalled_used_valid)) {
>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>> +                               unlikely(!signalled_used_valid)) &&
>>> +                               vq->callfd >= 0) {
>>
>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>
> I think this should be a separate fix.

ACK, will add a separate patch in this series to fix it.

>
>>
>>>> +                       vhost_vring_inject_irq(dev, vq);
>
>
> -- 
> David Marchand


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-16 11:36         ` Eelco Chaudron
@ 2023-05-16 11:45           ` Maxime Coquelin
  2023-05-16 12:07             ` Eelco Chaudron
  2023-05-17  9:18           ` Eelco Chaudron
  1 sibling, 1 reply; 12+ messages in thread
From: Maxime Coquelin @ 2023-05-16 11:45 UTC (permalink / raw)
  To: Eelco Chaudron, David Marchand; +Cc: chenbo.xia, dev



On 5/16/23 13:36, Eelco Chaudron wrote:
> 
> 
> On 16 May 2023, at 12:12, David Marchand wrote:
> 
>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>
>> [snip]
>>
>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>                  vsocket->path = NULL;
>>>>>          }
>>>>>
>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>> +#pragma GCC diagnostic push
>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>> +#pragma GCC diagnostic pop
>>>>> +               vsocket->notify_ops = NULL;
>>>>> +       }
>>>>
>>>> Rather than select the behavior based on a boolean (and here force the
>>>> compiler to close its eyes), I would instead add a non const pointer
>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>
>>> Good idea, I will make the change in v3.
>>
>> Feel free to use a better name for this field :-).
>>
>>>
>>>>> +
>>>>>          if (vsocket) {
>>>>>                  free(vsocket);
>>>>>                  vsocket = NULL;
>>
>> [snip]
>>
>>>>> +       /*
>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>> +        * override the guest_notify operation. This is because with the
>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>> +        * it could crash the application.
>>>>> +        */
>>>>> +       if (ops && !ops->guest_notify) {
>>>>
>>>> Hum, as described in the comment above, I don't think we should look
>>>> at ops->guest_notify value at all.
>>>> Checking ops != NULL should be enough.
>>>
>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>
>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>
>> Hum, I don't understand my comment either o_O'.
>> Too many days off... or maybe my evil twin took over the keyboard.
>>
>>
>>>
>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>> +
>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>
>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>> I am unclear of the impact though.
>>>
>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>
>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>
>> Determinining current numa is doable, via 'ops'
>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>> numa_realloc().
>> The problem is how to allocate on this numa with the libc allocator
>> for which I have no idea...
>> We could go with the dpdk allocator (again, like numa_realloc()).
>>
>>
>> In practice, the passed ops will be probably from a const variable in
>> the program .data section (for which I think fields are set to 0
>> unless explicitly initialised), or a memset() will be called for a
>> dynamic allocation from good citizens.
>> So we can probably live with the current proposal.
>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>> we will drop this compat code.
>>
>> Maxime, Chenbo, what do you think?
> 
> Wait for their response, but for now I assume we can just keep the numa unaware malloc().

Let's keep it as is as we'll get rid of it in 23.11.

>>
>> [snip]
>>
>>>>
>>>> But putting indentation aside, is this change equivalent?
>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>> -                                       (vq->callfd >= 0)) ||
>>>> -                               unlikely(!signalled_used_valid)) {
>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>> +                               unlikely(!signalled_used_valid)) &&
>>>> +                               vq->callfd >= 0) {
>>>
>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>
>> I think this should be a separate fix.
> 
> ACK, will add a separate patch in this series to fix it.

I also caught & fixed it while implementing my VDUSE series [0].
You can pick it in your series, and I will rebase my series on top of
it.

Thanks,
Maxime

[0]: 
https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/b976e1f226db5c09834148847d994045eb89be93


> 
>>
>>>
>>>>> +                       vhost_vring_inject_irq(dev, vq);
>>
>>
>> -- 
>> David Marchand
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-16 11:45           ` Maxime Coquelin
@ 2023-05-16 12:07             ` Eelco Chaudron
  0 siblings, 0 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-05-16 12:07 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: David Marchand, chenbo.xia, dev



On 16 May 2023, at 13:45, Maxime Coquelin wrote:

> On 5/16/23 13:36, Eelco Chaudron wrote:
>>
>>
>> On 16 May 2023, at 12:12, David Marchand wrote:
>>
>>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>>
>>> [snip]
>>>
>>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>>                  vsocket->path = NULL;
>>>>>>          }
>>>>>>
>>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>>> +#pragma GCC diagnostic push
>>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>>> +#pragma GCC diagnostic pop
>>>>>> +               vsocket->notify_ops = NULL;
>>>>>> +       }
>>>>>
>>>>> Rather than select the behavior based on a boolean (and here force the
>>>>> compiler to close its eyes), I would instead add a non const pointer
>>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>>
>>>> Good idea, I will make the change in v3.
>>>
>>> Feel free to use a better name for this field :-).
>>>
>>>>
>>>>>> +
>>>>>>          if (vsocket) {
>>>>>>                  free(vsocket);
>>>>>>                  vsocket = NULL;
>>>
>>> [snip]
>>>
>>>>>> +       /*
>>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>>> +        * override the guest_notify operation. This is because with the
>>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>>> +        * it could crash the application.
>>>>>> +        */
>>>>>> +       if (ops && !ops->guest_notify) {
>>>>>
>>>>> Hum, as described in the comment above, I don't think we should look
>>>>> at ops->guest_notify value at all.
>>>>> Checking ops != NULL should be enough.
>>>>
>>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>>
>>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>>
>>> Hum, I don't understand my comment either o_O'.
>>> Too many days off... or maybe my evil twin took over the keyboard.
>>>
>>>
>>>>
>>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>>> +
>>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>>
>>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>>> I am unclear of the impact though.
>>>>
>>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>>
>>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>>
>>> Determinining current numa is doable, via 'ops'
>>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>>> numa_realloc().
>>> The problem is how to allocate on this numa with the libc allocator
>>> for which I have no idea...
>>> We could go with the dpdk allocator (again, like numa_realloc()).
>>>
>>>
>>> In practice, the passed ops will be probably from a const variable in
>>> the program .data section (for which I think fields are set to 0
>>> unless explicitly initialised), or a memset() will be called for a
>>> dynamic allocation from good citizens.
>>> So we can probably live with the current proposal.
>>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>>> we will drop this compat code.
>>>
>>> Maxime, Chenbo, what do you think?
>>
>> Wait for their response, but for now I assume we can just keep the numa unaware malloc().
>
> Let's keep it as is as we'll get rid of it in 23.11.

Thanks for confirming.

>>>
>>> [snip]
>>>
>>>>>
>>>>> But putting indentation aside, is this change equivalent?
>>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>>> -                                       (vq->callfd >= 0)) ||
>>>>> -                               unlikely(!signalled_used_valid)) {
>>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>>> +                               unlikely(!signalled_used_valid)) &&
>>>>> +                               vq->callfd >= 0) {
>>>>
>>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>>
>>> I think this should be a separate fix.
>>
>> ACK, will add a separate patch in this series to fix it.
>
> I also caught & fixed it while implementing my VDUSE series [0].
> You can pick it in your series, and I will rebase my series on top of
> it.

Thanks for the details I’ll include your patch in my series.

I will send out a new revision soon (after testing the changes with OVS).

Thanks,

Eelco

> Thanks,
> Maxime
>
> [0]: https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/b976e1f226db5c09834148847d994045eb89be93
>
>
>>
>>>
>>>>
>>>>>> +                       vhost_vring_inject_irq(dev, vq);
>>>
>>>
>>> -- 
>>> David Marchand
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick
  2023-05-16 11:36         ` Eelco Chaudron
  2023-05-16 11:45           ` Maxime Coquelin
@ 2023-05-17  9:18           ` Eelco Chaudron
  1 sibling, 0 replies; 12+ messages in thread
From: Eelco Chaudron @ 2023-05-17  9:18 UTC (permalink / raw)
  To: David Marchand; +Cc: maxime.coquelin, chenbo.xia, dev



On 16 May 2023, at 13:36, Eelco Chaudron wrote:

> On 16 May 2023, at 12:12, David Marchand wrote:
>
>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>
>> [snip]
>>
>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>                 vsocket->path = NULL;
>>>>>         }
>>>>>
>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>> +#pragma GCC diagnostic push
>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>> +#pragma GCC diagnostic pop
>>>>> +               vsocket->notify_ops = NULL;
>>>>> +       }
>>>>
>>>> Rather than select the behavior based on a boolean (and here force the
>>>> compiler to close its eyes), I would instead add a non const pointer
>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>
>>> Good idea, I will make the change in v3.
>>
>> Feel free to use a better name for this field :-).
>>
>>>
>>>>> +
>>>>>         if (vsocket) {
>>>>>                 free(vsocket);
>>>>>                 vsocket = NULL;
>>
>> [snip]
>>
>>>>> +       /*
>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>> +        * override the guest_notify operation. This is because with the
>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>> +        * it could crash the application.
>>>>> +        */
>>>>> +       if (ops && !ops->guest_notify) {
>>>>
>>>> Hum, as described in the comment above, I don't think we should look
>>>> at ops->guest_notify value at all.
>>>> Checking ops != NULL should be enough.
>>>
>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>
>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>
>> Hum, I don't understand my comment either o_O'.
>> Too many days off... or maybe my evil twin took over the keyboard.
>>
>>
>>>
>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>> +
>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>
>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>> I am unclear of the impact though.
>>>
>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>
>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>
>> Determinining current numa is doable, via 'ops'
>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>> numa_realloc().
>> The problem is how to allocate on this numa with the libc allocator
>> for which I have no idea...
>> We could go with the dpdk allocator (again, like numa_realloc()).
>>
>>
>> In practice, the passed ops will be probably from a const variable in
>> the program .data section (for which I think fields are set to 0
>> unless explicitly initialised), or a memset() will be called for a
>> dynamic allocation from good citizens.
>> So we can probably live with the current proposal.
>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>> we will drop this compat code.
>>
>> Maxime, Chenbo, what do you think?
>
> Wait for their response, but for now I assume we can just keep the numa unaware malloc().
>
>>
>> [snip]
>>
>>>>
>>>> But putting indentation aside, is this change equivalent?
>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>> -                                       (vq->callfd >= 0)) ||
>>>> -                               unlikely(!signalled_used_valid)) {
>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>> +                               unlikely(!signalled_used_valid)) &&
>>>> +                               vq->callfd >= 0) {
>>>
>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>
>> I think this should be a separate fix.
>
> ACK, will add a separate patch in this series to fix it.

FYI I sent out the v3 patch.

//Eelco


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-05-17  9:18 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-05 12:40 [PATCH v2 0/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
2023-04-05 12:40 ` [PATCH v2 1/3] vhost: Change vhost_virtqueue access lock to a read/write one Eelco Chaudron
2023-04-05 12:41 ` [PATCH v2 2/3] vhost: make the guest_notifications statistic counter atomic Eelco Chaudron
2023-04-05 12:41 ` [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Eelco Chaudron
2023-05-10 11:44   ` David Marchand
2023-05-16  8:53     ` Eelco Chaudron
2023-05-16 10:12       ` David Marchand
2023-05-16 11:36         ` Eelco Chaudron
2023-05-16 11:45           ` Maxime Coquelin
2023-05-16 12:07             ` Eelco Chaudron
2023-05-17  9:18           ` Eelco Chaudron
2023-05-08 13:58 ` [PATCH v2 0/3] " Eelco Chaudron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).