[dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
@ 2017-12-20 14:37 Victor Kaplansky
  2017-12-20 19:06 ` Stephen Hemminger
  0 siblings, 1 reply; 5+ messages in thread
From: Victor Kaplansky @ 2017-12-20 14:37 UTC (permalink / raw)
  To: dev
  Cc: stable, Jens Freimann, Maxime Coquelin, Yuanhan Liu, Tiwei Bie,
	Tan, Jianfeng, Victor Kaplansky

When performing live migration or memory hot-plugging,
the changes to the device and vrings made by message handler
done independently from vring usage by PMD threads.

This causes for example segfaults during live-migration
with MQ enable, but in general virtually any request
sent by qemu changing the state of device can cause
problems.

These patches fixes all above issues by adding a spinlock
to every vring and requiring message handler to start operation
only after ensuring that all PMD threads related to the device
are out of critical section accessing the vring data.

Each vring has its own lock in order to not create contention
between PMD threads of different vrings and to prevent
performance degradation by scaling queue pair number.

See https://bugzilla.redhat.com/show_bug.cgi?id=1450680

Signed-off-by: Victor Kaplansky <victork@redhat.com>
---

v4:

 o moved access_unlock before accessing enable flag and
   access_unlock after iommu_unlock consistently.
 o cosmetics: removed blank line.
 o the access_lock variable moved to be in the same
   cache line with enable and access_ok flags.
 o dequeue path is now guarded with trylock and returning
   zero if unsuccessful.
 o GET_VRING_BASE operation is not guarded by access lock
   to avoid deadlock with device_destroy. See the comment
   in the code.
 o Fixed error path exit from enqueue and dequeue carefully
   unlocking access and iommu locks as appropriate.

v3:
   o Added locking to enqueue flow.
   o Enqueue path guarded as well as dequeue path.
   o Changed name of active_lock.
   o Added initialization of guarding spinlock.
   o Reworked functions skimming over all virt-queues.
   o Performance measurements done by Maxime Coquelin shows
     no degradation in bandwidth and throughput.
   o Spelling.
   o Taking lock only on set operations.
   o IOMMU messages are not guarded by access lock.

v2:
   o Fixed checkpatch complains.
   o Added Signed-off-by.
   o Refined placement of guard to exclude IOMMU messages.
   o TODO: performance degradation measurement.



 lib/librte_vhost/vhost.h      | 25 +++++++++++++--
 lib/librte_vhost/vhost.c      |  1 +
 lib/librte_vhost/vhost_user.c | 71 +++++++++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/virtio_net.c | 28 ++++++++++++++---
 4 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 1cc81c17..f3e43e95 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -108,12 +108,14 @@ struct vhost_virtqueue {
 
 	/* Backend value to determine if device should started/stopped */
 	int			backend;
+	int			enabled;
+	int			access_ok;
+	rte_spinlock_t		access_lock;
+
 	/* Used to notify the guest (trigger interrupt) */
 	int			callfd;
 	/* Currently unused as polling mode is enabled */
 	int			kickfd;
-	int			enabled;
-	int			access_ok;
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
@@ -302,6 +304,25 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	vhost_log_write(dev, vq->log_guest_addr + offset, len);
 }
 
+static __rte_always_inline int
+vhost_user_access_trylock(struct vhost_virtqueue *vq)
+{
+	return rte_spinlock_trylock(&vq->access_lock);
+}
+
+static __rte_always_inline void
+vhost_user_access_lock(struct vhost_virtqueue *vq)
+{
+	rte_spinlock_lock(&vq->access_lock);
+}
+
+static __rte_always_inline void
+vhost_user_access_unlock(struct vhost_virtqueue *vq)
+{
+	rte_spinlock_unlock(&vq->access_lock);
+}
+
+
 /* Macros for printing using RTE_LOG */
 #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
 #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 4f8b73a0..dcc42fc7 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -259,6 +259,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 
 	dev->virtqueue[vring_idx] = vq;
 	init_vring_queue(dev, vring_idx);
+	rte_spinlock_init(&vq->access_lock);
 
 	dev->nr_vring += 1;
 
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index f4c7ce46..a10cbdcc 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1190,12 +1190,48 @@ vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
 	return alloc_vring_queue(dev, vring_idx);
 }
 
+static void
+vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
+{
+	unsigned int i = 0;
+	unsigned int vq_num = 0;
+
+	while (vq_num < dev->nr_vring) {
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+		if (vq) {
+			vhost_user_access_lock(vq);
+			vq_num++;
+		}
+		i++;
+	}
+}
+
+static void
+vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
+{
+	unsigned int i = 0;
+	unsigned int vq_num = 0;
+
+	while (vq_num < dev->nr_vring) {
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+		if (vq) {
+			vhost_user_access_unlock(vq);
+			vq_num++;
+		}
+		i++;
+	}
+}
+
 int
 vhost_user_msg_handler(int vid, int fd)
 {
 	struct virtio_net *dev;
 	struct VhostUserMsg msg;
 	int ret;
+	int unlock_required = 0;
+
 
 	dev = get_device(vid);
 	if (dev == NULL)
@@ -1241,6 +1277,38 @@ vhost_user_msg_handler(int vid, int fd)
 		return -1;
 	}
 
+	/*
+	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE,
+	 * since it is sent when virtio stops and device is destroyed.
+	 * destroy_device waits for queues to be inactive, so it is safe.
+	 * Otherwise taking the access_lock would cause a dead lock.
+	 */
+	switch (msg.request.master) {
+	case VHOST_USER_SET_FEATURES:
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+	case VHOST_USER_SET_OWNER:
+	case VHOST_USER_RESET_OWNER:
+	case VHOST_USER_SET_MEM_TABLE:
+	case VHOST_USER_SET_LOG_BASE:
+	case VHOST_USER_SET_LOG_FD:
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_ADDR:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+	case VHOST_USER_SET_VRING_ENABLE:
+	case VHOST_USER_SEND_RARP:
+	case VHOST_USER_NET_SET_MTU:
+	case VHOST_USER_SET_SLAVE_REQ_FD:
+		vhost_user_lock_all_queue_pairs(dev);
+		unlock_required = 1;
+		break;
+	default:
+		break;
+
+	}
+
 	switch (msg.request.master) {
 	case VHOST_USER_GET_FEATURES:
 		msg.payload.u64 = vhost_user_get_features(dev);
@@ -1342,6 +1410,9 @@ vhost_user_msg_handler(int vid, int fd)
 
 	}
 
+	if (unlock_required)
+		vhost_user_unlock_all_queue_pairs(dev);
+
 	if (msg.flags & VHOST_USER_NEED_REPLY) {
 		msg.payload.u64 = !!ret;
 		msg.size = sizeof(msg.payload.u64);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6fee16e5..d379b102 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -44,6 +44,7 @@
 #include <rte_udp.h>
 #include <rte_sctp.h>
 #include <rte_arp.h>
+#include <rte_spinlock.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -326,8 +327,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
+
+	vhost_user_access_lock(vq);
+
 	if (unlikely(vq->enabled == 0))
-		return 0;
+		goto out_access_unlock;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_lock(vq);
@@ -419,6 +423,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
+out_access_unlock:
+	vhost_user_access_unlock(vq);
+
 	return count;
 }
 
@@ -651,8 +658,11 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
+
+	vhost_user_access_lock(vq);
+
 	if (unlikely(vq->enabled == 0))
-		return 0;
+		goto out_access_unlock;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_lock(vq);
@@ -715,6 +725,9 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
+out_access_unlock:
+	vhost_user_access_unlock(vq);
+
 	return pkt_idx;
 }
 
@@ -1180,9 +1193,13 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+
+	if (unlikely(vhost_user_access_trylock(vq) == 0))
 		return 0;
 
+	if (unlikely(vq->enabled == 0))
+		goto out_access_unlock;
+
 	vq->batch_copy_nb_elems = 0;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@@ -1240,7 +1257,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 		if (rarp_mbuf == NULL) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
-			return 0;
+			goto out;
 		}
 
 		if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
@@ -1356,6 +1373,9 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
+out_access_unlock:
+	vhost_user_access_unlock(vq);
+
 	if (unlikely(rarp_mbuf != NULL)) {
 		/*
 		 * Inject it to the head of "pkts" array, so that switch's mac
-- 
Victor

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
  2017-12-20 14:37 [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes Victor Kaplansky
@ 2017-12-20 19:06 ` Stephen Hemminger
  2017-12-20 20:06   ` Victor Kaplansky
  0 siblings, 1 reply; 5+ messages in thread
From: Stephen Hemminger @ 2017-12-20 19:06 UTC (permalink / raw)
  To: Victor Kaplansky
  Cc: dev, stable, Jens Freimann, Maxime Coquelin, Yuanhan Liu,
	Tiwei Bie, Tan, Jianfeng

On Wed, 20 Dec 2017 16:37:52 +0200
Victor Kaplansky <victork@redhat.com> wrote:

> When performing live migration or memory hot-plugging,
> the changes to the device and vrings made by message handler
> done independently from vring usage by PMD threads.
> 
> This causes for example segfaults during live-migration
> with MQ enable, but in general virtually any request
> sent by qemu changing the state of device can cause
> problems.
> 
> These patches fixes all above issues by adding a spinlock
> to every vring and requiring message handler to start operation
> only after ensuring that all PMD threads related to the device
> are out of critical section accessing the vring data.
> 
> Each vring has its own lock in order to not create contention
> between PMD threads of different vrings and to prevent
> performance degradation by scaling queue pair number.
> 
> See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
> 
> Signed-off-by: Victor Kaplansky <victork@redhat.com>
> ---
> 
> v4:
> 
>  o moved access_unlock before accessing enable flag and
>    access_unlock after iommu_unlock consistently.
>  o cosmetics: removed blank line.
>  o the access_lock variable moved to be in the same
>    cache line with enable and access_ok flags.
>  o dequeue path is now guarded with trylock and returning
>    zero if unsuccessful.
>  o GET_VRING_BASE operation is not guarded by access lock
>    to avoid deadlock with device_destroy. See the comment
>    in the code.
>  o Fixed error path exit from enqueue and dequeue carefully
>    unlocking access and iommu locks as appropriate.
> 
> v3:
>    o Added locking to enqueue flow.
>    o Enqueue path guarded as well as dequeue path.
>    o Changed name of active_lock.
>    o Added initialization of guarding spinlock.
>    o Reworked functions skimming over all virt-queues.
>    o Performance measurements done by Maxime Coquelin shows
>      no degradation in bandwidth and throughput.
>    o Spelling.
>    o Taking lock only on set operations.
>    o IOMMU messages are not guarded by access lock.
> 
> v2:
>    o Fixed checkpatch complains.
>    o Added Signed-off-by.
>    o Refined placement of guard to exclude IOMMU messages.
>    o TODO: performance degradation measurement.
> 
> 
> 
>  lib/librte_vhost/vhost.h      | 25 +++++++++++++--
>  lib/librte_vhost/vhost.c      |  1 +
>  lib/librte_vhost/vhost_user.c | 71 +++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/virtio_net.c | 28 ++++++++++++++---
>  4 files changed, 119 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 1cc81c17..f3e43e95 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -108,12 +108,14 @@ struct vhost_virtqueue {
>  
>  	/* Backend value to determine if device should started/stopped */
>  	int			backend;
> +	int			enabled;
> +	int			access_ok;
> +	rte_spinlock_t		access_lock;

Maybe these int's should be bool?

> +
>  	/* Used to notify the guest (trigger interrupt) */
>  	int			callfd;
>  	/* Currently unused as polling mode is enabled */
>  	int			kickfd;
> -	int			enabled;
> -	int			access_ok;
>  
>  	/* Physical address of used ring, for logging */
>  	uint64_t		log_guest_addr;
> @@ -302,6 +304,25 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	vhost_log_write(dev, vq->log_guest_addr + offset, len);
>  }
>  
> +static __rte_always_inline int
> +vhost_user_access_trylock(struct vhost_virtqueue *vq)
> +{
> +	return rte_spinlock_trylock(&vq->access_lock);
> +}
> +
> +static __rte_always_inline void
> +vhost_user_access_lock(struct vhost_virtqueue *vq)
> +{
> +	rte_spinlock_lock(&vq->access_lock);
> +}
> +
> +static __rte_always_inline void
> +vhost_user_access_unlock(struct vhost_virtqueue *vq)
> +{
> +	rte_spinlock_unlock(&vq->access_lock);
> +}
> +
> +

Wrapping locking inline's adds nothing and makes life harder
for static analysis tools.

The bigger problem is that doing locking on all enqueue/dequeue
can have a visible performance impact. Did you measure that?

Could you invent an RCUish mechanism using compiler barriers?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
  2017-12-20 19:06 ` Stephen Hemminger
@ 2017-12-20 20:06   ` Victor Kaplansky
  2017-12-20 20:19     ` Stephen Hemminger
  0 siblings, 1 reply; 5+ messages in thread
From: Victor Kaplansky @ 2017-12-20 20:06 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, stable, Jens Freimann, Maxime Coquelin, Yuanhan Liu,
	Tiwei Bie, Jianfeng Tan



----- Original Message -----
> From: "Stephen Hemminger" <stephen@networkplumber.org>
> To: "Victor Kaplansky" <victork@redhat.com>
> Cc: dev@dpdk.org, stable@dpdk.org, "Jens Freimann" <jfreiman@redhat.com>, "Maxime Coquelin"
> <maxime.coquelin@redhat.com>, "Yuanhan Liu" <yliu@fridaylinux.org>, "Tiwei Bie" <tiwei.bie@intel.com>, "Jianfeng
> Tan" <jianfeng.tan@intel.com>
> Sent: Wednesday, December 20, 2017 9:06:16 PM
> Subject: Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
> 
> On Wed, 20 Dec 2017 16:37:52 +0200
> Victor Kaplansky <victork@redhat.com> wrote:
> 
> > When performing live migration or memory hot-plugging,
> > the changes to the device and vrings made by message handler
> > done independently from vring usage by PMD threads.
> > 
> > This causes for example segfaults during live-migration
> > with MQ enable, but in general virtually any request
> > sent by qemu changing the state of device can cause
> > problems.
> > 
> > These patches fixes all above issues by adding a spinlock
> > to every vring and requiring message handler to start operation
> > only after ensuring that all PMD threads related to the device
> > are out of critical section accessing the vring data.
> > 
> > Each vring has its own lock in order to not create contention
> > between PMD threads of different vrings and to prevent
> > performance degradation by scaling queue pair number.
> > 
> > See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
> > 
> > Signed-off-by: Victor Kaplansky <victork@redhat.com>
> > ---
> > 
> > v4:
> > 
> >  o moved access_unlock before accessing enable flag and
> >    access_unlock after iommu_unlock consistently.
> >  o cosmetics: removed blank line.
> >  o the access_lock variable moved to be in the same
> >    cache line with enable and access_ok flags.
> >  o dequeue path is now guarded with trylock and returning
> >    zero if unsuccessful.
> >  o GET_VRING_BASE operation is not guarded by access lock
> >    to avoid deadlock with device_destroy. See the comment
> >    in the code.
> >  o Fixed error path exit from enqueue and dequeue carefully
> >    unlocking access and iommu locks as appropriate.
> > 
> > v3:
> >    o Added locking to enqueue flow.
> >    o Enqueue path guarded as well as dequeue path.
> >    o Changed name of active_lock.
> >    o Added initialization of guarding spinlock.
> >    o Reworked functions skimming over all virt-queues.
> >    o Performance measurements done by Maxime Coquelin shows
> >      no degradation in bandwidth and throughput.
> >    o Spelling.
> >    o Taking lock only on set operations.
> >    o IOMMU messages are not guarded by access lock.
> > 
> > v2:
> >    o Fixed checkpatch complains.
> >    o Added Signed-off-by.
> >    o Refined placement of guard to exclude IOMMU messages.
> >    o TODO: performance degradation measurement.
> > 
> > 
> > 
> >  lib/librte_vhost/vhost.h      | 25 +++++++++++++--
> >  lib/librte_vhost/vhost.c      |  1 +
> >  lib/librte_vhost/vhost_user.c | 71
> >  +++++++++++++++++++++++++++++++++++++++++++
> >  lib/librte_vhost/virtio_net.c | 28 ++++++++++++++---
> >  4 files changed, 119 insertions(+), 6 deletions(-)
> > 
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 1cc81c17..f3e43e95 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -108,12 +108,14 @@ struct vhost_virtqueue {
> >  
> >  	/* Backend value to determine if device should started/stopped */
> >  	int			backend;
> > +	int			enabled;
> > +	int			access_ok;
> > +	rte_spinlock_t		access_lock;
> 
> Maybe these int's should be bool?

In this patch I only moved the location of "enabled" and "access_ok",
without changing their type. Changing to bool could be done by another
patch. Anyway, from C/C++ standard point of view, bool type is only promised
to be large enough to store the values 0 and 1, thus if we want a better
control on memory layout, it would we better to leave them as int (or change
to char).

> 
> > +
> >  	/* Used to notify the guest (trigger interrupt) */
> >  	int			callfd;
> >  	/* Currently unused as polling mode is enabled */
> >  	int			kickfd;
> > -	int			enabled;
> > -	int			access_ok;
> >  
> >  	/* Physical address of used ring, for logging */
> >  	uint64_t		log_guest_addr;
> > @@ -302,6 +304,25 @@ vhost_log_used_vring(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> >  	vhost_log_write(dev, vq->log_guest_addr + offset, len);
> >  }
> >  
> > +static __rte_always_inline int
> > +vhost_user_access_trylock(struct vhost_virtqueue *vq)
> > +{
> > +	return rte_spinlock_trylock(&vq->access_lock);
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_user_access_lock(struct vhost_virtqueue *vq)
> > +{
> > +	rte_spinlock_lock(&vq->access_lock);
> > +}
> > +
> > +static __rte_always_inline void
> > +vhost_user_access_unlock(struct vhost_virtqueue *vq)
> > +{
> > +	rte_spinlock_unlock(&vq->access_lock);
> > +}
> > +
> > +
> 
> Wrapping locking inline's adds nothing and makes life harder
> for static analysis tools.

Yep. In this case it inhibits the details of how the locking is
implemented (e.g. the name of the lock). It also facilitates
replacement of locking mechanism, by another implementation.
See below.

> 
> The bigger problem is that doing locking on all enqueue/dequeue
> can have a visible performance impact. Did you measure that?
> 
> Could you invent an RCUish mechanism using compiler barriers?
> 

I've played a bit with measuring performance impact. Successful
lock adds on the average about 30 cycles on my Haswell cpu.
(and it successes 99.999...% of time).

I can investigate it more, but my initial feeling is that adding a
memory barrier (the real one, not the compiler barrier) would add
about the same overhead.

By the way, the way update_queuing_status() in 
drivers/net/vhost/rte_eth_vhost.c tries to avoid contention with
the active queue by playing with "allow_queuing" and "while_queuing"
seems to be broken, since memory barriers are missing.

-- 
Victor

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
  2017-12-20 20:06   ` Victor Kaplansky
@ 2017-12-20 20:19     ` Stephen Hemminger
  2017-12-21 12:41       ` Victor Kaplansky
  0 siblings, 1 reply; 5+ messages in thread
From: Stephen Hemminger @ 2017-12-20 20:19 UTC (permalink / raw)
  To: Victor Kaplansky
  Cc: dev, stable, Jens Freimann, Maxime Coquelin, Yuanhan Liu,
	Tiwei Bie, Jianfeng Tan

On Wed, 20 Dec 2017 15:06:30 -0500 (EST)
Victor Kaplansky <vkaplans@redhat.com> wrote:

> > Wrapping locking inline's adds nothing and makes life harder
> > for static analysis tools.  
> 
> Yep. In this case it inhibits the details of how the locking is
> implemented (e.g. the name of the lock). It also facilitates
> replacement of locking mechanism, by another implementation.
> See below.

YAGNI You aren't gonna need it.

Don't build infrastructure for things that you forsee.




> > 
> > The bigger problem is that doing locking on all enqueue/dequeue
> > can have a visible performance impact. Did you measure that?
> > 
> > Could you invent an RCUish mechanism using compiler barriers?
> >   
> 
> I've played a bit with measuring performance impact. Successful
> lock adds on the average about 30 cycles on my Haswell cpu.
> (and it successes 99.999...% of time).
> 
> I can investigate it more, but my initial feeling is that adding a
> memory barrier (the real one, not the compiler barrier) would add
> about the same overhead.
> 
> By the way, the way update_queuing_status() in 
> drivers/net/vhost/rte_eth_vhost.c tries to avoid contention with
> the active queue by playing with "allow_queuing" and "while_queuing"
> seems to be broken, since memory barriers are missing.

CPU cycles alone don't matter on modern x86.
What matters is cache and instructions per cycle.
In this case locking requires locked instruction which causes the cpu
prefetching and instruction pipeline to stall.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
  2017-12-20 20:19     ` Stephen Hemminger
@ 2017-12-21 12:41       ` Victor Kaplansky
  0 siblings, 0 replies; 5+ messages in thread
From: Victor Kaplansky @ 2017-12-21 12:41 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, stable, Jens Freimann, Maxime Coquelin, Yuanhan Liu,
	Tiwei Bie, Jianfeng Tan, Chao Zhu, Roman Dementiev



----- Original Message -----
> From: "Stephen Hemminger" <stephen@networkplumber.org>
> To: "Victor Kaplansky" <vkaplans@redhat.com>
> Cc: dev@dpdk.org, stable@dpdk.org, "Jens Freimann" <jfreiman@redhat.com>, "Maxime Coquelin"
> <maxime.coquelin@redhat.com>, "Yuanhan Liu" <yliu@fridaylinux.org>, "Tiwei Bie" <tiwei.bie@intel.com>, "Jianfeng
> Tan" <jianfeng.tan@intel.com>
> Sent: Wednesday, December 20, 2017 10:19:45 PM
> Subject: Re: [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes
> 
> On Wed, 20 Dec 2017 15:06:30 -0500 (EST)
> Victor Kaplansky <vkaplans@redhat.com> wrote:
> 
> > > Wrapping locking inline's adds nothing and makes life harder
> > > for static analysis tools.
> > 
> > Yep. In this case it inhibits the details of how the locking is
> > implemented (e.g. the name of the lock). It also facilitates
> > replacement of locking mechanism, by another implementation.
> > See below.
> 
> YAGNI You aren't gonna need it.
> 
> Don't build infrastructure for things that you forsee.

Good point, thanks. I'll simplify this.


> 
> 
> 
> 
> > > 
> > > The bigger problem is that doing locking on all enqueue/dequeue
> > > can have a visible performance impact. Did you measure that?
> > > 
> > > Could you invent an RCUish mechanism using compiler barriers?
> > >   
> > 
> > I've played a bit with measuring performance impact. Successful
> > lock adds on the average about 30 cycles on my Haswell cpu.
> > (and it successes 99.999...% of time).
> > 
> > I can investigate it more, but my initial feeling is that adding a
> > memory barrier (the real one, not the compiler barrier) would add
> > about the same overhead.
> > 
> > By the way, the way update_queuing_status() in
> > drivers/net/vhost/rte_eth_vhost.c tries to avoid contention with
> > the active queue by playing with "allow_queuing" and "while_queuing"
> > seems to be broken, since memory barriers are missing.
> 
> CPU cycles alone don't matter on modern x86.
> What matters is cache and instructions per cycle.
> In this case locking requires locked instruction which causes the cpu
> prefetching and instruction pipeline to stall.
> 

I agree. I've measured total overhead of added pair of lock/unlock and
it appears to be around 28 cycles per lock/unlock pair on my 3.5GHz Haswell.

>From "Intel® 64 and IA-32 Architectures Software Developer’s Manual
Volume 3A: System Programming Guide, Part 1":

        In the Pentium 4, Intel Xeon, and P6 family processors, the
        locking operation is handled with either a cache lock or bus
        lock. If a memory access is cacheable and affects only a
        single cache line, a cache lock is invoked and the system
        bus and the actual memory location in system memory are not
        locked during the operation. Here, other Pentium 4, Intel
        Xeon, or P6 family processors on the bus write-back any
        modified data and invalidate their caches as necessary to
        maintain system memory coherency. If the memory access is
        not cacheable and/or it crosses a cache line boundary, the
        processor’s LOCK# signal is asserted and the processor does
        not respond to requests for bus control during the locked
        operation.

So, the whole memory bus is locked only if the memory access crosses 
memory cache line.

Anyway, I'm open to ways to reduce this overhead. This patch fixes a critical
host of bugs reported in bugzilla, so if we can pull this fix
and try to optimize it later by a subsequent patch it would be great.

See below a quick test program I've used to test and measure the overhead.
It also demonstrates the problem I'm trying to fix. Do you have any idea
about using RCU of how to reduce the overhead?

BTW, our implementation of rte_spinlock_unlock() could be slightly faster,
if we would use regular move instead of xchg instruction.

Also our implementation of rte_spinlock_lock() could be faster if we
optimize it for success path by making conditional branch to fall-trough
or even better if we reimplement the spinlock using gcc builtins.

-- 
Victor

diff --git a/tlock.c b/tlock.c
deleted file mode 100644
index 62c68852..00000000
--- a/tlock.c
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <pthread.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <stdio.h>
-
-/* build me with: 
-      gcc -march=native -std=gnu11 -O3 -lpthread tlock.c -o tlock
-*/
-
-
-
-typedef struct {
-        volatile int locked; /**< lock status 0 = unlocked, 1 = locked */
-} rte_spinlock_t;
-
-static inline void
-rte_spinlock_lock(rte_spinlock_t *sl)
-{
-        int lock_val = 1;
-        asm volatile (
-                        "1:\n"
-                        "xchg %[locked], %[lv]\n"
-                        "test %[lv], %[lv]\n"
-                        "jz 3f\n"
-                        "2:\n"
-                        "pause\n"
-                        "cmpl $0, %[locked]\n"
-                        "jnz 2b\n"
-                        "jmp 1b\n"
-                        "3:\n"
-                        : [locked] "=m" (sl->locked), [lv] "=q" (lock_val)
-                        : "[lv]" (lock_val)
-                        : "memory");
-}
-
-static inline void
-rte_spinlock_unlock (rte_spinlock_t *sl)
-{
-        int unlock_val = 0;
-        asm volatile (
-                        "xchg %[locked], %[ulv]\n"
-                        : [locked] "=m" (sl->locked), [ulv] "=q" (unlock_val)
-                        : "[ulv]" (unlock_val)
-                        : "memory");
-}
-
-static unsigned * volatile pointer;
-static rte_spinlock_t reader_access;
-
-void *
-worker(void *unused)
-{
-       int i = 0;
-       while (1) {
-               unsigned *new_pointer = (unsigned *) mmap(NULL, 4096, PROT_READ | PROT_WRITE,
-                                                     MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-               unsigned *old_pointer = pointer;
-
-               rte_spinlock_lock(&reader_access);
-               pointer = new_pointer;
-               rte_spinlock_unlock(&reader_access);
-
-               munmap (old_pointer, 4096);
-
-               usleep(10000);
-                       
-       }
-       return 0;
-
-}
-
-int main()
-{
-       pthread_t t;
-       pointer = (unsigned *) mmap(NULL, 4096, PROT_READ | PROT_WRITE,
-                                   MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-
-       pthread_create(&t, 0, worker, NULL);
-
-       unsigned n = 400000000;
-
-       while (n--) {
-               rte_spinlock_lock(&reader_access);
-               *pointer = 1;
-               rte_spinlock_unlock(&reader_access);
-       }
-
-       pthread_cancel(t);
-       return 0;
-
-}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-12-21 12:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-20 14:37 [dpdk-dev] [PATCH v4] vhost_user: protect active rings from async ring changes Victor Kaplansky
2017-12-20 19:06 ` Stephen Hemminger
2017-12-20 20:06   ` Victor Kaplansky
2017-12-20 20:19     ` Stephen Hemminger
2017-12-21 12:41       ` Victor Kaplansky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).