DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC PATCH] vfio: move eventfd/interrupt pairing at setup time
@ 2019-07-02 12:48 David Marchand
  2019-07-09 13:40 ` [dpdk-dev] [EXT] " Shahed Shaikh
  2019-07-10 12:33 ` [dpdk-dev] [PATCH] vfio: fix interrupts race condition David Marchand
  0 siblings, 2 replies; 8+ messages in thread
From: David Marchand @ 2019-07-02 12:48 UTC (permalink / raw)
  To: dev; +Cc: anatoly.burakov, alex.williamson, maxime.coquelin, thomas, stephen

Populating the eventfd in rte_intr_enable in each request to vfio
triggers a reconfiguration of the interrupt handler on the kernel side.
The problem is that rte_intr_enable is often used to re-enable masked
interrupts from drivers interrupt handlers.

This reconfiguration leaves a window during which a device could send
an interrupt and then the kernel logs this unsolicited interrupt:
[158764.159833] do_IRQ: 9.34 No irq handler for vector

VFIO api makes it possible to set the fd at setup time.
Make use of this and then we only need to ask for masking/unmasking
legacy interrupts and we have nothing to do for MSI/MSIX.

"rxtx" interrupts are left untouched but are most likely subject to the
same issue.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
Sending as a RFC patch since it is not trivial and can eat your babies.
Could people that know well this parts have a look at this?
Thanks!

---
 drivers/bus/pci/linux/pci_vfio.c          |  78 ++++++------
 lib/librte_eal/linux/eal/eal_interrupts.c | 197 +++++++-----------------------
 2 files changed, 86 insertions(+), 189 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index ebf6ccd..e4c32c4 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -186,8 +186,11 @@
 static int
 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 {
-	int i, ret, intr_idx;
+	char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)];
+	struct vfio_irq_set *irq_set;
 	enum rte_intr_mode intr_mode;
+	int i, ret, intr_idx;
+	int fd;
 
 	/* default to invalid index */
 	intr_idx = VFIO_PCI_NUM_IRQS;
@@ -219,7 +222,6 @@
 	/* start from MSI-X interrupt type */
 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
-		int fd = -1;
 
 		/* skip interrupt modes we don't want */
 		if (intr_mode != RTE_INTR_MODE_NONE &&
@@ -235,51 +237,51 @@
 			return -1;
 		}
 
+		/* found a usable interrupt mode */
+		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) != 0)
+			break;
+
 		/* if this vector cannot be used with eventfd, fail if we explicitly
 		 * specified interrupt type, otherwise continue */
-		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
-			if (intr_mode != RTE_INTR_MODE_NONE) {
-				RTE_LOG(ERR, EAL,
-						"  interrupt vector does not support eventfd!\n");
-				return -1;
-			} else
-				continue;
-		}
-
-		/* set up an eventfd for interrupts */
-		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
-		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
-					"error %i (%s)\n", errno, strerror(errno));
+		if (intr_mode != RTE_INTR_MODE_NONE) {
+			RTE_LOG(ERR, EAL, "  interrupt vector does not support eventfd!\n");
 			return -1;
 		}
+	}
 
-		dev->intr_handle.fd = fd;
-		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+	if (i < 0)
+		return -1;
 
-		switch (i) {
-		case VFIO_PCI_MSIX_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_MSIX;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
-			break;
-		case VFIO_PCI_MSI_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_MSI;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
-			break;
-		case VFIO_PCI_INTX_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_LEGACY;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
-			break;
-		default:
-			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
-			return -1;
-		}
+	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "  cannot set up eventfd, error %i (%s)\n",
+			errno, strerror(errno));
+		return -1;
+	}
 
-		return 0;
+	irq_set = (struct vfio_irq_set *)irq_set_buf;
+	irq_set->argsz = sizeof(irq_set_buf);
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = i;
+	irq_set->start = 0;
+	irq_set->count = 1;
+	memcpy(&irq_set->data, &fd, sizeof(int));
+	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set) < 0) {
+		RTE_LOG(ERR, EAL, "  error configuring interrupt\n");
+		close(fd);
+		return -1;
 	}
 
-	/* if we're here, we haven't found a suitable interrupt vector */
-	return -1;
+	dev->intr_handle.fd = fd;
+	dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+	if (i == VFIO_PCI_MSIX_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+	else if (i == VFIO_PCI_MSI_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
+	else if (i == VFIO_PCI_INTX_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
+
+	return 0;
 }
 
 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
index 79ad5e8..27976b3 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -109,42 +109,19 @@ struct rte_intr_source {
 
 /* enable legacy (INTx) interrupts */
 static int
-vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-	int *fd_ptr;
-
-	len = sizeof(irq_set_buf);
-
-	/* enable INTx */
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
+vfio_enable_intx(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	/* unmask INTx after enabling */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
+	/* unmask INTx */
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set.start = 0;
+	irq_set.count = 1;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
@@ -156,128 +133,51 @@ struct rte_intr_source {
 
 /* disable legacy (INTx) interrupts */
 static int
-vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
+vfio_disable_intx(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	/* mask interrupts before disabling */
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
+	/* mask interrupts */
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
+	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set.start = 0;
+	irq_set.count = 1;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* disable INTx*/
-	memset(irq_set, 0, len);
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
-		return -1;
-	}
-	return 0;
-}
-
-/* enable MSI interrupts */
-static int
-vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
-	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	struct vfio_irq_set *irq_set;
-	int *fd_ptr;
-
-	len = sizeof(irq_set_buf);
-
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
-/* disable MSI interrupts */
-static int
-vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
-
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret)
-		RTE_LOG(ERR, EAL,
-			"Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
-
-	return ret;
-}
-
 /* enable MSI-X interrupts */
 static int
-vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
-	int len, ret;
+vfio_enable_msix(const struct rte_intr_handle *intr_handle)
+{
 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
-	int *fd_ptr;
+	int len, ret;
+
+	if (intr_handle->nb_efd == 0)
+		return 0;
 
 	len = sizeof(irq_set_buf);
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
-	irq_set->count = intr_handle->max_intr ?
-		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
-		RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	/* INTR vector offset 0 reserve for non-efds mapping */
-	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
-	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
-		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
+	irq_set->start = RTE_INTR_VEC_RXTX_OFFSET;
+	irq_set->count = intr_handle->nb_efd;
+	memcpy(&irq_set->data, intr_handle->efds,
+	       sizeof(*intr_handle->efds) * intr_handle->nb_efd);
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
 						intr_handle->fd);
@@ -289,22 +189,21 @@ struct rte_intr_source {
 
 /* disable MSI-X interrupts */
 static int
-vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
+vfio_disable_msix(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
+	if (intr_handle->nb_efd == 0)
+		return 0;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
+	irq_set.count = intr_handle->nb_efd;
 
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 	if (ret)
 		RTE_LOG(ERR, EAL,
 			"Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
@@ -665,9 +564,7 @@ struct rte_intr_source {
 			return -1;
 		break;
 	case RTE_INTR_HANDLE_VFIO_MSI:
-		if (vfio_enable_msi(intr_handle))
-			return -1;
-		break;
+		return 0;
 	case RTE_INTR_HANDLE_VFIO_LEGACY:
 		if (vfio_enable_intx(intr_handle))
 			return -1;
@@ -721,9 +618,7 @@ struct rte_intr_source {
 			return -1;
 		break;
 	case RTE_INTR_HANDLE_VFIO_MSI:
-		if (vfio_disable_msi(intr_handle))
-			return -1;
-		break;
+		return 0;
 	case RTE_INTR_HANDLE_VFIO_LEGACY:
 		if (vfio_disable_intx(intr_handle))
 			return -1;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [EXT] [RFC PATCH] vfio: move eventfd/interrupt pairing at setup time
  2019-07-02 12:48 [dpdk-dev] [RFC PATCH] vfio: move eventfd/interrupt pairing at setup time David Marchand
@ 2019-07-09 13:40 ` Shahed Shaikh
  2019-07-10 12:33 ` [dpdk-dev] [PATCH] vfio: fix interrupts race condition David Marchand
  1 sibling, 0 replies; 8+ messages in thread
From: Shahed Shaikh @ 2019-07-09 13:40 UTC (permalink / raw)
  To: David Marchand, dev
  Cc: anatoly.burakov, alex.williamson, maxime.coquelin, thomas, stephen

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of David Marchand
> Sent: Tuesday, July 2, 2019 6:18 PM
> To: dev@dpdk.org
> Cc: anatoly.burakov@intel.com; alex.williamson@redhat.com;
> maxime.coquelin@redhat.com; thomas@monjalon.net;
> stephen@networkplumber.org
> Subject: [EXT] [dpdk-dev] [RFC PATCH] vfio: move eventfd/interrupt pairing at
> setup time
> 
> 
> ----------------------------------------------------------------------
> Populating the eventfd in rte_intr_enable in each request to vfio triggers a
> reconfiguration of the interrupt handler on the kernel side.
> The problem is that rte_intr_enable is often used to re-enable masked interrupts
> from drivers interrupt handlers.
> 
> This reconfiguration leaves a window during which a device could send an
> interrupt and then the kernel logs this unsolicited interrupt:
> [158764.159833] do_IRQ: 9.34 No irq handler for vector
> 
> VFIO api makes it possible to set the fd at setup time.
> Make use of this and then we only need to ask for masking/unmasking legacy
> interrupts and we have nothing to do for MSI/MSIX.
> 
> "rxtx" interrupts are left untouched but are most likely subject to the same issue.
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
> Signed-off-by: David Marchand <david.marchand@redhat.com>

Thanks David for this patch.
I have tested this patch with FastLinq adapters on which original issue is reported.
Tested this with MSI-x and legacy interrupt mode.

Tested-by: Shahed Shaikh <shshaikh@marvell.com>

> ---
> Sending as a RFC patch since it is not trivial and can eat your babies.
> Could people that know well this parts have a look at this?
> Thanks!
> 
> ---
>  drivers/bus/pci/linux/pci_vfio.c          |  78 ++++++------
>  lib/librte_eal/linux/eal/eal_interrupts.c | 197 +++++++-----------------------
>  2 files changed, 86 insertions(+), 189 deletions(-)
> 
> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
> index ebf6ccd..e4c32c4 100644
> --- a/drivers/bus/pci/linux/pci_vfio.c
> +++ b/drivers/bus/pci/linux/pci_vfio.c
> @@ -186,8 +186,11 @@
>  static int
>  pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)  {
> -	int i, ret, intr_idx;
> +	char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)];
> +	struct vfio_irq_set *irq_set;
>  	enum rte_intr_mode intr_mode;
> +	int i, ret, intr_idx;
> +	int fd;
> 
>  	/* default to invalid index */
>  	intr_idx = VFIO_PCI_NUM_IRQS;
> @@ -219,7 +222,6 @@
>  	/* start from MSI-X interrupt type */
>  	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
>  		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
> -		int fd = -1;
> 
>  		/* skip interrupt modes we don't want */
>  		if (intr_mode != RTE_INTR_MODE_NONE && @@ -235,51
> +237,51 @@
>  			return -1;
>  		}
> 
> +		/* found a usable interrupt mode */
> +		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) != 0)
> +			break;
> +
>  		/* if this vector cannot be used with eventfd, fail if we explicitly
>  		 * specified interrupt type, otherwise continue */
> -		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
> -			if (intr_mode != RTE_INTR_MODE_NONE) {
> -				RTE_LOG(ERR, EAL,
> -						"  interrupt vector does not
> support eventfd!\n");
> -				return -1;
> -			} else
> -				continue;
> -		}
> -
> -		/* set up an eventfd for interrupts */
> -		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> -		if (fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
> -					"error %i (%s)\n", errno,
> strerror(errno));
> +		if (intr_mode != RTE_INTR_MODE_NONE) {
> +			RTE_LOG(ERR, EAL, "  interrupt vector does not support
> eventfd!\n");
>  			return -1;
>  		}
> +	}
> 
> -		dev->intr_handle.fd = fd;
> -		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
> +	if (i < 0)
> +		return -1;
> 
> -		switch (i) {
> -		case VFIO_PCI_MSIX_IRQ_INDEX:
> -			intr_mode = RTE_INTR_MODE_MSIX;
> -			dev->intr_handle.type =
> RTE_INTR_HANDLE_VFIO_MSIX;
> -			break;
> -		case VFIO_PCI_MSI_IRQ_INDEX:
> -			intr_mode = RTE_INTR_MODE_MSI;
> -			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
> -			break;
> -		case VFIO_PCI_INTX_IRQ_INDEX:
> -			intr_mode = RTE_INTR_MODE_LEGACY;
> -			dev->intr_handle.type =
> RTE_INTR_HANDLE_VFIO_LEGACY;
> -			break;
> -		default:
> -			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
> -			return -1;
> -		}
> +	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +	if (fd < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot set up eventfd, error %i (%s)\n",
> +			errno, strerror(errno));
> +		return -1;
> +	}
> 
> -		return 0;
> +	irq_set = (struct vfio_irq_set *)irq_set_buf;
> +	irq_set->argsz = sizeof(irq_set_buf);
> +	irq_set->flags =
> VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_TRIGGER;
> +	irq_set->index = i;
> +	irq_set->start = 0;
> +	irq_set->count = 1;
> +	memcpy(&irq_set->data, &fd, sizeof(int));
> +	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set) < 0) {
> +		RTE_LOG(ERR, EAL, "  error configuring interrupt\n");
> +		close(fd);
> +		return -1;
>  	}
> 
> -	/* if we're here, we haven't found a suitable interrupt vector */
> -	return -1;
> +	dev->intr_handle.fd = fd;
> +	dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
> +	if (i == VFIO_PCI_MSIX_IRQ_INDEX)
> +		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
> +	else if (i == VFIO_PCI_MSI_IRQ_INDEX)
> +		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
> +	else if (i == VFIO_PCI_INTX_IRQ_INDEX)
> +		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
> +
> +	return 0;
>  }
> 
>  #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
> diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c
> b/lib/librte_eal/linux/eal/eal_interrupts.c
> index 79ad5e8..27976b3 100644
> --- a/lib/librte_eal/linux/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linux/eal/eal_interrupts.c
> @@ -109,42 +109,19 @@ struct rte_intr_source {
> 
>  /* enable legacy (INTx) interrupts */
>  static int
> -vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
> -	struct vfio_irq_set *irq_set;
> -	char irq_set_buf[IRQ_SET_BUF_LEN];
> -	int len, ret;
> -	int *fd_ptr;
> -
> -	len = sizeof(irq_set_buf);
> -
> -	/* enable INTx */
> -	irq_set = (struct vfio_irq_set *) irq_set_buf;
> -	irq_set->argsz = len;
> -	irq_set->count = 1;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> -	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> -	irq_set->start = 0;
> -	fd_ptr = (int *) &irq_set->data;
> -	*fd_ptr = intr_handle->fd;
> -
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd
> %d\n",
> -						intr_handle->fd);
> -		return -1;
> -	}
> +vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
> +	struct vfio_irq_set irq_set;
> +	int ret;
> 
> -	/* unmask INTx after enabling */
> -	memset(irq_set, 0, len);
> -	len = sizeof(struct vfio_irq_set);
> -	irq_set->argsz = len;
> -	irq_set->count = 1;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_UNMASK;
> -	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> -	irq_set->start = 0;
> +	/* unmask INTx */
> +	irq_set.argsz = sizeof(irq_set);
> +	irq_set.flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_UNMASK;
> +	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
> +	irq_set.start = 0;
> +	irq_set.count = 1;
> 
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> 
>  	if (ret) {
>  		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd
> %d\n", @@ -156,128 +133,51 @@ struct rte_intr_source {
> 
>  /* disable legacy (INTx) interrupts */
>  static int
> -vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
> -	struct vfio_irq_set *irq_set;
> -	char irq_set_buf[IRQ_SET_BUF_LEN];
> -	int len, ret;
> -
> -	len = sizeof(struct vfio_irq_set);
> +vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
> +	struct vfio_irq_set irq_set;
> +	int ret;
> 
> -	/* mask interrupts before disabling */
> -	irq_set = (struct vfio_irq_set *) irq_set_buf;
> -	irq_set->argsz = len;
> -	irq_set->count = 1;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_MASK;
> -	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> -	irq_set->start = 0;
> +	/* mask interrupts */
> +	irq_set.argsz = sizeof(irq_set);
> +	irq_set.flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_MASK;
> +	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
> +	irq_set.start = 0;
> +	irq_set.count = 1;
> 
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> 
>  	if (ret) {
>  		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
>  						intr_handle->fd);
>  		return -1;
>  	}
> -
> -	/* disable INTx*/
> -	memset(irq_set, 0, len);
> -	irq_set->argsz = len;
> -	irq_set->count = 0;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> -	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> -	irq_set->start = 0;
> -
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> -	if (ret) {
> -		RTE_LOG(ERR, EAL,
> -			"Error disabling INTx interrupts for fd %d\n",
> intr_handle->fd);
> -		return -1;
> -	}
> -	return 0;
> -}
> -
> -/* enable MSI interrupts */
> -static int
> -vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
> -	int len, ret;
> -	char irq_set_buf[IRQ_SET_BUF_LEN];
> -	struct vfio_irq_set *irq_set;
> -	int *fd_ptr;
> -
> -	len = sizeof(irq_set_buf);
> -
> -	irq_set = (struct vfio_irq_set *) irq_set_buf;
> -	irq_set->argsz = len;
> -	irq_set->count = 1;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> -	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
> -	irq_set->start = 0;
> -	fd_ptr = (int *) &irq_set->data;
> -	*fd_ptr = intr_handle->fd;
> -
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
> -						intr_handle->fd);
> -		return -1;
> -	}
>  	return 0;
>  }
> 
> -/* disable MSI interrupts */
> -static int
> -vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
> -	struct vfio_irq_set *irq_set;
> -	char irq_set_buf[IRQ_SET_BUF_LEN];
> -	int len, ret;
> -
> -	len = sizeof(struct vfio_irq_set);
> -
> -	irq_set = (struct vfio_irq_set *) irq_set_buf;
> -	irq_set->argsz = len;
> -	irq_set->count = 0;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> -	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
> -	irq_set->start = 0;
> -
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> -	if (ret)
> -		RTE_LOG(ERR, EAL,
> -			"Error disabling MSI interrupts for fd %d\n",
> intr_handle->fd);
> -
> -	return ret;
> -}
> -
>  /* enable MSI-X interrupts */
>  static int
> -vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
> -	int len, ret;
> +vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
>  	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
>  	struct vfio_irq_set *irq_set;
> -	int *fd_ptr;
> +	int len, ret;
> +
> +	if (intr_handle->nb_efd == 0)
> +		return 0;
> 
>  	len = sizeof(irq_set_buf);
> 
>  	irq_set = (struct vfio_irq_set *) irq_set_buf;
>  	irq_set->argsz = len;
> -	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
> -	irq_set->count = intr_handle->max_intr ?
> -		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
> -		RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
>  	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> VFIO_IRQ_SET_ACTION_TRIGGER;
>  	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
> -	irq_set->start = 0;
> -	fd_ptr = (int *) &irq_set->data;
> -	/* INTR vector offset 0 reserve for non-efds mapping */
> -	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
> -	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
> -		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
> +	irq_set->start = RTE_INTR_VEC_RXTX_OFFSET;
> +	irq_set->count = intr_handle->nb_efd;
> +	memcpy(&irq_set->data, intr_handle->efds,
> +	       sizeof(*intr_handle->efds) * intr_handle->nb_efd);
> 
>  	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
>  	if (ret) {
>  		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd
> %d\n",
>  						intr_handle->fd);
> @@ -289,22 +189,21 @@ struct rte_intr_source {
> 
>  /* disable MSI-X interrupts */
>  static int
> -vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
> -	struct vfio_irq_set *irq_set;
> -	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
> -	int len, ret;
> -
> -	len = sizeof(struct vfio_irq_set);
> +vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
> +	struct vfio_irq_set irq_set;
> +	int ret;
> 
> -	irq_set = (struct vfio_irq_set *) irq_set_buf;
> -	irq_set->argsz = len;
> -	irq_set->count = 0;
> -	irq_set->flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> -	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
> -	irq_set->start = 0;
> +	if (intr_handle->nb_efd == 0)
> +		return 0;
> 
> -	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +	irq_set.argsz = sizeof(irq_set);
> +	irq_set.flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> +	irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
> +	irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
> +	irq_set.count = intr_handle->nb_efd;
> 
> +	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
>  	if (ret)
>  		RTE_LOG(ERR, EAL,
>  			"Error disabling MSI-X interrupts for fd %d\n",
> intr_handle->fd); @@ -665,9 +564,7 @@ struct rte_intr_source {
>  			return -1;
>  		break;
>  	case RTE_INTR_HANDLE_VFIO_MSI:
> -		if (vfio_enable_msi(intr_handle))
> -			return -1;
> -		break;
> +		return 0;
>  	case RTE_INTR_HANDLE_VFIO_LEGACY:
>  		if (vfio_enable_intx(intr_handle))
>  			return -1;
> @@ -721,9 +618,7 @@ struct rte_intr_source {
>  			return -1;
>  		break;
>  	case RTE_INTR_HANDLE_VFIO_MSI:
> -		if (vfio_disable_msi(intr_handle))
> -			return -1;
> -		break;
> +		return 0;
>  	case RTE_INTR_HANDLE_VFIO_LEGACY:
>  		if (vfio_disable_intx(intr_handle))
>  			return -1;
> --
> 1.8.3.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-02 12:48 [dpdk-dev] [RFC PATCH] vfio: move eventfd/interrupt pairing at setup time David Marchand
  2019-07-09 13:40 ` [dpdk-dev] [EXT] " Shahed Shaikh
@ 2019-07-10 12:33 ` David Marchand
  2019-07-10 21:20   ` Thomas Monjalon
  1 sibling, 1 reply; 8+ messages in thread
From: David Marchand @ 2019-07-10 12:33 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, alex.williamson, maxime.coquelin, thomas,
	stephen, igor.russkikh, pavel.belous, allain.legacy, matt.peters,
	ravi1.kumar, rmody, shshaikh, ajit.khaparde, somnath.kotur,
	hemant.agrawal, shreyansh.jain, wenzhuo.lu, mw, mk, gtzalik,
	evgenys, johndale, hyonkim, qi.z.zhang, xiao.w.wang, xuanziyang2,
	cloud.wangxiaoyun, zhouguoyang, beilei.xing, jingjing.wu,
	qiming.yang, konstantin.ananyev, alejandro.lucero, arybchenko,
	tiwei.bie, zhihong.wang, yongwang, stable

Populating the eventfd in rte_intr_enable in each request to vfio
triggers a reconfiguration of the interrupt handler on the kernel side.
The problem is that rte_intr_enable is often used to re-enable masked
interrupts from drivers interrupt handlers.

This reconfiguration leaves a window during which a device could send
an interrupt and then the kernel logs this (unsolicited from the kernel
point of view) interrupt:
[158764.159833] do_IRQ: 9.34 No irq handler for vector

VFIO api makes it possible to set the fd at setup time.
Make use of this and then we only need to ask for masking/unmasking
legacy interrupts and we have nothing to do for MSI/MSIX.

"rxtx" interrupts are left untouched but are most likely subject to the
same issue.

Fixes: 5c782b3928b8 ("vfio: interrupts")
Cc: stable@dpdk.org

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
Signed-off-by: David Marchand <david.marchand@redhat.com>
Tested-by: Shahed Shaikh <shshaikh@marvell.com>
---
 drivers/bus/pci/linux/pci_vfio.c          |  78 ++++++------
 lib/librte_eal/linux/eal/eal_interrupts.c | 197 +++++++-----------------------
 2 files changed, 86 insertions(+), 189 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 1ceb1c0..ee31239 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -187,8 +187,11 @@
 static int
 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 {
-	int i, ret, intr_idx;
+	char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)];
+	struct vfio_irq_set *irq_set;
 	enum rte_intr_mode intr_mode;
+	int i, ret, intr_idx;
+	int fd;
 
 	/* default to invalid index */
 	intr_idx = VFIO_PCI_NUM_IRQS;
@@ -220,7 +223,6 @@
 	/* start from MSI-X interrupt type */
 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
-		int fd = -1;
 
 		/* skip interrupt modes we don't want */
 		if (intr_mode != RTE_INTR_MODE_NONE &&
@@ -236,51 +238,51 @@
 			return -1;
 		}
 
+		/* found a usable interrupt mode */
+		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) != 0)
+			break;
+
 		/* if this vector cannot be used with eventfd, fail if we explicitly
 		 * specified interrupt type, otherwise continue */
-		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
-			if (intr_mode != RTE_INTR_MODE_NONE) {
-				RTE_LOG(ERR, EAL,
-						"  interrupt vector does not support eventfd!\n");
-				return -1;
-			} else
-				continue;
-		}
-
-		/* set up an eventfd for interrupts */
-		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
-		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
-					"error %i (%s)\n", errno, strerror(errno));
+		if (intr_mode != RTE_INTR_MODE_NONE) {
+			RTE_LOG(ERR, EAL, "  interrupt vector does not support eventfd!\n");
 			return -1;
 		}
+	}
 
-		dev->intr_handle.fd = fd;
-		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+	if (i < 0)
+		return -1;
 
-		switch (i) {
-		case VFIO_PCI_MSIX_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_MSIX;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
-			break;
-		case VFIO_PCI_MSI_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_MSI;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
-			break;
-		case VFIO_PCI_INTX_IRQ_INDEX:
-			intr_mode = RTE_INTR_MODE_LEGACY;
-			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
-			break;
-		default:
-			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
-			return -1;
-		}
+	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "  cannot set up eventfd, error %i (%s)\n",
+			errno, strerror(errno));
+		return -1;
+	}
 
-		return 0;
+	irq_set = (struct vfio_irq_set *)irq_set_buf;
+	irq_set->argsz = sizeof(irq_set_buf);
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD|VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = i;
+	irq_set->start = 0;
+	irq_set->count = 1;
+	memcpy(&irq_set->data, &fd, sizeof(int));
+	if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set) < 0) {
+		RTE_LOG(ERR, EAL, "  error configuring interrupt\n");
+		close(fd);
+		return -1;
 	}
 
-	/* if we're here, we haven't found a suitable interrupt vector */
-	return -1;
+	dev->intr_handle.fd = fd;
+	dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+	if (i == VFIO_PCI_MSIX_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+	else if (i == VFIO_PCI_MSI_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
+	else if (i == VFIO_PCI_INTX_IRQ_INDEX)
+		dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
+
+	return 0;
 }
 
 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
index 79ad5e8..27976b3 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -109,42 +109,19 @@ struct rte_intr_source {
 
 /* enable legacy (INTx) interrupts */
 static int
-vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-	int *fd_ptr;
-
-	len = sizeof(irq_set_buf);
-
-	/* enable INTx */
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
+vfio_enable_intx(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	/* unmask INTx after enabling */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
+	/* unmask INTx */
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set.start = 0;
+	irq_set.count = 1;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
@@ -156,128 +133,51 @@ struct rte_intr_source {
 
 /* disable legacy (INTx) interrupts */
 static int
-vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
+vfio_disable_intx(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	/* mask interrupts before disabling */
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
+	/* mask interrupts */
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
+	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set.start = 0;
+	irq_set.count = 1;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* disable INTx*/
-	memset(irq_set, 0, len);
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
-		return -1;
-	}
-	return 0;
-}
-
-/* enable MSI interrupts */
-static int
-vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
-	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	struct vfio_irq_set *irq_set;
-	int *fd_ptr;
-
-	len = sizeof(irq_set_buf);
-
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
-/* disable MSI interrupts */
-static int
-vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
-
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret)
-		RTE_LOG(ERR, EAL,
-			"Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
-
-	return ret;
-}
-
 /* enable MSI-X interrupts */
 static int
-vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
-	int len, ret;
+vfio_enable_msix(const struct rte_intr_handle *intr_handle)
+{
 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
-	int *fd_ptr;
+	int len, ret;
+
+	if (intr_handle->nb_efd == 0)
+		return 0;
 
 	len = sizeof(irq_set_buf);
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
-	irq_set->count = intr_handle->max_intr ?
-		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
-		RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-	fd_ptr = (int *) &irq_set->data;
-	/* INTR vector offset 0 reserve for non-efds mapping */
-	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
-	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
-		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
+	irq_set->start = RTE_INTR_VEC_RXTX_OFFSET;
+	irq_set->count = intr_handle->nb_efd;
+	memcpy(&irq_set->data, intr_handle->efds,
+	       sizeof(*intr_handle->efds) * intr_handle->nb_efd);
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
 	if (ret) {
 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
 						intr_handle->fd);
@@ -289,22 +189,21 @@ struct rte_intr_source {
 
 /* disable MSI-X interrupts */
 static int
-vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
-	struct vfio_irq_set *irq_set;
-	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
-	int len, ret;
-
-	len = sizeof(struct vfio_irq_set);
+vfio_disable_msix(const struct rte_intr_handle *intr_handle)
+{
+	struct vfio_irq_set irq_set;
+	int ret;
 
-	irq_set = (struct vfio_irq_set *) irq_set_buf;
-	irq_set->argsz = len;
-	irq_set->count = 0;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
+	if (intr_handle->nb_efd == 0)
+		return 0;
 
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	irq_set.argsz = sizeof(irq_set);
+	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
+	irq_set.count = intr_handle->nb_efd;
 
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 	if (ret)
 		RTE_LOG(ERR, EAL,
 			"Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
@@ -665,9 +564,7 @@ struct rte_intr_source {
 			return -1;
 		break;
 	case RTE_INTR_HANDLE_VFIO_MSI:
-		if (vfio_enable_msi(intr_handle))
-			return -1;
-		break;
+		return 0;
 	case RTE_INTR_HANDLE_VFIO_LEGACY:
 		if (vfio_enable_intx(intr_handle))
 			return -1;
@@ -721,9 +618,7 @@ struct rte_intr_source {
 			return -1;
 		break;
 	case RTE_INTR_HANDLE_VFIO_MSI:
-		if (vfio_disable_msi(intr_handle))
-			return -1;
-		break;
+		return 0;
 	case RTE_INTR_HANDLE_VFIO_LEGACY:
 		if (vfio_disable_intx(intr_handle))
 			return -1;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-10 12:33 ` [dpdk-dev] [PATCH] vfio: fix interrupts race condition David Marchand
@ 2019-07-10 21:20   ` Thomas Monjalon
  2019-07-14  5:10     ` Hyong Youb Kim (hyonkim)
  0 siblings, 1 reply; 8+ messages in thread
From: Thomas Monjalon @ 2019-07-10 21:20 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, anatoly.burakov, alex.williamson, maxime.coquelin, stephen,
	igor.russkikh, pavel.belous, allain.legacy, matt.peters,
	ravi1.kumar, rmody, shshaikh, ajit.khaparde, somnath.kotur,
	hemant.agrawal, shreyansh.jain, wenzhuo.lu, mw, mk, gtzalik,
	evgenys, johndale, hyonkim, qi.z.zhang, xiao.w.wang, xuanziyang2,
	cloud.wangxiaoyun, zhouguoyang, beilei.xing, jingjing.wu,
	qiming.yang, konstantin.ananyev, alejandro.lucero, arybchenko,
	tiwei.bie, zhihong.wang, yongwang, stable

10/07/2019 14:33, David Marchand:
> Populating the eventfd in rte_intr_enable in each request to vfio
> triggers a reconfiguration of the interrupt handler on the kernel side.
> The problem is that rte_intr_enable is often used to re-enable masked
> interrupts from drivers interrupt handlers.
> 
> This reconfiguration leaves a window during which a device could send
> an interrupt and then the kernel logs this (unsolicited from the kernel
> point of view) interrupt:
> [158764.159833] do_IRQ: 9.34 No irq handler for vector
> 
> VFIO api makes it possible to set the fd at setup time.
> Make use of this and then we only need to ask for masking/unmasking
> legacy interrupts and we have nothing to do for MSI/MSIX.
> 
> "rxtx" interrupts are left untouched but are most likely subject to the
> same issue.
> 
> Fixes: 5c782b3928b8 ("vfio: interrupts")
> Cc: stable@dpdk.org
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> Tested-by: Shahed Shaikh <shshaikh@marvell.com>

This is a real bug which should be fixed in this release.
As the patch is quite big and needs a strong validation,
I prefer merging it quickly to give a lot of time before
releasing 19.08-rc2.
The maintainers of all concerned PMDs are Cc.
Please make sure the interrupts are still working well with VFIO.

Applied, thanks



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-10 21:20   ` Thomas Monjalon
@ 2019-07-14  5:10     ` Hyong Youb Kim (hyonkim)
  2019-07-14 11:19       ` Thomas Monjalon
  0 siblings, 1 reply; 8+ messages in thread
From: Hyong Youb Kim (hyonkim) @ 2019-07-14  5:10 UTC (permalink / raw)
  To: Thomas Monjalon, David Marchand
  Cc: dev, anatoly.burakov, alex.williamson, maxime.coquelin, stephen,
	igor.russkikh, pavel.belous, allain.legacy, matt.peters,
	ravi1.kumar, rmody, shshaikh, ajit.khaparde, somnath.kotur,
	hemant.agrawal, shreyansh.jain, wenzhuo.lu, mw, mk, gtzalik,
	evgenys, John Daley (johndale),
	qi.z.zhang, xiao.w.wang, xuanziyang2, cloud.wangxiaoyun,
	zhouguoyang, beilei.xing, jingjing.wu, qiming.yang,
	konstantin.ananyev, alejandro.lucero, arybchenko, tiwei.bie,
	zhihong.wang, yongwang, stable

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, July 11, 2019 6:21 AM
[...]
> Subject: Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
> 
> 10/07/2019 14:33, David Marchand:
> > Populating the eventfd in rte_intr_enable in each request to vfio
> > triggers a reconfiguration of the interrupt handler on the kernel side.
> > The problem is that rte_intr_enable is often used to re-enable masked
> > interrupts from drivers interrupt handlers.
> >
> > This reconfiguration leaves a window during which a device could send
> > an interrupt and then the kernel logs this (unsolicited from the kernel
> > point of view) interrupt:
> > [158764.159833] do_IRQ: 9.34 No irq handler for vector
> >
> > VFIO api makes it possible to set the fd at setup time.
> > Make use of this and then we only need to ask for masking/unmasking
> > legacy interrupts and we have nothing to do for MSI/MSIX.
> >
> > "rxtx" interrupts are left untouched but are most likely subject to the
> > same issue.
> >
> > Fixes: 5c782b3928b8 ("vfio: interrupts")
> > Cc: stable@dpdk.org
> >
> > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > Tested-by: Shahed Shaikh <shshaikh@marvell.com>
> 
> This is a real bug which should be fixed in this release.
> As the patch is quite big and needs a strong validation,
> I prefer merging it quickly to give a lot of time before
> releasing 19.08-rc2.
> The maintainers of all concerned PMDs are Cc.
> Please make sure the interrupts are still working well with VFIO.
> 
> Applied, thanks
> 

[Apologies in advance if email format gets messed up. Forced to use
outlook for the first time..]

Hi,

This commit breaks MSI-X + rxq interrupts. I think others are seeing
the same error?

sudo ~/dpdk/examples/l3fwd-power/build/l3fwd-power \
-c 0x1e -n 4 -w 0000:1a:00.0 --log-level=pmd,debug -- -p 0x1 -P --config "(0,0,2),(0,1,3),(0,2,4)"
[...]
EAL: Error enabling MSI-X interrupts for fd 35

A rough sequence of events goes like this. The above test is using 3
rxqs (3 interrupts).

1. During probe, pci_vfio_setup_interrupts() runs.
This now does ioctl(VFIO_DEVICE_SET_IRQS) for the 1st efd
(intr_handle->fd).

ioctl does:
- pci_enable_msix(1 vector) because this is the first time enabling
  interrupts.
- request_irq(vector 0)

2. App configs
The app sets port_conf.intr_conf.rxq=1, configs 3 rxqs, etc.

3. rte_eth_dev_start()
PMD calls:
- rte_intr_efd_enable()
  This creates 3 efds (intr_handle->nb_efd = 3).
- rte_intr_enable() => vfio_enable_msix()
  This does ioctl(VFIO_DEVICE_SET_IRQS) for the 3 efds.

ioctl now needs to request_irq() for vectors 1, 2, 3 for the 3 new
efds. It does not do another pci_enable_msix() as it has been done
earlier. Before calling request_irq(), it sees that only 1 vector was
enabled in earlier pci_enable_msix(), so it fails with EINVAL.

We would need pci_enable_msix(4 vectors) for this to work
(intr_handle->fd + 3 efds).

Prior to this patch, VFIO_DEVICE_SET_IRQS is done only in
vfio_enable_msix(). So, ioctl ends up doing pci_enable_msix(4 vectors)
and request_irq() for each of the 4 efds, which completes
successfully.

Not an expert in this area.. Perhaps, defer enabling 1st efd
(intr_handle->fd) until the first invocation of vfio_enable_msix(), so
it knows the app wants to use 4 vectors in total?

Also, vfio_disable_msix() looks a bit wrong.

        irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
        irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
        irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
        irq_set.count = intr_handle->nb_efd;

This tells vfio-pci to simulate interrupts by triggering efds? To
free_irq() specific efds, I think we need DATA_EVENTFD and set fd =
-1.

flags = DATA_EVENTFD | ACTION_TRIGGER
data = [fd(-1), fd(-1), ...]

I have not tested this part myself yet.

Thanks..
-Hyong


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-14  5:10     ` Hyong Youb Kim (hyonkim)
@ 2019-07-14 11:19       ` Thomas Monjalon
  2019-07-15  5:35         ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 8+ messages in thread
From: Thomas Monjalon @ 2019-07-14 11:19 UTC (permalink / raw)
  To: Hyong Youb Kim (hyonkim)
  Cc: David Marchand, dev, anatoly.burakov, alex.williamson,
	maxime.coquelin, stephen, igor.russkikh, pavel.belous,
	allain.legacy, matt.peters, ravi1.kumar, rmody, shshaikh,
	ajit.khaparde, somnath.kotur, hemant.agrawal, shreyansh.jain,
	wenzhuo.lu, mw, mk, gtzalik, evgenys, John Daley (johndale),
	qi.z.zhang, xiao.w.wang, xuanziyang2, cloud.wangxiaoyun,
	zhouguoyang, beilei.xing, jingjing.wu, qiming.yang,
	konstantin.ananyev, alejandro.lucero, arybchenko, tiwei.bie,
	zhihong.wang, yongwang, stable

14/07/2019 07:10, Hyong Youb Kim (hyonkim):
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Thursday, July 11, 2019 6:21 AM
> [...]
> > Subject: Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
> > 
> > 10/07/2019 14:33, David Marchand:
> > > Populating the eventfd in rte_intr_enable in each request to vfio
> > > triggers a reconfiguration of the interrupt handler on the kernel side.
> > > The problem is that rte_intr_enable is often used to re-enable masked
> > > interrupts from drivers interrupt handlers.
> > >
> > > This reconfiguration leaves a window during which a device could send
> > > an interrupt and then the kernel logs this (unsolicited from the kernel
> > > point of view) interrupt:
> > > [158764.159833] do_IRQ: 9.34 No irq handler for vector
> > >
> > > VFIO api makes it possible to set the fd at setup time.
> > > Make use of this and then we only need to ask for masking/unmasking
> > > legacy interrupts and we have nothing to do for MSI/MSIX.
> > >
> > > "rxtx" interrupts are left untouched but are most likely subject to the
> > > same issue.
> > >
> > > Fixes: 5c782b3928b8 ("vfio: interrupts")
> > > Cc: stable@dpdk.org
> > >
> > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1654824
> > > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > > Tested-by: Shahed Shaikh <shshaikh@marvell.com>
> > 
> > This is a real bug which should be fixed in this release.
> > As the patch is quite big and needs a strong validation,
> > I prefer merging it quickly to give a lot of time before
> > releasing 19.08-rc2.
> > The maintainers of all concerned PMDs are Cc.
> > Please make sure the interrupts are still working well with VFIO.
> > 
> > Applied, thanks
> > 
> 
> [Apologies in advance if email format gets messed up. Forced to use
> outlook for the first time..]
> 
> Hi,
> 
> This commit breaks MSI-X + rxq interrupts. I think others are seeing
> the same error?
> 
> sudo ~/dpdk/examples/l3fwd-power/build/l3fwd-power \
> -c 0x1e -n 4 -w 0000:1a:00.0 --log-level=pmd,debug -- -p 0x1 -P --config "(0,0,2),(0,1,3),(0,2,4)"
> [...]
> EAL: Error enabling MSI-X interrupts for fd 35
> 
> A rough sequence of events goes like this. The above test is using 3
> rxqs (3 interrupts).
> 
> 1. During probe, pci_vfio_setup_interrupts() runs.
> This now does ioctl(VFIO_DEVICE_SET_IRQS) for the 1st efd
> (intr_handle->fd).
> 
> ioctl does:
> - pci_enable_msix(1 vector) because this is the first time enabling
>   interrupts.
> - request_irq(vector 0)
> 
> 2. App configs
> The app sets port_conf.intr_conf.rxq=1, configs 3 rxqs, etc.
> 
> 3. rte_eth_dev_start()
> PMD calls:
> - rte_intr_efd_enable()
>   This creates 3 efds (intr_handle->nb_efd = 3).
> - rte_intr_enable() => vfio_enable_msix()
>   This does ioctl(VFIO_DEVICE_SET_IRQS) for the 3 efds.
> 
> ioctl now needs to request_irq() for vectors 1, 2, 3 for the 3 new
> efds. It does not do another pci_enable_msix() as it has been done
> earlier. Before calling request_irq(), it sees that only 1 vector was
> enabled in earlier pci_enable_msix(), so it fails with EINVAL.
> 
> We would need pci_enable_msix(4 vectors) for this to work
> (intr_handle->fd + 3 efds).
> 
> Prior to this patch, VFIO_DEVICE_SET_IRQS is done only in
> vfio_enable_msix(). So, ioctl ends up doing pci_enable_msix(4 vectors)
> and request_irq() for each of the 4 efds, which completes
> successfully.
> 
> Not an expert in this area.. Perhaps, defer enabling 1st efd
> (intr_handle->fd) until the first invocation of vfio_enable_msix(), so
> it knows the app wants to use 4 vectors in total?
> 
> Also, vfio_disable_msix() looks a bit wrong.
> 
>         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
>         irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
>         irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
>         irq_set.count = intr_handle->nb_efd;
> 
> This tells vfio-pci to simulate interrupts by triggering efds? To
> free_irq() specific efds, I think we need DATA_EVENTFD and set fd =
> -1.
> 
> flags = DATA_EVENTFD | ACTION_TRIGGER
> data = [fd(-1), fd(-1), ...]
> 
> I have not tested this part myself yet.

Thanks for your detailed report Hyong.
Would you be able to propose a fix?



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-14 11:19       ` Thomas Monjalon
@ 2019-07-15  5:35         ` Jerin Jacob Kollanukkaran
  2019-07-15  7:20           ` Hyong Youb Kim (hyonkim)
  0 siblings, 1 reply; 8+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-15  5:35 UTC (permalink / raw)
  To: Thomas Monjalon, Hyong Youb Kim (hyonkim)
  Cc: David Marchand, dev, anatoly.burakov, alex.williamson,
	maxime.coquelin, stephen, igor.russkikh, pavel.belous,
	allain.legacy, matt.peters, ravi1.kumar, Rasesh Mody,
	Shahed Shaikh, ajit.khaparde, somnath.kotur, hemant.agrawal,
	shreyansh.jain, wenzhuo.lu, mw, mk, gtzalik, evgenys,
	John Daley (johndale),
	qi.z.zhang, xiao.w.wang, xuanziyang2, cloud.wangxiaoyun,
	zhouguoyang, beilei.xing, jingjing.wu, qiming.yang,
	konstantin.ananyev, alejandro.lucero, arybchenko, tiwei.bie,
	zhihong.wang, yongwang, stable, Nithin Kumar Dabilpuram

> > >
> > > This is a real bug which should be fixed in this release.
> > > As the patch is quite big and needs a strong validation, I prefer
> > > merging it quickly to give a lot of time before releasing 19.08-rc2.
> > > The maintainers of all concerned PMDs are Cc.
> > > Please make sure the interrupts are still working well with VFIO.
> > >
> > > Applied, thanks
> > >
> >
> > [Apologies in advance if email format gets messed up. Forced to use
> > outlook for the first time..]
> >
> > Hi,
> >
> > This commit breaks MSI-X + rxq interrupts. I think others are seeing
> > the same error?
> >
> > sudo ~/dpdk/examples/l3fwd-power/build/l3fwd-power \ -c 0x1e -n 4 -w
> > 0000:1a:00.0 --log-level=pmd,debug -- -p 0x1 -P --config
> "(0,0,2),(0,1,3),(0,2,4)"
> > [...]
> > EAL: Error enabling MSI-X interrupts for fd 35
> >
> > A rough sequence of events goes like this. The above test is using 3
> > rxqs (3 interrupts).
> >
> > 1. During probe, pci_vfio_setup_interrupts() runs.
> > This now does ioctl(VFIO_DEVICE_SET_IRQS) for the 1st efd
> > (intr_handle->fd).
> >
> > ioctl does:
> > - pci_enable_msix(1 vector) because this is the first time enabling
> >   interrupts.
> > - request_irq(vector 0)
> >
> > 2. App configs
> > The app sets port_conf.intr_conf.rxq=1, configs 3 rxqs, etc.
> >
> > 3. rte_eth_dev_start()
> > PMD calls:
> > - rte_intr_efd_enable()
> >   This creates 3 efds (intr_handle->nb_efd = 3).
> > - rte_intr_enable() => vfio_enable_msix()
> >   This does ioctl(VFIO_DEVICE_SET_IRQS) for the 3 efds.
> >
> > ioctl now needs to request_irq() for vectors 1, 2, 3 for the 3 new
> > efds. It does not do another pci_enable_msix() as it has been done
> > earlier. Before calling request_irq(), it sees that only 1 vector was
> > enabled in earlier pci_enable_msix(), so it fails with EINVAL.
> >
> > We would need pci_enable_msix(4 vectors) for this to work
> > (intr_handle->fd + 3 efds).
> >
> > Prior to this patch, VFIO_DEVICE_SET_IRQS is done only in
> > vfio_enable_msix(). So, ioctl ends up doing pci_enable_msix(4 vectors)
> > and request_irq() for each of the 4 efds, which completes
> > successfully.
> >
> > Not an expert in this area.. Perhaps, defer enabling 1st efd
> > (intr_handle->fd) until the first invocation of vfio_enable_msix(), so
> > it knows the app wants to use 4 vectors in total?
> >
> > Also, vfio_disable_msix() looks a bit wrong.
> >
> >         irq_set.flags = VFIO_IRQ_SET_DATA_NONE |
> VFIO_IRQ_SET_ACTION_TRIGGER;
> >         irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
> >         irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
> >         irq_set.count = intr_handle->nb_efd;
> >
> > This tells vfio-pci to simulate interrupts by triggering efds? To
> > free_irq() specific efds, I think we need DATA_EVENTFD and set fd =
> > -1.
> >
> > flags = DATA_EVENTFD | ACTION_TRIGGER
> > data = [fd(-1), fd(-1), ...]
> >
> > I have not tested this part myself yet.


We do see the following failure[1] on octeontx2 PMD with this patch.
We will try to find a fix.

        irq_set = (struct vfio_irq_set *)irq_set_buf;
        irq_set->argsz = len;
        irq_set->start = 0;
        irq_set->count = intr_handle->max_intr;
        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
                        VFIO_IRQ_SET_ACTION_TRIGGER;
        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;

        fd_ptr = (int32_t *)&irq_set->data[0];
        for (i = 0; i < irq_set->count; i++)
                fd_ptr[i] = -1;

        rc = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
        if (rc)
                otx2_err("Failed to set irqs vector rc=%d", rc);
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[1]			
			
				


> 
> Thanks for your detailed report Hyong.
> Would you be able to propose a fix?
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
  2019-07-15  5:35         ` Jerin Jacob Kollanukkaran
@ 2019-07-15  7:20           ` Hyong Youb Kim (hyonkim)
  0 siblings, 0 replies; 8+ messages in thread
From: Hyong Youb Kim (hyonkim) @ 2019-07-15  7:20 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Thomas Monjalon
  Cc: David Marchand, dev, anatoly.burakov, alex.williamson,
	maxime.coquelin, stephen, igor.russkikh, pavel.belous,
	allain.legacy, matt.peters, ravi1.kumar, Rasesh Mody,
	Shahed Shaikh, ajit.khaparde, somnath.kotur, hemant.agrawal,
	shreyansh.jain, wenzhuo.lu, mw, mk, gtzalik, evgenys,
	John Daley (johndale),
	qi.z.zhang, xiao.w.wang, xuanziyang2, cloud.wangxiaoyun,
	zhouguoyang, beilei.xing, jingjing.wu, qiming.yang,
	konstantin.ananyev, alejandro.lucero, arybchenko, tiwei.bie,
	zhihong.wang, yongwang, stable, Nithin Kumar Dabilpuram

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Monday, July 15, 2019 2:35 PM
[...]
> Subject: RE: [dpdk-dev] [PATCH] vfio: fix interrupts race condition
> 
> > > >
> > > > This is a real bug which should be fixed in this release.
> > > > As the patch is quite big and needs a strong validation, I prefer
> > > > merging it quickly to give a lot of time before releasing 19.08-rc2.
> > > > The maintainers of all concerned PMDs are Cc.
> > > > Please make sure the interrupts are still working well with VFIO.
> > > >
> > > > Applied, thanks
> > > >
> > >
> > > [Apologies in advance if email format gets messed up. Forced to use
> > > outlook for the first time..]
> > >
> > > Hi,
> > >
> > > This commit breaks MSI-X + rxq interrupts. I think others are seeing
> > > the same error?
> > >
> > > sudo ~/dpdk/examples/l3fwd-power/build/l3fwd-power \ -c 0x1e -n 4 -
> w
> > > 0000:1a:00.0 --log-level=pmd,debug -- -p 0x1 -P --config
> > "(0,0,2),(0,1,3),(0,2,4)"
> > > [...]
> > > EAL: Error enabling MSI-X interrupts for fd 35
> > >
> > > A rough sequence of events goes like this. The above test is using 3
> > > rxqs (3 interrupts).
> > >
> > > 1. During probe, pci_vfio_setup_interrupts() runs.
> > > This now does ioctl(VFIO_DEVICE_SET_IRQS) for the 1st efd
> > > (intr_handle->fd).
> > >
> > > ioctl does:
> > > - pci_enable_msix(1 vector) because this is the first time enabling
> > >   interrupts.
> > > - request_irq(vector 0)
> > >
> > > 2. App configs
> > > The app sets port_conf.intr_conf.rxq=1, configs 3 rxqs, etc.
> > >
> > > 3. rte_eth_dev_start()
> > > PMD calls:
> > > - rte_intr_efd_enable()
> > >   This creates 3 efds (intr_handle->nb_efd = 3).
> > > - rte_intr_enable() => vfio_enable_msix()
> > >   This does ioctl(VFIO_DEVICE_SET_IRQS) for the 3 efds.
> > >
> > > ioctl now needs to request_irq() for vectors 1, 2, 3 for the 3 new
> > > efds. It does not do another pci_enable_msix() as it has been done
> > > earlier. Before calling request_irq(), it sees that only 1 vector was
> > > enabled in earlier pci_enable_msix(), so it fails with EINVAL.
> > >
> > > We would need pci_enable_msix(4 vectors) for this to work
> > > (intr_handle->fd + 3 efds).
> > >
> > > Prior to this patch, VFIO_DEVICE_SET_IRQS is done only in
> > > vfio_enable_msix(). So, ioctl ends up doing pci_enable_msix(4 vectors)
> > > and request_irq() for each of the 4 efds, which completes
> > > successfully.
> > >
> > > Not an expert in this area.. Perhaps, defer enabling 1st efd
> > > (intr_handle->fd) until the first invocation of vfio_enable_msix(), so
> > > it knows the app wants to use 4 vectors in total?
> > >
> > > Also, vfio_disable_msix() looks a bit wrong.
> > >
> > >         irq_set.flags = VFIO_IRQ_SET_DATA_NONE |
> > VFIO_IRQ_SET_ACTION_TRIGGER;
> > >         irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
> > >         irq_set.start = RTE_INTR_VEC_RXTX_OFFSET;
> > >         irq_set.count = intr_handle->nb_efd;
> > >
> > > This tells vfio-pci to simulate interrupts by triggering efds? To
> > > free_irq() specific efds, I think we need DATA_EVENTFD and set fd =
> > > -1.
> > >
> > > flags = DATA_EVENTFD | ACTION_TRIGGER
> > > data = [fd(-1), fd(-1), ...]
> > >
> > > I have not tested this part myself yet.
> 
> 
> We do see the following failure[1] on octeontx2 PMD with this patch.
> We will try to find a fix.
> 
>         irq_set = (struct vfio_irq_set *)irq_set_buf;
>         irq_set->argsz = len;
>         irq_set->start = 0;
>         irq_set->count = intr_handle->max_intr;
>         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
>                         VFIO_IRQ_SET_ACTION_TRIGGER;
>         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
> 
>         fd_ptr = (int32_t *)&irq_set->data[0];
>         for (i = 0; i < irq_set->count; i++)
>                 fd_ptr[i] = -1;
> 
>         rc = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
>         if (rc)
>                 otx2_err("Failed to set irqs vector rc=%d", rc);
> 	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[1]
> 
> 
> 
> 
> 
> >
> > Thanks for your detailed report Hyong.
> > Would you be able to propose a fix?
> >

Did more digging. Another lengthy email..

It feels tricky to fix the problem properly, as it is getting late..
Here is a recap of the problem, as best I can.

1. INTx:
Both vfio-pci and igb_uio mask the interrupt (i.e. write 1 to
Interrupt Disable in PCI config) and then trigger callback to PMD.
PMD needs to unmask the interrupt when exiting its irq callback. This
is currently achieved by calling rte_intr_enable. Several PMDs use
this pattern.

irq_callback() {
  take_action()
  rte_intr_enable()
}

2. MSI/MSI-X:
No automatic masking done within vfio-pci/igb_uio before triggering
callback to PMD. No need to "re-enable" interrupt by calling
rte_intr_enable. If we forget INTx, we can simply remove
rte_intr_enable from all PMD irq callbacks..

The current vfio commit effectively turns rte_intr_enable into no-op
for MSI/MSI-X (ignore rxq interrupts for now).. So it is equivalent to
removing rte_intr_enable from irq callback.

Prior to this commit, rte_intr_enable ends up re-doing irq setup:
free_irq() -> request_irq(). In the bugzilla issue (qede), an
interrupt arrives in between these, and gets "lost", which causes
something that is waiting for it to timeout, etc. In the bz, note that
INTx has no issues, probably because it is level-triggered.


Now, about this commit (beware unorganized thoughts from me)..

I think David wants to turn rte_intr_enable/disable into unmask/mask,
and avoid free_irq()/request_irq() "post probe". 3 cases to
consider...

1. INTx:
Mission accomplished via ACTION_(UN)MASK.

2. MSI/MSI-X and 1st vector (intr_handle->fd):
rte_intr_enable/disable is now no-op. This is not quite right, since
interrupt remains enabled even after a call to rte_intr_disable. For
MSI/MSI-X, ACTION_(UN)MASK is no-op (unimpl) in vfio-pci, so no way to
mask. It's been that way ever since, as far as I can tell.

Prior to this commit, rte_intr_disable() does free_irq(), so interrupt
does get disabled.

3. MSI/MSI-X and rxq vectors (intr_handle->efds):
Broken as reported earlier.


If we limit scope to only qede, then a variation of David's earlier
patch (self NACKed) would be sufficient.

http://patchwork.dpdk.org/patch/55310/

qede has separate handlers for INTx and MSI/MSI-X. So, just need to
remove rte_intr_enable() from the MSI/MSI-X handler.

--- a/drivers/net/qede/qede_ethdev.c
+++ b/drivers/net/qede/qede_ethdev.c
@@ -261,8 +261,6 @@ qede_interrupt_handler(void *param)
        struct ecore_dev *edev = &qdev->edev;

        qede_interrupt_action(ECORE_LEADING_HWFN(edev));
-       if (rte_intr_enable(eth_dev->intr_handle))
-               DP_ERR(edev, "rte_intr_enable failed\n");
 }


Trying to see if the following works. Do not have a patch yet.
- Revert pci_vfio.c so we do not enable interrupt during probe
- In eal_interrupts.c
  - Add state bit so we know if interrupt is enabled
  - For INTx, if enabled, use David's code to mask/unmask
  - For MSI/MSI-X, if enabled, do not enable again
    (i.e. do not do VFIO_DEVICE_SET_IRQS)


Jerin or others, do not let me stop you. Kinda reluctant to be the
owner of this issue at the moment :-)

Thank you.
-Hyong


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-07-15  7:20 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-02 12:48 [dpdk-dev] [RFC PATCH] vfio: move eventfd/interrupt pairing at setup time David Marchand
2019-07-09 13:40 ` [dpdk-dev] [EXT] " Shahed Shaikh
2019-07-10 12:33 ` [dpdk-dev] [PATCH] vfio: fix interrupts race condition David Marchand
2019-07-10 21:20   ` Thomas Monjalon
2019-07-14  5:10     ` Hyong Youb Kim (hyonkim)
2019-07-14 11:19       ` Thomas Monjalon
2019-07-15  5:35         ` Jerin Jacob Kollanukkaran
2019-07-15  7:20           ` Hyong Youb Kim (hyonkim)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).