Re: [dpdk-dev] [PATCH] vfio: add hotplug support

DPDK patches and discussions
 help / color / mirror / Atom feed

From: "Burakov, Anatoly" <anatoly.burakov@intel.com>
To: Alejandro Lucero <alejandro.lucero@netronome.com>
Cc: "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [PATCH] vfio: add hotplug support
Date: Wed, 22 Mar 2017 13:49:15 +0000	[thread overview]
Message-ID: <C6ECDF3AB251BE4894318F4E45123697821EF8A5@IRSMSX109.ger.corp.intel.com> (raw)
In-Reply-To: <1489661496-12818-1-git-send-email-alejandro.lucero@netronome.com>

Hi Alejandro,

Just a general comment, I think there's now a rule to not split error messages onto multiple lines
even if going above the 80 char limit, to make grepping easier.

> Current device hotplug is just when using UIO. This patch adds
> same functionality with VFIO.
> 
> It has been validated through tests using IOMMU and also with VFIO
> and no-iommu mode.
> 

<...>

>  /* map VFIO resource prototype */
>  int pci_vfio_map_resource(struct rte_pci_device *dev);
> +int pci_vfio_unmap_resource(struct rte_pci_device *dev);

Probably worth it to fix the comment here?

> @@ -517,7 +521,7 @@
>  		}
> 
>  		/* set bus mastering for the device */
> -		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
> +		if (pci_vfio_set_bus_master(vfio_dev_fd, 1)) {

"true"? Doesn't make a difference, but if you're accepting bool, you might as well
use bool values when calling :)

>  			RTE_LOG(ERR, EAL, "  %s cannot set up bus
> mastering!\n", pci_addr);
>  			close(vfio_dev_fd);
>  			rte_free(vfio_res);
> @@ -535,6 +539,76 @@
>  }
> 
>  int
> +pci_vfio_unmap_resource(struct rte_pci_device *dev)
> +{
> +	char pci_addr[PATH_MAX] = {0};
> +	struct rte_pci_addr *loc = &dev->addr;
> +	int i, ret;
> +	struct mapped_pci_resource *vfio_res = NULL;
> +	struct mapped_pci_res_list *vfio_res_list;
> +
> +	struct pci_map *maps;
> +
> +	/* store PCI address string */
> +	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
> +			loc->domain, loc->bus, loc->devid, loc->function);
> +
> +
> +	if (close(dev->intr_handle.fd) < 0) {
> +		RTE_LOG(INFO, EAL, "Error when Closing eventfd file descriptor for %s\n",

"closing"

> +			pci_addr);
> +		return -1;
> +	}
> +
> +	if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, 0)) {

"false"?

> +		RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI
> device!\n",
> +				pci_addr);
> +		return -1;
> +	}
> +
> +	ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
> +				  dev->intr_handle.vfio_dev_fd);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL,
> +			"%s(): cannot release device\n", __func__);
> +		return ret;
> +	}
> +
> +	vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head,
> mapped_pci_res_list);
> +	/* Get vfio_res */
> +	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
> +		if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev-
> >addr)))
> +			continue;
> +		break;
> +	}
> +	/* if we haven't found our tailq entry, something's wrong */
> +	if (vfio_res == NULL) {
> +		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI
> device!\n",
> +				pci_addr);
> +		return -1;
> +	}
> +
> +	/* map BARs */
> +	maps = vfio_res->maps;

Fix comment

> +
> +	RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
> +		pci_addr);
> +	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> +
> +		/* TODO: what about the maps offset field? */

You probably need to perform the same calculations mapping function does if you
are unmapping an MSI-X BAR. As far as I understand, the address space is mapped
in whole (i.e. the maps[i].addr covers the whole BAR), but the things that map into
that address space are mapped using two separate mmap calls. So it's probably a
matter of 1) figuring out if maps[i] points to an MSI-X BAR, 2) figuring out if there
is other stuff in the register that can be mapped, and 3) unmapping that. We pretty
much map those unconditionally, so if the device is up, they're guaranteed to be mapped.

> +		if (maps[i].addr) {
> +			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource
> for %s at %p\n",
> +				pci_addr, maps[i].addr);
> +			pci_unmap_resource(maps[i].addr, maps[i].size);
> +		}
> +	}
> +
> +	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
> +
> +	return 0;
> +}
> +
> +int
>  pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
>  		    struct rte_pci_ioport *p)
>  {
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 9377a66..42e0f62 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -68,13 +68,32 @@
>  {
>  	int i;
>  	int vfio_group_fd;
> +	int group_idx = -1;
>  	char filename[PATH_MAX];
> 
>  	/* check if we already have the group descriptor open */
> -	for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
> +	for (i = 0; i < VFIO_MAX_GROUPS; i++)
>  		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
>  			return vfio_cfg.vfio_groups[i].fd;
> 
> +	/* Lets see first if there is room for a new group */
> +	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
> +		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups
> reached!\n");
> +		return -1;
> +	}
> +
> +	/* Now lets get an index for the new group */
> +	for (i = 0; i < VFIO_MAX_GROUPS; i++)
> +		if (vfio_cfg.vfio_groups[i].group_no == -1) {
> +			group_idx = i;
> +			break;
> +		}
> +
> +	/* This should not happen */
> +	if (group_idx == -1) {
> +		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
> +		return -1;
> +	}
>  	/* if primary, try to open the group */
>  	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>  		/* try regular group format */
> @@ -104,14 +123,9 @@
>  			/* noiommu group found */
>  		}
> 
> -		/* if the fd is valid, create a new group for it */
> -		if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
> -			RTE_LOG(ERR, EAL, "Maximum number of VFIO
> groups reached!\n");
> -			close(vfio_group_fd);
> -			return -1;
> -		}
> -		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no =
> iommu_group_no;
> -		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd =
> vfio_group_fd;
> +		vfio_cfg.vfio_groups[group_idx].group_no =
> iommu_group_no;
> +		vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
> +		vfio_cfg.vfio_active_groups++;
>  		return vfio_group_fd;
>  	}
>  	/* if we're in a secondary process, request group fd from the primary
> @@ -158,14 +172,62 @@
>  	return -1;
>  }
> 
> -static void
> -clear_current_group(void)
> +int
> +clear_group(int vfio_group_fd)
>  {
> -	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
> -	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
> +	int i;
> +	int socket_fd, ret;
> +
> +	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> +
> +		for (i = 0; i < VFIO_MAX_GROUPS; i++)
> +			if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
> +				vfio_cfg.vfio_groups[i].group_no = -1;
> +				vfio_cfg.vfio_groups[i].fd = -1;
> +				vfio_cfg.vfio_active_groups--;
> +				return 0;
> +			}
> +		return -1;
> +	}
> +
> +	/* This is just for SECONDARY processes */
> +	socket_fd = vfio_mp_sync_connect_to_primary();
> +
> +	if (socket_fd < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot connect to primary
> process!\n");
> +		return -1;
> +	}
> +
> +	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) <
> 0) {
> +		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> +		close(socket_fd);
> +		return -1;
> +	}
> +
> +	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> +		close(socket_fd);
> +		return -1;
> +	}
> +
> +	ret = vfio_mp_sync_receive_request(socket_fd);
> +	switch (ret) {
> +	case SOCKET_NO_FD:
> +		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> +		close(socket_fd);
> +		break;
> +	case SOCKET_OK:
> +		close(socket_fd);
> +		return 0;
> +	default:
> +		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> +		close(socket_fd);

Technically, SOCKET_ERR isn't an "UNKNOWN" reply, so maybe handle it explicitly?

That said, please forgive me my ignorance on hotplug, but does secondary process
need to notify primary about group clearing at all? I mean, doesn't primary process
also get the hotplug event? I would guess secondary process just needs to unmap
the device and that's it, no?

> +	}
> +	return -1;
>  }
> 
> -int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
> +int
> +vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>  		int *vfio_dev_fd, struct vfio_device_info *device_info)
>  {
>  	struct vfio_group_status group_status = {

<...>

> +		 */
> +
> +		RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s
> failed\n",
>  				dev_addr);
> -		return 1;
> +		close(vfio_group_fd);
> +		clear_group(vfio_group_fd);
> +		return -1;

I've had suspicions that it would be a bug, but I tested it on both modern and old kernels (where
ports on the same NIC are assigned same IOMMU group), and it works as it should. I think there wasn't
a "which kernel driver" EAL PCI code back when this was written, so returning 1 was necessary to not fail
when multiple ports shared the same IOMMU group, but only one of them was bound to VFIO.

>  	}
> 
>  	/* test and setup the device */
>  	ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
>  	if (ret) {
>  		RTE_LOG(ERR, EAL, "  %s cannot get device info, "
> -				"error %i (%s)\n", dev_addr, errno,
> strerror(errno));

<...>

> @@ -155,6 +154,10 @@ struct vfio_iommu_type {
>  int
>  vfio_get_group_fd(int iommu_group_no);
> 
> +/* remove group fd from internal VFIO group fd array */
> +int
> +clear_group(int vfio_group_fd);
> +
>  /**
>   * Setup vfio_cfg for the device identified by its address. It discovers
>   * the configured I/O MMU groups or sets a new one for the device. If a new
> @@ -165,6 +168,9 @@ struct vfio_iommu_type {
>  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>  		int *vfio_dev_fd, struct vfio_device_info *device_info);
> 
> +int vfio_release_device(const char *sysfs_base, const char *dev_addr, int
> fd);
> +
> +int vfio_enable(const char *modname);
>  int vfio_enable(const char *modname);

Duplicate vfio_enable declaration?

>  int vfio_is_enabled(const char *modname);
> 
> @@ -175,6 +181,7 @@ int vfio_setup_device(const char *sysfs_base, const
> char *dev_addr,
> 
>  #define SOCKET_REQ_CONTAINER 0x100
>  #define SOCKET_REQ_GROUP 0x200
> +#define SOCKET_CLR_GROUP 0x300
>  #define SOCKET_OK 0x0
>  #define SOCKET_NO_FD 0x1
>  #define SOCKET_ERR 0xFF

Thanks,
Anatoly

next prev parent reply	other threads:[~2017-03-22 13:49 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-16 10:51 Alejandro Lucero
2017-03-17 11:07 ` Eelco Chaudron
2017-03-22 13:49 ` Burakov, Anatoly [this message]
2017-03-23 12:08   ` Burakov, Anatoly
2017-03-24 16:39     ` Alejandro Lucero
2017-03-24 16:38   ` Alejandro Lucero
2017-03-24 16:56     ` Burakov, Anatoly

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=C6ECDF3AB251BE4894318F4E45123697821EF8A5@IRSMSX109.ger.corp.intel.com \
    --to=anatoly.burakov@intel.com \
    --cc=alejandro.lucero@netronome.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).