DPDK patches and discussions
 help / color / mirror / Atom feed
From: "Qiu, Michael" <michael.qiu@intel.com>
To: "Zhou, Danny" <danny.zhou@intel.com>, "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [RFC PATCH] Enable uio_pci_generic support
Date: Tue, 28 Oct 2014 02:56:52 +0000	[thread overview]
Message-ID: <533710CFB86FA344BFBF2D6802E60286C7C559@SHSMSX101.ccr.corp.intel.com> (raw)
In-Reply-To: <1414432082-30428-1-git-send-email-danny.zhou@intel.com>

Tested-by: Michael Qiu <michael.qiu@intel.com>

在 10/28/2014 1:49 AM, Danny Zhou 写道:
> Linux kernel provides UIO as well as VFIO mechanism to support writing user
> space device driver. Comparing to UIO which is available since 2.6.32 kernel,
> the VFIO is introduced into kernel since version 3.6.0 with better interrupt
> and memory protection (build on top of Intel VT-d technology) supports.
> Basically, UIO and VFIO do two common things below:
> 1) Map PCIe device's I/O memory space to user space driver
> 2) Support device interrupt notification mechanism allows user space
>    driver/application is notified when device interrupt triggers.
>
> To run an DPDK application and make use of VFIO, two in_kernel modules
> vfio and vfio-pci module must be loaded. But to use UIO, a DPDK kernel
> module igb_uio, which was there since DPDK is invented, must be loaded to
> attach to in_kernel uio module. As an possible solution to deprecate igb_uio, 
> this RFC patch leverages uio_pci_generic this in_kernel module to support
> DPDK user space PMD in a generic fashion (like how VFIO works today), to
> partially (DPDK KNI module rte_kni still sits in kernel) remove DPDK dependency
> on GPL code igb_uio in kernel. Tests with uio_pci_generic shows performance
> remains same and LSC interrupts works as before.
>
> Example to bind Network Ports to uio_pci_generic:
> 	modprobe uio
> 	modprobe uio_pci_generic
> 	/* to bind device 08:00.0, to the uio_pci_generic driver */
> 	./tools/dpdk_nic_bind.py -b uio_pci_generic 08:00.0
>
> Note: this RFC patch does not completely remove igb_uio support due to
> igb_uio supports creating maximum number of SR-IOV VFs (Virtual Functions)
> by using max_vfs kernel parameter on older kernels (kernel 3.7.x and below).
> Specifically, igb_uio explicitly calls pci_enable_sriov() to create VFs, while
> it is not invoked in either uio or uio_pci_generic. On kernel 3.8.x
> and above, user can use the standard sysfs to enable VFs. For examples:
>
> #echo $num_vf_enabled > /sys/class/net/$dev/device/sriov_numvfs   //enable VFs
> #echo 0 > /sys/class/net/$dev/device/sriov_numvfs                 //disable VFs
>
> ---
>  lib/librte_eal/common/include/rte_pci.h            |   1 +
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c       |  67 +++++--
>  lib/librte_eal/linuxapp/eal/eal_pci.c              |   2 +-
>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c          | 203 ++++++++++++++++-----
>  .../linuxapp/eal/include/exec-env/rte_interrupts.h |   1 +
>  tools/dpdk_nic_bind.py                             |   2 +-
>  6 files changed, 214 insertions(+), 62 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
> index 66ed793..71ca882 100644
> --- a/lib/librte_eal/common/include/rte_pci.h
> +++ b/lib/librte_eal/common/include/rte_pci.h
> @@ -148,6 +148,7 @@ struct rte_pci_device {
>  	struct rte_pci_id id;                   /**< PCI ID. */
>  	struct rte_pci_resource mem_resource[PCI_MAX_RESOURCE];   /**< PCI Memory Resource */
>  	struct rte_intr_handle intr_handle;     /**< Interrupt handle */
> +	char driver_name[BUFSIZ];               /**< driver name */
>  	const struct rte_pci_driver *driver;    /**< Associated driver */
>  	uint16_t max_vfs;                       /**< sriov enable if not zero */
>  	int numa_node;                          /**< NUMA node connection */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index dc2668a..f2da778 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> @@ -361,6 +361,53 @@ vfio_disable_msix(struct rte_intr_handle *intr_handle) {
>  }
>  #endif
>  
> +static int
> +uio_intr_enable(struct rte_intr_handle *intr_handle)
> +{
> +	unsigned char command_high;
> +
> +	/* use config file descriptor for uio_pci_generic */
> +	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +		RTE_LOG(ERR, EAL,
> +			"Error reading interrupts status for fd %d\n",
> +			intr_handle->uio_cfg_fd);
> +		return -1;
> +	}
> +	/* disable interrupts */
> +	command_high |= 0x4;
> +	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +		RTE_LOG(ERR, EAL,
> +			"Error disabling interrupts for fd %d\n",
> +			intr_handle->uio_cfg_fd);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +uio_intr_disable(struct rte_intr_handle *intr_handle)
> +{
> +	unsigned char command_high;
> +
> +	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +		RTE_LOG(ERR, EAL,
> +			"Error reading interrupts status for fd %d\n",
> +			intr_handle->uio_cfg_fd);
> +		return -1;
> +	}
> +	/* enable interrupts */
> +	command_high &= ~0x4;
> +	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +		RTE_LOG(ERR, EAL,
> +			"Error enabling interrupts for fd %d\n",
> +			intr_handle->uio_cfg_fd);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
>  int
>  rte_intr_callback_register(struct rte_intr_handle *intr_handle,
>  			rte_intr_callback_fn cb, void *cb_arg)
> @@ -500,20 +547,14 @@ rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
>  int
>  rte_intr_enable(struct rte_intr_handle *intr_handle)
>  {
> -	const int value = 1;
> -
> -	if (!intr_handle || intr_handle->fd < 0)
> +	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
>  		return -1;
>  
>  	switch (intr_handle->type){
>  	/* write to the uio fd to enable the interrupt */
>  	case RTE_INTR_HANDLE_UIO:
> -		if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
> -			RTE_LOG(ERR, EAL,
> -				"Error enabling interrupts for fd %d\n",
> -							intr_handle->fd);
> +		if (uio_intr_enable(intr_handle))
>  			return -1;
> -		}
>  		break;
>  	/* not used at this moment */
>  	case RTE_INTR_HANDLE_ALARM:
> @@ -546,20 +587,14 @@ rte_intr_enable(struct rte_intr_handle *intr_handle)
>  int
>  rte_intr_disable(struct rte_intr_handle *intr_handle)
>  {
> -	const int value = 0;
> -
> -	if (!intr_handle || intr_handle->fd < 0)
> +	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
>  		return -1;
>  
>  	switch (intr_handle->type){
>  	/* write to the uio fd to disable the interrupt */
>  	case RTE_INTR_HANDLE_UIO:
> -		if (write(intr_handle->fd, &value, sizeof(value)) < 0){
> -			RTE_LOG(ERR, EAL,
> -				"Error disabling interrupts for fd %d\n",
> -							intr_handle->fd);
> +		if (uio_intr_disable(intr_handle))
>  			return -1;
> -		}
>  		break;
>  	/* not used at this moment */
>  	case RTE_INTR_HANDLE_ALARM:
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index 5fe3961..5c73908 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -498,7 +498,7 @@ pci_map_device(struct rte_pci_device *dev)
>  			return ret;
>  	}
>  #endif
> -	/* map resources for devices that use igb_uio */
> +	/* map resources for devices that use uio_pci_generic or igb_uio */
>  	if (!mapped) {
>  		ret = pci_uio_map_resource(dev);
>  		if (ret != 0)
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> index 7e62266..4b159e2 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> @@ -35,6 +35,7 @@
>  #include <fcntl.h>
>  #include <dirent.h>
>  #include <sys/stat.h>
> +#include <linux/pci_regs.h>
>  
>  #include <rte_log.h>
>  #include <rte_pci.h>
> @@ -46,72 +47,130 @@
>  #include "eal_filesystem.h"
>  #include "eal_pci_init.h"
>  
> +#define IORESOURCE_MEM        0x00000200
> +#define UIO_PCI_GENERIC_NAME  "uio_pci_generic"
> +
>  static int pci_parse_sysfs_value(const char *filename, uint64_t *val);
>  
>  
>  #define OFF_MAX              ((uint64_t)(off_t)-1)
>  static int
> -pci_uio_get_mappings(const char *devname, struct pci_map maps[], int nb_maps)
> +pci_uio_get_mappings(struct rte_pci_device *dev, const char *devname,
> +				struct pci_map maps[], int nb_maps)
>  {
> -	int i;
> +	struct rte_pci_addr *loc = &dev->addr;
> +	int i = 0;
>  	char dirname[PATH_MAX];
>  	char filename[PATH_MAX];
>  	uint64_t offset, size;
> +	unsigned long long start_addr, end_addr, flags;
> +	FILE *f;
>  
> -	for (i = 0; i != nb_maps; i++) {
> -
> -		/* check if map directory exists */
> -		snprintf(dirname, sizeof(dirname),
> -			"%s/maps/map%u", devname, i);
> -
> -		if (access(dirname, F_OK) != 0)
> -			break;
> -
> -		/* get mapping offset */
> +	if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> +			strlen(UIO_PCI_GENERIC_NAME)) == 0) {
>  		snprintf(filename, sizeof(filename),
> -			"%s/offset", dirname);
> -		if (pci_parse_sysfs_value(filename, &offset) < 0) {
> -			RTE_LOG(ERR, EAL,
> -				"%s(): cannot parse offset of %s\n",
> -				__func__, dirname);
> -			return -1;
> -		}
> +			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource",
> +			loc->domain, loc->bus, loc->devid, loc->function);
>  
> -		/* get mapping size */
> -		snprintf(filename, sizeof(filename),
> -			"%s/size", dirname);
> -		if (pci_parse_sysfs_value(filename, &size) < 0) {
> +		f = fopen(filename, "r");
> +		if (f == NULL) {
>  			RTE_LOG(ERR, EAL,
> -				"%s(): cannot parse size of %s\n",
> -				__func__, dirname);
> +			"%s(): cannot open sysfs %s\n",
> +			__func__, filename);
>  			return -1;
>  		}
>  
> -		/* get mapping physical address */
> -		snprintf(filename, sizeof(filename),
> -			"%s/addr", dirname);
> -		if (pci_parse_sysfs_value(filename, &maps[i].phaddr) < 0) {
> -			RTE_LOG(ERR, EAL,
> -				"%s(): cannot parse addr of %s\n",
> -				__func__, dirname);
> -			return -1;
> +		while (fscanf(f, "%llx %llx %llx", &start_addr,
> +				&end_addr, &flags) == 3 && i < nb_maps) {
> +			if (flags & IORESOURCE_MEM) {
> +				maps[i].offset = 0x0;
> +				maps[i].size = end_addr - start_addr + 1;
> +				maps[i].phaddr = start_addr;
> +				i++;
> +			}
>  		}
> +		fclose(f);
> +	} else {
> +		for (i = 0; i != nb_maps; i++) {
> +			/* check if map directory exists */
> +			snprintf(dirname, sizeof(dirname),
> +				"%s/maps/map%u", devname, i);
> +
> +			if (access(dirname, F_OK) != 0)
> +				break;
> +
> +			/* get mapping offset */
> +			snprintf(filename, sizeof(filename),
> +				"%s/offset", dirname);
> +			if (pci_parse_sysfs_value(filename, &offset) < 0) {
> +				RTE_LOG(ERR, EAL,
> +					"%s(): cannot parse offset of %s\n",
> +					__func__, dirname);
> +				return -1;
> +			}
>  
> -		if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> -			RTE_LOG(ERR, EAL,
> -				"%s(): offset/size exceed system max value\n",
> -				__func__);
> -			return -1;
> -		}
> +			/* get mapping size */
> +			snprintf(filename, sizeof(filename),
> +				"%s/size", dirname);
> +			if (pci_parse_sysfs_value(filename, &size) < 0) {
> +				RTE_LOG(ERR, EAL,
> +					"%s(): cannot parse size of %s\n",
> +					__func__, dirname);
> +				return -1;
> +			}
>  
> -		maps[i].offset = offset;
> -		maps[i].size = size;
> +			/* get mapping physical address */
> +			snprintf(filename, sizeof(filename),
> +				"%s/addr", dirname);
> +			if (pci_parse_sysfs_value(filename,
> +						&maps[i].phaddr) < 0) {
> +				RTE_LOG(ERR, EAL,
> +					"%s(): cannot parse addr of %s\n",
> +					__func__, dirname);
> +				return -1;
> +			}
> +
> +			if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> +				RTE_LOG(ERR, EAL,
> +					"%s(): offset/size exceed system max value\n",
> +					__func__);
> +				return -1;
> +			}
> +
> +			maps[i].offset = offset;
> +			maps[i].size = size;
> +		}
>  	}
>  
>  	return i;
>  }
>  
>  static int
> +pci_uio_set_bus_master(int dev_fd)
> +{
> +	uint16_t reg;
> +	int ret;
> +
> +	ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
> +	if (ret != sizeof(reg)) {
> +		RTE_LOG(ERR, EAL,
> +			"Cannot read command from PCI config space!\n");
> +		return -1;
> +	}
> +
> +	reg |= PCI_COMMAND_MASTER;
> +
> +	ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
> +	if (ret != sizeof(reg)) {
> +		RTE_LOG(ERR, EAL,
> +			"Cannot write command to PCI config space!\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
>  pci_uio_map_secondary(struct rte_pci_device *dev)
>  {
>  	int fd, i;
> @@ -210,6 +269,10 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
>  	struct dirent *e;
>  	DIR *dir;
>  	char dirname[PATH_MAX];
> +	FILE *f;
> +	char fullpath[PATH_MAX];
> +	char buf[BUFSIZ];
> +	char *s;
>  
>  	/* depending on kernel version, uio can be located in uio/uioX
>  	 * or uio:uioX */
> @@ -265,6 +328,30 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
>  	if (e == NULL)
>  		return -1;
>  
> +	/* remember driver name of uio device */
> +	snprintf(fullpath, sizeof(fullpath),
> +			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio/%s/name" ,
> +			loc->domain, loc->bus, loc->devid,
> +			loc->function, e->d_name);
> +
> +	f = fopen(fullpath, "r");
> +	if (f == NULL) {
> +		RTE_LOG(ERR, EAL,
> +			"%s(): cannot open sysfs to get driver name\n",
> +			__func__);
> +		return -1;
> +	}
> +	s = fgets(buf, sizeof(buf), f);
> +	if (s == NULL) {
> +		RTE_LOG(ERR, EAL,
> +			"%s(): cannot read sysfs to get driver name\n",
> +			__func__);
> +		fclose(f);
> +		return -1;
> +	}
> +	strncpy(dev->driver_name, buf, sizeof(buf));
> +	fclose(f);
> +
>  	/* create uio device if we've been asked to */
>  	if (internal_config.create_uio_dev &&
>  			pci_mknod_uio_dev(dstbuf, uio_num) < 0)
> @@ -279,6 +366,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  {
>  	int i, j;
>  	char dirname[PATH_MAX];
> +	char cfgname[PATH_MAX];
>  	char devname[PATH_MAX]; /* contains the /dev/uioX */
>  	void *mapaddr;
>  	int uio_num;
> @@ -291,6 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  	struct pci_map *maps;
>  
>  	dev->intr_handle.fd = -1;
> +	dev->intr_handle.uio_cfg_fd = -1;
>  	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
>  
>  	/* secondary processes - use already recorded details */
> @@ -315,6 +404,33 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  	}
>  	dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
>  
> +	snprintf(cfgname, sizeof(cfgname),
> +			"/sys/class/uio/uio%u/device/config", uio_num);
> +	dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
> +	if (dev->intr_handle.uio_cfg_fd < 0) {
> +		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +			cfgname, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* handle Misc. stuff for uio_pci_generic  */
> +	if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> +			strlen(UIO_PCI_GENERIC_NAME)) == 0) {
> +		/* update devname for uio_pci_generic  */
> +		snprintf(devname, sizeof(devname),
> +			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d",
> +			loc->domain, loc->bus, loc->devid, loc->function, 0);
> +
> +		/* set bus master that is not done by uio_pci_generic */
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
> +				RTE_LOG(ERR, EAL,
> +					"Cannot set up bus mastering!\n");
> +				return -1;
> +			}
> +		}
> +	}
> +
>  	/* allocate the mapping details for secondary processes*/
>  	uio_res = rte_zmalloc("UIO_RES", sizeof(*uio_res), 0);
>  	if (uio_res == NULL) {
> @@ -327,13 +443,12 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  	memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));
>  
>  	/* collect info about device mappings */
> -	nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
> -				       RTE_DIM(uio_res->maps));
> +	nb_maps = pci_uio_get_mappings(dev, dirname, uio_res->maps,
> +						RTE_DIM(uio_res->maps));
>  	if (nb_maps < 0) {
>  		rte_free(uio_res);
>  		return nb_maps;
>  	}
> -
>  	uio_res->nb_maps = nb_maps;
>  
>  	/* Map all BARs */
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index 23eafd9..151323d 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -51,6 +51,7 @@ enum rte_intr_handle_type {
>  /** Handle for interrupts. */
>  struct rte_intr_handle {
>  	int vfio_dev_fd;                 /**< VFIO device file descriptor */
> +	int uio_cfg_fd;                  /**< UIO config file descriptor */
>  	int fd;                          /**< file descriptor */
>  	enum rte_intr_handle_type type;  /**< handle type */
>  };
> diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py
> index 812b6a1..2483056 100755
> --- a/tools/dpdk_nic_bind.py
> +++ b/tools/dpdk_nic_bind.py
> @@ -43,7 +43,7 @@ ETHERNET_CLASS = "0200"
>  # Each device within this is itself a dictionary of device properties
>  devices = {}
>  # list of supported DPDK drivers
> -dpdk_drivers = [ "igb_uio", "vfio-pci" ]
> +dpdk_drivers = [ "igb_uio", "vfio-pci", "uio_pci_generic" ]
>  
>  # command-line arg flags
>  b_flag = None


  reply	other threads:[~2014-10-28  2:50 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-10-27 17:48 Danny Zhou
2014-10-28  2:56 ` Qiu, Michael [this message]
2014-11-17 11:52 ` Bruce Richardson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=533710CFB86FA344BFBF2D6802E60286C7C559@SHSMSX101.ccr.corp.intel.com \
    --to=michael.qiu@intel.com \
    --cc=danny.zhou@intel.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).