From: "Qiu, Michael" <michael.qiu@intel.com>
To: "Zhou, Danny" <danny.zhou@intel.com>, "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [RFC PATCH] Enable uio_pci_generic support
Date: Tue, 28 Oct 2014 02:56:52 +0000 [thread overview]
Message-ID: <533710CFB86FA344BFBF2D6802E60286C7C559@SHSMSX101.ccr.corp.intel.com> (raw)
In-Reply-To: <1414432082-30428-1-git-send-email-danny.zhou@intel.com>
Tested-by: Michael Qiu <michael.qiu@intel.com>
在 10/28/2014 1:49 AM, Danny Zhou 写道:
> Linux kernel provides UIO as well as VFIO mechanism to support writing user
> space device driver. Comparing to UIO which is available since 2.6.32 kernel,
> the VFIO is introduced into kernel since version 3.6.0 with better interrupt
> and memory protection (build on top of Intel VT-d technology) supports.
> Basically, UIO and VFIO do two common things below:
> 1) Map PCIe device's I/O memory space to user space driver
> 2) Support device interrupt notification mechanism allows user space
> driver/application is notified when device interrupt triggers.
>
> To run an DPDK application and make use of VFIO, two in_kernel modules
> vfio and vfio-pci module must be loaded. But to use UIO, a DPDK kernel
> module igb_uio, which was there since DPDK is invented, must be loaded to
> attach to in_kernel uio module. As an possible solution to deprecate igb_uio,
> this RFC patch leverages uio_pci_generic this in_kernel module to support
> DPDK user space PMD in a generic fashion (like how VFIO works today), to
> partially (DPDK KNI module rte_kni still sits in kernel) remove DPDK dependency
> on GPL code igb_uio in kernel. Tests with uio_pci_generic shows performance
> remains same and LSC interrupts works as before.
>
> Example to bind Network Ports to uio_pci_generic:
> modprobe uio
> modprobe uio_pci_generic
> /* to bind device 08:00.0, to the uio_pci_generic driver */
> ./tools/dpdk_nic_bind.py -b uio_pci_generic 08:00.0
>
> Note: this RFC patch does not completely remove igb_uio support due to
> igb_uio supports creating maximum number of SR-IOV VFs (Virtual Functions)
> by using max_vfs kernel parameter on older kernels (kernel 3.7.x and below).
> Specifically, igb_uio explicitly calls pci_enable_sriov() to create VFs, while
> it is not invoked in either uio or uio_pci_generic. On kernel 3.8.x
> and above, user can use the standard sysfs to enable VFs. For examples:
>
> #echo $num_vf_enabled > /sys/class/net/$dev/device/sriov_numvfs //enable VFs
> #echo 0 > /sys/class/net/$dev/device/sriov_numvfs //disable VFs
>
> ---
> lib/librte_eal/common/include/rte_pci.h | 1 +
> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 67 +++++--
> lib/librte_eal/linuxapp/eal/eal_pci.c | 2 +-
> lib/librte_eal/linuxapp/eal/eal_pci_uio.c | 203 ++++++++++++++++-----
> .../linuxapp/eal/include/exec-env/rte_interrupts.h | 1 +
> tools/dpdk_nic_bind.py | 2 +-
> 6 files changed, 214 insertions(+), 62 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
> index 66ed793..71ca882 100644
> --- a/lib/librte_eal/common/include/rte_pci.h
> +++ b/lib/librte_eal/common/include/rte_pci.h
> @@ -148,6 +148,7 @@ struct rte_pci_device {
> struct rte_pci_id id; /**< PCI ID. */
> struct rte_pci_resource mem_resource[PCI_MAX_RESOURCE]; /**< PCI Memory Resource */
> struct rte_intr_handle intr_handle; /**< Interrupt handle */
> + char driver_name[BUFSIZ]; /**< driver name */
> const struct rte_pci_driver *driver; /**< Associated driver */
> uint16_t max_vfs; /**< sriov enable if not zero */
> int numa_node; /**< NUMA node connection */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index dc2668a..f2da778 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> @@ -361,6 +361,53 @@ vfio_disable_msix(struct rte_intr_handle *intr_handle) {
> }
> #endif
>
> +static int
> +uio_intr_enable(struct rte_intr_handle *intr_handle)
> +{
> + unsigned char command_high;
> +
> + /* use config file descriptor for uio_pci_generic */
> + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> + RTE_LOG(ERR, EAL,
> + "Error reading interrupts status for fd %d\n",
> + intr_handle->uio_cfg_fd);
> + return -1;
> + }
> + /* disable interrupts */
> + command_high |= 0x4;
> + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> + RTE_LOG(ERR, EAL,
> + "Error disabling interrupts for fd %d\n",
> + intr_handle->uio_cfg_fd);
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +uio_intr_disable(struct rte_intr_handle *intr_handle)
> +{
> + unsigned char command_high;
> +
> + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> + RTE_LOG(ERR, EAL,
> + "Error reading interrupts status for fd %d\n",
> + intr_handle->uio_cfg_fd);
> + return -1;
> + }
> + /* enable interrupts */
> + command_high &= ~0x4;
> + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> + RTE_LOG(ERR, EAL,
> + "Error enabling interrupts for fd %d\n",
> + intr_handle->uio_cfg_fd);
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> int
> rte_intr_callback_register(struct rte_intr_handle *intr_handle,
> rte_intr_callback_fn cb, void *cb_arg)
> @@ -500,20 +547,14 @@ rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
> int
> rte_intr_enable(struct rte_intr_handle *intr_handle)
> {
> - const int value = 1;
> -
> - if (!intr_handle || intr_handle->fd < 0)
> + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
> return -1;
>
> switch (intr_handle->type){
> /* write to the uio fd to enable the interrupt */
> case RTE_INTR_HANDLE_UIO:
> - if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
> - RTE_LOG(ERR, EAL,
> - "Error enabling interrupts for fd %d\n",
> - intr_handle->fd);
> + if (uio_intr_enable(intr_handle))
> return -1;
> - }
> break;
> /* not used at this moment */
> case RTE_INTR_HANDLE_ALARM:
> @@ -546,20 +587,14 @@ rte_intr_enable(struct rte_intr_handle *intr_handle)
> int
> rte_intr_disable(struct rte_intr_handle *intr_handle)
> {
> - const int value = 0;
> -
> - if (!intr_handle || intr_handle->fd < 0)
> + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
> return -1;
>
> switch (intr_handle->type){
> /* write to the uio fd to disable the interrupt */
> case RTE_INTR_HANDLE_UIO:
> - if (write(intr_handle->fd, &value, sizeof(value)) < 0){
> - RTE_LOG(ERR, EAL,
> - "Error disabling interrupts for fd %d\n",
> - intr_handle->fd);
> + if (uio_intr_disable(intr_handle))
> return -1;
> - }
> break;
> /* not used at this moment */
> case RTE_INTR_HANDLE_ALARM:
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index 5fe3961..5c73908 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -498,7 +498,7 @@ pci_map_device(struct rte_pci_device *dev)
> return ret;
> }
> #endif
> - /* map resources for devices that use igb_uio */
> + /* map resources for devices that use uio_pci_generic or igb_uio */
> if (!mapped) {
> ret = pci_uio_map_resource(dev);
> if (ret != 0)
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> index 7e62266..4b159e2 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> @@ -35,6 +35,7 @@
> #include <fcntl.h>
> #include <dirent.h>
> #include <sys/stat.h>
> +#include <linux/pci_regs.h>
>
> #include <rte_log.h>
> #include <rte_pci.h>
> @@ -46,72 +47,130 @@
> #include "eal_filesystem.h"
> #include "eal_pci_init.h"
>
> +#define IORESOURCE_MEM 0x00000200
> +#define UIO_PCI_GENERIC_NAME "uio_pci_generic"
> +
> static int pci_parse_sysfs_value(const char *filename, uint64_t *val);
>
>
> #define OFF_MAX ((uint64_t)(off_t)-1)
> static int
> -pci_uio_get_mappings(const char *devname, struct pci_map maps[], int nb_maps)
> +pci_uio_get_mappings(struct rte_pci_device *dev, const char *devname,
> + struct pci_map maps[], int nb_maps)
> {
> - int i;
> + struct rte_pci_addr *loc = &dev->addr;
> + int i = 0;
> char dirname[PATH_MAX];
> char filename[PATH_MAX];
> uint64_t offset, size;
> + unsigned long long start_addr, end_addr, flags;
> + FILE *f;
>
> - for (i = 0; i != nb_maps; i++) {
> -
> - /* check if map directory exists */
> - snprintf(dirname, sizeof(dirname),
> - "%s/maps/map%u", devname, i);
> -
> - if (access(dirname, F_OK) != 0)
> - break;
> -
> - /* get mapping offset */
> + if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> + strlen(UIO_PCI_GENERIC_NAME)) == 0) {
> snprintf(filename, sizeof(filename),
> - "%s/offset", dirname);
> - if (pci_parse_sysfs_value(filename, &offset) < 0) {
> - RTE_LOG(ERR, EAL,
> - "%s(): cannot parse offset of %s\n",
> - __func__, dirname);
> - return -1;
> - }
> + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource",
> + loc->domain, loc->bus, loc->devid, loc->function);
>
> - /* get mapping size */
> - snprintf(filename, sizeof(filename),
> - "%s/size", dirname);
> - if (pci_parse_sysfs_value(filename, &size) < 0) {
> + f = fopen(filename, "r");
> + if (f == NULL) {
> RTE_LOG(ERR, EAL,
> - "%s(): cannot parse size of %s\n",
> - __func__, dirname);
> + "%s(): cannot open sysfs %s\n",
> + __func__, filename);
> return -1;
> }
>
> - /* get mapping physical address */
> - snprintf(filename, sizeof(filename),
> - "%s/addr", dirname);
> - if (pci_parse_sysfs_value(filename, &maps[i].phaddr) < 0) {
> - RTE_LOG(ERR, EAL,
> - "%s(): cannot parse addr of %s\n",
> - __func__, dirname);
> - return -1;
> + while (fscanf(f, "%llx %llx %llx", &start_addr,
> + &end_addr, &flags) == 3 && i < nb_maps) {
> + if (flags & IORESOURCE_MEM) {
> + maps[i].offset = 0x0;
> + maps[i].size = end_addr - start_addr + 1;
> + maps[i].phaddr = start_addr;
> + i++;
> + }
> }
> + fclose(f);
> + } else {
> + for (i = 0; i != nb_maps; i++) {
> + /* check if map directory exists */
> + snprintf(dirname, sizeof(dirname),
> + "%s/maps/map%u", devname, i);
> +
> + if (access(dirname, F_OK) != 0)
> + break;
> +
> + /* get mapping offset */
> + snprintf(filename, sizeof(filename),
> + "%s/offset", dirname);
> + if (pci_parse_sysfs_value(filename, &offset) < 0) {
> + RTE_LOG(ERR, EAL,
> + "%s(): cannot parse offset of %s\n",
> + __func__, dirname);
> + return -1;
> + }
>
> - if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> - RTE_LOG(ERR, EAL,
> - "%s(): offset/size exceed system max value\n",
> - __func__);
> - return -1;
> - }
> + /* get mapping size */
> + snprintf(filename, sizeof(filename),
> + "%s/size", dirname);
> + if (pci_parse_sysfs_value(filename, &size) < 0) {
> + RTE_LOG(ERR, EAL,
> + "%s(): cannot parse size of %s\n",
> + __func__, dirname);
> + return -1;
> + }
>
> - maps[i].offset = offset;
> - maps[i].size = size;
> + /* get mapping physical address */
> + snprintf(filename, sizeof(filename),
> + "%s/addr", dirname);
> + if (pci_parse_sysfs_value(filename,
> + &maps[i].phaddr) < 0) {
> + RTE_LOG(ERR, EAL,
> + "%s(): cannot parse addr of %s\n",
> + __func__, dirname);
> + return -1;
> + }
> +
> + if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> + RTE_LOG(ERR, EAL,
> + "%s(): offset/size exceed system max value\n",
> + __func__);
> + return -1;
> + }
> +
> + maps[i].offset = offset;
> + maps[i].size = size;
> + }
> }
>
> return i;
> }
>
> static int
> +pci_uio_set_bus_master(int dev_fd)
> +{
> + uint16_t reg;
> + int ret;
> +
> + ret = pread(dev_fd, ®, sizeof(reg), PCI_COMMAND);
> + if (ret != sizeof(reg)) {
> + RTE_LOG(ERR, EAL,
> + "Cannot read command from PCI config space!\n");
> + return -1;
> + }
> +
> + reg |= PCI_COMMAND_MASTER;
> +
> + ret = pwrite(dev_fd, ®, sizeof(reg), PCI_COMMAND);
> + if (ret != sizeof(reg)) {
> + RTE_LOG(ERR, EAL,
> + "Cannot write command to PCI config space!\n");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int
> pci_uio_map_secondary(struct rte_pci_device *dev)
> {
> int fd, i;
> @@ -210,6 +269,10 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
> struct dirent *e;
> DIR *dir;
> char dirname[PATH_MAX];
> + FILE *f;
> + char fullpath[PATH_MAX];
> + char buf[BUFSIZ];
> + char *s;
>
> /* depending on kernel version, uio can be located in uio/uioX
> * or uio:uioX */
> @@ -265,6 +328,30 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
> if (e == NULL)
> return -1;
>
> + /* remember driver name of uio device */
> + snprintf(fullpath, sizeof(fullpath),
> + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio/%s/name" ,
> + loc->domain, loc->bus, loc->devid,
> + loc->function, e->d_name);
> +
> + f = fopen(fullpath, "r");
> + if (f == NULL) {
> + RTE_LOG(ERR, EAL,
> + "%s(): cannot open sysfs to get driver name\n",
> + __func__);
> + return -1;
> + }
> + s = fgets(buf, sizeof(buf), f);
> + if (s == NULL) {
> + RTE_LOG(ERR, EAL,
> + "%s(): cannot read sysfs to get driver name\n",
> + __func__);
> + fclose(f);
> + return -1;
> + }
> + strncpy(dev->driver_name, buf, sizeof(buf));
> + fclose(f);
> +
> /* create uio device if we've been asked to */
> if (internal_config.create_uio_dev &&
> pci_mknod_uio_dev(dstbuf, uio_num) < 0)
> @@ -279,6 +366,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
> {
> int i, j;
> char dirname[PATH_MAX];
> + char cfgname[PATH_MAX];
> char devname[PATH_MAX]; /* contains the /dev/uioX */
> void *mapaddr;
> int uio_num;
> @@ -291,6 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
> struct pci_map *maps;
>
> dev->intr_handle.fd = -1;
> + dev->intr_handle.uio_cfg_fd = -1;
> dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
>
> /* secondary processes - use already recorded details */
> @@ -315,6 +404,33 @@ pci_uio_map_resource(struct rte_pci_device *dev)
> }
> dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
>
> + snprintf(cfgname, sizeof(cfgname),
> + "/sys/class/uio/uio%u/device/config", uio_num);
> + dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
> + if (dev->intr_handle.uio_cfg_fd < 0) {
> + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> + cfgname, strerror(errno));
> + return -1;
> + }
> +
> + /* handle Misc. stuff for uio_pci_generic */
> + if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> + strlen(UIO_PCI_GENERIC_NAME)) == 0) {
> + /* update devname for uio_pci_generic */
> + snprintf(devname, sizeof(devname),
> + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d",
> + loc->domain, loc->bus, loc->devid, loc->function, 0);
> +
> + /* set bus master that is not done by uio_pci_generic */
> + if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> + if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
> + RTE_LOG(ERR, EAL,
> + "Cannot set up bus mastering!\n");
> + return -1;
> + }
> + }
> + }
> +
> /* allocate the mapping details for secondary processes*/
> uio_res = rte_zmalloc("UIO_RES", sizeof(*uio_res), 0);
> if (uio_res == NULL) {
> @@ -327,13 +443,12 @@ pci_uio_map_resource(struct rte_pci_device *dev)
> memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));
>
> /* collect info about device mappings */
> - nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
> - RTE_DIM(uio_res->maps));
> + nb_maps = pci_uio_get_mappings(dev, dirname, uio_res->maps,
> + RTE_DIM(uio_res->maps));
> if (nb_maps < 0) {
> rte_free(uio_res);
> return nb_maps;
> }
> -
> uio_res->nb_maps = nb_maps;
>
> /* Map all BARs */
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index 23eafd9..151323d 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -51,6 +51,7 @@ enum rte_intr_handle_type {
> /** Handle for interrupts. */
> struct rte_intr_handle {
> int vfio_dev_fd; /**< VFIO device file descriptor */
> + int uio_cfg_fd; /**< UIO config file descriptor */
> int fd; /**< file descriptor */
> enum rte_intr_handle_type type; /**< handle type */
> };
> diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py
> index 812b6a1..2483056 100755
> --- a/tools/dpdk_nic_bind.py
> +++ b/tools/dpdk_nic_bind.py
> @@ -43,7 +43,7 @@ ETHERNET_CLASS = "0200"
> # Each device within this is itself a dictionary of device properties
> devices = {}
> # list of supported DPDK drivers
> -dpdk_drivers = [ "igb_uio", "vfio-pci" ]
> +dpdk_drivers = [ "igb_uio", "vfio-pci", "uio_pci_generic" ]
>
> # command-line arg flags
> b_flag = None
next prev parent reply other threads:[~2014-10-28 2:50 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-10-27 17:48 Danny Zhou
2014-10-28 2:56 ` Qiu, Michael [this message]
2014-11-17 11:52 ` Bruce Richardson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=533710CFB86FA344BFBF2D6802E60286C7C559@SHSMSX101.ccr.corp.intel.com \
--to=michael.qiu@intel.com \
--cc=danny.zhou@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).