From: "Burakov, Anatoly" <anatoly.burakov@intel.com>
To: Takeshi Yoshimura <t.yoshimura8869@gmail.com>
Cc: dev@dpdk.org, Jerin Jacob <jerin.jacob@caviumnetworks.com>,
thomas@monjalon.net
Subject: Re: [dpdk-dev] [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it
Date: Tue, 31 Jul 2018 12:24:02 +0100 [thread overview]
Message-ID: <664ca8bf-d5e2-50b4-8e28-e5b15d82cb80@intel.com> (raw)
In-Reply-To: <CALBPOTXJraeQScnJs6t0Tk=m+gPB1cghnBXVs_Zu_LXBfJrmQA@mail.gmail.com>
On 31-Jul-18 10:38 AM, Takeshi Yoshimura wrote:
> 2018-07-30 20:17 GMT+09:00 Anatoly Burakov <anatoly.burakov@intel.com>:
>> Currently, DPDK will skip mapping some areas (or even an entire BAR)
>> if MSI-X happens to be in it but is smaller than page address.
>>
>> Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
>> as a capability flag. Capability flags themselves are also only
>> supported since kernel 4.6 [2].
>>
>> This commit will introduce support for checking VFIO capabilities,
>> and will use it to check if we are allowed to map BARs with MSI-X
>> tables in them, along with backwards compatibility for older
>> kernels, including a workaround for a variable rename in VFIO
>> region info structure [3].
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6
>>
>> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279
>>
>> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
>> linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>> drivers/bus/pci/linux/pci_vfio.c | 127 ++++++++++++++++++++---
>> lib/librte_eal/common/include/rte_vfio.h | 26 +++++
>> 2 files changed, 140 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
>> index 686386d6a..e7765ee11 100644
>> --- a/drivers/bus/pci/linux/pci_vfio.c
>> +++ b/drivers/bus/pci/linux/pci_vfio.c
>> @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
>> return 0;
>> }
>>
>> +/*
>> + * region info may contain capability headers, so we need to keep reallocating
>> + * the memory until we match allocated memory size with argsz.
>> + */
>> +static int
>> +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
>> + int region)
>> +{
>> + struct vfio_region_info *ri;
>> + size_t argsz = sizeof(*ri);
>> + int ret;
>> +
>> + ri = malloc(sizeof(*ri));
>> + if (ri == NULL) {
>> + RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
>> + return -1;
>> + }
>> +again:
>> + memset(ri, 0, argsz);
>> + ri->argsz = argsz;
>> + ri->index = region;
>> +
>> + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
>> + if (ret) {
>> + free(ri);
>> + return ret;
>> + }
>> + if (ri->argsz != argsz) {
>> + argsz = ri->argsz;
>> + ri = realloc(ri, argsz);
>> +
>> + if (ri == NULL) {
>> + RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
>> + return -1;
>> + }
>> + goto again;
>> + }
>> + *info = ri;
>> +
>> + return 0;
>> +}
>> +
>> +static struct vfio_info_cap_header *
>> +pci_vfio_info_cap(struct vfio_region_info *info, int cap)
>> +{
>> + struct vfio_info_cap_header *h;
>> + size_t offset;
>> +
>> + if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
>> + /* VFIO info does not advertise capabilities */
>> + return NULL;
>> + }
>> +
>> + offset = VFIO_CAP_OFFSET(info);
>> + while (offset != 0) {
>> + h = RTE_PTR_ADD(info, offset);
>> + if (h->id == cap)
>> + return h;
>> + offset = h->next;
>> + }
>> + return NULL;
>> +}
>> +
>> +static int
>> +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
>> +{
>> + struct vfio_region_info *info;
>> + int ret;
>> +
>> + ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
>> + if (ret < 0)
>> + return -1;
>> +
>> + ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
>> +
>> + /* cleanup */
>> + free(info);
>> +
>> + return ret;
>> +}
>> +
>> +
>> static int
>> pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>> {
>> @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
>> if (ret < 0) {
>> RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n",
>> pci_addr);
>> - goto err_vfio_dev_fd;
>> + goto err_vfio_res;
>> + }
>> + /* if we found our MSI-X BAR region, check if we can mmap it */
>> + if (vfio_res->msix_table.bar_index != -1) {
>> + int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
>> + vfio_res->msix_table.bar_index);
>> + if (ret < 0) {
>> + RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
>> + goto err_vfio_res;
>> + } else if (ret != 0) {
>> + /* we can map it, so we don't care where it is */
>> + RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
>> + vfio_res->msix_table.bar_index = -1;
>> + }
>> }
>>
>> for (i = 0; i < (int) vfio_res->nb_maps; i++) {
>> - struct vfio_region_info reg = { .argsz = sizeof(reg) };
>> + struct vfio_region_info *reg;
>> void *bar_addr;
>>
>> - reg.index = i;
>> -
>> - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
>> - if (ret) {
>> + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i);
>> + if (ret < 0) {
>> RTE_LOG(ERR, EAL, " %s cannot get device region info "
>> - "error %i (%s)\n", pci_addr, errno, strerror(errno));
>> + "error %i (%s)\n", pci_addr, errno,
>> + strerror(errno));
>> goto err_vfio_res;
>> }
>>
>> /* chk for io port region */
>> ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
>> - if (ret < 0)
>> + if (ret < 0) {
>> + free(reg);
>> goto err_vfio_res;
>> - else if (ret) {
>> + } else if (ret) {
>> RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
>> i);
>> + free(reg);
>> continue;
>> }
>>
>> /* skip non-mmapable BARs */
>> - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
>> + if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
>> + free(reg);
>> continue;
>> + }
>>
>> /* try mapping somewhere close to the end of hugepages */
>> if (pci_map_addr == NULL)
>> pci_map_addr = pci_find_max_end_va();
>>
>> bar_addr = pci_map_addr;
>> - pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
>> + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
>>
>> maps[i].addr = bar_addr;
>> - maps[i].offset = reg.offset;
>> - maps[i].size = reg.size;
>> + maps[i].offset = reg->offset;
>> + maps[i].size = reg->size;
>> maps[i].path = NULL; /* vfio doesn't have per-resource paths */
>>
>> ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
>> if (ret < 0) {
>> RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n",
>> pci_addr, i, strerror(errno));
>> + free(reg);
>> goto err_vfio_res;
>> }
>>
>> dev->mem_resource[i].addr = maps[i].addr;
>> +
>> + free(reg);
>> }
>>
>> if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
>> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
>> index 5ca13fcce..f6617e004 100644
>> --- a/lib/librte_eal/common/include/rte_vfio.h
>> +++ b/lib/librte_eal/common/include/rte_vfio.h
>> @@ -14,6 +14,8 @@
>> extern "C" {
>> #endif
>>
>> +#include <stdint.h>
>> +
>> /*
>> * determine if VFIO is present on the system
>> */
>> @@ -44,6 +46,30 @@ extern "C" {
>> #define RTE_VFIO_NOIOMMU 8
>> #endif
>>
>> +/*
>> + * capabilities are only supported on kernel 4.6+. there were also some API
>> + * changes as well, so add a macro to get cap offset.
>> + */
>> +#ifdef VFIO_REGION_INFO_FLAG_CAPS
>> +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
>> +#define VFIO_CAP_OFFSET(x) (x->cap_offset)
>> +#else
>> +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
>> +#define VFIO_CAP_OFFSET(x) (x->resv)
>> +struct vfio_info_cap_header {
>> + uint16_t id;
>> + uint16_t version;
>> + uint32_t next;
>> +};
>> +#endif
>> +
>> +/* kernels 4.16+ can map BAR containing MSI-X table */
>> +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
>> +#else
>> +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
>> +#endif
>> +
>> #else /* not VFIO_PRESENT */
>>
>> /* we don't need an actual definition, only pointer is used */
>> --
>> 2.17.1
>
> Hi Anatoly,
> I have tested the patch on our ppc64le machine, but the
> ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in
> pci_vfio_get_region_info() failed.
> This may be an issue of ppc64le VFIO implementation. Let me investigate more...
>
> Thanks,
> Takeshi
>
Hi Takeshi, i think there's a bug in my patch. I'll submit a v2.
--
Thanks,
Anatoly
next prev parent reply other threads:[~2018-07-31 11:24 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-07-30 11:17 Anatoly Burakov
2018-07-31 9:38 ` Takeshi Yoshimura
2018-07-31 11:24 ` Burakov, Anatoly [this message]
2018-07-31 11:28 ` [dpdk-dev] [PATCH 18.11 v2] " Anatoly Burakov
2018-08-02 6:47 ` Takeshi Yoshimura
2018-08-02 8:17 ` Burakov, Anatoly
2018-09-20 13:11 ` [dpdk-dev] [PATCH v3] " Anatoly Burakov
2018-10-03 22:40 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=664ca8bf-d5e2-50b4-8e28-e5b15d82cb80@intel.com \
--to=anatoly.burakov@intel.com \
--cc=dev@dpdk.org \
--cc=jerin.jacob@caviumnetworks.com \
--cc=t.yoshimura8869@gmail.com \
--cc=thomas@monjalon.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).