* [dpdk-dev] [PATCH 01/12] eal: Make rte_eal_using_phys_addrs work sooner
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
` (12 subsequent siblings)
13 siblings, 1 reply; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This function only returned the correct answer after
a call to initialize the memory subsystem. Make it work
prior to that.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I8f3c5128fbf5da884a956bbcc72c5a13564825d5
---
lib/librte_eal/linux/eal/eal_memory.c | 63 ++++++++++++---------------
1 file changed, 28 insertions(+), 35 deletions(-)
diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c
index 416dad898..0c07bb946 100644
--- a/lib/librte_eal/linux/eal/eal_memory.c
+++ b/lib/librte_eal/linux/eal/eal_memory.c
@@ -66,34 +66,8 @@
* zone as well as a physical contiguous zone.
*/
-static bool phys_addrs_available = true;
-
#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
-static void
-test_phys_addrs_available(void)
-{
- uint64_t tmp = 0;
- phys_addr_t physaddr;
-
- if (!rte_eal_has_hugepages()) {
- RTE_LOG(ERR, EAL,
- "Started without hugepages support, physical addresses not available\n");
- phys_addrs_available = false;
- return;
- }
-
- physaddr = rte_mem_virt2phy(&tmp);
- if (physaddr == RTE_BAD_PHYS_ADDR) {
- if (rte_eal_iova_mode() == RTE_IOVA_PA)
- RTE_LOG(ERR, EAL,
- "Cannot obtain physical addresses: %s. "
- "Only vfio will function.\n",
- strerror(errno));
- phys_addrs_available = false;
- }
-}
-
/*
* Get physical address of any mapped virtual address in the current process.
*/
@@ -107,7 +81,7 @@ rte_mem_virt2phy(const void *virtaddr)
off_t offset;
/* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
- if (!phys_addrs_available)
+ if (!rte_eal_using_phys_addrs())
return RTE_BAD_IOVA;
/* standard page size */
@@ -1336,8 +1310,6 @@ eal_legacy_hugepage_init(void)
int nr_hugefiles, nr_hugepages = 0;
void *addr;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
/* get pointer to global configuration */
@@ -1516,7 +1488,7 @@ eal_legacy_hugepage_init(void)
continue;
}
- if (phys_addrs_available &&
+ if (rte_eal_using_phys_addrs() &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
@@ -1735,8 +1707,6 @@ eal_hugepage_init(void)
uint64_t memory[RTE_MAX_NUMA_NODES];
int hp_sz_idx, socket_id;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
for (hp_sz_idx = 0;
@@ -1879,8 +1849,6 @@ eal_legacy_hugepage_attach(void)
"into secondary processes\n");
}
- test_phys_addrs_available();
-
fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n",
@@ -2020,7 +1988,32 @@ rte_eal_hugepage_attach(void)
int
rte_eal_using_phys_addrs(void)
{
- return phys_addrs_available;
+ static int using_phys_addrs = -1;
+ uint64_t tmp = 0;
+ phys_addr_t physaddr;
+
+ if (using_phys_addrs != -1)
+ return using_phys_addrs;
+
+ /* Set the default to 1 */
+ using_phys_addrs = 1;
+
+ if (!rte_eal_has_hugepages()) {
+ RTE_LOG(ERR, EAL,
+ "Started without hugepages support, physical addresses not available\n");
+ using_phys_addrs = 0;
+ return using_phys_addrs;
+ }
+
+ physaddr = rte_mem_virt2phy(&tmp);
+ if (physaddr == RTE_BAD_PHYS_ADDR) {
+ if (rte_eal_iova_mode() == RTE_IOVA_PA)
+ RTE_LOG(ERR, EAL,
+ "Cannot obtain physical addresses. Only vfio will function.\n");
+ using_phys_addrs = 0;
+ }
+
+ return using_phys_addrs;
}
static int __rte_unused
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 01/12] eal: Make rte_eal_using_phys_addrs work sooner
2019-05-30 17:48 ` [dpdk-dev] [PATCH 01/12] eal: Make rte_eal_using_phys_addrs work sooner Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
` (10 more replies)
0 siblings, 11 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This function only returned the correct answer after
a call to initialize the memory subsystem. Make it work
prior to that.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
lib/librte_eal/linux/eal/eal_memory.c | 63 ++++++++++++---------------
1 file changed, 28 insertions(+), 35 deletions(-)
diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c
index 416dad898..0c07bb946 100644
--- a/lib/librte_eal/linux/eal/eal_memory.c
+++ b/lib/librte_eal/linux/eal/eal_memory.c
@@ -66,34 +66,8 @@
* zone as well as a physical contiguous zone.
*/
-static bool phys_addrs_available = true;
-
#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
-static void
-test_phys_addrs_available(void)
-{
- uint64_t tmp = 0;
- phys_addr_t physaddr;
-
- if (!rte_eal_has_hugepages()) {
- RTE_LOG(ERR, EAL,
- "Started without hugepages support, physical addresses not available\n");
- phys_addrs_available = false;
- return;
- }
-
- physaddr = rte_mem_virt2phy(&tmp);
- if (physaddr == RTE_BAD_PHYS_ADDR) {
- if (rte_eal_iova_mode() == RTE_IOVA_PA)
- RTE_LOG(ERR, EAL,
- "Cannot obtain physical addresses: %s. "
- "Only vfio will function.\n",
- strerror(errno));
- phys_addrs_available = false;
- }
-}
-
/*
* Get physical address of any mapped virtual address in the current process.
*/
@@ -107,7 +81,7 @@ rte_mem_virt2phy(const void *virtaddr)
off_t offset;
/* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
- if (!phys_addrs_available)
+ if (!rte_eal_using_phys_addrs())
return RTE_BAD_IOVA;
/* standard page size */
@@ -1336,8 +1310,6 @@ eal_legacy_hugepage_init(void)
int nr_hugefiles, nr_hugepages = 0;
void *addr;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
/* get pointer to global configuration */
@@ -1516,7 +1488,7 @@ eal_legacy_hugepage_init(void)
continue;
}
- if (phys_addrs_available &&
+ if (rte_eal_using_phys_addrs() &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
@@ -1735,8 +1707,6 @@ eal_hugepage_init(void)
uint64_t memory[RTE_MAX_NUMA_NODES];
int hp_sz_idx, socket_id;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
for (hp_sz_idx = 0;
@@ -1879,8 +1849,6 @@ eal_legacy_hugepage_attach(void)
"into secondary processes\n");
}
- test_phys_addrs_available();
-
fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n",
@@ -2020,7 +1988,32 @@ rte_eal_hugepage_attach(void)
int
rte_eal_using_phys_addrs(void)
{
- return phys_addrs_available;
+ static int using_phys_addrs = -1;
+ uint64_t tmp = 0;
+ phys_addr_t physaddr;
+
+ if (using_phys_addrs != -1)
+ return using_phys_addrs;
+
+ /* Set the default to 1 */
+ using_phys_addrs = 1;
+
+ if (!rte_eal_has_hugepages()) {
+ RTE_LOG(ERR, EAL,
+ "Started without hugepages support, physical addresses not available\n");
+ using_phys_addrs = 0;
+ return using_phys_addrs;
+ }
+
+ physaddr = rte_mem_virt2phy(&tmp);
+ if (physaddr == RTE_BAD_PHYS_ADDR) {
+ if (rte_eal_iova_mode() == RTE_IOVA_PA)
+ RTE_LOG(ERR, EAL,
+ "Cannot obtain physical addresses. Only vfio will function.\n");
+ using_phys_addrs = 0;
+ }
+
+ return using_phys_addrs;
}
static int __rte_unused
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class Ben Walker
` (9 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This is in preparation for future simplifications. The
functions are simply inlined for now.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 176 +++++++++++++++---------------------
1 file changed, 71 insertions(+), 105 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index c99d523f0..d3177916a 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -497,86 +497,6 @@ rte_pci_scan(void)
return -1;
}
-/*
- * Is pci device bound to any kdrv
- */
-static inline int
-pci_one_device_is_bound(void)
-{
- struct rte_pci_device *dev = NULL;
- int ret = 0;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_UNKNOWN ||
- dev->kdrv == RTE_KDRV_NONE) {
- continue;
- } else {
- ret = 1;
- break;
- }
- }
- return ret;
-}
-
-/*
- * Any one of the device bound to uio
- */
-static inline int
-pci_one_device_bound_uio(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_devargs *devargs;
- int need_check;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- devargs = dev->device.devargs;
-
- need_check = 0;
- switch (rte_pci_bus.bus.conf.scan_mode) {
- case RTE_BUS_SCAN_WHITELIST:
- if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
- need_check = 1;
- break;
- case RTE_BUS_SCAN_UNDEFINED:
- case RTE_BUS_SCAN_BLACKLIST:
- if (devargs == NULL ||
- devargs->policy != RTE_DEV_BLACKLISTED)
- need_check = 1;
- break;
- }
-
- if (!need_check)
- continue;
-
- if (dev->kdrv == RTE_KDRV_IGB_UIO ||
- dev->kdrv == RTE_KDRV_UIO_GENERIC) {
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * Any one of the device has iova as va
- */
-static inline int
-pci_one_device_has_iova_va(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
-
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_VFIO &&
- rte_pci_match(drv, dev))
- return 1;
- }
- }
- }
- return 0;
-}
-
#if defined(RTE_ARCH_X86)
static bool
pci_one_device_iommu_support_va(struct rte_pci_device *dev)
@@ -641,14 +561,76 @@ pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
#endif
/*
- * All devices IOMMUs support VA as IOVA
+ * Get iommu class of PCI devices on the bus.
*/
-static bool
-pci_devices_iommu_support_va(void)
+enum rte_iova_mode
+rte_pci_get_iommu_class(void)
{
+ bool is_bound = false;
+ bool is_vfio_noiommu_enabled = true;
+ bool has_iova_va = false;
+ bool is_bound_uio = false;
+ bool iommu_no_va = false;
+ bool break_out;
+ bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
+ struct rte_devargs *devargs;
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_UNKNOWN ||
+ dev->kdrv == RTE_KDRV_NONE) {
+ continue;
+ } else {
+ is_bound = true;
+ break;
+ }
+ }
+ if (!is_bound)
+ return RTE_IOVA_DC;
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_VFIO &&
+ rte_pci_match(drv, dev)) {
+ has_iova_va = true;
+ break;
+ }
+ }
+
+ if (has_iova_va)
+ break;
+ }
+ }
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ devargs = dev->device.devargs;
+
+ need_check = false;
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ need_check = true;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ need_check = true;
+ break;
+ }
+
+ if (!need_check)
+ continue;
+
+ if (dev->kdrv == RTE_KDRV_IGB_UIO ||
+ dev->kdrv == RTE_KDRV_UIO_GENERIC) {
+ is_bound_uio = true;
+ }
+ }
+
+ break_out = false;
FOREACH_DRIVER_ON_PCIBUS(drv) {
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (!rte_pci_match(drv, dev))
@@ -657,31 +639,15 @@ pci_devices_iommu_support_va(void)
* just one PCI device needs to be checked out because
* the IOMMU hardware is the same for all of them.
*/
- return pci_one_device_iommu_support_va(dev);
+ iommu_no_va = !pci_one_device_iommu_support_va(dev);
+ break_out = true;
+ break;
}
- }
- return true;
-}
-/*
- * Get iommu class of PCI devices on the bus.
- */
-enum rte_iova_mode
-rte_pci_get_iommu_class(void)
-{
- bool is_bound;
- bool is_vfio_noiommu_enabled = true;
- bool has_iova_va;
- bool is_bound_uio;
- bool iommu_no_va;
-
- is_bound = pci_one_device_is_bound();
- if (!is_bound)
- return RTE_IOVA_DC;
+ if (break_out)
+ break;
+ }
- has_iova_va = pci_one_device_has_iova_va();
- is_bound_uio = pci_one_device_bound_uio();
- iommu_no_va = !pci_devices_iommu_support_va();
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 04/12] eal/pci: Collapse two " Ben Walker
` (8 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Make all of the loops first iterate over devices, then
drivers. This is in preparation for combining them
into a single loop.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index d3177916a..70815e4f0 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -589,10 +589,10 @@ rte_pci_get_iommu_class(void)
if (!is_bound)
return RTE_IOVA_DC;
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_VFIO &&
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_VFIO) {
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA &&
rte_pci_match(drv, dev)) {
has_iova_va = true;
break;
@@ -631,8 +631,8 @@ rte_pci_get_iommu_class(void)
}
break_out = false;
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
/*
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 04/12] eal/pci: Collapse two loops in rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 05/12] eal/pci: Add function pci_ignore_device Ben Walker
` (7 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Two of these loops easily collapse into a single loop.
This sets the stage for future simplifications.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 31 ++++++++++---------------------
1 file changed, 10 insertions(+), 21 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 70815e4f0..29ffae77f 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -571,7 +571,6 @@ rte_pci_get_iommu_class(void)
bool has_iova_va = false;
bool is_bound_uio = false;
bool iommu_no_va = false;
- bool break_out;
bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
@@ -592,8 +591,16 @@ rte_pci_get_iommu_class(void)
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA &&
- rte_pci_match(drv, dev)) {
+ if (!rte_pci_match(drv, dev))
+ continue;
+
+ /*
+ * just one PCI device needs to be checked out because
+ * the IOMMU hardware is the same for all of them.
+ */
+ iommu_no_va = !pci_one_device_iommu_support_va(dev);
+
+ if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
has_iova_va = true;
break;
}
@@ -630,24 +637,6 @@ rte_pci_get_iommu_class(void)
}
}
- break_out = false;
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (!rte_pci_match(drv, dev))
- continue;
- /*
- * just one PCI device needs to be checked out because
- * the IOMMU hardware is the same for all of them.
- */
- iommu_no_va = !pci_one_device_iommu_support_va(dev);
- break_out = true;
- break;
- }
-
- if (break_out)
- break;
- }
-
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 05/12] eal/pci: Add function pci_ignore_device
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (2 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 04/12] eal/pci: Collapse two " Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class Ben Walker
` (6 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This performs a check for whether the device should be ignored
due to whitelist or blacklist. This check eventually needs
to apply to all of the other checks in rte_pci_get_iommu_class.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 44 +++++++++++++++++++++----------------
1 file changed, 25 insertions(+), 19 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 29ffae77f..6d311f4e0 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -560,6 +560,29 @@ pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
}
#endif
+static bool
+pci_ignore_device(struct rte_pci_device *dev)
+{
+ struct rte_devargs *devargs;
+
+ devargs = dev->device.devargs;
+
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ return false;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
/*
* Get iommu class of PCI devices on the bus.
*/
@@ -571,10 +594,9 @@ rte_pci_get_iommu_class(void)
bool has_iova_va = false;
bool is_bound_uio = false;
bool iommu_no_va = false;
- bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
- struct rte_devargs *devargs;
+
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (dev->kdrv == RTE_KDRV_UNKNOWN ||
@@ -612,23 +634,7 @@ rte_pci_get_iommu_class(void)
}
FOREACH_DEVICE_ON_PCIBUS(dev) {
- devargs = dev->device.devargs;
-
- need_check = false;
- switch (rte_pci_bus.bus.conf.scan_mode) {
- case RTE_BUS_SCAN_WHITELIST:
- if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
- need_check = true;
- break;
- case RTE_BUS_SCAN_UNDEFINED:
- case RTE_BUS_SCAN_BLACKLIST:
- if (devargs == NULL ||
- devargs->policy != RTE_DEV_BLACKLISTED)
- need_check = true;
- break;
- }
-
- if (!need_check)
+ if (pci_ignore_device(dev))
continue;
if (dev->kdrv == RTE_KDRV_IGB_UIO ||
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (3 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 05/12] eal/pci: Add function pci_ignore_device Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 07/12] eal/pci: Reverse if check " Ben Walker
` (5 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
All of the checks should respect the white and black lists.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 6d311f4e0..d2464d2ae 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -597,8 +597,10 @@ rte_pci_get_iommu_class(void)
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
-
FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (pci_ignore_device(dev))
+ continue;
+
if (dev->kdrv == RTE_KDRV_UNKNOWN ||
dev->kdrv == RTE_KDRV_NONE) {
continue;
@@ -611,6 +613,9 @@ rte_pci_get_iommu_class(void)
return RTE_IOVA_DC;
FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (pci_ignore_device(dev))
+ continue;
+
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 07/12] eal/pci: Reverse if check in rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (4 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 08/12] eal/pci: Collapse loops " Ben Walker
` (4 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
It's simpler to reverse the if statement here, especially
with an upcoming simplification.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index d2464d2ae..549d61e74 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -601,10 +601,8 @@ rte_pci_get_iommu_class(void)
if (pci_ignore_device(dev))
continue;
- if (dev->kdrv == RTE_KDRV_UNKNOWN ||
- dev->kdrv == RTE_KDRV_NONE) {
- continue;
- } else {
+ if (dev->kdrv != RTE_KDRV_UNKNOWN &&
+ dev->kdrv != RTE_KDRV_NONE) {
is_bound = true;
break;
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 08/12] eal/pci: Collapse loops in rte_pci_get_iommu_class
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (5 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 07/12] eal/pci: Reverse if check " Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch Ben Walker
` (3 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
The three loops can now be easily combined into one.
This is slightly less efficient than before because it
doesn't break out early. But that can be addressed
later.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 549d61e74..765c473e8 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -604,15 +604,7 @@ rte_pci_get_iommu_class(void)
if (dev->kdrv != RTE_KDRV_UNKNOWN &&
dev->kdrv != RTE_KDRV_NONE) {
is_bound = true;
- break;
}
- }
- if (!is_bound)
- return RTE_IOVA_DC;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (pci_ignore_device(dev))
- continue;
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
@@ -630,15 +622,7 @@ rte_pci_get_iommu_class(void)
break;
}
}
-
- if (has_iova_va)
- break;
}
- }
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (pci_ignore_device(dev))
- continue;
if (dev->kdrv == RTE_KDRV_IGB_UIO ||
dev->kdrv == RTE_KDRV_UIO_GENERIC) {
@@ -646,6 +630,9 @@ rte_pci_get_iommu_class(void)
}
}
+ if (!is_bound)
+ return RTE_IOVA_DC;
+
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (6 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 08/12] eal/pci: Collapse loops " Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 10/12] eal/pci: Finding a device bound to UIO does not force PA Ben Walker
` (2 subsequent siblings)
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Take several independent if statements and convert to a
switch statement.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 765c473e8..5e61f46c8 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -601,12 +601,12 @@ rte_pci_get_iommu_class(void)
if (pci_ignore_device(dev))
continue;
- if (dev->kdrv != RTE_KDRV_UNKNOWN &&
- dev->kdrv != RTE_KDRV_NONE) {
+ switch (dev->kdrv) {
+ case RTE_KDRV_UNKNOWN:
+ case RTE_KDRV_NONE:
+ break;
+ case RTE_KDRV_VFIO:
is_bound = true;
- }
-
- if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
@@ -622,11 +622,14 @@ rte_pci_get_iommu_class(void)
break;
}
}
- }
-
- if (dev->kdrv == RTE_KDRV_IGB_UIO ||
- dev->kdrv == RTE_KDRV_UIO_GENERIC) {
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ case RTE_KDRV_NIC_UIO:
+ is_bound = true;
is_bound_uio = true;
+ break;
+
}
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 10/12] eal/pci: Finding a device bound to UIO does not force PA
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (7 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 12/12] eal: If bus can't decide PA or VA, try to access PA Ben Walker
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
If a device is found that is bound to the UIO driver,
only force IOVA_PA if there is a driver registered to use it.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 5e61f46c8..a71c66380 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -627,7 +627,13 @@ rte_pci_get_iommu_class(void)
case RTE_KDRV_UIO_GENERIC:
case RTE_KDRV_NIC_UIO:
is_bound = true;
- is_bound_uio = true;
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (!rte_pci_match(drv, dev))
+ continue;
+
+ is_bound_uio = true;
+ break;
+ }
break;
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (8 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 10/12] eal/pci: Finding a device bound to UIO does not force PA Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 12/12] eal: If bus can't decide PA or VA, try to access PA Ben Walker
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
In the case where no drivers are registered with the system,
rte_pci_get_iommu_class should return RTE_IOVA_DC.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
drivers/bus/pci/linux/pci.c | 91 ++++++++++++++++++++-----------------
1 file changed, 50 insertions(+), 41 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index a71c66380..60424932e 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -589,49 +589,80 @@ pci_ignore_device(struct rte_pci_device *dev)
enum rte_iova_mode
rte_pci_get_iommu_class(void)
{
- bool is_bound = false;
- bool is_vfio_noiommu_enabled = true;
- bool has_iova_va = false;
- bool is_bound_uio = false;
- bool iommu_no_va = false;
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
+ struct rte_pci_device *dev;
+ struct rte_pci_driver *drv;
+ struct rte_pci_addr *addr;
+ enum rte_iova_mode iova_mode;
+
+ iova_mode = RTE_IOVA_DC;
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (pci_ignore_device(dev))
continue;
+ addr = &dev->addr;
+
switch (dev->kdrv) {
case RTE_KDRV_UNKNOWN:
case RTE_KDRV_NONE:
break;
case RTE_KDRV_VFIO:
- is_bound = true;
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
- /*
- * just one PCI device needs to be checked out because
- * the IOMMU hardware is the same for all of them.
- */
- iommu_no_va = !pci_one_device_iommu_support_va(dev);
+ if ((drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0)
+ continue;
- if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- has_iova_va = true;
- break;
+ if (!pci_one_device_iommu_support_va(dev)) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT
+ " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid,
+ addr->function);
+ RTE_LOG(WARNING, EAL,
+ "IOMMU does not support it.\n");
+ iova_mode = RTE_IOVA_PA;
+ }
+#ifdef VFIO_PRESENT
+ else if (rte_vfio_noiommu_is_enabled()) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT
+ " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid,
+ addr->function);
+ RTE_LOG(WARNING, EAL,
+ "vfio-noiommu is enabled.\n");
+ iova_mode = RTE_IOVA_PA;
+#endif
+ } else if (iova_mode == RTE_IOVA_PA) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT
+ " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid,
+ addr->function);
+ RTE_LOG(WARNING, EAL,
+ "other devices require PA.\n");
+ } else {
+ iova_mode = RTE_IOVA_VA;
}
}
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
case RTE_KDRV_NIC_UIO:
- is_bound = true;
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
- is_bound_uio = true;
+ if (iova_mode == RTE_IOVA_VA) {
+ RTE_LOG(WARNING, EAL,
+ "Some devices wanted IOVA as VA, but ");
+ RTE_LOG(WARNING, EAL, "device " PCI_PRI_FMT
+ " requires PA.\n",
+ addr->domain, addr->bus, addr->devid,
+ addr->function);
+
+ }
+
+ iova_mode = RTE_IOVA_PA;
break;
}
break;
@@ -639,29 +670,7 @@ rte_pci_get_iommu_class(void)
}
}
- if (!is_bound)
- return RTE_IOVA_DC;
-
-#ifdef VFIO_PRESENT
- is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
- true : false;
-#endif
-
- if (has_iova_va && !is_bound_uio && !is_vfio_noiommu_enabled &&
- !iommu_no_va)
- return RTE_IOVA_VA;
-
- if (has_iova_va) {
- RTE_LOG(WARNING, EAL, "Some devices want iova as va but pa will be used because.. ");
- if (is_vfio_noiommu_enabled)
- RTE_LOG(WARNING, EAL, "vfio-noiommu mode configured\n");
- if (is_bound_uio)
- RTE_LOG(WARNING, EAL, "few device bound to UIO\n");
- if (iommu_no_va)
- RTE_LOG(WARNING, EAL, "IOMMU does not support IOVA as VA\n");
- }
-
- return RTE_IOVA_PA;
+ return iova_mode;
}
/* Read PCI config space. */
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 12/12] eal: If bus can't decide PA or VA, try to access PA
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 " Ben Walker
` (9 preceding siblings ...)
2019-05-30 21:29 ` [dpdk-dev] [PATCH v2 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers Ben Walker
@ 2019-05-30 21:29 ` Ben Walker
10 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 21:29 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
If the bus can't determine a preference for IOVA_PA vs.
IOVA_VA by looking at the devices and drivers, as a
last resort test if physical addresses are even accessible
in /proc/self/pagemap. If they are, use IOVA_PA. If they
are not, use IOVA_VA.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
---
lib/librte_eal/common/eal_common_bus.c | 4 ----
lib/librte_eal/linux/eal/eal.c | 30 ++++++++++++++++++++------
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index c8f1901f0..77f1be1b4 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -237,10 +237,6 @@ rte_bus_get_iommu_class(void)
mode |= bus->get_iommu_class();
}
- if (mode != RTE_IOVA_VA) {
- /* Use default IOVA mode */
- mode = RTE_IOVA_PA;
- }
return mode;
}
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 161399619..af9c003a7 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -948,6 +948,7 @@ rte_eal_init(int argc, char **argv)
static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ enum rte_iova_mode iova_mode;
/* checks if the machine is adequate */
if (!rte_cpu_is_supported()) {
@@ -1037,18 +1038,33 @@ rte_eal_init(int argc, char **argv)
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
- /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
- rte_eal_get_configuration()->iova_mode =
- rte_bus_get_iommu_class();
+ /* autodetect the IOVA mapping mode */
+ iova_mode = rte_bus_get_iommu_class();
/* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ if (iova_mode == RTE_IOVA_VA &&
rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+ iova_mode = RTE_IOVA_PA;
RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ "Some devices want IOVA as VA but PA will be"
+ " used because KNI module inserted\n");
+ }
+
+ if (iova_mode == RTE_IOVA_DC) {
+ /* If the bus doesn't care, check if physical addresses are
+ * accessible.
+ */
+ if (rte_eal_using_phys_addrs()) {
+ /* Physical addresses are available, so the safest
+ * choice is to use those.
+ */
+ iova_mode = RTE_IOVA_PA;
+ } else {
+ iova_mode = RTE_IOVA_VA;
+ }
}
+
+ rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 01/12] eal: Make rte_eal_using_phys_addrs work sooner Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:57 ` Stephen Hemminger
2019-05-30 17:48 ` [dpdk-dev] [PATCH 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class Ben Walker
` (11 subsequent siblings)
13 siblings, 1 reply; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This is in preparation for future simplifications. The
functions are simply inlined for now.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I129992c9b44f4575a28cc05b78297e15b6be4249
---
drivers/bus/pci/linux/pci.c | 176 +++++++++++++++---------------------
1 file changed, 71 insertions(+), 105 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index c99d523f0..d3177916a 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -497,86 +497,6 @@ rte_pci_scan(void)
return -1;
}
-/*
- * Is pci device bound to any kdrv
- */
-static inline int
-pci_one_device_is_bound(void)
-{
- struct rte_pci_device *dev = NULL;
- int ret = 0;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_UNKNOWN ||
- dev->kdrv == RTE_KDRV_NONE) {
- continue;
- } else {
- ret = 1;
- break;
- }
- }
- return ret;
-}
-
-/*
- * Any one of the device bound to uio
- */
-static inline int
-pci_one_device_bound_uio(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_devargs *devargs;
- int need_check;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- devargs = dev->device.devargs;
-
- need_check = 0;
- switch (rte_pci_bus.bus.conf.scan_mode) {
- case RTE_BUS_SCAN_WHITELIST:
- if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
- need_check = 1;
- break;
- case RTE_BUS_SCAN_UNDEFINED:
- case RTE_BUS_SCAN_BLACKLIST:
- if (devargs == NULL ||
- devargs->policy != RTE_DEV_BLACKLISTED)
- need_check = 1;
- break;
- }
-
- if (!need_check)
- continue;
-
- if (dev->kdrv == RTE_KDRV_IGB_UIO ||
- dev->kdrv == RTE_KDRV_UIO_GENERIC) {
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * Any one of the device has iova as va
- */
-static inline int
-pci_one_device_has_iova_va(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
-
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_VFIO &&
- rte_pci_match(drv, dev))
- return 1;
- }
- }
- }
- return 0;
-}
-
#if defined(RTE_ARCH_X86)
static bool
pci_one_device_iommu_support_va(struct rte_pci_device *dev)
@@ -641,14 +561,76 @@ pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
#endif
/*
- * All devices IOMMUs support VA as IOVA
+ * Get iommu class of PCI devices on the bus.
*/
-static bool
-pci_devices_iommu_support_va(void)
+enum rte_iova_mode
+rte_pci_get_iommu_class(void)
{
+ bool is_bound = false;
+ bool is_vfio_noiommu_enabled = true;
+ bool has_iova_va = false;
+ bool is_bound_uio = false;
+ bool iommu_no_va = false;
+ bool break_out;
+ bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
+ struct rte_devargs *devargs;
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_UNKNOWN ||
+ dev->kdrv == RTE_KDRV_NONE) {
+ continue;
+ } else {
+ is_bound = true;
+ break;
+ }
+ }
+ if (!is_bound)
+ return RTE_IOVA_DC;
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_VFIO &&
+ rte_pci_match(drv, dev)) {
+ has_iova_va = true;
+ break;
+ }
+ }
+
+ if (has_iova_va)
+ break;
+ }
+ }
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ devargs = dev->device.devargs;
+
+ need_check = false;
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ need_check = true;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ need_check = true;
+ break;
+ }
+
+ if (!need_check)
+ continue;
+
+ if (dev->kdrv == RTE_KDRV_IGB_UIO ||
+ dev->kdrv == RTE_KDRV_UIO_GENERIC) {
+ is_bound_uio = true;
+ }
+ }
+
+ break_out = false;
FOREACH_DRIVER_ON_PCIBUS(drv) {
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (!rte_pci_match(drv, dev))
@@ -657,31 +639,15 @@ pci_devices_iommu_support_va(void)
* just one PCI device needs to be checked out because
* the IOMMU hardware is the same for all of them.
*/
- return pci_one_device_iommu_support_va(dev);
+ iommu_no_va = !pci_one_device_iommu_support_va(dev);
+ break_out = true;
+ break;
}
- }
- return true;
-}
-/*
- * Get iommu class of PCI devices on the bus.
- */
-enum rte_iova_mode
-rte_pci_get_iommu_class(void)
-{
- bool is_bound;
- bool is_vfio_noiommu_enabled = true;
- bool has_iova_va;
- bool is_bound_uio;
- bool iommu_no_va;
-
- is_bound = pci_one_device_is_bound();
- if (!is_bound)
- return RTE_IOVA_DC;
+ if (break_out)
+ break;
+ }
- has_iova_va = pci_one_device_has_iova_va();
- is_bound_uio = pci_one_device_bound_uio();
- iommu_no_va = !pci_devices_iommu_support_va();
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class
2019-05-30 17:48 ` [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 17:57 ` Stephen Hemminger
2019-05-30 18:09 ` Walker, Benjamin
0 siblings, 1 reply; 47+ messages in thread
From: Stephen Hemminger @ 2019-05-30 17:57 UTC (permalink / raw)
To: Ben Walker; +Cc: dev
On Thu, 30 May 2019 10:48:09 -0700
Ben Walker <benjamin.walker@intel.com> wrote:
> This is in preparation for future simplifications. The
> functions are simply inlined for now.
>
> Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> Change-Id: I129992c9b44f4575a28cc05b78297e15b6be4249
Please don't inline any functions that are not in the fast path.
The compiler will do it anyway.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class
2019-05-30 17:57 ` Stephen Hemminger
@ 2019-05-30 18:09 ` Walker, Benjamin
0 siblings, 0 replies; 47+ messages in thread
From: Walker, Benjamin @ 2019-05-30 18:09 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: dev
> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Thursday, May 30, 2019 10:57 AM
> To: Walker, Benjamin <benjamin.walker@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into
> rte_pci_get_iommu_class
>
> On Thu, 30 May 2019 10:48:09 -0700
> Ben Walker <benjamin.walker@intel.com> wrote:
>
> > This is in preparation for future simplifications. The functions are
> > simply inlined for now.
> >
> > Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> > Change-Id: I129992c9b44f4575a28cc05b78297e15b6be4249
>
> Please don't inline any functions that are not in the fast path.
> The compiler will do it anyway.
That's not what I mean by inline. I didn't mark the functions inline - I copied
their contents into the single place they are called. This patch is a set up patch
for a later one that refactors the way this function works, and doing this makes
the diff easier to read.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 01/12] eal: Make rte_eal_using_phys_addrs work sooner Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 02/12] eal/pci: Inline several functions into rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 04/12] eal/pci: Collapse two " Ben Walker
` (10 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Make all of the loops first iterate over devices, then
drivers. This is in preparation for combining them
into a single loop.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: Ifb2bfcc60570a5d5a13481be3da0fc74bf00ef1f
---
drivers/bus/pci/linux/pci.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index d3177916a..70815e4f0 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -589,10 +589,10 @@ rte_pci_get_iommu_class(void)
if (!is_bound)
return RTE_IOVA_DC;
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_VFIO &&
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_VFIO) {
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA &&
rte_pci_match(drv, dev)) {
has_iova_va = true;
break;
@@ -631,8 +631,8 @@ rte_pci_get_iommu_class(void)
}
break_out = false;
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
/*
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 04/12] eal/pci: Collapse two loops in rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (2 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 03/12] eal/pci: Rework loops in rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 05/12] eal/pci: Add function pci_ignore_device Ben Walker
` (9 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Two of these loops easily collapse into a single loop.
This sets the stage for future simplifications.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I3353f2e3585808cebff3f11805f96e4a1cc7fb3a
---
drivers/bus/pci/linux/pci.c | 31 ++++++++++---------------------
1 file changed, 10 insertions(+), 21 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 70815e4f0..b7a66d717 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -571,7 +571,6 @@ rte_pci_get_iommu_class(void)
bool has_iova_va = false;
bool is_bound_uio = false;
bool iommu_no_va = false;
- bool break_out;
bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
@@ -592,8 +591,16 @@ rte_pci_get_iommu_class(void)
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA &&
- rte_pci_match(drv, dev)) {
+ if (!rte_pci_match(drv, dev))
+ continue;
+
+ /*
+ * just one PCI device needs to be checked out because
+ * the IOMMU hardware is the same for all of them.
+ */
+ iommu_no_va = !pci_one_device_iommu_support_va(dev);
+
+ if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
has_iova_va = true;
break;
}
@@ -630,24 +637,6 @@ rte_pci_get_iommu_class(void)
}
}
- break_out = false;
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (!rte_pci_match(drv, dev))
- continue;
- /*
- * just one PCI device needs to be checked out because
- * the IOMMU hardware is the same for all of them.
- */
- iommu_no_va = !pci_one_device_iommu_support_va(dev);
- break_out = true;
- break;
- }
-
- if (break_out)
- break;
- }
-
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 05/12] eal/pci: Add function pci_ignore_device
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (3 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 04/12] eal/pci: Collapse two " Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class Ben Walker
` (8 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
This performs a check for whether the device should be ignored
due to whitelist or blacklist. This check eventually needs
to apply to all of the other checks in rte_pci_get_iommu_class.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I8e63e4c2e4199f34561ea1d911e13d6d74a47322
---
drivers/bus/pci/linux/pci.c | 44 +++++++++++++++++++++----------------
1 file changed, 25 insertions(+), 19 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index b7a66d717..f269b6a64 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -560,6 +560,29 @@ pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
}
#endif
+static bool
+pci_ignore_device(struct rte_pci_device *dev)
+{
+ struct rte_devargs *devargs;
+
+ devargs = dev->device.devargs;
+
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ return false;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
/*
* Get iommu class of PCI devices on the bus.
*/
@@ -571,10 +594,9 @@ rte_pci_get_iommu_class(void)
bool has_iova_va = false;
bool is_bound_uio = false;
bool iommu_no_va = false;
- bool need_check;
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
- struct rte_devargs *devargs;
+
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (dev->kdrv == RTE_KDRV_UNKNOWN ||
@@ -612,23 +634,7 @@ rte_pci_get_iommu_class(void)
}
FOREACH_DEVICE_ON_PCIBUS(dev) {
- devargs = dev->device.devargs;
-
- need_check = false;
- switch (rte_pci_bus.bus.conf.scan_mode) {
- case RTE_BUS_SCAN_WHITELIST:
- if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
- need_check = true;
- break;
- case RTE_BUS_SCAN_UNDEFINED:
- case RTE_BUS_SCAN_BLACKLIST:
- if (devargs == NULL ||
- devargs->policy != RTE_DEV_BLACKLISTED)
- need_check = true;
- break;
- }
-
- if (!need_check)
+ if (pci_ignore_device(dev))
continue;
if (dev->kdrv == RTE_KDRV_IGB_UIO ||
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (4 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 05/12] eal/pci: Add function pci_ignore_device Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 07/12] eal/pci: Reverse if check " Ben Walker
` (7 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
All of the checks should respect the white and black lists.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: Ie66176bea49987d1fc0a03dbee2638d9dd6efbc5
---
drivers/bus/pci/linux/pci.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index f269b6a64..ebe62f140 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -597,8 +597,10 @@ rte_pci_get_iommu_class(void)
struct rte_pci_device *dev = NULL;
struct rte_pci_driver *drv = NULL;
-
FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (pci_ignore_device(dev))
+ continue;
+
if (dev->kdrv == RTE_KDRV_UNKNOWN ||
dev->kdrv == RTE_KDRV_NONE) {
continue;
@@ -611,6 +613,9 @@ rte_pci_get_iommu_class(void)
return RTE_IOVA_DC;
FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (pci_ignore_device(dev))
+ continue;
+
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 07/12] eal/pci: Reverse if check in rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (5 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 06/12] eal/pci: Correctly test whitelist/blacklist in rte_pci_get_iommu_class Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 08/12] eal/pci: Collapse loops " Ben Walker
` (6 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
It's simpler to reverse the if statement here, especially
with an upcoming simplification.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I6cff80231032304f3f865fdf38157554fad7fd07
---
drivers/bus/pci/linux/pci.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index ebe62f140..f678d2318 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -601,10 +601,8 @@ rte_pci_get_iommu_class(void)
if (pci_ignore_device(dev))
continue;
- if (dev->kdrv == RTE_KDRV_UNKNOWN ||
- dev->kdrv == RTE_KDRV_NONE) {
- continue;
- } else {
+ if (dev->kdrv != RTE_KDRV_UNKNOWN &&
+ dev->kdrv != RTE_KDRV_NONE) {
is_bound = true;
break;
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 08/12] eal/pci: Collapse loops in rte_pci_get_iommu_class
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (6 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 07/12] eal/pci: Reverse if check " Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch Ben Walker
` (5 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
The three loops can now be easily combined into one.
This is slightly less efficient than before because it
doesn't break out early. But that can be addressed
later.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: Ic97155bb478dddbcbeaa6d51947684ffef219a52
---
drivers/bus/pci/linux/pci.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index f678d2318..11e2e4d1b 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -604,15 +604,7 @@ rte_pci_get_iommu_class(void)
if (dev->kdrv != RTE_KDRV_UNKNOWN &&
dev->kdrv != RTE_KDRV_NONE) {
is_bound = true;
- break;
}
- }
- if (!is_bound)
- return RTE_IOVA_DC;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (pci_ignore_device(dev))
- continue;
if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
@@ -630,15 +622,7 @@ rte_pci_get_iommu_class(void)
break;
}
}
-
- if (has_iova_va)
- break;
}
- }
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (pci_ignore_device(dev))
- continue;
if (dev->kdrv == RTE_KDRV_IGB_UIO ||
dev->kdrv == RTE_KDRV_UIO_GENERIC) {
@@ -646,6 +630,9 @@ rte_pci_get_iommu_class(void)
}
}
+ if (!is_bound)
+ return RTE_IOVA_DC;
+
#ifdef VFIO_PRESENT
is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
true : false;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (7 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 08/12] eal/pci: Collapse loops " Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 10/12] eal/pci: Finding a device bound to UIO does not force PA Ben Walker
` (4 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
Take several independent if statements and convert to a
switch statement.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: Ia77c88ea484b529e8b0c9e09e8ef22cf3210e669
---
drivers/bus/pci/linux/pci.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 11e2e4d1b..41fd82988 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -601,12 +601,12 @@ rte_pci_get_iommu_class(void)
if (pci_ignore_device(dev))
continue;
- if (dev->kdrv != RTE_KDRV_UNKNOWN &&
- dev->kdrv != RTE_KDRV_NONE) {
+ switch (dev->kdrv) {
+ case RTE_KDRV_UNKNOWN:
+ case RTE_KDRV_NONE:
+ break;
+ case RTE_KDRV_VFIO:
is_bound = true;
- }
-
- if (dev->kdrv == RTE_KDRV_VFIO) {
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
@@ -622,11 +622,14 @@ rte_pci_get_iommu_class(void)
break;
}
}
- }
-
- if (dev->kdrv == RTE_KDRV_IGB_UIO ||
- dev->kdrv == RTE_KDRV_UIO_GENERIC) {
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ case RTE_KDRV_NIC_UIO:
+ is_bound = true;
is_bound_uio = true;
+ break;
+
}
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 10/12] eal/pci: Finding a device bound to UIO does not force PA
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (8 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 09/12] eal/pci: Simplify rte_pci_get_iommu class by using a switch Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers Ben Walker
` (3 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
If a device is found that is bound to the UIO driver,
only force IOVA_PA if there is a driver registered to use it.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: I8015f11a33ab1b7662bf374d6944eff8d7a74a07
---
drivers/bus/pci/linux/pci.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 41fd82988..09af66571 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -627,7 +627,13 @@ rte_pci_get_iommu_class(void)
case RTE_KDRV_UIO_GENERIC:
case RTE_KDRV_NIC_UIO:
is_bound = true;
- is_bound_uio = true;
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (!rte_pci_match(drv, dev))
+ continue;
+
+ is_bound_uio = true;
+ break;
+ }
break;
}
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (9 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 10/12] eal/pci: Finding a device bound to UIO does not force PA Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-05-30 17:48 ` [dpdk-dev] [PATCH 12/12] eal: If bus can't decide PA or VA, try to access PA Ben Walker
` (2 subsequent siblings)
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
In the case where no drivers are registered with the system,
rte_pci_get_iommu_class should return RTE_IOVA_DC.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Change-Id: Ia5b0cae100cfcfe46a9e4996328f9746ce33cfd3
---
drivers/bus/pci/linux/pci.c | 79 ++++++++++++++++++-------------------
1 file changed, 38 insertions(+), 41 deletions(-)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 09af66571..abc21061f 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -589,49 +589,68 @@ pci_ignore_device(struct rte_pci_device *dev)
enum rte_iova_mode
rte_pci_get_iommu_class(void)
{
- bool is_bound = false;
- bool is_vfio_noiommu_enabled = true;
- bool has_iova_va = false;
- bool is_bound_uio = false;
- bool iommu_no_va = false;
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
+ struct rte_pci_device *dev;
+ struct rte_pci_driver *drv;
+ struct rte_pci_addr *addr;
+ enum rte_iova_mode iova_mode;
+
+ iova_mode = RTE_IOVA_DC;
FOREACH_DEVICE_ON_PCIBUS(dev) {
if (pci_ignore_device(dev))
continue;
+ addr = &dev->addr;
+
switch (dev->kdrv) {
case RTE_KDRV_UNKNOWN:
case RTE_KDRV_NONE:
break;
case RTE_KDRV_VFIO:
- is_bound = true;
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
- /*
- * just one PCI device needs to be checked out because
- * the IOMMU hardware is the same for all of them.
- */
- iommu_no_va = !pci_one_device_iommu_support_va(dev);
+ if ((drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0)
+ continue;
- if (drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- has_iova_va = true;
- break;
+ if (!pci_one_device_iommu_support_va(dev)) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid, addr->function);
+ RTE_LOG(WARNING, EAL, "IOMMU does not support it.\n");
+ iova_mode = RTE_IOVA_PA;
+ }
+#ifdef VFIO_PRESENT
+ else if (rte_vfio_noiommu_is_enabled()) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid, addr->function);
+ RTE_LOG(WARNING, EAL, "vfio-noiommu is enabled.\n");
+ iova_mode = RTE_IOVA_PA;
+#endif
+ } else if (iova_mode == RTE_IOVA_PA) {
+ RTE_LOG(WARNING, EAL, "Device " PCI_PRI_FMT " wanted IOVA as VA, but ",
+ addr->domain, addr->bus, addr->devid, addr->function);
+ RTE_LOG(WARNING, EAL, "other devices require PA.\n");
+ } else {
+ iova_mode = RTE_IOVA_VA;
}
}
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
case RTE_KDRV_NIC_UIO:
- is_bound = true;
FOREACH_DRIVER_ON_PCIBUS(drv) {
if (!rte_pci_match(drv, dev))
continue;
- is_bound_uio = true;
+ if (iova_mode == RTE_IOVA_VA) {
+ RTE_LOG(WARNING, EAL, "Some devices wanted IOVA as VA, but ");
+ RTE_LOG(WARNING, EAL, "device " PCI_PRI_FMT " requires PA.\n",
+ addr->domain, addr->bus, addr->devid, addr->function);
+
+ }
+
+ iova_mode = RTE_IOVA_PA;
break;
}
break;
@@ -639,29 +658,7 @@ rte_pci_get_iommu_class(void)
}
}
- if (!is_bound)
- return RTE_IOVA_DC;
-
-#ifdef VFIO_PRESENT
- is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
- true : false;
-#endif
-
- if (has_iova_va && !is_bound_uio && !is_vfio_noiommu_enabled &&
- !iommu_no_va)
- return RTE_IOVA_VA;
-
- if (has_iova_va) {
- RTE_LOG(WARNING, EAL, "Some devices want iova as va but pa will be used because.. ");
- if (is_vfio_noiommu_enabled)
- RTE_LOG(WARNING, EAL, "vfio-noiommu mode configured\n");
- if (is_bound_uio)
- RTE_LOG(WARNING, EAL, "few device bound to UIO\n");
- if (iommu_no_va)
- RTE_LOG(WARNING, EAL, "IOMMU does not support IOVA as VA\n");
- }
-
- return RTE_IOVA_PA;
+ return iova_mode;
}
/* Read PCI config space. */
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH 12/12] eal: If bus can't decide PA or VA, try to access PA
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (10 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 11/12] eal/pci: rte_pci_get_iommu_class handles no drivers Ben Walker
@ 2019-05-30 17:48 ` Ben Walker
2019-06-03 10:48 ` [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
13 siblings, 0 replies; 47+ messages in thread
From: Ben Walker @ 2019-05-30 17:48 UTC (permalink / raw)
To: dev; +Cc: Ben Walker
If the bus can't determine a preference for IOVA_PA vs.
IOVA_VA by looking at the devices and drivers, as a
last resort test if physical addresses are even accessible
in /proc/self/pagemap. If they are, use IOVA_PA. If they
are not, use IOVA_VA.
Change-Id: If1eeb723283b80b24bd973987054fdad62f59cbd
---
lib/librte_eal/common/eal_common_bus.c | 4 ----
lib/librte_eal/linux/eal/eal.c | 28 +++++++++++++++++++-------
2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index c8f1901f0..77f1be1b4 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -237,10 +237,6 @@ rte_bus_get_iommu_class(void)
mode |= bus->get_iommu_class();
}
- if (mode != RTE_IOVA_VA) {
- /* Use default IOVA mode */
- mode = RTE_IOVA_PA;
- }
return mode;
}
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 161399619..283aae120 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -948,6 +948,7 @@ rte_eal_init(int argc, char **argv)
static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ enum rte_iova_mode iova_mode;
/* checks if the machine is adequate */
if (!rte_cpu_is_supported()) {
@@ -1037,18 +1038,31 @@ rte_eal_init(int argc, char **argv)
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
- /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
- rte_eal_get_configuration()->iova_mode =
- rte_bus_get_iommu_class();
+ /* autodetect the IOVA mapping mode */
+ iova_mode = rte_bus_get_iommu_class();
/* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ if (iova_mode == RTE_IOVA_VA &&
rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+ iova_mode = RTE_IOVA_PA;
RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ "Some devices want IOVA as VA but PA will be"
+ " used because KNI module inserted\n");
+ }
+
+ if (iova_mode == RTE_IOVA_DC) {
+ /* If the bus doesn't care, check if physical addresses are
+ * accessible. */
+ if (rte_eal_using_phys_addrs()) {
+ /* Physical addresses are available, so the safest
+ * choice is to use those. */
+ iova_mode = RTE_IOVA_PA;
+ } else {
+ iova_mode = RTE_IOVA_VA;
+ }
}
+
+ rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
--
2.20.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (11 preceding siblings ...)
2019-05-30 17:48 ` [dpdk-dev] [PATCH 12/12] eal: If bus can't decide PA or VA, try to access PA Ben Walker
@ 2019-06-03 10:48 ` David Marchand
2019-06-03 16:44 ` Walker, Benjamin
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
13 siblings, 1 reply; 47+ messages in thread
From: David Marchand @ 2019-06-03 10:48 UTC (permalink / raw)
To: Ben Walker, Jerin Jacob Kollanukkaran; +Cc: dev, Burakov, Anatoly
Hello,
On Thu, May 30, 2019 at 7:48 PM Ben Walker <benjamin.walker@intel.com>
wrote:
> In SPDK, not all drivers are registered with DPDK at start up time.
> Previously, that meant DPDK always chose to set itself up in IOVA_PA
> mode. Instead, when the correct iova choice is unclear based on the
> devices and drivers known to DPDK at start up time, use other heuristics
> (such as whether /proc/self/pagemap is accessible) to make a better
> choice.
>
> This enables SPDK to run as an unprivileged user again without requiring
> users to explicitly set the iova mode on the command line.
>
>
Interesting, I got a bz on something similar the day you sent this patchset
;-)
- When a dpdk process is started, either it has access to physical
addresses or not, and this won't change for the rest of its life.
Your fix on defaulting to VA based on a rte_eal_using_phys_addrs() check
makes sense to me.
It is the most encountered situation when running ovs as non root on recent
kernels.
- However, I fail to see the need for all of this detection code wrt
drivers and devices.
On one side of the equation, when dpdk starts, it checks physical address
availability.
On the other side of the equation, we have the drivers that will be invoked
when probing devices (either at dpdk init, or when hotplugging a device).
At this point, the probing call should check the driver requirement wrt to
the kernel driver the device is attached to.
If this requirement is not fulfilled, then the probing fails.
- This leaves the --iova-va forcing option.
Why do we need it?
If we don't have access to physical addresses, no choice but run in VA mode.
If we have access to physical addresses, the only case would be that you
want to downgrade from PA to VA.
But well, your process can still access it, not sure what the benefit is.
Jerin, I can see in the history you worked on this.
What did I miss?
Is there something wrong with dropping the detection code?
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode
2019-06-03 10:48 ` [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode David Marchand
@ 2019-06-03 16:44 ` Walker, Benjamin
2019-06-14 8:42 ` David Marchand
0 siblings, 1 reply; 47+ messages in thread
From: Walker, Benjamin @ 2019-06-03 16:44 UTC (permalink / raw)
To: david.marchand, jerinj; +Cc: Burakov, Anatoly, dev
On Mon, 2019-06-03 at 12:48 +0200, David Marchand wrote:
> Hello,
>
> On Thu, May 30, 2019 at 7:48 PM Ben Walker <benjamin.walker@intel.com> wrote:
> > In SPDK, not all drivers are registered with DPDK at start up time.
> > Previously, that meant DPDK always chose to set itself up in IOVA_PA
> > mode. Instead, when the correct iova choice is unclear based on the
> > devices and drivers known to DPDK at start up time, use other heuristics
> > (such as whether /proc/self/pagemap is accessible) to make a better
> > choice.
> >
> > This enables SPDK to run as an unprivileged user again without requiring
> > users to explicitly set the iova mode on the command line.
> >
>
> Interesting, I got a bz on something similar the day you sent this patchset ;-
> )
>
>
> - When a dpdk process is started, either it has access to physical addresses
> or not, and this won't change for the rest of its life.
> Your fix on defaulting to VA based on a rte_eal_using_phys_addrs() check makes
> sense to me.
> It is the most encountered situation when running ovs as non root on recent
> kernels.
>
>
> - However, I fail to see the need for all of this detection code wrt drivers
> and devices.
>
> On one side of the equation, when dpdk starts, it checks physical address
> availability.
> On the other side of the equation, we have the drivers that will be invoked
> when probing devices (either at dpdk init, or when hotplugging a device).
>
> At this point, the probing call should check the driver requirement wrt to the
> kernel driver the device is attached to.
> If this requirement is not fulfilled, then the probing fails.
>
>
> - This leaves the --iova-va forcing option.
> Why do we need it?
> If we don't have access to physical addresses, no choice but run in VA mode.
> If we have access to physical addresses, the only case would be that you want
> to downgrade from PA to VA.
> But well, your process can still access it, not sure what the benefit is.
All of the complexity here, at least as far as I understand it, stems from
supporting hot insert of devices. This is very important to SPDK because storage
devices get hot inserted all the time, so we very much appreciate that DPDK has
put in so much effort in this area and continues to accept our patches to
improve it. I know hot insert is not nearly as important for network devices.
When DPDK starts up, it needs to select whether to use virtual addresses or
physical addresses in its memory maps. It can do that by answering the following
questions:
1. Does the system only have buses that support an IOMMU?
2. Is the IOMMU sufficiently fast for the use case?
3. Will all of the devices that will be used with DPDK throughout the
application's lifetime work with an IOMMU?
If these three things are true, then the best choice is to use virtual addresses
in the memory translations. However, if any of the above are not true it needs
to fall back to physical addresses.
#1 is checked by simply asking all of the buses, which are known up front. #2 is
just assumed to be true. But #3 is not possible to check fully because of hot
insert.
The code currently approximates the #3 check by looking at the devices present
at initialization time. If a device exists that's bound to vfio-pci, and no
other devices exist that are bound to a uio driver, and DPDK has a registered
driver that's actually going to load against the vfio-pci devices, then it will
elect to use virtual addresses. This is purely a heuristic - it's not a
definitive answer because the user could later hot insert a device that gets
bound to uio.
The user, of course, knows the answer to which addressing scheme to use
typically. For example, these checks assume #2 is true, but there may be
hardware implementations where it is not and the user wants to force physical
addresses. Or the user may know that they are going to hot insert a device at
run time that doesn't work with the IOMMU. That's why it's important to maintain
the ability for the user to override the default heuristic's decision via the
command line.
My patch series is simply improving the heuristic in a few ways. First,
previously each bus when queried would return either virtual or physical
addresses as its choice. However, often the bus just does not have enough
information to formulate any preference at all (and PCI was defaulting to
physical addresses in this case). Instead, I made it so that the bus can return
that it doesn't care, which pushes the decision up to a higher level. That
higher level then makes the decision by checking whether it can access
/proc/self/pagemap. Second, I narrowed the uio check such that physical
addresses will only be selected if a device bound to uio exists and there is a
driver registered to use it. Previously if any device was bound to uio it would
select physical addresses, even if DPDK never ended up loading against that
device.
I think these two things make the heuristic choose the right thing more often,
but it still won't always get it right so the command line option needs to
remain.
Thanks,
Ben
>
>
> Jerin, I can see in the history you worked on this.
> What did I miss?
> Is there something wrong with dropping the detection code?
>
>
>
> --
> David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode
2019-06-03 16:44 ` Walker, Benjamin
@ 2019-06-14 8:42 ` David Marchand
0 siblings, 0 replies; 47+ messages in thread
From: David Marchand @ 2019-06-14 8:42 UTC (permalink / raw)
To: Walker, Benjamin
Cc: jerinj, Burakov, Anatoly, dev, Maxime Coquelin, hemant.agrawal,
shreyansh.jain, Rosen Xu, Gaetan Rivet, Stephen Hemminger, Yigit,
Ferruh, Thomas Monjalon, Yongseok Koh
On Mon, Jun 3, 2019 at 6:44 PM Walker, Benjamin <benjamin.walker@intel.com>
wrote:
> On Mon, 2019-06-03 at 12:48 +0200, David Marchand wrote:
> > Hello,
> >
> > On Thu, May 30, 2019 at 7:48 PM Ben Walker <benjamin.walker@intel.com>
> wrote:
> > > In SPDK, not all drivers are registered with DPDK at start up time.
> > > Previously, that meant DPDK always chose to set itself up in IOVA_PA
> > > mode. Instead, when the correct iova choice is unclear based on the
> > > devices and drivers known to DPDK at start up time, use other
> heuristics
> > > (such as whether /proc/self/pagemap is accessible) to make a better
> > > choice.
> > >
> > > This enables SPDK to run as an unprivileged user again without
> requiring
> > > users to explicitly set the iova mode on the command line.
> > >
> >
> > Interesting, I got a bz on something similar the day you sent this
> patchset ;-
> > )
> >
> >
> > - When a dpdk process is started, either it has access to physical
> addresses
> > or not, and this won't change for the rest of its life.
> > Your fix on defaulting to VA based on a rte_eal_using_phys_addrs() check
> makes
> > sense to me.
> > It is the most encountered situation when running ovs as non root on
> recent
> > kernels.
> >
> >
> > - However, I fail to see the need for all of this detection code wrt
> drivers
> > and devices.
> >
> > On one side of the equation, when dpdk starts, it checks physical address
> > availability.
> > On the other side of the equation, we have the drivers that will be
> invoked
> > when probing devices (either at dpdk init, or when hotplugging a device).
> >
> > At this point, the probing call should check the driver requirement wrt
> to the
> > kernel driver the device is attached to.
> > If this requirement is not fulfilled, then the probing fails.
> >
> >
> > - This leaves the --iova-va forcing option.
> > Why do we need it?
> > If we don't have access to physical addresses, no choice but run in VA
> mode.
> > If we have access to physical addresses, the only case would be that you
> want
> > to downgrade from PA to VA.
> > But well, your process can still access it, not sure what the benefit is.
>
> All of the complexity here, at least as far as I understand it, stems from
> supporting hot insert of devices. This is very important to SPDK because
> storage
> devices get hot inserted all the time, so we very much appreciate that
> DPDK has
> put in so much effort in this area and continues to accept our patches to
> improve it. I know hot insert is not nearly as important for network
> devices.
>
> When DPDK starts up, it needs to select whether to use virtual addresses or
> physical addresses in its memory maps. It can do that by answering the
> following
> questions:
>
> 1. Does the system only have buses that support an IOMMU?
> 2. Is the IOMMU sufficiently fast for the use case?
> 3. Will all of the devices that will be used with DPDK throughout the
> application's lifetime work with an IOMMU?
>
> If these three things are true, then the best choice is to use virtual
> addresses
> in the memory translations. However, if any of the above are not true it
> needs
> to fall back to physical addresses.
>
> #1 is checked by simply asking all of the buses, which are known up front.
> #2 is
> just assumed to be true. But #3 is not possible to check fully because of
> hot
> insert.
>
> The code currently approximates the #3 check by looking at the devices
> present
> at initialization time. If a device exists that's bound to vfio-pci, and no
> other devices exist that are bound to a uio driver, and DPDK has a
> registered
> driver that's actually going to load against the vfio-pci devices, then it
> will
> elect to use virtual addresses. This is purely a heuristic - it's not a
> definitive answer because the user could later hot insert a device that
> gets
> bound to uio.
>
> The user, of course, knows the answer to which addressing scheme to use
> typically. For example, these checks assume #2 is true, but there may be
> hardware implementations where it is not and the user wants to force
> physical
> addresses. Or the user may know that they are going to hot insert a device
> at
> run time that doesn't work with the IOMMU. That's why it's important to
> maintain
> the ability for the user to override the default heuristic's decision via
> the
> command line.
>
> My patch series is simply improving the heuristic in a few ways. First,
> previously each bus when queried would return either virtual or physical
> addresses as its choice. However, often the bus just does not have enough
> information to formulate any preference at all (and PCI was defaulting to
> physical addresses in this case). Instead, I made it so that the bus can
> return
> that it doesn't care, which pushes the decision up to a higher level. That
> higher level then makes the decision by checking whether it can access
> /proc/self/pagemap. Second, I narrowed the uio check such that physical
> addresses will only be selected if a device bound to uio exists and there
> is a
> driver registered to use it. Previously if any device was bound to uio it
> would
> select physical addresses, even if DPDK never ended up loading against that
> device.
>
> I think these two things make the heuristic choose the right thing more
> often,
> but it still won't always get it right so the command line option needs to
> remain.
>
>
After some exchanges offlist, on irc and taking some time looking at the
code, here are my conclusions.
Copying bus drivers maintainers/connaisseurs.
We have cases where we prefer using VA even if PA are available (for fslmc
where translating from iova as PA to VA is more costly).
I worked on Ben patches and summarised it as two main issues with the
current code:
- physical addresses availability is not taken into account early enough in
EAL init, and we end up with memory subsystem complaining later which is
not that user friendly.
A collateral is that the init could have fallen back to using VA in most
cases if there were no strong requirement on PA.
- pci bus driver looks at all devices on the system, with no consideration
on the pci white/blacklist and no consideration on the fact that dpdk has a
driver that supports the device
I prepared a new series that I will send shortly.
I am currently considering the backport potential for it.
Thoughts?
Else, reviews are welcome.
Thanks.
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of IOVA mode
2019-05-30 17:48 [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode Ben Walker
` (12 preceding siblings ...)
2019-06-03 10:48 ` [dpdk-dev] eal/pci: Improve automatic selection of IOVA mode David Marchand
@ 2019-06-14 9:39 ` David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 1/3] kni: refuse to initialise when IOVA is not PA David Marchand
` (4 more replies)
13 siblings, 5 replies; 47+ messages in thread
From: David Marchand @ 2019-06-14 9:39 UTC (permalink / raw)
To: dev; +Cc: benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin, thomas
In SPDK, not all drivers are registered with DPDK at start up time.
Previously, that meant DPDK always chose to set itself up in IOVA_PA
mode. Instead, when the correct iova choice is unclear based on the
devices and drivers known to DPDK at start up time, use other heuristics
(such as whether /proc/self/pagemap is accessible) to make a better
choice.
This enables SPDK to run as an unprivileged user again without requiring
users to explicitly set the iova mode on the command line.
Changelog since v1:
- I took over the series following experiments and discussions with Ben
and others, squashed Ben patches as two patches focusing on the main
issues,
- introduced a fix on KNI,
- on the EAL bits,
- added log on which IOVA mode has been selected,
- updated BSD EAL,
- in Linux EAL, moved KNI special case after IOVA selection,
- in Linux EAL, added check on forced mode wrt physical addresses
availability,
- on the PCI bus driver bits,
- enforced the checks in the common code of the PCI bus,
- added debug logs to track why a iova mode has been chosen per device,
- added BSD part,
- in Linux part, checked that VFIO is enabled,
- in Linux part, defaulted to DC if a driver supports both PA and VA,
--
David Marchand
Ben Walker (2):
eal: compute IOVA mode based on PA availability
bus/pci: only consider usable devices to select IOVA mode
David Marchand (1):
kni: refuse to initialise when IOVA is not PA
drivers/bus/pci/bsd/pci.c | 9 +-
drivers/bus/pci/linux/pci.c | 191 +++++++++-----------------------
drivers/bus/pci/pci_common.c | 65 +++++++++++
drivers/bus/pci/private.h | 8 ++
lib/librte_eal/common/eal_common_bus.c | 4 -
lib/librte_eal/common/include/rte_bus.h | 2 +-
lib/librte_eal/freebsd/eal/eal.c | 10 +-
lib/librte_eal/linux/eal/eal.c | 38 +++++--
lib/librte_eal/linux/eal/eal_memory.c | 46 ++------
lib/librte_kni/rte_kni.c | 5 +
10 files changed, 187 insertions(+), 191 deletions(-)
--
1.8.3.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 1/3] kni: refuse to initialise when IOVA is not PA
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
@ 2019-06-14 9:39 ` David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability David Marchand
` (3 subsequent siblings)
4 siblings, 0 replies; 47+ messages in thread
From: David Marchand @ 2019-06-14 9:39 UTC (permalink / raw)
To: dev
Cc: benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin,
thomas, stable, Ferruh Yigit
If a forced iova-mode has been passed at init, kni is not supposed to
work.
Fixes: 075b182b54ce ("eal: force IOVA to a particular mode")
Cc: stable@dpdk.org
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
lib/librte_kni/rte_kni.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index a0f1e37..a6bf323 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -97,6 +97,11 @@ enum kni_ops_status {
int
rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
{
+ if (rte_eal_iova_mode() != RTE_IOVA_PA) {
+ RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
+ return -1;
+ }
+
/* Check FD and open */
if (kni_fd < 0) {
kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
--
1.8.3.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 1/3] kni: refuse to initialise when IOVA is not PA David Marchand
@ 2019-06-14 9:39 ` David Marchand
2019-07-03 10:17 ` Burakov, Anatoly
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode David Marchand
` (2 subsequent siblings)
4 siblings, 1 reply; 47+ messages in thread
From: David Marchand @ 2019-06-14 9:39 UTC (permalink / raw)
To: dev
Cc: benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin,
thomas, Bruce Richardson
From: Ben Walker <benjamin.walker@intel.com>
Currently, if the bus selects IOVA as PA, the memory init can fail when
lacking access to physical addresses.
This can be quite hard for normal users to understand what is wrong
since this is the default behavior.
Catch this situation earlier in eal init by validating physical addresses
availability, or select IOVA when no clear preferrence had been expressed.
The bus code is changed so that it reports when it does not care about
the IOVA mode and let the eal init decide.
In Linux implementation, rework rte_eal_using_phys_addrs() so that it can
be called earlier but still avoid a circular dependency with
rte_mem_virt2phys().
In FreeBSD implementation, rte_eal_using_phys_addrs() always returns
false, so the detection part is left as is.
If librte_kni is compiled in and the KNI kmod is loaded,
- if the buses requested VA, force to PA if physical addresses are
available as it was done before,
- else, keep iova as VA, KNI init will fail later.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
lib/librte_eal/common/eal_common_bus.c | 4 ---
lib/librte_eal/common/include/rte_bus.h | 2 +-
lib/librte_eal/freebsd/eal/eal.c | 10 +++++--
lib/librte_eal/linux/eal/eal.c | 38 +++++++++++++++++++++------
lib/librte_eal/linux/eal/eal_memory.c | 46 +++++++++------------------------
5 files changed, 51 insertions(+), 49 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index c8f1901..77f1be1 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -237,10 +237,6 @@ enum rte_iova_mode
mode |= bus->get_iommu_class();
}
- if (mode != RTE_IOVA_VA) {
- /* Use default IOVA mode */
- mode = RTE_IOVA_PA;
- }
return mode;
}
diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h
index 4faf2d2..90fe4e9 100644
--- a/lib/librte_eal/common/include/rte_bus.h
+++ b/lib/librte_eal/common/include/rte_bus.h
@@ -392,7 +392,7 @@ struct rte_bus *rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp,
/**
* Get the common iommu class of devices bound on to buses available in the
- * system. The default mode is PA.
+ * system. RTE_IOVA_DC means that no preferrence has been expressed.
*
* @return
* enum rte_iova_mode value.
diff --git a/lib/librte_eal/freebsd/eal/eal.c b/lib/librte_eal/freebsd/eal/eal.c
index 4eaa531..231f1dc 100644
--- a/lib/librte_eal/freebsd/eal/eal.c
+++ b/lib/librte_eal/freebsd/eal/eal.c
@@ -689,13 +689,19 @@ static void rte_eal_init_alert(const char *msg)
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
/* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
- rte_eal_get_configuration()->iova_mode =
- rte_bus_get_iommu_class();
+ enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
+
+ if (iova_mode == RTE_IOVA_DC)
+ iova_mode = RTE_IOVA_PA;
+ rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
}
+ RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
+ rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
+
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 3e1d6eb..785ed2b 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -948,6 +948,7 @@ static void rte_eal_init_alert(const char *msg)
static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ bool phys_addrs;
/* checks if the machine is adequate */
if (!rte_cpu_is_supported()) {
@@ -1035,25 +1036,46 @@ static void rte_eal_init_alert(const char *msg)
return -1;
}
+ phys_addrs = rte_eal_using_phys_addrs() != 0;
+
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
- /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
- rte_eal_get_configuration()->iova_mode =
- rte_bus_get_iommu_class();
+ /* autodetect the IOVA mapping mode */
+ enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
+ if (iova_mode == RTE_IOVA_DC) {
+ iova_mode = phys_addrs ? RTE_IOVA_PA : RTE_IOVA_VA;
+ RTE_LOG(DEBUG, EAL,
+ "Buses did not request a specific IOVA mode, using '%s' based on physical addresses availability.\n",
+ phys_addrs ? "PA" : "VA");
+ }
+#ifdef RTE_LIBRTE_KNI
/* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ if (iova_mode == RTE_IOVA_VA &&
rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
- RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ if (phys_addrs) {
+ iova_mode = RTE_IOVA_PA;
+ RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+ } else {
+ RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
+ }
}
+#endif
+ rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
}
+ if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
+ rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
+ rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
+
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c
index 1853ace..25c4145 100644
--- a/lib/librte_eal/linux/eal/eal_memory.c
+++ b/lib/librte_eal/linux/eal/eal_memory.c
@@ -65,34 +65,10 @@
* zone as well as a physical contiguous zone.
*/
-static bool phys_addrs_available = true;
+static int phys_addrs_available = -1;
#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
-static void
-test_phys_addrs_available(void)
-{
- uint64_t tmp = 0;
- phys_addr_t physaddr;
-
- if (!rte_eal_has_hugepages()) {
- RTE_LOG(ERR, EAL,
- "Started without hugepages support, physical addresses not available\n");
- phys_addrs_available = false;
- return;
- }
-
- physaddr = rte_mem_virt2phy(&tmp);
- if (physaddr == RTE_BAD_PHYS_ADDR) {
- if (rte_eal_iova_mode() == RTE_IOVA_PA)
- RTE_LOG(ERR, EAL,
- "Cannot obtain physical addresses: %s. "
- "Only vfio will function.\n",
- strerror(errno));
- phys_addrs_available = false;
- }
-}
-
/*
* Get physical address of any mapped virtual address in the current process.
*/
@@ -105,8 +81,7 @@
int page_size;
off_t offset;
- /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
- if (!phys_addrs_available)
+ if (phys_addrs_available == 0)
return RTE_BAD_IOVA;
/* standard page size */
@@ -1336,8 +1311,6 @@ void numa_error(char *where)
int nr_hugefiles, nr_hugepages = 0;
void *addr;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
/* get pointer to global configuration */
@@ -1516,7 +1489,7 @@ void numa_error(char *where)
continue;
}
- if (phys_addrs_available &&
+ if (rte_eal_using_phys_addrs() &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
@@ -1735,8 +1708,6 @@ void numa_error(char *where)
uint64_t memory[RTE_MAX_NUMA_NODES];
int hp_sz_idx, socket_id;
- test_phys_addrs_available();
-
memset(used_hp, 0, sizeof(used_hp));
for (hp_sz_idx = 0;
@@ -1879,8 +1850,6 @@ void numa_error(char *where)
"into secondary processes\n");
}
- test_phys_addrs_available();
-
fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n",
@@ -2020,6 +1989,15 @@ void numa_error(char *where)
int
rte_eal_using_phys_addrs(void)
{
+ if (phys_addrs_available == -1) {
+ uint64_t tmp = 0;
+
+ if (rte_eal_has_hugepages() != 0 &&
+ rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
+ phys_addrs_available = 1;
+ else
+ phys_addrs_available = 0;
+ }
return phys_addrs_available;
}
--
1.8.3.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability David Marchand
@ 2019-07-03 10:17 ` Burakov, Anatoly
2019-07-04 7:13 ` David Marchand
0 siblings, 1 reply; 47+ messages in thread
From: Burakov, Anatoly @ 2019-07-03 10:17 UTC (permalink / raw)
To: David Marchand, dev
Cc: benjamin.walker, jerinj, maxime.coquelin, thomas, Bruce Richardson
On 14-Jun-19 10:39 AM, David Marchand wrote:
> From: Ben Walker <benjamin.walker@intel.com>
>
> Currently, if the bus selects IOVA as PA, the memory init can fail when
> lacking access to physical addresses.
> This can be quite hard for normal users to understand what is wrong
> since this is the default behavior.
>
> Catch this situation earlier in eal init by validating physical addresses
> availability, or select IOVA when no clear preferrence had been expressed.
>
> The bus code is changed so that it reports when it does not care about
> the IOVA mode and let the eal init decide.
>
> In Linux implementation, rework rte_eal_using_phys_addrs() so that it can
> be called earlier but still avoid a circular dependency with
> rte_mem_virt2phys().
> In FreeBSD implementation, rte_eal_using_phys_addrs() always returns
> false, so the detection part is left as is.
>
> If librte_kni is compiled in and the KNI kmod is loaded,
> - if the buses requested VA, force to PA if physical addresses are
> available as it was done before,
> - else, keep iova as VA, KNI init will fail later.
>
> Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
<snip>
> + /* autodetect the IOVA mapping mode */
> + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
>
> + if (iova_mode == RTE_IOVA_DC) {
> + iova_mode = phys_addrs ? RTE_IOVA_PA : RTE_IOVA_VA;
> + RTE_LOG(DEBUG, EAL,
> + "Buses did not request a specific IOVA mode, using '%s' based on physical addresses availability.\n",
> + phys_addrs ? "PA" : "VA");
> + }
> +#ifdef RTE_LIBRTE_KNI
> /* Workaround for KNI which requires physical address to work */
> - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
> + if (iova_mode == RTE_IOVA_VA &&
> rte_eal_check_module("rte_kni") == 1) {
> - rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
> - RTE_LOG(WARNING, EAL,
> - "Some devices want IOVA as VA but PA will be used because.. "
> - "KNI module inserted\n");
> + if (phys_addrs) {
> + iova_mode = RTE_IOVA_PA;
> + RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
> + } else {
> + RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
> + }
> }
> +#endif
Why the ifdefs? I don't think there's something specific to KNI there,
rte_eal_check_module() works absent of KNI.
Otherwise LGTM,
Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
I wish we would make IOVA as VA the default already, but at least
picking it automatically when physical addresses aren't available is a
good start :)
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability
2019-07-03 10:17 ` Burakov, Anatoly
@ 2019-07-04 7:13 ` David Marchand
0 siblings, 0 replies; 47+ messages in thread
From: David Marchand @ 2019-07-04 7:13 UTC (permalink / raw)
To: Burakov, Anatoly
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Maxime Coquelin,
Thomas Monjalon, Bruce Richardson
On Wed, Jul 3, 2019 at 12:17 PM Burakov, Anatoly <anatoly.burakov@intel.com>
wrote:
> On 14-Jun-19 10:39 AM, David Marchand wrote:
> > From: Ben Walker <benjamin.walker@intel.com>
> >
> > Currently, if the bus selects IOVA as PA, the memory init can fail when
> > lacking access to physical addresses.
> > This can be quite hard for normal users to understand what is wrong
> > since this is the default behavior.
> >
> > Catch this situation earlier in eal init by validating physical addresses
> > availability, or select IOVA when no clear preferrence had been
> expressed.
> >
> > The bus code is changed so that it reports when it does not care about
> > the IOVA mode and let the eal init decide.
> >
> > In Linux implementation, rework rte_eal_using_phys_addrs() so that it can
> > be called earlier but still avoid a circular dependency with
> > rte_mem_virt2phys().
> > In FreeBSD implementation, rte_eal_using_phys_addrs() always returns
> > false, so the detection part is left as is.
> >
> > If librte_kni is compiled in and the KNI kmod is loaded,
> > - if the buses requested VA, force to PA if physical addresses are
> > available as it was done before,
> > - else, keep iova as VA, KNI init will fail later.
> >
> > Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
>
> <snip>
>
> > + /* autodetect the IOVA mapping mode */
> > + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
> >
> > + if (iova_mode == RTE_IOVA_DC) {
> > + iova_mode = phys_addrs ? RTE_IOVA_PA : RTE_IOVA_VA;
> > + RTE_LOG(DEBUG, EAL,
> > + "Buses did not request a specific IOVA
> mode, using '%s' based on physical addresses availability.\n",
> > + phys_addrs ? "PA" : "VA");
> > + }
> > +#ifdef RTE_LIBRTE_KNI
> > /* Workaround for KNI which requires physical address to
> work */
> > - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA
> &&
> > + if (iova_mode == RTE_IOVA_VA &&
> > rte_eal_check_module("rte_kni") == 1) {
> > - rte_eal_get_configuration()->iova_mode =
> RTE_IOVA_PA;
> > - RTE_LOG(WARNING, EAL,
> > - "Some devices want IOVA as VA but PA will
> be used because.. "
> > - "KNI module inserted\n");
> > + if (phys_addrs) {
> > + iova_mode = RTE_IOVA_PA;
> > + RTE_LOG(WARNING, EAL, "Forcing IOVA as
> 'PA' because KNI module is loaded\n");
> > + } else {
> > + RTE_LOG(DEBUG, EAL, "KNI can not work
> since physical addresses are unavailable\n");
> > + }
> > }
> > +#endif
>
> Why the ifdefs? I don't think there's something specific to KNI there,
> rte_eal_check_module() works absent of KNI.
>
The sole presence of the kni kmod does not mean you have librte_kni in your
binary.
So if you consider the case where you would have two dpdk applications
running, one with kni, the other one without it.
It does not makes sense to force PA for the one that does not have
librte_kni.
Otherwise LGTM,
>
> Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
>
Thanks for the review !
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 1/3] kni: refuse to initialise when IOVA is not PA David Marchand
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 2/3] eal: compute IOVA mode based on PA availability David Marchand
@ 2019-06-14 9:39 ` David Marchand
2019-07-03 10:45 ` Burakov, Anatoly
2019-07-04 17:14 ` Stephen Hemminger
2019-06-27 17:05 ` [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of " Thomas Monjalon
2019-07-05 14:57 ` Thomas Monjalon
4 siblings, 2 replies; 47+ messages in thread
From: David Marchand @ 2019-06-14 9:39 UTC (permalink / raw)
To: dev; +Cc: benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin, thomas
From: Ben Walker <benjamin.walker@intel.com>
When selecting the preferred IOVA mode of the pci bus, the current
heuristic ("are devices bound?", "are devices bound to UIO?", "are pmd
drivers supporting IOVA as VA?" etc..) should honor the device
white/blacklist so that an unwanted device does not impact the decision.
There is no reason to consider a device which has no driver available.
This applies to all OS, so implements this in common code then call a
OS specific callback.
On Linux side:
- the VFIO special considerations should be evaluated only if VFIO
support is built,
- there is no strong requirement on using VA rather than PA if a driver
supports VA, so defaulting to DC in such a case.
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: David Marchand <david.marchand@redhat.com>
---
drivers/bus/pci/bsd/pci.c | 9 +-
drivers/bus/pci/linux/pci.c | 191 ++++++++++++-------------------------------
drivers/bus/pci/pci_common.c | 65 +++++++++++++++
drivers/bus/pci/private.h | 8 ++
4 files changed, 131 insertions(+), 142 deletions(-)
diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
index c7b90cb..a2de709 100644
--- a/drivers/bus/pci/bsd/pci.c
+++ b/drivers/bus/pci/bsd/pci.c
@@ -376,13 +376,14 @@
return -1;
}
-/*
- * Get iommu class of PCI devices on the bus.
- */
enum rte_iova_mode
-rte_pci_get_iommu_class(void)
+pci_device_iova_mode(const struct rte_pci_driver *pdrv __rte_unused,
+ const struct rte_pci_device *pdev)
{
/* Supports only RTE_KDRV_NIC_UIO */
+ if (pdev->kdrv != RTE_KDRV_NIC_UIO)
+ RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting to IOVA as 'PA'\n");
+
return RTE_IOVA_PA;
}
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index b931cf9..33c8ea7 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -500,95 +500,14 @@
return -1;
}
-/*
- * Is pci device bound to any kdrv
- */
-static inline int
-pci_one_device_is_bound(void)
-{
- struct rte_pci_device *dev = NULL;
- int ret = 0;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (dev->kdrv == RTE_KDRV_UNKNOWN ||
- dev->kdrv == RTE_KDRV_NONE) {
- continue;
- } else {
- ret = 1;
- break;
- }
- }
- return ret;
-}
-
-/*
- * Any one of the device bound to uio
- */
-static inline int
-pci_one_device_bound_uio(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_devargs *devargs;
- int need_check;
-
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- devargs = dev->device.devargs;
-
- need_check = 0;
- switch (rte_pci_bus.bus.conf.scan_mode) {
- case RTE_BUS_SCAN_WHITELIST:
- if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
- need_check = 1;
- break;
- case RTE_BUS_SCAN_UNDEFINED:
- case RTE_BUS_SCAN_BLACKLIST:
- if (devargs == NULL ||
- devargs->policy != RTE_DEV_BLACKLISTED)
- need_check = 1;
- break;
- }
-
- if (!need_check)
- continue;
-
- if (dev->kdrv == RTE_KDRV_IGB_UIO ||
- dev->kdrv == RTE_KDRV_UIO_GENERIC) {
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * Any one of the device has iova as va
- */
-static inline int
-pci_one_device_has_iova_va(void)
-{
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
-
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if ((dev->kdrv == RTE_KDRV_VFIO ||
- dev->kdrv == RTE_KDRV_NIC_MLX) &&
- rte_pci_match(drv, dev))
- return 1;
- }
- }
- }
- return 0;
-}
-
#if defined(RTE_ARCH_X86)
static bool
-pci_one_device_iommu_support_va(struct rte_pci_device *dev)
+pci_one_device_iommu_support_va(const struct rte_pci_device *dev)
{
#define VTD_CAP_MGAW_SHIFT 16
#define VTD_CAP_MGAW_MASK (0x3fULL << VTD_CAP_MGAW_SHIFT)
#define X86_VA_WIDTH 47 /* From Documentation/x86/x86_64/mm.txt */
- struct rte_pci_addr *addr = &dev->addr;
+ const struct rte_pci_addr *addr = &dev->addr;
char filename[PATH_MAX];
FILE *fp;
uint64_t mgaw, vtd_cap_reg = 0;
@@ -632,80 +551,76 @@
}
#elif defined(RTE_ARCH_PPC_64)
static bool
-pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
+pci_one_device_iommu_support_va(__rte_unused const struct rte_pci_device *dev)
{
return false;
}
#else
static bool
-pci_one_device_iommu_support_va(__rte_unused struct rte_pci_device *dev)
+pci_one_device_iommu_support_va(__rte_unused const struct rte_pci_device *dev)
{
return true;
}
#endif
-/*
- * All devices IOMMUs support VA as IOVA
- */
-static bool
-pci_devices_iommu_support_va(void)
+enum rte_iova_mode
+pci_device_iova_mode(const struct rte_pci_driver *pdrv,
+ const struct rte_pci_device *pdev)
{
- struct rte_pci_device *dev = NULL;
- struct rte_pci_driver *drv = NULL;
+ enum rte_iova_mode iova_mode = RTE_IOVA_DC;
+ static int iommu_no_va = -1;
- FOREACH_DRIVER_ON_PCIBUS(drv) {
- FOREACH_DEVICE_ON_PCIBUS(dev) {
- if (!rte_pci_match(drv, dev))
- continue;
- /*
- * just one PCI device needs to be checked out because
- * the IOMMU hardware is the same for all of them.
- */
- return pci_one_device_iommu_support_va(dev);
+ switch (pdev->kdrv) {
+ case RTE_KDRV_VFIO: {
+#ifdef VFIO_PRESENT
+ static int is_vfio_noiommu_enabled = -1;
+
+ if (is_vfio_noiommu_enabled == -1) {
+ if (rte_vfio_noiommu_is_enabled() == 1)
+ is_vfio_noiommu_enabled = 1;
+ else
+ is_vfio_noiommu_enabled = 0;
+ }
+ if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
+ iova_mode = RTE_IOVA_PA;
+ } else if (is_vfio_noiommu_enabled != 0) {
+ RTE_LOG(DEBUG, EAL, "Forcing to 'PA', vfio-noiommu mode configured\n");
+ iova_mode = RTE_IOVA_PA;
}
+#endif
+ break;
}
- return true;
-}
-/*
- * Get iommu class of PCI devices on the bus.
- */
-enum rte_iova_mode
-rte_pci_get_iommu_class(void)
-{
- bool is_bound;
- bool is_vfio_noiommu_enabled = true;
- bool has_iova_va;
- bool is_bound_uio;
- bool iommu_no_va;
-
- is_bound = pci_one_device_is_bound();
- if (!is_bound)
- return RTE_IOVA_DC;
-
- has_iova_va = pci_one_device_has_iova_va();
- is_bound_uio = pci_one_device_bound_uio();
- iommu_no_va = !pci_devices_iommu_support_va();
-#ifdef VFIO_PRESENT
- is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
- true : false;
-#endif
+ case RTE_KDRV_NIC_MLX:
+ if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0)
+ iova_mode = RTE_IOVA_PA;
+ break;
- if (has_iova_va && !is_bound_uio && !is_vfio_noiommu_enabled &&
- !iommu_no_va)
- return RTE_IOVA_VA;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ iova_mode = RTE_IOVA_PA;
+ break;
- if (has_iova_va) {
- RTE_LOG(WARNING, EAL, "Some devices want iova as va but pa will be used because.. ");
- if (is_vfio_noiommu_enabled)
- RTE_LOG(WARNING, EAL, "vfio-noiommu mode configured\n");
- if (is_bound_uio)
- RTE_LOG(WARNING, EAL, "few device bound to UIO\n");
- if (iommu_no_va)
- RTE_LOG(WARNING, EAL, "IOMMU does not support IOVA as VA\n");
+ default:
+ RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting to IOVA as 'PA'\n");
+ iova_mode = RTE_IOVA_PA;
+ break;
}
- return RTE_IOVA_PA;
+ if (iova_mode != RTE_IOVA_PA) {
+ /*
+ * We can check this only once, because the IOMMU hardware is
+ * the same for all of them.
+ */
+ if (iommu_no_va == -1)
+ iommu_no_va = pci_one_device_iommu_support_va(pdev)
+ ? 0 : 1;
+ if (iommu_no_va != 0) {
+ RTE_LOG(DEBUG, EAL, "Forcing to 'PA', IOMMU does not support IOVA as 'VA'\n");
+ iova_mode = RTE_IOVA_PA;
+ }
+ }
+ return iova_mode;
}
/* Read PCI config space. */
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 704b9d7..d2af472 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -574,6 +574,71 @@ static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev)
return -1;
}
+static bool
+pci_ignore_device(const struct rte_pci_device *dev)
+{
+ struct rte_devargs *devargs = dev->device.devargs;
+
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ return false;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ return false;
+ break;
+ }
+ return true;
+}
+
+enum rte_iova_mode
+rte_pci_get_iommu_class(void)
+{
+ enum rte_iova_mode iova_mode = RTE_IOVA_DC;
+ const struct rte_pci_device *dev;
+ const struct rte_pci_driver *drv;
+ bool devices_want_va = false;
+ bool devices_want_pa = false;
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (pci_ignore_device(dev))
+ continue;
+ if (dev->kdrv == RTE_KDRV_UNKNOWN ||
+ dev->kdrv == RTE_KDRV_NONE)
+ continue;
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ enum rte_iova_mode dev_iova_mode;
+
+ if (!rte_pci_match(drv, dev))
+ continue;
+
+ dev_iova_mode = pci_device_iova_mode(drv, dev);
+ RTE_LOG(DEBUG, EAL, "PCI driver %s for device "
+ PCI_PRI_FMT " wants IOVA as '%s'\n",
+ drv->driver.name,
+ dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function,
+ dev_iova_mode == RTE_IOVA_DC ? "DC" :
+ (dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA"));
+ if (dev_iova_mode == RTE_IOVA_PA)
+ devices_want_pa = true;
+ else if (dev_iova_mode == RTE_IOVA_VA)
+ devices_want_va = true;
+ }
+ }
+ if (devices_want_pa) {
+ iova_mode = RTE_IOVA_PA;
+ if (devices_want_va)
+ RTE_LOG(WARNING, EAL, "Some devices want 'VA' but forcing 'PA' because other devices want it\n");
+ } else if (devices_want_va) {
+ iova_mode = RTE_IOVA_VA;
+ }
+ return iova_mode;
+}
+
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 13c3324..8a55240 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -173,6 +173,14 @@ int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
const struct rte_pci_device *pci_dev);
/**
+ * OS specific callback for rte_pci_get_iommu_class
+ *
+ */
+enum rte_iova_mode
+pci_device_iova_mode(const struct rte_pci_driver *pci_drv,
+ const struct rte_pci_device *pci_dev);
+
+/**
* Get iommu class of PCI devices on the bus.
* And return their preferred iova mapping mode.
*
--
1.8.3.1
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode David Marchand
@ 2019-07-03 10:45 ` Burakov, Anatoly
2019-07-04 9:18 ` David Marchand
2019-07-04 17:14 ` Stephen Hemminger
1 sibling, 1 reply; 47+ messages in thread
From: Burakov, Anatoly @ 2019-07-03 10:45 UTC (permalink / raw)
To: David Marchand, dev; +Cc: benjamin.walker, jerinj, maxime.coquelin, thomas
On 14-Jun-19 10:39 AM, David Marchand wrote:
> From: Ben Walker <benjamin.walker@intel.com>
>
> When selecting the preferred IOVA mode of the pci bus, the current
> heuristic ("are devices bound?", "are devices bound to UIO?", "are pmd
> drivers supporting IOVA as VA?" etc..) should honor the device
> white/blacklist so that an unwanted device does not impact the decision.
>
> There is no reason to consider a device which has no driver available.
>
> This applies to all OS, so implements this in common code then call a
> OS specific callback.
>
> On Linux side:
> - the VFIO special considerations should be evaluated only if VFIO
> support is built,
> - there is no strong requirement on using VA rather than PA if a driver
> supports VA, so defaulting to DC in such a case.
>
> Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
<snip>
> + const struct rte_pci_device *pdev)
> {
> - struct rte_pci_device *dev = NULL;
> - struct rte_pci_driver *drv = NULL;
> + enum rte_iova_mode iova_mode = RTE_IOVA_DC;
> + static int iommu_no_va = -1;
>
> - FOREACH_DRIVER_ON_PCIBUS(drv) {
> - FOREACH_DEVICE_ON_PCIBUS(dev) {
> - if (!rte_pci_match(drv, dev))
> - continue;
> - /*
> - * just one PCI device needs to be checked out because
> - * the IOMMU hardware is the same for all of them.
> - */
> - return pci_one_device_iommu_support_va(dev);
> + switch (pdev->kdrv) {
> + case RTE_KDRV_VFIO: {
> +#ifdef VFIO_PRESENT
> + static int is_vfio_noiommu_enabled = -1;
> +
> + if (is_vfio_noiommu_enabled == -1) {
> + if (rte_vfio_noiommu_is_enabled() == 1)
> + is_vfio_noiommu_enabled = 1;
> + else
> + is_vfio_noiommu_enabled = 0;
> + }
> + if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> + iova_mode = RTE_IOVA_PA;
> + } else if (is_vfio_noiommu_enabled != 0) {
> + RTE_LOG(DEBUG, EAL, "Forcing to 'PA', vfio-noiommu mode configured\n");
> + iova_mode = RTE_IOVA_PA;
> }
> +#endif
> + break;
I'm not too well-versed in bus code, so please excuse my ignorance of
this codebase.
It seems that we would be ignoring drv_flags in case VFIO wasn't
compiled - if the driver has no RTE_PCI_DRV_IOVA_AS_VA flag, i'm pretty
sure we can set IOVA mode to PA without caring about VFIO at all. I
think it would be better to have something like this:
if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
iova_mode = RTE_IOVA_PA;
break; // early exit
}
#ifdef VFIO_PRESENT
static int is_vfio_noiommu_enabled = -1;
if (is_vfio_noiommu_enabled == -1) {
if (rte_vfio_noiommu_is_enabled() == 1)
is_vfio_noiommu_enabled = 1;
else
is_vfio_noiommu_enabled = 0;
}
if (is_vfio_noiommu_enabled != 0) {
iova_mode = RTE_IOVA_PA;
}
#endif
break;
In fact, could we not check if devices support both flags, and do an
early exit in case they don't?
Something like this, for example:
if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
return RTE_IOVA_PA; // early exit - device only wants PA
}
// device supports both PA and VA modes, so do more checks
switch (pdev->kdrv) {
...
}
Unless i'm missing something, this would look much simpler and easier to
understand.
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-03 10:45 ` Burakov, Anatoly
@ 2019-07-04 9:18 ` David Marchand
2019-07-04 10:43 ` Burakov, Anatoly
0 siblings, 1 reply; 47+ messages in thread
From: David Marchand @ 2019-07-04 9:18 UTC (permalink / raw)
To: Burakov, Anatoly
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Maxime Coquelin,
Thomas Monjalon
On Wed, Jul 3, 2019 at 12:45 PM Burakov, Anatoly <anatoly.burakov@intel.com>
wrote:
> On 14-Jun-19 10:39 AM, David Marchand wrote:
> > From: Ben Walker <benjamin.walker@intel.com>
> >
> > When selecting the preferred IOVA mode of the pci bus, the current
> > heuristic ("are devices bound?", "are devices bound to UIO?", "are pmd
> > drivers supporting IOVA as VA?" etc..) should honor the device
> > white/blacklist so that an unwanted device does not impact the decision.
> >
> > There is no reason to consider a device which has no driver available.
> >
> > This applies to all OS, so implements this in common code then call a
> > OS specific callback.
> >
> > On Linux side:
> > - the VFIO special considerations should be evaluated only if VFIO
> > support is built,
> > - there is no strong requirement on using VA rather than PA if a driver
> > supports VA, so defaulting to DC in such a case.
> >
> > Signed-off-by: Ben Walker <benjamin.walker@intel.com>
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
>
> <snip>
>
> > + const struct rte_pci_device *pdev)
> > {
> > - struct rte_pci_device *dev = NULL;
> > - struct rte_pci_driver *drv = NULL;
> > + enum rte_iova_mode iova_mode = RTE_IOVA_DC;
> > + static int iommu_no_va = -1;
> >
> > - FOREACH_DRIVER_ON_PCIBUS(drv) {
> > - FOREACH_DEVICE_ON_PCIBUS(dev) {
> > - if (!rte_pci_match(drv, dev))
> > - continue;
> > - /*
> > - * just one PCI device needs to be checked out
> because
> > - * the IOMMU hardware is the same for all of them.
> > - */
> > - return pci_one_device_iommu_support_va(dev);
> > + switch (pdev->kdrv) {
> > + case RTE_KDRV_VFIO: {
> > +#ifdef VFIO_PRESENT
> > + static int is_vfio_noiommu_enabled = -1;
> > +
> > + if (is_vfio_noiommu_enabled == -1) {
> > + if (rte_vfio_noiommu_is_enabled() == 1)
> > + is_vfio_noiommu_enabled = 1;
> > + else
> > + is_vfio_noiommu_enabled = 0;
> > + }
> > + if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> > + iova_mode = RTE_IOVA_PA;
> > + } else if (is_vfio_noiommu_enabled != 0) {
> > + RTE_LOG(DEBUG, EAL, "Forcing to 'PA', vfio-noiommu
> mode configured\n");
> > + iova_mode = RTE_IOVA_PA;
> > }
> > +#endif
> > + break;
>
> I'm not too well-versed in bus code, so please excuse my ignorance of
> this codebase.
>
> It seems that we would be ignoring drv_flags in case VFIO wasn't
> compiled - if the driver has no RTE_PCI_DRV_IOVA_AS_VA flag, i'm pretty
> sure we can set IOVA mode to PA without caring about VFIO at all. I
> think it would be better to have something like this:
>
> if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> iova_mode = RTE_IOVA_PA;
> break; // early exit
> }
>
If the device is bound to VFIO, but the dpdk binary has no vfio support, we
don't need to consider this device in the decision.
Did I miss something in what you suggest?
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-04 9:18 ` David Marchand
@ 2019-07-04 10:43 ` Burakov, Anatoly
2019-07-04 10:47 ` David Marchand
0 siblings, 1 reply; 47+ messages in thread
From: Burakov, Anatoly @ 2019-07-04 10:43 UTC (permalink / raw)
To: David Marchand
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Maxime Coquelin,
Thomas Monjalon
On 04-Jul-19 10:18 AM, David Marchand wrote:
>
>
> On Wed, Jul 3, 2019 at 12:45 PM Burakov, Anatoly
> <anatoly.burakov@intel.com <mailto:anatoly.burakov@intel.com>> wrote:
>
> On 14-Jun-19 10:39 AM, David Marchand wrote:
> > From: Ben Walker <benjamin.walker@intel.com
> <mailto:benjamin.walker@intel.com>>
> >
> > When selecting the preferred IOVA mode of the pci bus, the current
> > heuristic ("are devices bound?", "are devices bound to UIO?",
> "are pmd
> > drivers supporting IOVA as VA?" etc..) should honor the device
> > white/blacklist so that an unwanted device does not impact the
> decision.
> >
> > There is no reason to consider a device which has no driver
> available.
> >
> > This applies to all OS, so implements this in common code then call a
> > OS specific callback.
> >
> > On Linux side:
> > - the VFIO special considerations should be evaluated only if VFIO
> > support is built,
> > - there is no strong requirement on using VA rather than PA if a
> driver
> > supports VA, so defaulting to DC in such a case.
> >
> > Signed-off-by: Ben Walker <benjamin.walker@intel.com
> <mailto:benjamin.walker@intel.com>>
> > Signed-off-by: David Marchand <david.marchand@redhat.com
> <mailto:david.marchand@redhat.com>>
> > ---
>
> <snip>
>
> > + const struct rte_pci_device *pdev)
> > {
> > - struct rte_pci_device *dev = NULL;
> > - struct rte_pci_driver *drv = NULL;
> > + enum rte_iova_mode iova_mode = RTE_IOVA_DC;
> > + static int iommu_no_va = -1;
> >
> > - FOREACH_DRIVER_ON_PCIBUS(drv) {
> > - FOREACH_DEVICE_ON_PCIBUS(dev) {
> > - if (!rte_pci_match(drv, dev))
> > - continue;
> > - /*
> > - * just one PCI device needs to be checked
> out because
> > - * the IOMMU hardware is the same for all
> of them.
> > - */
> > - return pci_one_device_iommu_support_va(dev);
> > + switch (pdev->kdrv) {
> > + case RTE_KDRV_VFIO: {
> > +#ifdef VFIO_PRESENT
> > + static int is_vfio_noiommu_enabled = -1;
> > +
> > + if (is_vfio_noiommu_enabled == -1) {
> > + if (rte_vfio_noiommu_is_enabled() == 1)
> > + is_vfio_noiommu_enabled = 1;
> > + else
> > + is_vfio_noiommu_enabled = 0;
> > + }
> > + if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> > + iova_mode = RTE_IOVA_PA;
> > + } else if (is_vfio_noiommu_enabled != 0) {
> > + RTE_LOG(DEBUG, EAL, "Forcing to 'PA',
> vfio-noiommu mode configured\n");
> > + iova_mode = RTE_IOVA_PA;
> > }
> > +#endif
> > + break;
>
> I'm not too well-versed in bus code, so please excuse my ignorance of
> this codebase.
>
> It seems that we would be ignoring drv_flags in case VFIO wasn't
> compiled - if the driver has no RTE_PCI_DRV_IOVA_AS_VA flag, i'm pretty
> sure we can set IOVA mode to PA without caring about VFIO at all. I
> think it would be better to have something like this:
>
> if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> iova_mode = RTE_IOVA_PA;
> break; // early exit
> }
>
>
> If the device is bound to VFIO, but the dpdk binary has no vfio support,
> we don't need to consider this device in the decision.
> Did I miss something in what you suggest?
>
Yep, you're correct :)
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>
> --
> David Marchand
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-04 10:43 ` Burakov, Anatoly
@ 2019-07-04 10:47 ` David Marchand
0 siblings, 0 replies; 47+ messages in thread
From: David Marchand @ 2019-07-04 10:47 UTC (permalink / raw)
To: Burakov, Anatoly
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Maxime Coquelin,
Thomas Monjalon
On Thu, Jul 4, 2019 at 12:44 PM Burakov, Anatoly <anatoly.burakov@intel.com>
wrote:
> On 04-Jul-19 10:18 AM, David Marchand wrote:
> >
> >
> > On Wed, Jul 3, 2019 at 12:45 PM Burakov, Anatoly
> > <anatoly.burakov@intel.com <mailto:anatoly.burakov@intel.com>> wrote:
> >
> > On 14-Jun-19 10:39 AM, David Marchand wrote:
> > > From: Ben Walker <benjamin.walker@intel.com
> > <mailto:benjamin.walker@intel.com>>
> > >
> > > When selecting the preferred IOVA mode of the pci bus, the current
> > > heuristic ("are devices bound?", "are devices bound to UIO?",
> > "are pmd
> > > drivers supporting IOVA as VA?" etc..) should honor the device
> > > white/blacklist so that an unwanted device does not impact the
> > decision.
> > >
> > > There is no reason to consider a device which has no driver
> > available.
> > >
> > > This applies to all OS, so implements this in common code then
> call a
> > > OS specific callback.
> > >
> > > On Linux side:
> > > - the VFIO special considerations should be evaluated only if VFIO
> > > support is built,
> > > - there is no strong requirement on using VA rather than PA if a
> > driver
> > > supports VA, so defaulting to DC in such a case.
> > >
> > > Signed-off-by: Ben Walker <benjamin.walker@intel.com
> > <mailto:benjamin.walker@intel.com>>
> > > Signed-off-by: David Marchand <david.marchand@redhat.com
> > <mailto:david.marchand@redhat.com>>
> > > ---
> >
> > <snip>
> >
> > > + const struct rte_pci_device *pdev)
> > > {
> > > - struct rte_pci_device *dev = NULL;
> > > - struct rte_pci_driver *drv = NULL;
> > > + enum rte_iova_mode iova_mode = RTE_IOVA_DC;
> > > + static int iommu_no_va = -1;
> > >
> > > - FOREACH_DRIVER_ON_PCIBUS(drv) {
> > > - FOREACH_DEVICE_ON_PCIBUS(dev) {
> > > - if (!rte_pci_match(drv, dev))
> > > - continue;
> > > - /*
> > > - * just one PCI device needs to be checked
> > out because
> > > - * the IOMMU hardware is the same for all
> > of them.
> > > - */
> > > - return pci_one_device_iommu_support_va(dev);
> > > + switch (pdev->kdrv) {
> > > + case RTE_KDRV_VFIO: {
> > > +#ifdef VFIO_PRESENT
> > > + static int is_vfio_noiommu_enabled = -1;
> > > +
> > > + if (is_vfio_noiommu_enabled == -1) {
> > > + if (rte_vfio_noiommu_is_enabled() == 1)
> > > + is_vfio_noiommu_enabled = 1;
> > > + else
> > > + is_vfio_noiommu_enabled = 0;
> > > + }
> > > + if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) ==
> 0) {
> > > + iova_mode = RTE_IOVA_PA;
> > > + } else if (is_vfio_noiommu_enabled != 0) {
> > > + RTE_LOG(DEBUG, EAL, "Forcing to 'PA',
> > vfio-noiommu mode configured\n");
> > > + iova_mode = RTE_IOVA_PA;
> > > }
> > > +#endif
> > > + break;
> >
> > I'm not too well-versed in bus code, so please excuse my ignorance of
> > this codebase.
> >
> > It seems that we would be ignoring drv_flags in case VFIO wasn't
> > compiled - if the driver has no RTE_PCI_DRV_IOVA_AS_VA flag, i'm
> pretty
> > sure we can set IOVA mode to PA without caring about VFIO at all. I
> > think it would be better to have something like this:
> >
> > if ((pdrv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) == 0) {
> > iova_mode = RTE_IOVA_PA;
> > break; // early exit
> > }
> >
> >
> > If the device is bound to VFIO, but the dpdk binary has no vfio support,
> > we don't need to consider this device in the decision.
> > Did I miss something in what you suggest?
> >
>
> Yep, you're correct :)
>
> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>
Cool, thanks Anatoly!
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode David Marchand
2019-07-03 10:45 ` Burakov, Anatoly
@ 2019-07-04 17:14 ` Stephen Hemminger
2019-07-05 7:58 ` David Marchand
2019-07-05 8:26 ` Thomas Monjalon
1 sibling, 2 replies; 47+ messages in thread
From: Stephen Hemminger @ 2019-07-04 17:14 UTC (permalink / raw)
To: David Marchand
Cc: dev, benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin, thomas
On Fri, 14 Jun 2019 11:39:17 +0200
David Marchand <david.marchand@redhat.com> wrote:
> /* Supports only RTE_KDRV_NIC_UIO */
> + if (pdev->kdrv != RTE_KDRV_NIC_UIO)
> + RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting to IOVA as 'PA'\n");
Maybe NOTICE level, rather than DEBUG which is usually suppressed.
> + } else if (is_vfio_noiommu_enabled != 0) {
> + RTE_LOG(DEBUG, EAL, "Forcing to 'PA', vfio-noiommu mode configured\n");
ditto
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-04 17:14 ` Stephen Hemminger
@ 2019-07-05 7:58 ` David Marchand
2019-07-05 16:27 ` Stephen Hemminger
2019-07-05 8:26 ` Thomas Monjalon
1 sibling, 1 reply; 47+ messages in thread
From: David Marchand @ 2019-07-05 7:58 UTC (permalink / raw)
To: Stephen Hemminger
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Burakov, Anatoly,
Maxime Coquelin, Thomas Monjalon
On Thu, Jul 4, 2019 at 7:14 PM Stephen Hemminger <stephen@networkplumber.org>
wrote:
> On Fri, 14 Jun 2019 11:39:17 +0200
> David Marchand <david.marchand@redhat.com> wrote:
>
> > /* Supports only RTE_KDRV_NIC_UIO */
> > + if (pdev->kdrv != RTE_KDRV_NIC_UIO)
> > + RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting
> to IOVA as 'PA'\n");
>
> Maybe NOTICE level, rather than DEBUG which is usually suppressed.
>
What do you mean by "suppressed" ?
We had a hardwired log level some time ago, but it has been removed and all
logs are in the binaries now.
https://git.dpdk.org/dpdk/commit/?id=5d8f0baf69ea
> > + } else if (is_vfio_noiommu_enabled != 0) {
> > + RTE_LOG(DEBUG, EAL, "Forcing to 'PA', vfio-noiommu
> mode configured\n");
> ditto
>
>
--
David Marchand
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-05 7:58 ` David Marchand
@ 2019-07-05 16:27 ` Stephen Hemminger
0 siblings, 0 replies; 47+ messages in thread
From: Stephen Hemminger @ 2019-07-05 16:27 UTC (permalink / raw)
To: David Marchand
Cc: dev, Ben Walker, Jerin Jacob Kollanukkaran, Burakov, Anatoly,
Maxime Coquelin, Thomas Monjalon
On Fri, 5 Jul 2019 09:58:44 +0200
David Marchand <david.marchand@redhat.com> wrote:
> On Thu, Jul 4, 2019 at 7:14 PM Stephen Hemminger <stephen@networkplumber.org>
> wrote:
>
> > On Fri, 14 Jun 2019 11:39:17 +0200
> > David Marchand <david.marchand@redhat.com> wrote:
> >
> > > /* Supports only RTE_KDRV_NIC_UIO */
> > > + if (pdev->kdrv != RTE_KDRV_NIC_UIO)
> > > + RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting
> > to IOVA as 'PA'\n");
> >
> > Maybe NOTICE level, rather than DEBUG which is usually suppressed.
> >
>
> What do you mean by "suppressed" ?
>
> We had a hardwired log level some time ago, but it has been removed and all
> logs are in the binaries now.
> https://git.dpdk.org/dpdk/commit/?id=5d8f0baf69ea
Unless user increases the log level, the default is to not print debug logs.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode
2019-07-04 17:14 ` Stephen Hemminger
2019-07-05 7:58 ` David Marchand
@ 2019-07-05 8:26 ` Thomas Monjalon
1 sibling, 0 replies; 47+ messages in thread
From: Thomas Monjalon @ 2019-07-05 8:26 UTC (permalink / raw)
To: Stephen Hemminger, David Marchand
Cc: dev, benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin
04/07/2019 19:14, Stephen Hemminger:
> On Fri, 14 Jun 2019 11:39:17 +0200
> David Marchand <david.marchand@redhat.com> wrote:
>
> > /* Supports only RTE_KDRV_NIC_UIO */
> > + if (pdev->kdrv != RTE_KDRV_NIC_UIO)
> > + RTE_LOG(DEBUG, EAL, "Unsupported kernel driver? Defaulting to IOVA as 'PA'\n");
>
> Maybe NOTICE level, rather than DEBUG which is usually suppressed.
DEBUG may be enough here, as it is not something unexpected.
The IOVA choice will be printed in INFO level:
RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of IOVA mode
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
` (2 preceding siblings ...)
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 3/3] bus/pci: only consider usable devices to select IOVA mode David Marchand
@ 2019-06-27 17:05 ` Thomas Monjalon
2019-07-02 14:18 ` Thomas Monjalon
2019-07-05 14:57 ` Thomas Monjalon
4 siblings, 1 reply; 47+ messages in thread
From: Thomas Monjalon @ 2019-06-27 17:05 UTC (permalink / raw)
To: dev
Cc: David Marchand, benjamin.walker, jerinj, anatoly.burakov,
maxime.coquelin, hemant.agrawal, honnappa.nagarahalli,
bruce.richardson, ferruh.yigit
Help in review would be much appreciated here, thanks.
14/06/2019 11:39, David Marchand:
> In SPDK, not all drivers are registered with DPDK at start up time.
> Previously, that meant DPDK always chose to set itself up in IOVA_PA
> mode. Instead, when the correct iova choice is unclear based on the
> devices and drivers known to DPDK at start up time, use other heuristics
> (such as whether /proc/self/pagemap is accessible) to make a better
> choice.
>
> This enables SPDK to run as an unprivileged user again without requiring
> users to explicitly set the iova mode on the command line.
>
>
> Changelog since v1:
> - I took over the series following experiments and discussions with Ben
> and others, squashed Ben patches as two patches focusing on the main
> issues,
> - introduced a fix on KNI,
> - on the EAL bits,
>
> - added log on which IOVA mode has been selected,
> - updated BSD EAL,
> - in Linux EAL, moved KNI special case after IOVA selection,
> - in Linux EAL, added check on forced mode wrt physical addresses
> availability,
>
> - on the PCI bus driver bits,
>
> - enforced the checks in the common code of the PCI bus,
> - added debug logs to track why a iova mode has been chosen per device,
> - added BSD part,
> - in Linux part, checked that VFIO is enabled,
> - in Linux part, defaulted to DC if a driver supports both PA and VA,
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of IOVA mode
2019-06-27 17:05 ` [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of " Thomas Monjalon
@ 2019-07-02 14:18 ` Thomas Monjalon
0 siblings, 0 replies; 47+ messages in thread
From: Thomas Monjalon @ 2019-07-02 14:18 UTC (permalink / raw)
To: dev
Cc: David Marchand, benjamin.walker, jerinj, anatoly.burakov,
maxime.coquelin, hemant.agrawal, honnappa.nagarahalli,
bruce.richardson, ferruh.yigit
No review at all? I can merge then?
27/06/2019 19:05, Thomas Monjalon:
> Help in review would be much appreciated here, thanks.
>
> 14/06/2019 11:39, David Marchand:
> > In SPDK, not all drivers are registered with DPDK at start up time.
> > Previously, that meant DPDK always chose to set itself up in IOVA_PA
> > mode. Instead, when the correct iova choice is unclear based on the
> > devices and drivers known to DPDK at start up time, use other heuristics
> > (such as whether /proc/self/pagemap is accessible) to make a better
> > choice.
> >
> > This enables SPDK to run as an unprivileged user again without requiring
> > users to explicitly set the iova mode on the command line.
> >
> >
> > Changelog since v1:
> > - I took over the series following experiments and discussions with Ben
> > and others, squashed Ben patches as two patches focusing on the main
> > issues,
> > - introduced a fix on KNI,
> > - on the EAL bits,
> >
> > - added log on which IOVA mode has been selected,
> > - updated BSD EAL,
> > - in Linux EAL, moved KNI special case after IOVA selection,
> > - in Linux EAL, added check on forced mode wrt physical addresses
> > availability,
> >
> > - on the PCI bus driver bits,
> >
> > - enforced the checks in the common code of the PCI bus,
> > - added debug logs to track why a iova mode has been chosen per device,
> > - added BSD part,
> > - in Linux part, checked that VFIO is enabled,
> > - in Linux part, defaulted to DC if a driver supports both PA and VA,
>
>
>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of IOVA mode
2019-06-14 9:39 ` [dpdk-dev] [PATCH v2 0/3] " David Marchand
` (3 preceding siblings ...)
2019-06-27 17:05 ` [dpdk-dev] [PATCH v2 0/3] Improve automatic selection of " Thomas Monjalon
@ 2019-07-05 14:57 ` Thomas Monjalon
4 siblings, 0 replies; 47+ messages in thread
From: Thomas Monjalon @ 2019-07-05 14:57 UTC (permalink / raw)
To: David Marchand
Cc: dev, benjamin.walker, jerinj, anatoly.burakov, maxime.coquelin
14/06/2019 11:39, David Marchand:
> In SPDK, not all drivers are registered with DPDK at start up time.
> Previously, that meant DPDK always chose to set itself up in IOVA_PA
> mode. Instead, when the correct iova choice is unclear based on the
> devices and drivers known to DPDK at start up time, use other heuristics
> (such as whether /proc/self/pagemap is accessible) to make a better
> choice.
>
> This enables SPDK to run as an unprivileged user again without requiring
> users to explicitly set the iova mode on the command line.
Applied, thanks
^ permalink raw reply [flat|nested] 47+ messages in thread