DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support
@ 2018-05-25 16:35 Adrien Mazarguil
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
                   ` (7 more replies)
  0 siblings, 8 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This series adds support for port (VF) representors to the mlx5 PMD, which
can be instantiated using the standard "representor" device parameter.

Note the PMD only probes existing representors which exist as Verbs devices;
their creation is part of the host system configuration.

Adrien Mazarguil (7):
  net/mlx5: rename confusing object in probe code
  net/mlx5: remove redundant objects in probe code
  net/mlx5: split PCI from generic probing code
  net/mlx5: re-indent generic probing function
  net/mlx5: add port representor awareness
  net/mlx5: probe all port representors
  net/mlx5: add parameter for port representors

 doc/guides/nics/mlx5.rst                |   12 +
 doc/guides/prog_guide/poll_mode_drv.rst |    2 +
 drivers/net/mlx5/mlx5.c                 | 1040 +++++++++++++++-----------
 drivers/net/mlx5/mlx5.h                 |    8 +-
 drivers/net/mlx5/mlx5_ethdev.c          |  145 +++-
 drivers/net/mlx5/mlx5_mac.c             |    2 +-
 drivers/net/mlx5/mlx5_stats.c           |    6 +-
 7 files changed, 764 insertions(+), 451 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-10 11:00   ` Xueming(Steven) Li
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

There are several attribute objects in this function:

- IB device attributes (struct ibv_device_attr_ex device_attr).
- Direct Verbs attributes (struct mlx5dv_context attrs_out).
- Port attributes (struct ibv_port_attr).
- IB device attributes again (struct ibv_device_attr_ex device_attr_ex).

"attrs_out" is both odd and initialized using a nonstandard syntax. Rename
it "dv_attr" for consistency.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9b78f9879..602f952ca 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -652,6 +652,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
@@ -668,7 +669,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
 	int i;
-	struct mlx5dv_context attrs_out = {0};
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc;
 #endif
@@ -734,21 +734,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
 	/*
 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 	 * as all ConnectX-5 devices.
 	 */
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
-	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
-		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+	mlx5_glue->dv_query_device(ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
 			mps = MLX5_MPW_ENHANCED;
 		} else {
@@ -760,14 +760,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		mps = MLX5_MPW_DISABLED;
 	}
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
-		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
-			attrs_out.striding_rq_caps;
+			dv_attr.striding_rq_caps;
 
 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
 			mprq_caps.min_single_stride_log_num_of_bytes);
@@ -792,15 +792,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
-	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
-		tunnel_en = ((attrs_out.tunnel_offloads_caps &
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
-			     (attrs_out.tunnel_offloads_caps &
+			     (dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
 	}
 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
@@ -810,9 +810,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
-	mpls_en = ((attrs_out.tunnel_offloads_caps &
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
-		   (attrs_out.tunnel_offloads_caps &
+		   (dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
 		mpls_en ? "" : "not ");
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects in probe code
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-10 11:00   ` Xueming(Steven) Li
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch gets rid of redundant calls to open the device and query its
attributes in order to simplify the code.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 60 +++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 602f952ca..41a542ebc 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -652,10 +652,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct ibv_context *ctx = NULL;
+	struct ibv_device_attr_ex attr;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
-	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr_ex device_attr;
 	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -712,12 +712,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		attr_ctx = mlx5_glue->open_device(list[i]);
+		ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
 		break;
 	}
-	if (attr_ctx == NULL) {
+	if (ctx == NULL) {
 		switch (err) {
 		case 0:
 			DRV_LOG(ERR,
@@ -820,23 +820,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected",
-		device_attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
 		char name[RTE_ETH_NAME_MAX_LEN];
 		int len;
 		uint32_t port = i + 1; /* ports are indexed from one */
-		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -863,7 +860,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (device_attr.orig_attr.phys_port_cnt > 1)
+		if (attr.orig_attr.phys_port_cnt > 1)
 			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
@@ -905,7 +902,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		ctx = mlx5_glue->open_device(ibv_dev);
+		if (!ctx)
+			ctx = mlx5_glue->open_device(ibv_dev);
 		if (ctx == NULL) {
 			err = ENODEV;
 			goto port_error;
@@ -947,7 +945,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = device_attr;
+		priv->device_attr = attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
@@ -958,17 +956,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				strerror(rte_errno));
 			goto port_error;
 		}
-		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
-		if (err) {
-			DRV_LOG(ERR, "ibv_query_device_ex() failed");
-			goto port_error;
-		}
-		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
+		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!(device_attr.max_counter_sets);
+		config.flow_counter_en = !!attr.max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -976,7 +969,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+			attr.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -984,29 +977,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+		config.hw_padding = !!attr.rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
-			      (device_attr_ex.tso_caps.supported_qpts &
+		config.tso = (attr.tso_caps.max_tso > 0 &&
+			      (attr.tso_caps.supported_qpts &
 			      (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz =
-					device_attr_ex.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr.tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1153,14 +1145,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
+		/*
+		 * Each eth_dev instance is assigned its own Verbs context,
+		 * since this one is consumed, let the next iteration open
+		 * another.
+		 */
+		ctx = NULL;
 		continue;
 port_error:
 		if (priv)
 			rte_free(priv);
 		if (pd)
 			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (ctx)
-			claim_zero(mlx5_glue->close_device(ctx));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
 		break;
@@ -1172,8 +1168,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
 error:
-	if (attr_ctx)
-		claim_zero(mlx5_glue->close_device(attr_ctx));
+	if (ctx)
+		claim_zero(mlx5_glue->close_device(ctx));
 	if (list)
 		mlx5_glue->free_device_list(list);
 	if (err) {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-10 12:59   ` Xueming(Steven) Li
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function Adrien Mazarguil
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 332 ++++++++++++++++++++++++++-----------------
 1 file changed, 201 insertions(+), 131 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 41a542ebc..7a812ef93 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -633,30 +633,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ * @param[in] attr
+ *   Verbs device attributes.
+ * @param port
+ *   Verbs port to use (indexed from 1).
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+		   struct ibv_device *ibv_dev,
+		   int vf,
+		   const struct ibv_device_attr_ex *attr,
+		   unsigned int port)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
-	struct ibv_context *ctx = NULL;
-	struct ibv_device_attr_ex attr;
+	struct ibv_context *ctx;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -668,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc;
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	ibv_dev = list[i];
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -820,20 +771,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-	if (err) {
-		DEBUG("ibv_query_device_ex() failed");
-		goto error;
-	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -857,11 +799,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
+		if (attr->orig_attr.phys_port_cnt > 1)
+			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+		else
+			snprintf(name, sizeof(name), "%s port %u",
+				 dpdk_dev->name, port);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -870,7 +812,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -898,16 +840,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
-			continue;
+			mlx5_glue->close_device(ctx);
+			return eth_dev;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
-		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -945,23 +881,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
+		priv->device_attr = *attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
 			goto port_error;
 		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
+		config.hw_csum = !!(attr->device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
+		config.flow_counter_en = !!attr->max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -969,7 +905,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
+			attr->rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -977,28 +913,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
+		config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
+		config.tso = (attr->tso_caps.max_tso > 0 &&
+			      (attr->tso_caps.supported_qpts &
 			      (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr->tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1039,8 +975,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1145,13 +1080,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
+		return eth_dev;
 port_error:
 		if (priv)
 			rte_free(priv);
@@ -1159,24 +1088,165 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			claim_zero(mlx5_glue->dealloc_pd(pd));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
-		break;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ *
+ * @return
+ *   A NULL-terminated list of Ethernet device objects on success, NULL
+ *   otherwise and rte_errno is set. Caller is expected to release list
+ *   memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
+{
+	struct rte_eth_dev **eth_list = NULL;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex attr;
+	unsigned int i;
+	int ret;
+
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		if (rte_errno == ENODEV)
+			DRV_LOG(ERR,
+				"cannot access device, is mlx5_ib loaded?");
+		else
+			DRV_LOG(ERR,
+				"cannot use device, are drivers up to date?");
+		return NULL;
+	}
+	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+	mlx5_glue->close_device(ctx);
+	if (ret) {
+		rte_errno = ret;
+		DRV_LOG(ERR, "unable to query device information: %s",
+			strerror(rte_errno));
+		return NULL;
+	}
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	eth_list = malloc(sizeof(*eth_list) *
+			  (attr.orig_attr.phys_port_cnt + 1));
+	if (!eth_list) {
+		rte_errno = errno;
+		return NULL;
+	}
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+						 &attr, i + 1);
+		if (eth_list[i])
+			continue;
+		/* Save rte_errno and roll back in case of failure. */
+		ret = rte_errno;
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		free(eth_list);
+		rte_errno = ret;
+		return NULL;
+	}
+	eth_list[i] = NULL;
+	return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev **eth_list = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	if (ret >= 0)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!eth_list || !*eth_list) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else {
+		for (ret = 0; eth_list[ret]; ++ret) {
+			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			rte_eth_dev_probing_finish(eth_list[ret]);
+		}
+		ret = 0;
+	}
+	free(eth_list);
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
                   ` (2 preceding siblings ...)
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-11 11:42   ` Xueming(Steven) Li
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 5/7] net/mlx5: add port representor awareness Adrien Mazarguil
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Since commit "net/mlx5: split PCI from generic probing code" extracted the
inner loop to a separate function, mlx5_dev_spawn_one() is left with an
unnecessary indent level.

This patch eliminates a block, moves its local variables to function scope,
and re-indents its contents.

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 589 +++++++++++++++++++++----------------------
 1 file changed, 286 insertions(+), 303 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7a812ef93..d57e8118c 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -658,8 +658,27 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		   unsigned int port)
 {
 	struct ibv_context *ctx;
+	struct ibv_port_attr port_attr;
+	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct mlx5_dev_config config = {
+		.vf = !!vf,
+		.tx_vec_en = 1,
+		.rx_vec_en = 1,
+		.mpw_hdr_dseg = 0,
+		.txq_inline = MLX5_ARG_UNSET,
+		.txqs_inline = MLX5_ARG_UNSET,
+		.inline_max_packet_sz = MLX5_ARG_UNSET,
+		.vf_nl_en = 1,
+		.mprq = {
+			.enabled = 0,
+			.stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
+			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+		},
+	};
 	struct rte_eth_dev *eth_dev = NULL;
+	struct priv *priv = NULL;
 	int err = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -675,6 +694,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc;
 #endif
+	struct ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -710,11 +731,13 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		DRV_LOG(DEBUG, "MPW isn't supported");
 		mps = MLX5_MPW_DISABLED;
 	}
+	config.mps = mps;
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
 		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
+	config.swp = !!swp;
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
@@ -740,6 +763,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 			mprq_caps.min_single_wqe_log_num_of_strides;
 		mprq_max_stride_num_n =
 			mprq_caps.max_single_wqe_log_num_of_strides;
+		config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						   mprq_min_stride_num_n);
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
@@ -747,6 +772,7 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
+	config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
 		tunnel_en = ((dv_attr.tunnel_offloads_caps &
@@ -760,6 +786,7 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
+	config.tunnel_en = tunnel_en;
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
 	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
@@ -771,325 +798,281 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	{
-		char name[RTE_ETH_NAME_MAX_LEN];
-		struct ibv_port_attr port_attr;
-		struct ibv_pd *pd = NULL;
-		struct priv *priv = NULL;
-		struct ether_addr mac;
-		struct mlx5_dev_config config = {
-			.cqe_comp = cqe_comp,
-			.mps = mps,
-			.tunnel_en = tunnel_en,
-			.mpls_en = mpls_en,
-			.tx_vec_en = 1,
-			.rx_vec_en = 1,
-			.mpw_hdr_dseg = 0,
-			.txq_inline = MLX5_ARG_UNSET,
-			.txqs_inline = MLX5_ARG_UNSET,
-			.inline_max_packet_sz = MLX5_ARG_UNSET,
-			.vf_nl_en = 1,
-			.swp = !!swp,
-			.mprq = {
-				.enabled = 0, /* Disabled by default. */
-				.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-							mprq_min_stride_num_n),
-				.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
-				.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
-			},
-		};
-
-		if (attr->orig_attr.phys_port_cnt > 1)
-			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
-		else
-			snprintf(name, sizeof(name), "%s port %u",
-				 dpdk_dev->name, port);
-		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-			eth_dev = rte_eth_dev_attach_secondary(name);
-			if (eth_dev == NULL) {
-				DRV_LOG(ERR, "can not attach rte ethdev");
-				rte_errno = ENOMEM;
-				err = rte_errno;
-				goto error;
-			}
-			eth_dev->device = dpdk_dev;
-			eth_dev->dev_ops = &mlx5_dev_sec_ops;
-			err = mlx5_uar_init_secondary(eth_dev);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Receive command fd from primary process */
-			err = mlx5_socket_connect(eth_dev);
-			if (err < 0) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Remap UAR for Tx queues. */
-			err = mlx5_tx_uar_remap(eth_dev, err);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/*
-			 * Ethdev pointer is still required as input since
-			 * the primary device is not accessible from the
-			 * secondary process.
-			 */
-			eth_dev->rx_pkt_burst =
-				mlx5_select_rx_function(eth_dev);
-			eth_dev->tx_pkt_burst =
-				mlx5_select_tx_function(eth_dev);
-			mlx5_glue->close_device(ctx);
-			return eth_dev;
+	config.mpls_en = mpls_en;
+	if (attr->orig_attr.phys_port_cnt > 1)
+		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+	else
+		snprintf(name, sizeof(name), "%s port %u",
+			 dpdk_dev->name, port);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
+		if (eth_dev == NULL) {
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			err = rte_errno;
+			goto error;
 		}
-		DRV_LOG(DEBUG, "using port %u", port);
-		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, port, &port_attr);
+		eth_dev->device = dpdk_dev;
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_uar_init_secondary(eth_dev);
 		if (err) {
-			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto port_error;
-		}
-		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR,
-				"port %d is not configured in Ethernet mode",
-				port);
-			err = EINVAL;
-			goto port_error;
-		}
-		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
-				port,
-				mlx5_glue->port_state_str(port_attr.state),
-				port_attr.state);
-		/* Allocate protection domain. */
-		pd = mlx5_glue->alloc_pd(ctx);
-		if (pd == NULL) {
-			DRV_LOG(ERR, "PD allocation failure");
-			err = ENOMEM;
-			goto port_error;
+			err = rte_errno;
+			goto error;
 		}
-		/* from rte_ethdev.c */
-		priv = rte_zmalloc("ethdev private structure",
-				   sizeof(*priv),
-				   RTE_CACHE_LINE_SIZE);
-		if (priv == NULL) {
-			DRV_LOG(ERR, "priv allocation failure");
-			err = ENOMEM;
-			goto port_error;
+		/* Receive command fd from primary process */
+		err = mlx5_socket_connect(eth_dev);
+		if (err < 0) {
+			err = rte_errno;
+			goto error;
 		}
-		priv->ctx = ctx;
-		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
-			sizeof(priv->ibdev_path));
-		priv->device_attr = *attr;
-		priv->port = port;
-		priv->pd = pd;
-		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, dpdk_dev->devargs);
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_remap(eth_dev, err);
 		if (err) {
 			err = rte_errno;
-			DRV_LOG(ERR, "failed to process device arguments: %s",
-				strerror(rte_errno));
-			goto port_error;
+			goto error;
 		}
-		config.hw_csum = !!(attr->device_cap_flags_ex &
-				    IBV_DEVICE_RAW_IP_CSUM);
-		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
-			(config.hw_csum ? "" : "not "));
+		/*
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
+		 */
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		mlx5_glue->close_device(ctx);
+		return eth_dev;
+	}
+	DRV_LOG(DEBUG, "using port %u", port);
+	/* Check port status. */
+	err = mlx5_glue->query_port(ctx, port, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port %d is not configured in Ethernet mode",
+			port);
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
+			port, mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate protection domain. */
+	pd = mlx5_glue->alloc_pd(ctx);
+	if (pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv = rte_zmalloc("ethdev private structure",
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->ctx = ctx;
+	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
+		sizeof(priv->ibdev_path));
+	priv->device_attr = *attr;
+	priv->port = port;
+	priv->pd = pd;
+	priv->mtu = ETHER_MTU;
+	err = mlx5_args(&config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	config.hw_csum = !!(attr->device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr->max_counter_sets;
-		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
-		DRV_LOG(DEBUG,
-			"counter type = %d, num of cs = %ld, attributes = %d",
-			cs_desc.counter_type, cs_desc.num_of_cs,
-			cs_desc.attributes);
+	config.flow_counter_en = !!attr->max_counter_sets;
+	mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
+	DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d",
+		cs_desc.counter_type, cs_desc.num_of_cs,
+		cs_desc.attributes);
 #endif
-		config.ind_table_max_size =
-			attr->rss_caps.max_rwq_indirection_table_size;
-		/* Remove this check once DPDK supports larger/variable
-		 * indirection tables. */
-		if (config.ind_table_max_size >
-				(unsigned int)ETH_RSS_RETA_SIZE_512)
-			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
-		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
-			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr->raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
-		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
-			(config.hw_vlan_strip ? "" : "not "));
-
-		config.hw_fcs_strip = !!(attr->raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
-		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
-			(config.hw_fcs_strip ? "" : "not "));
-
+	config.ind_table_max_size =
+		attr->rss_caps.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config.ind_table_max_size);
+	config.hw_vlan_strip = !!(attr->raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config.hw_vlan_strip ? "" : "not "));
+	config.hw_fcs_strip = !!(attr->raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config.hw_fcs_strip ? "" : "not "));
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr->rx_pad_end_addr_align;
+	config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
-		DRV_LOG(DEBUG,
-			"hardware Rx end alignment padding is %ssupported",
-			(config.hw_padding ? "" : "not "));
-		config.vf = vf;
-		config.tso = (attr->tso_caps.max_tso > 0 &&
-			      (attr->tso_caps.supported_qpts &
-			      (1 << IBV_QPT_RAW_PACKET)));
-		if (config.tso)
-			config.tso_max_payload_sz = attr->tso_caps.max_tso;
-		if (config.mps && !mps) {
-			DRV_LOG(ERR,
-				"multi-packet send not supported on this device"
-				" (" MLX5_TXQ_MPW_EN ")");
-			err = ENOTSUP;
-			goto port_error;
-		}
-		DRV_LOG(INFO, "%s MPS is %s",
-			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
-			config.mps != MLX5_MPW_DISABLED ? "enabled" :
-			"disabled");
-		if (config.cqe_comp && !cqe_comp) {
-			DRV_LOG(WARNING, "Rx CQE compression isn't supported");
-			config.cqe_comp = 0;
-		}
-		config.mprq.enabled = config.mprq.enabled && mprq;
-		if (config.mprq.enabled) {
-			if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
-			    config.mprq.stride_num_n < mprq_min_stride_num_n) {
-				config.mprq.stride_num_n =
-					RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-						mprq_min_stride_num_n);
-				DRV_LOG(WARNING,
-					"the number of strides"
-					" for Multi-Packet RQ is out of range,"
-					" setting default value (%u)",
-					1 << config.mprq.stride_num_n);
-			}
-			config.mprq.min_stride_size_n = mprq_min_stride_size_n;
-			config.mprq.max_stride_size_n = mprq_max_stride_size_n;
-		}
-		eth_dev = rte_eth_dev_allocate(name);
-		if (eth_dev == NULL) {
-			DRV_LOG(ERR, "can not allocate rte ethdev");
-			err = ENOMEM;
-			goto port_error;
-		}
-		eth_dev->data->dev_private = priv;
-		priv->dev_data = eth_dev->data;
-		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = dpdk_dev;
-		eth_dev->device->driver = &mlx5_driver.driver;
-		err = mlx5_uar_init_primary(eth_dev);
-		if (err) {
-			err = rte_errno;
-			goto port_error;
-		}
-		/* Configure the first MAC address by default. */
-		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
-			DRV_LOG(ERR,
-				"port %u cannot get MAC address, is mlx5_en"
-				" loaded? (errno: %s)",
-				eth_dev->data->port_id, strerror(rte_errno));
-			err = ENODEV;
-			goto port_error;
+	DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported",
+		(config.hw_padding ? "" : "not "));
+	config.tso = (attr->tso_caps.max_tso > 0 &&
+		      (attr->tso_caps.supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config.tso)
+		config.tso_max_payload_sz = attr->tso_caps.max_tso;
+	if (config.mps && !mps) {
+		DRV_LOG(ERR,
+			"multi-packet send not supported on this device"
+			" (" MLX5_TXQ_MPW_EN ")");
+		err = ENOTSUP;
+		goto error;
+	}
+	DRV_LOG(INFO, "%sMPS is %s",
+		config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
+		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config.cqe_comp && !cqe_comp) {
+		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+		config.cqe_comp = 0;
+	}
+	config.mprq.enabled = config.mprq.enabled && mprq;
+	if (config.mprq.enabled) {
+		if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+		    config.mprq.stride_num_n < mprq_min_stride_num_n) {
+			config.mprq.stride_num_n =
+				RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+					mprq_min_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_num_n);
 		}
-		DRV_LOG(INFO,
-			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
-			eth_dev->data->port_id,
-			mac.addr_bytes[0], mac.addr_bytes[1],
-			mac.addr_bytes[2], mac.addr_bytes[3],
-			mac.addr_bytes[4], mac.addr_bytes[5]);
+		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+	}
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	eth_dev->device->driver = &mlx5_driver.driver;
+	err = mlx5_uar_init_primary(eth_dev);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		eth_dev->data->port_id,
+		mac.addr_bytes[0], mac.addr_bytes[1],
+		mac.addr_bytes[2], mac.addr_bytes[3],
+		mac.addr_bytes[4], mac.addr_bytes[5]);
 #ifndef NDEBUG
-		{
-			char ifname[IF_NAMESIZE];
-
-			if (mlx5_get_ifname(eth_dev, &ifname) == 0)
-				DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
-					eth_dev->data->port_id, ifname);
-			else
-				DRV_LOG(DEBUG, "port %u ifname is unknown",
-					eth_dev->data->port_id);
-		}
+	{
+		char ifname[IF_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
 #endif
-		/* Get actual MTU if possible. */
-		err = mlx5_get_mtu(eth_dev, &priv->mtu);
-		if (err) {
-			err = rte_errno;
-			goto port_error;
-		}
-		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
-			priv->mtu);
-		/*
-		 * Initialize burst functions to prevent crashes before link-up.
-		 */
-		eth_dev->rx_pkt_burst = removed_rx_burst;
-		eth_dev->tx_pkt_burst = removed_tx_burst;
-		eth_dev->dev_ops = &mlx5_dev_ops;
-		/* Register MAC address. */
-		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-		priv->nl_socket = -1;
-		priv->nl_sn = 0;
-		if (vf && config.vf_nl_en) {
-			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-			if (priv->nl_socket < 0)
-				priv->nl_socket = -1;
-			mlx5_nl_mac_addr_sync(eth_dev);
-		}
-		TAILQ_INIT(&priv->flows);
-		TAILQ_INIT(&priv->ctrl_flows);
-		/* Hint libmlx5 to use PMD allocator for data plane resources */
-		struct mlx5dv_ctx_allocators alctr = {
-			.alloc = &mlx5_alloc_verbs_buf,
-			.free = &mlx5_free_verbs_buf,
-			.data = priv,
-		};
-		mlx5_glue->dv_set_context_attr(ctx,
-					       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
-					       (void *)((uintptr_t)&alctr));
-		/* Bring Ethernet device up. */
-		DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
-			eth_dev->data->port_id);
-		mlx5_set_link_up(eth_dev);
-		/*
-		 * Even though the interrupt handler is not installed yet,
-		 * interrupts will still trigger on the asyn_fd from
-		 * Verbs context returned by ibv_open_device().
-		 */
-		mlx5_link_update(eth_dev, 0);
-		/* Store device configuration on private structure. */
-		priv->config = config;
-		/* Create drop queue. */
-		err = mlx5_flow_create_drop_queue(eth_dev);
-		if (err) {
-			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
-				eth_dev->data->port_id, strerror(rte_errno));
-			err = rte_errno;
-			goto port_error;
-		}
-		/* Supported Verbs flow priority number detection. */
-		if (verb_priorities == 0)
-			verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
-		if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
-			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
-				eth_dev->data->port_id, verb_priorities);
-			err = ENOTSUP;
-			goto port_error;
-		}
-		priv->config.max_verbs_prio = verb_priorities;
-		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-				 priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
-		rte_eth_dev_probing_finish(eth_dev);
-		return eth_dev;
-port_error:
-		if (priv)
-			rte_free(priv);
-		if (pd)
-			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
-			rte_eth_dev_release_port(eth_dev);
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	priv->nl_socket = -1;
+	priv->nl_sn = 0;
+	if (vf && config.vf_nl_en) {
+		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
+		if (priv->nl_socket < 0)
+			priv->nl_socket = -1;
+		mlx5_nl_mac_addr_sync(eth_dev);
+	}
+	TAILQ_INIT(&priv->flows);
+	TAILQ_INIT(&priv->ctrl_flows);
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	struct mlx5dv_ctx_allocators alctr = {
+		.alloc = &mlx5_alloc_verbs_buf,
+		.free = &mlx5_free_verbs_buf,
+		.data = priv,
+	};
+	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				       (void *)((uintptr_t)&alctr));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the asyn_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+	/* Store device configuration on private structure. */
+	priv->config = config;
+	/* Create drop queue. */
+	err = mlx5_flow_create_drop_queue(eth_dev);
+	if (err) {
+		DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = rte_errno;
+		goto error;
+	}
+	/* Supported Verbs flow priority number detection. */
+	if (verb_priorities == 0)
+		verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
+	if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
+		DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
+			eth_dev->data->port_id, verb_priorities);
+		err = ENOTSUP;
+		goto error;
 	}
+	priv->config.max_verbs_prio = verb_priorities;
+	/* Add device to memory callback list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+			 priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	return eth_dev;
 error:
+	if (priv)
+		rte_free(priv);
+	if (pd)
+		claim_zero(mlx5_glue->dealloc_pd(pd));
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
 	assert(err > 0);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 5/7] net/mlx5: add port representor awareness
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
                   ` (3 preceding siblings ...)
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors Adrien Mazarguil
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

The current PCI probing method is not aware of Verbs port representors,
which appear as standard Verbs devices bound to the same PCI address and
cannot be distinguished.

Problem is that more often than not, the wrong Verbs device is used,
resulting in unexpected traffic.

This patch adds necessary heuristics to bind affected driver instances to
the intended (i.e. non-representor) device.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 61 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d57e8118c..d3a298332 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1155,6 +1155,32 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 }
 
 /**
+ * Comparison callback to sort Verbs device names.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first Verbs device.
+ * @param b[in]
+ *   Pointer to pointer to second Verbs device.
+ *
+ * @return
+ *   0 if both names are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_cmp_ibv_name(const void *a, const void *b)
+{
+	const char *name_a = (*(const struct ibv_device *const *)a)->name;
+	const char *name_b = (*(const struct ibv_device *const *)b)->name;
+	size_t i = 0;
+
+	while (name_a[i] && name_a[i] == name_b[i])
+		++i;
+	return atoi(name_a + i) - atoi(name_b + i);
+}
+
+/**
  * DPDK callback to register a PCI device.
  *
  * This function creates an Ethernet device for each port of a given
@@ -1174,6 +1200,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **ibv_list;
 	struct rte_eth_dev **eth_list = NULL;
+	int n = 0;
 	int vf;
 	int ret;
 
@@ -1195,6 +1222,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
+
+	struct ibv_device *ibv_match[ret + 1];
+
 	while (ret-- > 0) {
 		struct rte_pci_addr pci_addr;
 
@@ -1206,12 +1236,35 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    pci_dev->addr.devid != pci_addr.devid ||
 		    pci_dev->addr.function != pci_addr.function)
 			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
 			ibv_list[ret]->name);
-		break;
+		ibv_match[n++] = ibv_list[ret];
+	}
+	ibv_match[n] = NULL;
+	if (n > 1) {
+		/*
+		 * The existence of several matching entries means port
+		 * representors have been instantiated. No existing Verbs
+		 * call nor /sys entries can tell them apart at this point.
+		 *
+		 * While definitely hackish, assume their names are numbered
+		 * based on order of creation with master device first,
+		 * followed by first port representor, followed by the
+		 * second one and so on.
+		 */
+		DRV_LOG(WARNING,
+			"probing device with port representors involves"
+			" heuristics with uncertain outcome");
+		qsort(ibv_match, n, sizeof(*ibv_match), mlx5_cmp_ibv_name);
+		DRV_LOG(WARNING, "assuming \"%s\" is the master device",
+			ibv_match[0]->name);
+		for (ret = 1; ret < n; ++ret)
+			DRV_LOG(WARNING,
+				"assuming \"%s\" is port representor #%d",
+				ibv_match[ret]->name, ret - 1);
 	}
-	if (ret >= 0)
-		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	if (n)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
 	mlx5_glue->free_device_list(ibv_list);
 	if (!eth_list || !*eth_list) {
 		DRV_LOG(WARNING,
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
                   ` (4 preceding siblings ...)
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 5/7] net/mlx5: add port representor awareness Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-12  6:42   ` Xueming(Steven) Li
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for " Adrien Mazarguil
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Probe existing port representors in addition to their master device and
associate them automatically.

To avoid name collision between Ethernet devices, their names use the same
convention as ixgbe and i40e PMDs, that is, instead of only a PCI address
in DBDF notation:

- "net_{DBDF}_0" for master/switch devices.
- "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
  representors.

Both optionally suffixed with "_port_{num}" instead of " port {num}" for
devices that expose several Verbs ports (note this is never the case on
mlx5, but kept for historical reasons for the time being).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c        | 119 ++++++++++++++++++++---------
 drivers/net/mlx5/mlx5.h        |   8 +-
 drivers/net/mlx5/mlx5_ethdev.c | 145 ++++++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_mac.c    |   2 +-
 drivers/net/mlx5/mlx5_stats.c  |   6 +-
 5 files changed, 226 insertions(+), 54 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d3a298332..09afca63c 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -301,6 +301,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
+	if (!priv->representor &&
+	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 	memset(priv, 0, sizeof(*priv));
 }
 
@@ -645,6 +648,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *   Verbs device attributes.
  * @param port
  *   Verbs port to use (indexed from 1).
+ * @param master
+ *   Master device in case @p ibv_dev is a port representor.
+ * @param rep_id
+ *   Representor identifier when @p master is non-NULL.
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
@@ -655,7 +662,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		   struct ibv_device *ibv_dev,
 		   int vf,
 		   const struct ibv_device_attr_ex *attr,
-		   unsigned int port)
+		   unsigned int port,
+		   struct rte_eth_dev *master,
+		   unsigned int rep_id)
 {
 	struct ibv_context *ctx;
 	struct ibv_port_attr port_attr;
@@ -799,11 +808,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config.mpls_en = mpls_en;
-	if (attr->orig_attr.phys_port_cnt > 1)
-		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+	if (!master)
+		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
 	else
-		snprintf(name, sizeof(name), "%s port %u",
-			 dpdk_dev->name, port);
+		snprintf(name, sizeof(name), "net_%s_representor_%u",
+			 dpdk_dev->name, rep_id);
+	if (attr->orig_attr.phys_port_cnt > 1)
+		snprintf(name, sizeof(name), "%s_port_%u", name, port);
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
@@ -880,6 +892,27 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	priv->port = port;
 	priv->pd = pd;
 	priv->mtu = ETHER_MTU;
+	/*
+	 * Allocate a switch domain for master devices and share it with
+	 * port representors.
+	 */
+	if (!master) {
+		priv->representor = 0;
+		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+		priv->rep_id = 0;
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+	} else {
+		priv->representor = 1;
+		priv->domain_id =
+			((struct priv *)master->data->dev_private)->domain_id;
+		priv->rep_id = rep_id;
+	}
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -1067,8 +1100,12 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
-	if (priv)
+	if (priv) {
+		if (!priv->representor &&
+		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
+	}
 	if (pd)
 		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev)
@@ -1081,12 +1118,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 }
 
 /**
- * Spawn Ethernet devices from Verbs information, one per detected port.
+ * Spawn Ethernet devices from Verbs information, one per detected port and
+ * port representor.
  *
  * @param dpdk_dev
  *   Backing DPDK device.
  * @param ibv_dev
- *   Verbs device.
+ *   NULL-terminated list of Verbs devices. First entry is the master device
+ *   (mandatory), followed by optional representors.
  * @param vf
  *   If nonzero, enable VF-specific features.
  *
@@ -1097,17 +1136,21 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
  */
 static struct rte_eth_dev **
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
-	       struct ibv_device *ibv_dev,
+	       struct ibv_device **ibv_dev,
 	       int vf)
 {
 	struct rte_eth_dev **eth_list = NULL;
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	void *tmp;
 	unsigned int i;
+	unsigned int j = 0;
+	unsigned int n = 0;
 	int ret;
 
+next:
 	errno = 0;
-	ctx = mlx5_glue->open_device(ibv_dev);
+	ctx = mlx5_glue->open_device(ibv_dev[j]);
 	if (!ctx) {
 		rte_errno = errno ? errno : ENODEV;
 		if (rte_errno == ENODEV)
@@ -1116,7 +1159,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		else
 			DRV_LOG(ERR,
 				"cannot use device, are drivers up to date?");
-		return NULL;
+		goto error;
 	}
 	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	mlx5_glue->close_device(ctx);
@@ -1124,34 +1167,42 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		rte_errno = ret;
 		DRV_LOG(ERR, "unable to query device information: %s",
 			strerror(rte_errno));
-		return NULL;
+		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	eth_list = malloc(sizeof(*eth_list) *
-			  (attr.orig_attr.phys_port_cnt + 1));
-	if (!eth_list) {
+	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
+		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
+	tmp = realloc(eth_list, sizeof(*eth_list) *
+		      (n + attr.orig_attr.phys_port_cnt + 1));
+	if (!tmp) {
 		rte_errno = errno;
-		return NULL;
+		goto error;
 	}
+	eth_list = tmp;
 	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
-		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
-						 &attr, i + 1);
-		if (eth_list[i])
-			continue;
-		/* Save rte_errno and roll back in case of failure. */
-		ret = rte_errno;
-		while (i--) {
-			mlx5_dev_close(eth_list[i]);
-			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-				rte_free(eth_list[i]->data->dev_private);
-			claim_zero(rte_eth_dev_release_port(eth_list[i]));
-		}
-		free(eth_list);
-		rte_errno = ret;
-		return NULL;
+		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
+						 &attr, i + 1,
+						 j ? eth_list[0] : NULL,
+						 j - 1);
+		if (!eth_list[n])
+			goto error;
+		++n;
 	}
-	eth_list[i] = NULL;
+	if (ibv_dev[++j])
+		goto next;
+	eth_list[n] = NULL;
 	return eth_list;
+error:
+	/* Save rte_errno and roll back in case of failure. */
+	ret = rte_errno;
+	while (n--) {
+		mlx5_dev_close(eth_list[n]);
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			rte_free(eth_list[n]->data->dev_private);
+		claim_zero(rte_eth_dev_release_port(eth_list[n]));
+	}
+	free(eth_list);
+	rte_errno = ret;
+	return NULL;
 }
 
 /**
@@ -1264,7 +1315,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				ibv_match[ret]->name, ret - 1);
 	}
 	if (n)
-		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
 	mlx5_glue->free_device_list(ibv_list);
 	if (!eth_list || !*eth_list) {
 		DRV_LOG(WARNING,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 997b04a33..b38cb37a9 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -161,6 +161,9 @@ struct priv {
 	uint16_t mtu; /* Configured MTU. */
 	uint8_t port; /* Physical port number. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	unsigned int rep_id; /* Port representor identifier. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
@@ -209,9 +212,12 @@ int mlx5_getenv_int(const char *);
 
 /* mlx5_ethdev.c */
 
+int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+			   char (*ifname)[IF_NAMESIZE]);
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
 int mlx5_ifindex(const struct rte_eth_dev *dev);
-int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	       int master);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
 		   unsigned int flags);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index f6cebae41..361b7ee4c 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -93,7 +93,7 @@ struct ethtool_link_settings {
 #endif
 
 /**
- * Get interface name from private structure.
+ * Get master interface name from private structure.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -104,7 +104,8 @@ struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+		       char (*ifname)[IF_NAMESIZE])
 {
 	struct priv *priv = dev->data->dev_private;
 	DIR *dir;
@@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret;
+	char master[IF_NAMESIZE];
+	FILE *file;
+	DIR *dir;
+	uint64_t phys_switch_id;
+
+	if (!priv->representor)
+		return mlx5_get_master_ifname(dev, ifname);
+	ret = mlx5_get_master_ifname(dev, &master);
+	if (ret)
+		return ret;
+	{
+		MKSTR(path, "%s/device/net/%s/phys_switch_id",
+		      priv->ibdev_path, master);
+
+		file = fopen(path, "rb");
+	}
+	if (!file) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
+	fclose(file);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	{
+		MKSTR(path, "%s/device/net/%s/subsystem",
+		      priv->ibdev_path, master);
+
+		dir = opendir(path);
+	}
+	if (!dir) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	/*
+	 * Scan network interfaces to find one with matching phys_switch_id
+	 * and phys_switch_name.
+	 */
+	do {
+		struct dirent *dent;
+		uint64_t phys_switch_id_rep;
+		int rep_id;
+
+		ret = -ENOENT;
+		dent = readdir(dir);
+		if (!dent)
+			break;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (phys_switch_id_rep != phys_switch_id)
+			continue;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_port_name",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%d", &rep_id);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
+			continue;
+		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
+		ret = 0;
+		break;
+	} while (1);
+	closedir(dir);
+	if (ret)
+		rte_errno = -ret;
+	return ret;
+}
+
+/**
  * Get the interface index from device name.
  *
  * @param[in] dev
@@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
  *   Request number to pass to ioctl().
  * @param[out] ifr
  *   Interface request structure output buffer.
+ * @param master
+ *   When device is a port representor, perform request on master device
+ *   instead.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	   int master)
 {
 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 	int ret = 0;
@@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 		rte_errno = errno;
 		return -rte_errno;
 	}
-	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (master)
+		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
+	else
+		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
 	if (ret)
 		goto error;
 	ret = ioctl(sock, req, ifr);
@@ -258,7 +373,7 @@ int
 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
 
 	if (ret)
 		return ret;
@@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
 	struct ifreq request = { .ifr_mtu = mtu, };
 
-	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
 }
 
 /**
@@ -302,13 +417,13 @@ int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
 
 	if (ret)
 		return ret;
 	request.ifr_flags &= keep;
 	request.ifr_flags |= flags & ~keep;
-	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
 }
 
 /**
@@ -551,7 +666,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	int link_speed = 0;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -561,7 +676,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&edata;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
@@ -622,7 +737,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	uint64_t sc;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -632,7 +747,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&gcmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -649,7 +764,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
 	*ecmd = gcmd;
 	ifr.ifr_data = (void *)ecmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -812,7 +927,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	int ret;
 
 	ifr.ifr_data = (void *)&ethpause;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
@@ -865,7 +980,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		ethpause.tx_pause = 1;
 	else
 		ethpause.tx_pause = 0;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 672a47619..12ee37f55 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 	struct ifreq request;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
 	if (ret)
 		return ret;
 	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 875dd1027..91f3d474a 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
 	et_stats->cmd = ETHTOOL_GSTATS;
 	et_stats->n_stats = xstats_ctrl->stats_n;
 	ifr.ifr_data = (caddr_t)et_stats;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u unable to read statistic values from device",
@@ -194,7 +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
 
 	drvinfo.cmd = ETHTOOL_GDRVINFO;
 	ifr.ifr_data = (caddr_t)&drvinfo;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to query number of statistics",
 			dev->data->port_id);
@@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
 	strings->string_set = ETH_SS_STATS;
 	strings->len = dev_stats_n;
 	ifr.ifr_data = (caddr_t)strings;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to get statistic names",
 			dev->data->port_id);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
                   ` (5 preceding siblings ...)
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-05-25 16:35 ` Adrien Mazarguil
  2018-06-12  8:02   ` Xueming(Steven) Li
  2018-06-12 14:44   ` Xueming(Steven) Li
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
  7 siblings, 2 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-05-25 16:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Prior to this patch, all port representors detected on a given device were
probed and Ethernet devices instantiated for each of them.

This patch adds support for the standard "representor" parameter, which
implies that port representors are not probed by default anymore, except
for the list provided through device arguments.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 doc/guides/nics/mlx5.rst                | 12 ++++++++++++
 doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
 drivers/net/mlx5/mlx5.c                 | 25 +++++++++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 79c982e29..5229e546c 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -388,6 +388,18 @@ Run-time configuration
 
   Disabled by default.
 
+- ``representor`` parameter [list]
+
+  This parameter can be used to instantiate DPDK Ethernet devices from
+  existing port (or VF) representors configured on the device.
+
+  It is a standard parameter whose format is described in
+  :ref:`ethernet_device_standard_device_arguments`.
+
+  For instance, to probe port representors 0 through 2::
+
+    representor=[0-2]
+
 Firmware configuration
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
index af82352a0..58d49ba0f 100644
--- a/doc/guides/prog_guide/poll_mode_drv.rst
+++ b/doc/guides/prog_guide/poll_mode_drv.rst
@@ -365,6 +365,8 @@ Ethernet Device API
 
 The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
 
+.. _ethernet_device_standard_device_arguments:
+
 Ethernet Device Standard Device Arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 09afca63c..216753ba6 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -90,6 +90,9 @@
 /* Activate Netlink support in VF mode. */
 #define MLX5_VF_NL_EN "vf_nl_en"
 
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
 #ifndef HAVE_IBV_MLX5_MOD_MPW
 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
@@ -420,6 +423,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	struct mlx5_dev_config *config = opaque;
 	unsigned long tmp;
 
+	/* No-op, port representors are processed in mlx5_dev_spawn(). */
+	if (!strcmp(MLX5_REPRESENTOR, key))
+		return 0;
 	errno = 0;
 	tmp = strtoul(val, NULL, 0);
 	if (errno) {
@@ -492,6 +498,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
+		MLX5_REPRESENTOR,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
@@ -1142,13 +1149,30 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	struct rte_eth_dev **eth_list = NULL;
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	struct rte_eth_devargs eth_da;
 	void *tmp;
 	unsigned int i;
 	unsigned int j = 0;
 	unsigned int n = 0;
 	int ret;
 
+	if (dpdk_dev->devargs) {
+		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+		if (ret)
+			goto error;
+	} else {
+		memset(&eth_da, 0, sizeof(eth_da));
+	}
 next:
+	if (j) {
+		unsigned int k;
+
+		for (k = 0; k < eth_da.nb_representor_ports; ++k)
+			if (eth_da.representor_ports[k] == j - 1)
+				break;
+		if (k == eth_da.nb_representor_ports)
+			goto skip;
+	}
 	errno = 0;
 	ctx = mlx5_glue->open_device(ibv_dev[j]);
 	if (!ctx) {
@@ -1187,6 +1211,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			goto error;
 		++n;
 	}
+skip:
 	if (ibv_dev[++j])
 		goto next;
 	eth_list[n] = NULL;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-06-10 11:00   ` Xueming(Steven) Li
  2018-06-12 13:19     ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-10 11:00 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Ack except one minor question below.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code
> 
> There are several attribute objects in this function:
> 
> - IB device attributes (struct ibv_device_attr_ex device_attr).
> - Direct Verbs attributes (struct mlx5dv_context attrs_out).
> - Port attributes (struct ibv_port_attr).
> - IB device attributes again (struct ibv_device_attr_ex device_attr_ex).
> 
> "attrs_out" is both odd and initialized using a nonstandard syntax. Rename it "dv_attr" for
> consistency.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
>  1 file changed, 17 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 9b78f9879..602f952ca 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -652,6 +652,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  {
>  	struct ibv_device **list = NULL;
>  	struct ibv_device *ibv_dev;
> +	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	int err = 0;
>  	struct ibv_context *attr_ctx = NULL;
>  	struct ibv_device_attr_ex device_attr; @@ -668,7 +669,6 @@ mlx5_pci_probe(struct rte_pci_driver
> *pci_drv __rte_unused,
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
>  	int i;
> -	struct mlx5dv_context attrs_out = {0};
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc;  #endif @@ -734,21 +734,21 @@ mlx5_pci_probe(struct
> rte_pci_driver *pci_drv __rte_unused,
>  	ibv_dev = list[i];
>  	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
>  #endif
>  	/*
>  	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
>  	 * as all ConnectX-5 devices.
>  	 */
>  #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
>  #endif
>  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
>  #endif
> -	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
> -	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
> -		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
> +	mlx5_glue->dv_query_device(ctx, &dv_attr);

Should ctx be attr_ctx?.

> +	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
> +		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
>  			DRV_LOG(DEBUG, "enhanced MPW is supported");
>  			mps = MLX5_MPW_ENHANCED;
>  		} else {
> @@ -760,14 +760,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		mps = MLX5_MPW_DISABLED;
>  	}
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
> -		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
> +		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
>  	DRV_LOG(DEBUG, "SWP support: %u", swp);  #endif  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
>  		struct mlx5dv_striding_rq_caps mprq_caps =
> -			attrs_out.striding_rq_caps;
> +			dv_attr.striding_rq_caps;
> 
>  		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
>  			mprq_caps.min_single_stride_log_num_of_bytes);
> @@ -792,15 +792,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	}
>  #endif
>  	if (RTE_CACHE_LINE_SIZE == 128 &&
> -	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
> +	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
>  		cqe_comp = 0;
>  	else
>  		cqe_comp = 1;
>  #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
> -		tunnel_en = ((attrs_out.tunnel_offloads_caps &
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
> +		tunnel_en = ((dv_attr.tunnel_offloads_caps &
>  			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
> -			     (attrs_out.tunnel_offloads_caps &
> +			     (dv_attr.tunnel_offloads_caps &
>  			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
>  	}
>  	DRV_LOG(DEBUG, "tunnel offloading is %ssupported", @@ -810,9 +810,9 @@ mlx5_pci_probe(struct
> rte_pci_driver *pci_drv __rte_unused,
>  		"tunnel offloading disabled due to old OFED/rdma-core version");  #endif  #ifdef
> HAVE_IBV_DEVICE_MPLS_SUPPORT
> -	mpls_en = ((attrs_out.tunnel_offloads_caps &
> +	mpls_en = ((dv_attr.tunnel_offloads_caps &
>  		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
> -		   (attrs_out.tunnel_offloads_caps &
> +		   (dv_attr.tunnel_offloads_caps &
>  		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
>  	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
>  		mpls_en ? "" : "not ");
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects in probe code
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-06-10 11:00   ` Xueming(Steven) Li
  2018-06-12 13:19     ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-10 11:00 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Ack. Trivial issue related to other patch found , not sure whether it good to fix it here.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects in probe code
> 
> This patch gets rid of redundant calls to open the device and query its attributes in order to
> simplify the code.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c | 60 +++++++++++++++++++++-----------------------
>  1 file changed, 28 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 602f952ca..41a542ebc 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -652,10 +652,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  {
>  	struct ibv_device **list = NULL;
>  	struct ibv_device *ibv_dev;
> +	struct ibv_context *ctx = NULL;
> +	struct ibv_device_attr_ex attr;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	int err = 0;
> -	struct ibv_context *attr_ctx = NULL;
> -	struct ibv_device_attr_ex device_attr;
>  	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
> @@ -712,12 +712,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
>  		      (pci_dev->id.device_id ==
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		attr_ctx = mlx5_glue->open_device(list[i]);
> +		ctx = mlx5_glue->open_device(list[i]);
>  		rte_errno = errno;
>  		err = rte_errno;
>  		break;
>  	}
> -	if (attr_ctx == NULL) {
> +	if (ctx == NULL) {
>  		switch (err) {
>  		case 0:
>  			DRV_LOG(ERR,
> @@ -820,23 +820,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
> +	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	if (err) {
>  		DEBUG("ibv_query_device_ex() failed");
>  		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected",
> -		device_attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
>  		char name[RTE_ETH_NAME_MAX_LEN];
>  		int len;
>  		uint32_t port = i + 1; /* ports are indexed from one */
> -		struct ibv_context *ctx = NULL;
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
>  		struct rte_eth_dev *eth_dev = NULL;
> -		struct ibv_device_attr_ex device_attr_ex;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -863,7 +860,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
>  			 pci_dev->addr.domain, pci_dev->addr.bus,
>  			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (device_attr.orig_attr.phys_port_cnt > 1)
> +		if (attr.orig_attr.phys_port_cnt > 1)
>  			snprintf(name + len, sizeof(name), " port %u", i);
>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
> @@ -905,7 +902,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			continue;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		ctx = mlx5_glue->open_device(ibv_dev);
> +		if (!ctx)
> +			ctx = mlx5_glue->open_device(ibv_dev);
>  		if (ctx == NULL) {
>  			err = ENODEV;
>  			goto port_error;
> @@ -947,7 +945,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = device_attr;
> +		priv->device_attr = attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> @@ -958,17 +956,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
> -		if (err) {
> -			DRV_LOG(ERR, "ibv_query_device_ex() failed");
> -			goto port_error;
> -		}
> -		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
> +		config.hw_csum = !!(attr.device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!(device_attr.max_counter_sets);
> +		config.flow_counter_en = !!attr.max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -976,7 +969,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
> +			attr.rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -984,29 +977,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr.raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr.raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr.rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
> -			      (device_attr_ex.tso_caps.supported_qpts &
> +		config.tso = (attr.tso_caps.max_tso > 0 &&
> +			      (attr.tso_caps.supported_qpts &
>  			      (1 << IBV_QPT_RAW_PACKET)));

Not related to this patch, wrong indent.

>  		if (config.tso)
> -			config.tso_max_payload_sz =
> -					device_attr_ex.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr.tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1153,14 +1145,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> +		/*
> +		 * Each eth_dev instance is assigned its own Verbs context,
> +		 * since this one is consumed, let the next iteration open
> +		 * another.
> +		 */
> +		ctx = NULL;
>  		continue;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
>  		if (pd)
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
> -		if (ctx)
> -			claim_zero(mlx5_glue->close_device(ctx));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
>  		break;
> @@ -1172,8 +1168,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	 * way to enumerate the registered ethdevs to free the previous ones.
>  	 */
>  error:
> -	if (attr_ctx)
> -		claim_zero(mlx5_glue->close_device(attr_ctx));
> +	if (ctx)
> +		claim_zero(mlx5_glue->close_device(ctx));
>  	if (list)
>  		mlx5_glue->free_device_list(list);
>  	if (err) {
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-06-10 12:59   ` Xueming(Steven) Li
  2018-06-12 13:20     ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-10 12:59 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Hi Adrien,

The logic looks much more clear now with the split.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is currently supplied by a PCI
> lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c | 332 ++++++++++++++++++++++++++-----------------
>  1 file changed, 201 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 41a542ebc..7a812ef93 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -633,30 +633,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)  }
> 
>  /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
>   *
> - * @param[in] pci_drv
> - *   PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - *   PCI device information.
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + * @param[in] attr
> + *   Verbs device attributes.
> + * @param port
> + *   Verbs port to use (indexed from 1).
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> + *   is set.
>   */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> -	       struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> +		   struct ibv_device *ibv_dev,
> +		   int vf,
> +		   const struct ibv_device_attr_ex *attr,
> +		   unsigned int port)
>  {
> -	struct ibv_device **list = NULL;
> -	struct ibv_device *ibv_dev;
> -	struct ibv_context *ctx = NULL;
> -	struct ibv_device_attr_ex attr;
> +	struct ibv_context *ctx;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +	struct rte_eth_dev *eth_dev = NULL;
>  	int err = 0;
> -	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
>  	unsigned int tunnel_en = 0;
> @@ -668,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	unsigned int mprq_max_stride_size_n = 0;
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
> -	int i;
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc;  #endif
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> -	assert(pci_drv == &mlx5_driver);
> -	list = mlx5_glue->get_device_list(&i);
> -	if (list == NULL) {
> -		assert(errno);
> -		err = errno;
> -		if (errno == ENOSYS)
> -			DRV_LOG(ERR,
> -				"cannot list devices, is ib_uverbs loaded?");
> -		goto error;
> -	}
> -	assert(i >= 0);
> -	/*
> -	 * For each listed device, check related sysfs entry against
> -	 * the provided PCI ID.
> -	 */
> -	while (i != 0) {
> -		struct rte_pci_addr pci_addr;
> -
> -		--i;
> -		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> -		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> -			continue;
> -		if ((pci_dev->addr.domain != pci_addr.domain) ||
> -		    (pci_dev->addr.bus != pci_addr.bus) ||
> -		    (pci_dev->addr.devid != pci_addr.devid) ||
> -		    (pci_dev->addr.function != pci_addr.function))
> -			continue;
> -		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> -			list[i]->name);
> -		vf = ((pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		ctx = mlx5_glue->open_device(list[i]);
> -		rte_errno = errno;
> -		err = rte_errno;
> -		break;
> -	}
> -	if (ctx == NULL) {
> -		switch (err) {
> -		case 0:
> -			DRV_LOG(ERR,
> -				"cannot access device, is mlx5_ib loaded?");
> -			err = ENODEV;
> -			break;
> -		case EINVAL:
> -			DRV_LOG(ERR,
> -				"cannot use device, are drivers up to date?");
> -			break;
> -		}
> -		goto error;
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		return NULL;
>  	}
> -	ibv_dev = list[i];
> -	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@ -820,20 +771,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -	if (err) {
> -		DEBUG("ibv_query_device_ex() failed");
> -		goto error;
> -	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> +	{
>  		char name[RTE_ETH_NAME_MAX_LEN];
> -		int len;
> -		uint32_t port = i + 1; /* ports are indexed from one */
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
> -		struct rte_eth_dev *eth_dev = NULL;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -857,11 +799,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			},
>  		};
> 
> -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> -			 pci_dev->addr.domain, pci_dev->addr.bus,
> -			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (attr.orig_attr.phys_port_cnt > 1)
> -			snprintf(name + len, sizeof(name), " port %u", i);
> +		if (attr->orig_attr.phys_port_cnt > 1)
> +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +		else
> +			snprintf(name, sizeof(name), "%s port %u",
> +				 dpdk_dev->name, port);

Name contains port only if phys_port_cnt > 1 in previous logic, are you sure?

>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
>  			if (eth_dev == NULL) {
> @@ -870,7 +812,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				err = rte_errno;
>  				goto error;
>  			}
> -			eth_dev->device = &pci_dev->device;
> +			eth_dev->device = dpdk_dev;
>  			eth_dev->dev_ops = &mlx5_dev_sec_ops;
>  			err = mlx5_uar_init_secondary(eth_dev);
>  			if (err) {
> @@ -898,16 +840,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				mlx5_select_rx_function(eth_dev);
>  			eth_dev->tx_pkt_burst =
>  				mlx5_select_tx_function(eth_dev);
> -			rte_eth_dev_probing_finish(eth_dev);
> -			continue;
> +			mlx5_glue->close_device(ctx);
> +			return eth_dev;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		if (!ctx)
> -			ctx = mlx5_glue->open_device(ibv_dev);
> -		if (ctx == NULL) {
> -			err = ENODEV;
> -			goto port_error;
> -		}
>  		/* Check port status. */
>  		err = mlx5_glue->query_port(ctx, port, &port_attr);
>  		if (err) {
> @@ -945,23 +881,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = attr;
> +		priv->device_attr = *attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> -		err = mlx5_args(&config, pci_dev->device.devargs);
> +		err = mlx5_args(&config, dpdk_dev->devargs);
>  		if (err) {
>  			err = rte_errno;
>  			DRV_LOG(ERR, "failed to process device arguments: %s",
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		config.hw_csum = !!(attr.device_cap_flags_ex &
> +		config.hw_csum = !!(attr->device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!attr.max_counter_sets;
> +		config.flow_counter_en = !!attr->max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -969,7 +905,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			attr.rss_caps.max_rwq_indirection_table_size;
> +			attr->rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -977,28 +913,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!attr.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = (attr.tso_caps.max_tso > 0 &&
> -			      (attr.tso_caps.supported_qpts &
> +		config.tso = (attr->tso_caps.max_tso > 0 &&
> +			      (attr->tso_caps.supported_qpts &
>  			      (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr->tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1039,8 +975,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		eth_dev->data->dev_private = priv;
>  		priv->dev_data = eth_dev->data;
>  		eth_dev->data->mac_addrs = priv->mac;
> -		eth_dev->device = &pci_dev->device;
> -		rte_eth_copy_pci_info(eth_dev, pci_dev);
> +		eth_dev->device = dpdk_dev;
>  		eth_dev->device->driver = &mlx5_driver.driver;
>  		err = mlx5_uar_init_primary(eth_dev);
>  		if (err) {
> @@ -1145,13 +1080,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> -		/*
> -		 * Each eth_dev instance is assigned its own Verbs context,
> -		 * since this one is consumed, let the next iteration open
> -		 * another.
> -		 */
> -		ctx = NULL;
> -		continue;
> +		return eth_dev;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
> @@ -1159,24 +1088,165 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
> -		break;
>  	}
> -	/*
> -	 * XXX if something went wrong in the loop above, there is a resource
> -	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> -	 * long as the dpdk does not provide a way to deallocate a ethdev and a
> -	 * way to enumerate the registered ethdevs to free the previous ones.
> -	 */
>  error:
>  	if (ctx)
>  		claim_zero(mlx5_glue->close_device(ctx));
> -	if (list)
> -		mlx5_glue->free_device_list(list);
> -	if (err) {
> -		rte_errno = err;
> +	assert(err > 0);
> +	rte_errno = err;
> +	return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + *
> + * @return
> + *   A NULL-terminated list of Ethernet device objects on success, NULL
> + *   otherwise and rte_errno is set. Caller is expected to release list
> + *   memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> +	       struct ibv_device *ibv_dev,
> +	       int vf)
> +{
> +	struct rte_eth_dev **eth_list = NULL;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex attr;
> +	unsigned int i;
> +	int ret;
> +
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		if (rte_errno == ENODEV)
> +			DRV_LOG(ERR,
> +				"cannot access device, is mlx5_ib loaded?");
> +		else
> +			DRV_LOG(ERR,
> +				"cannot use device, are drivers up to date?");
> +		return NULL;
> +	}
> +	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> +	mlx5_glue->close_device(ctx);
> +	if (ret) {
> +		rte_errno = ret;
> +		DRV_LOG(ERR, "unable to query device information: %s",
> +			strerror(rte_errno));
> +		return NULL;
> +	}
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	eth_list = malloc(sizeof(*eth_list) *
> +			  (attr.orig_attr.phys_port_cnt + 1));
> +	if (!eth_list) {
> +		rte_errno = errno;
> +		return NULL;
> +	}
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> +						 &attr, i + 1);
> +		if (eth_list[i])
> +			continue;
> +		/* Save rte_errno and roll back in case of failure. */
> +		ret = rte_errno;
> +		while (i--) {
> +			mlx5_dev_close(eth_list[i]);
> +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +				rte_free(eth_list[i]->data->dev_private);
> +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +		}
> +		free(eth_list);
> +		rte_errno = ret;
> +		return NULL;

The code is correct, but I personally prefer to move complex error handling to 
dedicate "error:" block to make the code clear.

> +	}
> +	eth_list[i] = NULL;
> +	return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}

How about use a macro for vf detection and invoke in mlx5_dev_spawn_one().
Seems it not used in outer callers.
	
> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}
> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!eth_list || !*eth_list) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI " PCI_PRI_FMT ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function Adrien Mazarguil
@ 2018-06-11 11:42   ` Xueming(Steven) Li
  0 siblings, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-11 11:42 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Reviewed-by: Xueming(Steven) Li <xuemingl@mellanox.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 4/7] net/mlx5: re-indent generic probing function
> 
> Since commit "net/mlx5: split PCI from generic probing code" extracted the inner loop to a separate
> function, mlx5_dev_spawn_one() is left with an unnecessary indent level.
> 
> This patch eliminates a block, moves its local variables to function scope, and re-indents its
> contents.
> 
> No functional impact.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c | 589 +++++++++++++++++++++----------------------
>  1 file changed, 286 insertions(+), 303 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 7a812ef93..d57e8118c 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -658,8 +658,27 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		   unsigned int port)
>  {
>  	struct ibv_context *ctx;
> +	struct ibv_port_attr port_attr;
> +	struct ibv_pd *pd = NULL;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +	struct mlx5_dev_config config = {
> +		.vf = !!vf,
> +		.tx_vec_en = 1,
> +		.rx_vec_en = 1,
> +		.mpw_hdr_dseg = 0,
> +		.txq_inline = MLX5_ARG_UNSET,
> +		.txqs_inline = MLX5_ARG_UNSET,
> +		.inline_max_packet_sz = MLX5_ARG_UNSET,
> +		.vf_nl_en = 1,
> +		.mprq = {
> +			.enabled = 0,
> +			.stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
> +			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
> +			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
> +		},
> +	};
>  	struct rte_eth_dev *eth_dev = NULL;
> +	struct priv *priv = NULL;
>  	int err = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
> @@ -675,6 +694,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,  #ifdef
> HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc;  #endif
> +	struct ether_addr mac;
> +	char name[RTE_ETH_NAME_MAX_LEN];
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> @@ -710,11 +731,13 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		DRV_LOG(DEBUG, "MPW isn't supported");
>  		mps = MLX5_MPW_DISABLED;
>  	}
> +	config.mps = mps;
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
>  		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
>  	DRV_LOG(DEBUG, "SWP support: %u", swp);  #endif
> +	config.swp = !!swp;
>  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
>  	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
>  		struct mlx5dv_striding_rq_caps mprq_caps = @@ -740,6 +763,8 @@ mlx5_dev_spawn_one(struct
> rte_device *dpdk_dev,
>  			mprq_caps.min_single_wqe_log_num_of_strides;
>  		mprq_max_stride_num_n =
>  			mprq_caps.max_single_wqe_log_num_of_strides;
> +		config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
> +						   mprq_min_stride_num_n);
>  	}
>  #endif
>  	if (RTE_CACHE_LINE_SIZE == 128 &&
> @@ -747,6 +772,7 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		cqe_comp = 0;
>  	else
>  		cqe_comp = 1;
> +	config.cqe_comp = cqe_comp;
>  #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
>  	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
>  		tunnel_en = ((dv_attr.tunnel_offloads_caps & @@ -760,6 +786,7 @@ mlx5_dev_spawn_one(struct
> rte_device *dpdk_dev,
>  	DRV_LOG(WARNING,
>  		"tunnel offloading disabled due to old OFED/rdma-core version");  #endif
> +	config.tunnel_en = tunnel_en;
>  #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
>  	mpls_en = ((dv_attr.tunnel_offloads_caps &
>  		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && @@ -771,325 +798,281 @@
> mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	{
> -		char name[RTE_ETH_NAME_MAX_LEN];
> -		struct ibv_port_attr port_attr;
> -		struct ibv_pd *pd = NULL;
> -		struct priv *priv = NULL;
> -		struct ether_addr mac;
> -		struct mlx5_dev_config config = {
> -			.cqe_comp = cqe_comp,
> -			.mps = mps,
> -			.tunnel_en = tunnel_en,
> -			.mpls_en = mpls_en,
> -			.tx_vec_en = 1,
> -			.rx_vec_en = 1,
> -			.mpw_hdr_dseg = 0,
> -			.txq_inline = MLX5_ARG_UNSET,
> -			.txqs_inline = MLX5_ARG_UNSET,
> -			.inline_max_packet_sz = MLX5_ARG_UNSET,
> -			.vf_nl_en = 1,
> -			.swp = !!swp,
> -			.mprq = {
> -				.enabled = 0, /* Disabled by default. */
> -				.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
> -							mprq_min_stride_num_n),
> -				.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
> -				.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
> -			},
> -		};
> -
> -		if (attr->orig_attr.phys_port_cnt > 1)
> -			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> -		else
> -			snprintf(name, sizeof(name), "%s port %u",
> -				 dpdk_dev->name, port);
> -		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> -			eth_dev = rte_eth_dev_attach_secondary(name);
> -			if (eth_dev == NULL) {
> -				DRV_LOG(ERR, "can not attach rte ethdev");
> -				rte_errno = ENOMEM;
> -				err = rte_errno;
> -				goto error;
> -			}
> -			eth_dev->device = dpdk_dev;
> -			eth_dev->dev_ops = &mlx5_dev_sec_ops;
> -			err = mlx5_uar_init_secondary(eth_dev);
> -			if (err) {
> -				err = rte_errno;
> -				goto error;
> -			}
> -			/* Receive command fd from primary process */
> -			err = mlx5_socket_connect(eth_dev);
> -			if (err < 0) {
> -				err = rte_errno;
> -				goto error;
> -			}
> -			/* Remap UAR for Tx queues. */
> -			err = mlx5_tx_uar_remap(eth_dev, err);
> -			if (err) {
> -				err = rte_errno;
> -				goto error;
> -			}
> -			/*
> -			 * Ethdev pointer is still required as input since
> -			 * the primary device is not accessible from the
> -			 * secondary process.
> -			 */
> -			eth_dev->rx_pkt_burst =
> -				mlx5_select_rx_function(eth_dev);
> -			eth_dev->tx_pkt_burst =
> -				mlx5_select_tx_function(eth_dev);
> -			mlx5_glue->close_device(ctx);
> -			return eth_dev;
> +	config.mpls_en = mpls_en;
> +	if (attr->orig_attr.phys_port_cnt > 1)
> +		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +	else
> +		snprintf(name, sizeof(name), "%s port %u",
> +			 dpdk_dev->name, port);
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +		eth_dev = rte_eth_dev_attach_secondary(name);
> +		if (eth_dev == NULL) {
> +			DRV_LOG(ERR, "can not attach rte ethdev");
> +			rte_errno = ENOMEM;
> +			err = rte_errno;
> +			goto error;
>  		}
> -		DRV_LOG(DEBUG, "using port %u", port);
> -		/* Check port status. */
> -		err = mlx5_glue->query_port(ctx, port, &port_attr);
> +		eth_dev->device = dpdk_dev;
> +		eth_dev->dev_ops = &mlx5_dev_sec_ops;
> +		err = mlx5_uar_init_secondary(eth_dev);
>  		if (err) {
> -			DRV_LOG(ERR, "port query failed: %s", strerror(err));
> -			goto port_error;
> -		}
> -		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
> -			DRV_LOG(ERR,
> -				"port %d is not configured in Ethernet mode",
> -				port);
> -			err = EINVAL;
> -			goto port_error;
> -		}
> -		if (port_attr.state != IBV_PORT_ACTIVE)
> -			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
> -				port,
> -				mlx5_glue->port_state_str(port_attr.state),
> -				port_attr.state);
> -		/* Allocate protection domain. */
> -		pd = mlx5_glue->alloc_pd(ctx);
> -		if (pd == NULL) {
> -			DRV_LOG(ERR, "PD allocation failure");
> -			err = ENOMEM;
> -			goto port_error;
> +			err = rte_errno;
> +			goto error;
>  		}
> -		/* from rte_ethdev.c */
> -		priv = rte_zmalloc("ethdev private structure",
> -				   sizeof(*priv),
> -				   RTE_CACHE_LINE_SIZE);
> -		if (priv == NULL) {
> -			DRV_LOG(ERR, "priv allocation failure");
> -			err = ENOMEM;
> -			goto port_error;
> +		/* Receive command fd from primary process */
> +		err = mlx5_socket_connect(eth_dev);
> +		if (err < 0) {
> +			err = rte_errno;
> +			goto error;
>  		}
> -		priv->ctx = ctx;
> -		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
> -			sizeof(priv->ibdev_path));
> -		priv->device_attr = *attr;
> -		priv->port = port;
> -		priv->pd = pd;
> -		priv->mtu = ETHER_MTU;
> -		err = mlx5_args(&config, dpdk_dev->devargs);
> +		/* Remap UAR for Tx queues. */
> +		err = mlx5_tx_uar_remap(eth_dev, err);
>  		if (err) {
>  			err = rte_errno;
> -			DRV_LOG(ERR, "failed to process device arguments: %s",
> -				strerror(rte_errno));
> -			goto port_error;
> +			goto error;
>  		}
> -		config.hw_csum = !!(attr->device_cap_flags_ex &
> -				    IBV_DEVICE_RAW_IP_CSUM);
> -		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
> -			(config.hw_csum ? "" : "not "));
> +		/*
> +		 * Ethdev pointer is still required as input since
> +		 * the primary device is not accessible from the
> +		 * secondary process.
> +		 */
> +		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
> +		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
> +		mlx5_glue->close_device(ctx);
> +		return eth_dev;
> +	}
> +	DRV_LOG(DEBUG, "using port %u", port);
> +	/* Check port status. */
> +	err = mlx5_glue->query_port(ctx, port, &port_attr);
> +	if (err) {
> +		DRV_LOG(ERR, "port query failed: %s", strerror(err));
> +		goto error;
> +	}
> +	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
> +		DRV_LOG(ERR, "port %d is not configured in Ethernet mode",
> +			port);
> +		err = EINVAL;
> +		goto error;
> +	}
> +	if (port_attr.state != IBV_PORT_ACTIVE)
> +		DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
> +			port, mlx5_glue->port_state_str(port_attr.state),
> +			port_attr.state);
> +	/* Allocate protection domain. */
> +	pd = mlx5_glue->alloc_pd(ctx);
> +	if (pd == NULL) {
> +		DRV_LOG(ERR, "PD allocation failure");
> +		err = ENOMEM;
> +		goto error;
> +	}
> +	priv = rte_zmalloc("ethdev private structure",
> +			   sizeof(*priv),
> +			   RTE_CACHE_LINE_SIZE);
> +	if (priv == NULL) {
> +		DRV_LOG(ERR, "priv allocation failure");
> +		err = ENOMEM;
> +		goto error;
> +	}
> +	priv->ctx = ctx;
> +	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
> +		sizeof(priv->ibdev_path));
> +	priv->device_attr = *attr;
> +	priv->port = port;
> +	priv->pd = pd;
> +	priv->mtu = ETHER_MTU;
> +	err = mlx5_args(&config, dpdk_dev->devargs);
> +	if (err) {
> +		err = rte_errno;
> +		DRV_LOG(ERR, "failed to process device arguments: %s",
> +			strerror(rte_errno));
> +		goto error;
> +	}
> +	config.hw_csum = !!(attr->device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
> +	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
> +		(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!attr->max_counter_sets;
> -		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
> -		DRV_LOG(DEBUG,
> -			"counter type = %d, num of cs = %ld, attributes = %d",
> -			cs_desc.counter_type, cs_desc.num_of_cs,
> -			cs_desc.attributes);
> +	config.flow_counter_en = !!attr->max_counter_sets;
> +	mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
> +	DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d",
> +		cs_desc.counter_type, cs_desc.num_of_cs,
> +		cs_desc.attributes);
>  #endif
> -		config.ind_table_max_size =
> -			attr->rss_caps.max_rwq_indirection_table_size;
> -		/* Remove this check once DPDK supports larger/variable
> -		 * indirection tables. */
> -		if (config.ind_table_max_size >
> -				(unsigned int)ETH_RSS_RETA_SIZE_512)
> -			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
> -		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
> -			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(attr->raw_packet_caps &
> -					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
> -		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
> -			(config.hw_vlan_strip ? "" : "not "));
> -
> -		config.hw_fcs_strip = !!(attr->raw_packet_caps &
> -					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
> -		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
> -			(config.hw_fcs_strip ? "" : "not "));
> -
> +	config.ind_table_max_size =
> +		attr->rss_caps.max_rwq_indirection_table_size;
> +	/*
> +	 * Remove this check once DPDK supports larger/variable
> +	 * indirection tables.
> +	 */
> +	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
> +		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
> +	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
> +		config.ind_table_max_size);
> +	config.hw_vlan_strip = !!(attr->raw_packet_caps &
> +				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
> +	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
> +		(config.hw_vlan_strip ? "" : "not "));
> +	config.hw_fcs_strip = !!(attr->raw_packet_caps &
> +				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
> +	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
> +		(config.hw_fcs_strip ? "" : "not "));
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!attr->rx_pad_end_addr_align;
> +	config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
> -		DRV_LOG(DEBUG,
> -			"hardware Rx end alignment padding is %ssupported",
> -			(config.hw_padding ? "" : "not "));
> -		config.vf = vf;
> -		config.tso = (attr->tso_caps.max_tso > 0 &&
> -			      (attr->tso_caps.supported_qpts &
> -			      (1 << IBV_QPT_RAW_PACKET)));
> -		if (config.tso)
> -			config.tso_max_payload_sz = attr->tso_caps.max_tso;
> -		if (config.mps && !mps) {
> -			DRV_LOG(ERR,
> -				"multi-packet send not supported on this device"
> -				" (" MLX5_TXQ_MPW_EN ")");
> -			err = ENOTSUP;
> -			goto port_error;
> -		}
> -		DRV_LOG(INFO, "%s MPS is %s",
> -			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
> -			config.mps != MLX5_MPW_DISABLED ? "enabled" :
> -			"disabled");
> -		if (config.cqe_comp && !cqe_comp) {
> -			DRV_LOG(WARNING, "Rx CQE compression isn't supported");
> -			config.cqe_comp = 0;
> -		}
> -		config.mprq.enabled = config.mprq.enabled && mprq;
> -		if (config.mprq.enabled) {
> -			if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
> -			    config.mprq.stride_num_n < mprq_min_stride_num_n) {
> -				config.mprq.stride_num_n =
> -					RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
> -						mprq_min_stride_num_n);
> -				DRV_LOG(WARNING,
> -					"the number of strides"
> -					" for Multi-Packet RQ is out of range,"
> -					" setting default value (%u)",
> -					1 << config.mprq.stride_num_n);
> -			}
> -			config.mprq.min_stride_size_n = mprq_min_stride_size_n;
> -			config.mprq.max_stride_size_n = mprq_max_stride_size_n;
> -		}
> -		eth_dev = rte_eth_dev_allocate(name);
> -		if (eth_dev == NULL) {
> -			DRV_LOG(ERR, "can not allocate rte ethdev");
> -			err = ENOMEM;
> -			goto port_error;
> -		}
> -		eth_dev->data->dev_private = priv;
> -		priv->dev_data = eth_dev->data;
> -		eth_dev->data->mac_addrs = priv->mac;
> -		eth_dev->device = dpdk_dev;
> -		eth_dev->device->driver = &mlx5_driver.driver;
> -		err = mlx5_uar_init_primary(eth_dev);
> -		if (err) {
> -			err = rte_errno;
> -			goto port_error;
> -		}
> -		/* Configure the first MAC address by default. */
> -		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
> -			DRV_LOG(ERR,
> -				"port %u cannot get MAC address, is mlx5_en"
> -				" loaded? (errno: %s)",
> -				eth_dev->data->port_id, strerror(rte_errno));
> -			err = ENODEV;
> -			goto port_error;
> +	DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported",
> +		(config.hw_padding ? "" : "not "));
> +	config.tso = (attr->tso_caps.max_tso > 0 &&
> +		      (attr->tso_caps.supported_qpts &
> +		       (1 << IBV_QPT_RAW_PACKET)));
> +	if (config.tso)
> +		config.tso_max_payload_sz = attr->tso_caps.max_tso;
> +	if (config.mps && !mps) {
> +		DRV_LOG(ERR,
> +			"multi-packet send not supported on this device"
> +			" (" MLX5_TXQ_MPW_EN ")");
> +		err = ENOTSUP;
> +		goto error;
> +	}
> +	DRV_LOG(INFO, "%sMPS is %s",
> +		config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
> +		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
> +	if (config.cqe_comp && !cqe_comp) {
> +		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
> +		config.cqe_comp = 0;
> +	}
> +	config.mprq.enabled = config.mprq.enabled && mprq;
> +	if (config.mprq.enabled) {
> +		if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
> +		    config.mprq.stride_num_n < mprq_min_stride_num_n) {
> +			config.mprq.stride_num_n =
> +				RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
> +					mprq_min_stride_num_n);
> +			DRV_LOG(WARNING,
> +				"the number of strides"
> +				" for Multi-Packet RQ is out of range,"
> +				" setting default value (%u)",
> +				1 << config.mprq.stride_num_n);
>  		}
> -		DRV_LOG(INFO,
> -			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
> -			eth_dev->data->port_id,
> -			mac.addr_bytes[0], mac.addr_bytes[1],
> -			mac.addr_bytes[2], mac.addr_bytes[3],
> -			mac.addr_bytes[4], mac.addr_bytes[5]);
> +		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
> +		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
> +	}
> +	eth_dev = rte_eth_dev_allocate(name);
> +	if (eth_dev == NULL) {
> +		DRV_LOG(ERR, "can not allocate rte ethdev");
> +		err = ENOMEM;
> +		goto error;
> +	}
> +	eth_dev->data->dev_private = priv;
> +	priv->dev_data = eth_dev->data;
> +	eth_dev->data->mac_addrs = priv->mac;
> +	eth_dev->device = dpdk_dev;
> +	eth_dev->device->driver = &mlx5_driver.driver;
> +	err = mlx5_uar_init_primary(eth_dev);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
> +	/* Configure the first MAC address by default. */
> +	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
> +		DRV_LOG(ERR,
> +			"port %u cannot get MAC address, is mlx5_en"
> +			" loaded? (errno: %s)",
> +			eth_dev->data->port_id, strerror(rte_errno));
> +		err = ENODEV;
> +		goto error;
> +	}
> +	DRV_LOG(INFO,
> +		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
> +		eth_dev->data->port_id,
> +		mac.addr_bytes[0], mac.addr_bytes[1],
> +		mac.addr_bytes[2], mac.addr_bytes[3],
> +		mac.addr_bytes[4], mac.addr_bytes[5]);
>  #ifndef NDEBUG
> -		{
> -			char ifname[IF_NAMESIZE];
> -
> -			if (mlx5_get_ifname(eth_dev, &ifname) == 0)
> -				DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
> -					eth_dev->data->port_id, ifname);
> -			else
> -				DRV_LOG(DEBUG, "port %u ifname is unknown",
> -					eth_dev->data->port_id);
> -		}
> +	{
> +		char ifname[IF_NAMESIZE];
> +
> +		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
> +			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
> +				eth_dev->data->port_id, ifname);
> +		else
> +			DRV_LOG(DEBUG, "port %u ifname is unknown",
> +				eth_dev->data->port_id);
> +	}
>  #endif
> -		/* Get actual MTU if possible. */
> -		err = mlx5_get_mtu(eth_dev, &priv->mtu);
> -		if (err) {
> -			err = rte_errno;
> -			goto port_error;
> -		}
> -		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
> -			priv->mtu);
> -		/*
> -		 * Initialize burst functions to prevent crashes before link-up.
> -		 */
> -		eth_dev->rx_pkt_burst = removed_rx_burst;
> -		eth_dev->tx_pkt_burst = removed_tx_burst;
> -		eth_dev->dev_ops = &mlx5_dev_ops;
> -		/* Register MAC address. */
> -		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
> -		priv->nl_socket = -1;
> -		priv->nl_sn = 0;
> -		if (vf && config.vf_nl_en) {
> -			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
> -			if (priv->nl_socket < 0)
> -				priv->nl_socket = -1;
> -			mlx5_nl_mac_addr_sync(eth_dev);
> -		}
> -		TAILQ_INIT(&priv->flows);
> -		TAILQ_INIT(&priv->ctrl_flows);
> -		/* Hint libmlx5 to use PMD allocator for data plane resources */
> -		struct mlx5dv_ctx_allocators alctr = {
> -			.alloc = &mlx5_alloc_verbs_buf,
> -			.free = &mlx5_free_verbs_buf,
> -			.data = priv,
> -		};
> -		mlx5_glue->dv_set_context_attr(ctx,
> -					       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
> -					       (void *)((uintptr_t)&alctr));
> -		/* Bring Ethernet device up. */
> -		DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
> -			eth_dev->data->port_id);
> -		mlx5_set_link_up(eth_dev);
> -		/*
> -		 * Even though the interrupt handler is not installed yet,
> -		 * interrupts will still trigger on the asyn_fd from
> -		 * Verbs context returned by ibv_open_device().
> -		 */
> -		mlx5_link_update(eth_dev, 0);
> -		/* Store device configuration on private structure. */
> -		priv->config = config;
> -		/* Create drop queue. */
> -		err = mlx5_flow_create_drop_queue(eth_dev);
> -		if (err) {
> -			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
> -				eth_dev->data->port_id, strerror(rte_errno));
> -			err = rte_errno;
> -			goto port_error;
> -		}
> -		/* Supported Verbs flow priority number detection. */
> -		if (verb_priorities == 0)
> -			verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
> -		if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
> -			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
> -				eth_dev->data->port_id, verb_priorities);
> -			err = ENOTSUP;
> -			goto port_error;
> -		}
> -		priv->config.max_verbs_prio = verb_priorities;
> -		/* Add device to memory callback list. */
> -		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -		LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> -				 priv, mem_event_cb);
> -		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> -		rte_eth_dev_probing_finish(eth_dev);
> -		return eth_dev;
> -port_error:
> -		if (priv)
> -			rte_free(priv);
> -		if (pd)
> -			claim_zero(mlx5_glue->dealloc_pd(pd));
> -		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
> -			rte_eth_dev_release_port(eth_dev);
> +	/* Get actual MTU if possible. */
> +	err = mlx5_get_mtu(eth_dev, &priv->mtu);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
> +	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
> +		priv->mtu);
> +	/* Initialize burst functions to prevent crashes before link-up. */
> +	eth_dev->rx_pkt_burst = removed_rx_burst;
> +	eth_dev->tx_pkt_burst = removed_tx_burst;
> +	eth_dev->dev_ops = &mlx5_dev_ops;
> +	/* Register MAC address. */
> +	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
> +	priv->nl_socket = -1;
> +	priv->nl_sn = 0;
> +	if (vf && config.vf_nl_en) {
> +		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
> +		if (priv->nl_socket < 0)
> +			priv->nl_socket = -1;
> +		mlx5_nl_mac_addr_sync(eth_dev);
> +	}
> +	TAILQ_INIT(&priv->flows);
> +	TAILQ_INIT(&priv->ctrl_flows);
> +	/* Hint libmlx5 to use PMD allocator for data plane resources */
> +	struct mlx5dv_ctx_allocators alctr = {
> +		.alloc = &mlx5_alloc_verbs_buf,
> +		.free = &mlx5_free_verbs_buf,
> +		.data = priv,
> +	};
> +	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
> +				       (void *)((uintptr_t)&alctr));
> +	/* Bring Ethernet device up. */
> +	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
> +		eth_dev->data->port_id);
> +	mlx5_set_link_up(eth_dev);
> +	/*
> +	 * Even though the interrupt handler is not installed yet,
> +	 * interrupts will still trigger on the asyn_fd from
> +	 * Verbs context returned by ibv_open_device().
> +	 */
> +	mlx5_link_update(eth_dev, 0);
> +	/* Store device configuration on private structure. */
> +	priv->config = config;
> +	/* Create drop queue. */
> +	err = mlx5_flow_create_drop_queue(eth_dev);
> +	if (err) {
> +		DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
> +			eth_dev->data->port_id, strerror(rte_errno));
> +		err = rte_errno;
> +		goto error;
> +	}
> +	/* Supported Verbs flow priority number detection. */
> +	if (verb_priorities == 0)
> +		verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
> +	if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
> +		DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
> +			eth_dev->data->port_id, verb_priorities);
> +		err = ENOTSUP;
> +		goto error;
>  	}
> +	priv->config.max_verbs_prio = verb_priorities;
> +	/* Add device to memory callback list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> +			 priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> +	return eth_dev;
>  error:
> +	if (priv)
> +		rte_free(priv);
> +	if (pd)
> +		claim_zero(mlx5_glue->dealloc_pd(pd));
> +	if (eth_dev)
> +		rte_eth_dev_release_port(eth_dev);
>  	if (ctx)
>  		claim_zero(mlx5_glue->close_device(ctx));
>  	assert(err > 0);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-06-12  6:42   ` Xueming(Steven) Li
  2018-06-12 13:20     ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-12  6:42 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Hi Adrien,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors
> 
> Probe existing port representors in addition to their master device and associate them automatically.
> 
> To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> 
> - "net_{DBDF}_0" for master/switch devices.
> - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
>   representors.
> 
> Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c        | 119 ++++++++++++++++++++---------
>  drivers/net/mlx5/mlx5.h        |   8 +-
>  drivers/net/mlx5/mlx5_ethdev.c | 145 ++++++++++++++++++++++++++++++++----
>  drivers/net/mlx5/mlx5_mac.c    |   2 +-
>  drivers/net/mlx5/mlx5_stats.c  |   6 +-
>  5 files changed, 226 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index d3a298332..09afca63c 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -301,6 +301,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  	if (ret)
>  		DRV_LOG(WARNING, "port %u some flows still remain",
>  			dev->data->port_id);
> +	if (!priv->representor &&
> +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  	memset(priv, 0, sizeof(*priv));
>  }
> 
> @@ -645,6 +648,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
>   *   Verbs device attributes.
>   * @param port
>   *   Verbs port to use (indexed from 1).
> + * @param master
> + *   Master device in case @p ibv_dev is a port representor.
> + * @param rep_id
> + *   Representor identifier when @p master is non-NULL.
>   *
>   * @return
>   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> @@ -655,7 +662,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		   struct ibv_device *ibv_dev,
>  		   int vf,
>  		   const struct ibv_device_attr_ex *attr,
> -		   unsigned int port)
> +		   unsigned int port,
> +		   struct rte_eth_dev *master,
> +		   unsigned int rep_id)
>  {
>  	struct ibv_context *ctx;
>  	struct ibv_port_attr port_attr;
> @@ -799,11 +808,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		" old OFED/rdma-core version or firmware configuration");  #endif
>  	config.mpls_en = mpls_en;
> -	if (attr->orig_attr.phys_port_cnt > 1)
> -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +	if (!master)
> +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
>  	else
> -		snprintf(name, sizeof(name), "%s port %u",
> -			 dpdk_dev->name, port);
> +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> +			 dpdk_dev->name, rep_id);
> +	if (attr->orig_attr.phys_port_cnt > 1)
> +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  		eth_dev = rte_eth_dev_attach_secondary(name);
>  		if (eth_dev == NULL) {
> @@ -880,6 +892,27 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	priv->port = port;
>  	priv->pd = pd;
>  	priv->mtu = ETHER_MTU;
> +	/*
> +	 * Allocate a switch domain for master devices and share it with
> +	 * port representors.
> +	 */
> +	if (!master) {
> +		priv->representor = 0;
> +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> +		priv->rep_id = 0;
> +		err = rte_eth_switch_domain_alloc(&priv->domain_id);

So domain_id is used to identify relation between PF and representor port?

> +		if (err) {
> +			err = rte_errno;
> +			DRV_LOG(ERR, "unable to allocate switch domain: %s",
> +				strerror(rte_errno));
> +			goto error;
> +		}
> +	} else {
> +		priv->representor = 1;
> +		priv->domain_id =
> +			((struct priv *)master->data->dev_private)->domain_id;
> +		priv->rep_id = rep_id;
> +	}
>  	err = mlx5_args(&config, dpdk_dev->devargs);
>  	if (err) {
>  		err = rte_errno;
> @@ -1067,8 +1100,12 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  	return eth_dev;
>  error:
> -	if (priv)
> +	if (priv) {
> +		if (!priv->representor &&
> +		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  		rte_free(priv);
> +	}
>  	if (pd)
>  		claim_zero(mlx5_glue->dealloc_pd(pd));
>  	if (eth_dev)
> @@ -1081,12 +1118,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,  }
> 
>  /**
> - * Spawn Ethernet devices from Verbs information, one per detected port.
> + * Spawn Ethernet devices from Verbs information, one per detected port
> + and
> + * port representor.
>   *
>   * @param dpdk_dev
>   *   Backing DPDK device.
>   * @param ibv_dev
> - *   Verbs device.
> + *   NULL-terminated list of Verbs devices. First entry is the master device
> + *   (mandatory), followed by optional representors.
>   * @param vf
>   *   If nonzero, enable VF-specific features.
>   *
> @@ -1097,17 +1136,21 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>   */
>  static struct rte_eth_dev **
>  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> -	       struct ibv_device *ibv_dev,
> +	       struct ibv_device **ibv_dev,
>  	       int vf)
>  {
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	void *tmp;
>  	unsigned int i;
> +	unsigned int j = 0;
> +	unsigned int n = 0;
>  	int ret;
> 
> +next:
>  	errno = 0;
> -	ctx = mlx5_glue->open_device(ibv_dev);
> +	ctx = mlx5_glue->open_device(ibv_dev[j]);
>  	if (!ctx) {
>  		rte_errno = errno ? errno : ENODEV;
>  		if (rte_errno == ENODEV)
> @@ -1116,7 +1159,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		else
>  			DRV_LOG(ERR,
>  				"cannot use device, are drivers up to date?");
> -		return NULL;
> +		goto error;
>  	}
>  	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	mlx5_glue->close_device(ctx);
> @@ -1124,34 +1167,42 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		rte_errno = ret;
>  		DRV_LOG(ERR, "unable to query device information: %s",
>  			strerror(rte_errno));
> -		return NULL;
> +		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	eth_list = malloc(sizeof(*eth_list) *
> -			  (attr.orig_attr.phys_port_cnt + 1));
> -	if (!eth_list) {
> +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> +	tmp = realloc(eth_list, sizeof(*eth_list) *
> +		      (n + attr.orig_attr.phys_port_cnt + 1));
> +	if (!tmp) {
>  		rte_errno = errno;
> -		return NULL;
> +		goto error;
>  	}
> +	eth_list = tmp;
>  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {

Is there any mlx5 device that support more than physical ports on same PCI id?
I remember this is major difference between mlx5 and mlx4.

> -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> -						 &attr, i + 1);
> -		if (eth_list[i])
> -			continue;
> -		/* Save rte_errno and roll back in case of failure. */
> -		ret = rte_errno;
> -		while (i--) {
> -			mlx5_dev_close(eth_list[i]);
> -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> -				rte_free(eth_list[i]->data->dev_private);
> -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> -		}
> -		free(eth_list);
> -		rte_errno = ret;
> -		return NULL;
> +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
> +						 &attr, i + 1,
> +						 j ? eth_list[0] : NULL,
> +						 j - 1);
> +		if (!eth_list[n])
> +			goto error;
> +		++n;
>  	}
> -	eth_list[i] = NULL;
> +	if (ibv_dev[++j])
> +		goto next;
> +	eth_list[n] = NULL;
>  	return eth_list;
> +error:
> +	/* Save rte_errno and roll back in case of failure. */
> +	ret = rte_errno;
> +	while (n--) {
> +		mlx5_dev_close(eth_list[n]);
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			rte_free(eth_list[n]->data->dev_private);
> +		claim_zero(rte_eth_dev_release_port(eth_list[n]));
> +	}
> +	free(eth_list);
> +	rte_errno = ret;
> +	return NULL;
>  }
> 
>  /**
> @@ -1264,7 +1315,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				ibv_match[ret]->name, ret - 1);
>  	}
>  	if (n)
> -		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
>  	mlx5_glue->free_device_list(ibv_list);
>  	if (!eth_list || !*eth_list) {
>  		DRV_LOG(WARNING,
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 997b04a33..b38cb37a9 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -161,6 +161,9 @@ struct priv {
>  	uint16_t mtu; /* Configured MTU. */
>  	uint8_t port; /* Physical port number. */
>  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> +	unsigned int representor:1; /* Device is a port representor. */
> +	uint16_t domain_id; /* Switch domain identifier. */
> +	unsigned int rep_id; /* Port representor identifier. */
>  	/* RX/TX queues. */
>  	unsigned int rxqs_n; /* RX queues array size. */
>  	unsigned int txqs_n; /* TX queues array size. */ @@ -209,9 +212,12 @@ int mlx5_getenv_int(const
> char *);
> 
>  /* mlx5_ethdev.c */
> 
> +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +			   char (*ifname)[IF_NAMESIZE]);
>  int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);  int
> mlx5_ifindex(const struct rte_eth_dev *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr);
> +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	       int master);
>  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int mlx5_set_flags(struct rte_eth_dev *dev,
> unsigned int keep,
>  		   unsigned int flags);
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index
> f6cebae41..361b7ee4c 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -93,7 +93,7 @@ struct ethtool_link_settings {  #endif
> 
>  /**
> - * Get interface name from private structure.
> + * Get master interface name from private structure.
>   *
>   * @param[in] dev
>   *   Pointer to Ethernet device.
> @@ -104,7 +104,8 @@ struct ethtool_link_settings {
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
> +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +		       char (*ifname)[IF_NAMESIZE])
>  {
>  	struct priv *priv = dev->data->dev_private;
>  	DIR *dir;
> @@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])  }
> 
>  /**
> + * Get interface name from private structure.
> + *
> + * This is a port representor-aware version of mlx5_get_master_ifname().
> + *
> + * @param[in] dev
> + *   Pointer to Ethernet device.
> + * @param[out] ifname
> + *   Interface name output buffer.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int
> +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> +(*ifname)[IF_NAMESIZE]) {
> +	struct priv *priv = dev->data->dev_private;
> +	int ret;
> +	char master[IF_NAMESIZE];
> +	FILE *file;
> +	DIR *dir;
> +	uint64_t phys_switch_id;
> +
> +	if (!priv->representor)
> +		return mlx5_get_master_ifname(dev, ifname);
> +	ret = mlx5_get_master_ifname(dev, &master);
> +	if (ret)
> +		return ret;
> +	{
> +		MKSTR(path, "%s/device/net/%s/phys_switch_id",
> +		      priv->ibdev_path, master);
> +
> +		file = fopen(path, "rb");
> +	}
> +	if (!file) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
> +	fclose(file);
> +	if (ret != 1) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	{
> +		MKSTR(path, "%s/device/net/%s/subsystem",
> +		      priv->ibdev_path, master);
> +
> +		dir = opendir(path);
> +	}
> +	if (!dir) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	/*
> +	 * Scan network interfaces to find one with matching phys_switch_id
> +	 * and phys_switch_name.
> +	 */
> +	do {
> +		struct dirent *dent;
> +		uint64_t phys_switch_id_rep;
> +		int rep_id;
> +
> +		ret = -ENOENT;
> +		dent = readdir(dir);
> +		if (!dent)
> +			break;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (phys_switch_id_rep != phys_switch_id)
> +			continue;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_port_name",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%d", &rep_id);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
> +			continue;
> +		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
> +		ret = 0;
> +		break;
> +	} while (1);
> +	closedir(dir);
> +	if (ret)
> +		rte_errno = -ret;
> +	return ret;
> +}
> +
> +/**
>   * Get the interface index from device name.
>   *
>   * @param[in] dev
> @@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
>   *   Request number to pass to ioctl().
>   * @param[out] ifr
>   *   Interface request structure output buffer.
> + * @param master
> + *   When device is a port representor, perform request on master device
> + *   instead.
>   *
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	   int master)
>  {
>  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
>  	int ret = 0;
> @@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
>  		rte_errno = errno;
>  		return -rte_errno;
>  	}
> -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> +	if (master)
> +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> +	else
> +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
>  	if (ret)
>  		goto error;
>  	ret = ioctl(sock, req, ifr);
> @@ -258,7 +373,7 @@ int
>  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> 
>  	if (ret)
>  		return ret;
> @@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)  {
>  	struct ifreq request = { .ifr_mtu = mtu, };
> 
> -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
>  }
> 
>  /**
> @@ -302,13 +417,13 @@ int
>  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> 
>  	if (ret)
>  		return ret;
>  	request.ifr_flags &= keep;
>  	request.ifr_flags |= flags & ~keep;
> -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
>  }
> 
>  /**
> @@ -551,7 +666,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	int link_speed = 0;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -561,7 +676,7 @@
> mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&edata;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", @@ -622,7 +737,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	uint64_t sc;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -632,7 +747,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&gcmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -649,7 +764,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> 
>  	*ecmd = gcmd;
>  	ifr.ifr_data = (void *)ecmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -812,7 +927,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  	int ret;
> 
>  	ifr.ifr_data = (void *)&ethpause;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
> @@ -865,7 +980,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  		ethpause.tx_pause = 1;
>  	else
>  		ethpause.tx_pause = 0;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
> diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 672a47619..12ee37f55
> 100644
> --- a/drivers/net/mlx5/mlx5_mac.c
> +++ b/drivers/net/mlx5/mlx5_mac.c
> @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
>  	struct ifreq request;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
>  	if (ret)
>  		return ret;
>  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff --git
> a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 875dd1027..91f3d474a 100644
> --- a/drivers/net/mlx5/mlx5_stats.c
> +++ b/drivers/net/mlx5/mlx5_stats.c
> @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
>  	et_stats->cmd = ETHTOOL_GSTATS;
>  	et_stats->n_stats = xstats_ctrl->stats_n;
>  	ifr.ifr_data = (caddr_t)et_stats;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u unable to read statistic values from device", @@ -194,7 +194,7 @@
> mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
> 
>  	drvinfo.cmd = ETHTOOL_GDRVINFO;
>  	ifr.ifr_data = (caddr_t)&drvinfo;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to query number of statistics",
>  			dev->data->port_id);
> @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
>  	strings->string_set = ETH_SS_STATS;
>  	strings->len = dev_stats_n;
>  	ifr.ifr_data = (caddr_t)strings;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to get statistic names",
>  			dev->data->port_id);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for " Adrien Mazarguil
@ 2018-06-12  8:02   ` Xueming(Steven) Li
  2018-06-12 13:20     ` Adrien Mazarguil
  2018-06-12 14:44   ` Xueming(Steven) Li
  1 sibling, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-12  8:02 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Hi Adrien,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
> 
> Prior to this patch, all port representors detected on a given device were probed and Ethernet devices
> instantiated for each of them.
> 
> This patch adds support for the standard "representor" parameter, which implies that port representors
> are not probed by default anymore, except for the list provided through device arguments.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  doc/guides/nics/mlx5.rst                | 12 ++++++++++++
>  doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
>  drivers/net/mlx5/mlx5.c                 | 25 +++++++++++++++++++++++++
>  3 files changed, 39 insertions(+)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index 79c982e29..5229e546c 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -388,6 +388,18 @@ Run-time configuration
> 
>    Disabled by default.
> 
> +- ``representor`` parameter [list]
> +
> +  This parameter can be used to instantiate DPDK Ethernet devices from
> + existing port (or VF) representors configured on the device.
> +
> +  It is a standard parameter whose format is described in
> + :ref:`ethernet_device_standard_device_arguments`.
> +
> +  For instance, to probe port representors 0 through 2::
> +
> +    representor=[0-2]
> +
>  Firmware configuration
>  ~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
> index af82352a0..58d49ba0f 100644
> --- a/doc/guides/prog_guide/poll_mode_drv.rst
> +++ b/doc/guides/prog_guide/poll_mode_drv.rst
> @@ -365,6 +365,8 @@ Ethernet Device API
> 
>  The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
> 
> +.. _ethernet_device_standard_device_arguments:
> +
>  Ethernet Device Standard Device Arguments  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 09afca63c..216753ba6 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -90,6 +90,9 @@
>  /* Activate Netlink support in VF mode. */  #define MLX5_VF_NL_EN "vf_nl_en"
> 
> +/* Select port representors to instantiate. */ #define MLX5_REPRESENTOR
> +"representor"
> +
>  #ifndef HAVE_IBV_MLX5_MOD_MPW
>  #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)  #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
> @@ -420,6 +423,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
>  	struct mlx5_dev_config *config = opaque;
>  	unsigned long tmp;
> 
> +	/* No-op, port representors are processed in mlx5_dev_spawn(). */
> +	if (!strcmp(MLX5_REPRESENTOR, key))
> +		return 0;
>  	errno = 0;
>  	tmp = strtoul(val, NULL, 0);
>  	if (errno) {
> @@ -492,6 +498,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
>  		MLX5_VF_NL_EN,
> +		MLX5_REPRESENTOR,
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> @@ -1142,13 +1149,30 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	struct rte_eth_devargs eth_da;

Not related to this patch, from this data structure, maximum representor count is 32, 
customer might use VF on container environment, 32 is far from requirement. We need
additional work here. A workaround is that users call this api multiple times with different
representor IDs.

>  	void *tmp;
>  	unsigned int i;
>  	unsigned int j = 0;
>  	unsigned int n = 0;
>  	int ret;
> 
> +	if (dpdk_dev->devargs) {
> +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> +		if (ret)
> +			goto error;
> +	} else {
> +		memset(&eth_da, 0, sizeof(eth_da));
> +	}
>  next:
> +	if (j) {
> +		unsigned int k;
> +
> +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> +			if (eth_da.representor_ports[k] == j - 1)
> +				break;
> +		if (k == eth_da.nb_representor_ports)
> +			goto skip;
> +	}
>  	errno = 0;
>  	ctx = mlx5_glue->open_device(ibv_dev[j]);

Need a range check for j here.

>  	if (!ctx) {
> @@ -1187,6 +1211,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  			goto error;
>  		++n;
>  	}
> +skip:
>  	if (ibv_dev[++j])
>  		goto next;
>  	eth_list[n] = NULL;
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 1/7] net/mlx5: rename confusing object in probe code
  2018-06-10 11:00   ` Xueming(Steven) Li
@ 2018-06-12 13:19     ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-12 13:19 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Sun, Jun 10, 2018 at 11:00:57AM +0000, Xueming(Steven) Li wrote:
> Ack except one minor question below.
<snip>
> > -	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
> > -	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
> > -		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
> > +	mlx5_glue->dv_query_device(ctx, &dv_attr);
> 
> Should ctx be attr_ctx?.

Indeed, seems like I didn't validate this patch on its own after splitting
it from another. Will fix in v2, thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-10 11:00   ` Xueming(Steven) Li
@ 2018-06-12 13:19     ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-12 13:19 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Sun, Jun 10, 2018 at 11:00:59AM +0000, Xueming(Steven) Li wrote:
> Ack. Trivial issue related to other patch found , not sure whether it good to fix it here.
<snip>
> > -		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
> > -			      (device_attr_ex.tso_caps.supported_qpts &
> > +		config.tso = (attr.tso_caps.max_tso > 0 &&
> > +			      (attr.tso_caps.supported_qpts &
> >  			      (1 << IBV_QPT_RAW_PACKET)));
> 
> Not related to this patch, wrong indent.

No problem, I'll add an extra space for v2.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
  2018-06-10 12:59   ` Xueming(Steven) Li
@ 2018-06-12 13:20     ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-12 13:20 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Sun, Jun 10, 2018 at 12:59:06PM +0000, Xueming(Steven) Li wrote:
> Hi Adrien,
> 
> The logic looks much more clear now with the split.
<snip>
> > -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> > -			 pci_dev->addr.domain, pci_dev->addr.bus,
> > -			 pci_dev->addr.devid, pci_dev->addr.function);
> > -		if (attr.orig_attr.phys_port_cnt > 1)
> > -			snprintf(name + len, sizeof(name), " port %u", i);
> > +		if (attr->orig_attr.phys_port_cnt > 1)
> > +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		else
> > +			snprintf(name, sizeof(name), "%s port %u",
> > +				 dpdk_dev->name, port);
> 
> Name contains port only if phys_port_cnt > 1 in previous logic, are you sure?

Nice catch, will fix it for v2. This wasn't noticed because this code is
replaced in a subsequent patch of the series.

<snip>
> > +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > +						 &attr, i + 1);
> > +		if (eth_list[i])
> > +			continue;
> > +		/* Save rte_errno and roll back in case of failure. */
> > +		ret = rte_errno;
> > +		while (i--) {
> > +			mlx5_dev_close(eth_list[i]);
> > +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > +				rte_free(eth_list[i]->data->dev_private);
> > +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > +		}
> > +		free(eth_list);
> > +		rte_errno = ret;
> > +		return NULL;
> 
> The code is correct, but I personally prefer to move complex error handling to 
> dedicate "error:" block to make the code clear.

Since it's the only place where this failure can occur, I'll leave it as is
on the basis that doing so saves a goto statement. Those should be avoided
where possible. It would have been a different story if the same error
handling code was called from multiple places.

<snip>
> > +static int
> > +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> > +	       struct rte_pci_device *pci_dev) {
> > +	struct ibv_device **ibv_list;
> > +	struct rte_eth_dev **eth_list = NULL;
> > +	int vf;
> > +	int ret;
> > +
> > +	assert(pci_drv == &mlx5_driver);
> > +	switch (pci_dev->id.device_id) {
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> > +		vf = 1;
> > +		break;
> > +	default:
> > +		vf = 0;
> > +	}
> 
> How about use a macro for vf detection and invoke in mlx5_dev_spawn_one().
> Seems it not used in outer callers.

mlx5_dev_spawn_one() can be invoked with IB devices not backed by PCI
(e.g. vdevs), for which the caller may still knowingly ask for VF behavior,
either by user request or through other means.

In this case, the caller happens to be a PCI probing function which, based
on the device ID, easily determines whether VF behavior shall be requested.

This is basically the only place where PCI device ID can be checked. Adding
a macro here would only obfuscate this check. Adding it in
mlx5_dev_spawn_one() would entangle PCI and generic code again, the opposite
of the purpose of this patch, therefore I'll leave it unmodified for v2.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors
  2018-06-12  6:42   ` Xueming(Steven) Li
@ 2018-06-12 13:20     ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-12 13:20 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Tue, Jun 12, 2018 at 06:42:38AM +0000, Xueming(Steven) Li wrote:
> Hi Adrien,
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Saturday, May 26, 2018 12:35 AM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH 6/7] net/mlx5: probe all port representors
> > 
> > Probe existing port representors in addition to their master device and associate them automatically.
> > 
> > To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> > i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> > 
> > - "net_{DBDF}_0" for master/switch devices.
> > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> > 
> > Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> > Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
<snip>
> > +	/*
> > +	 * Allocate a switch domain for master devices and share it with
> > +	 * port representors.
> > +	 */
> > +	if (!master) {
> > +		priv->representor = 0;
> > +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> > +		priv->rep_id = 0;
> > +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> 
> So domain_id is used to identify relation between PF and representor port?

Right, as described by the API [1]. What's missing in this patch is that
this information is not reported through dev_infos_get(). I'll add it for
v2.

[1] https://www.dpdk.org/doc/guides/prog_guide/switch_representation.html#port-representors

<snip>
> > +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> > +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> > +	tmp = realloc(eth_list, sizeof(*eth_list) *
> > +		      (n + attr.orig_attr.phys_port_cnt + 1));
> > +	if (!tmp) {
> >  		rte_errno = errno;
> > -		return NULL;
> > +		goto error;
> >  	}
> > +	eth_list = tmp;
> >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> 
> Is there any mlx5 device that support more than physical ports on same PCI id?
> I remember this is major difference between mlx5 and mlx4.

Unlike mlx4, no known mlx5 adapter exposes more than a single port per PCI
address. This is kept for historical reasons (i.e. because it's always been
there) and the fact that since Verbs exposes it, phys_port_cnt should at
least be checked, if only to return an error when somehow multiple ports are
detected.

This series just makes easier to drop this intermediate loop if deemed
necessary later, simply by calling mlx5_dev_spawn_one() directly. In the
meantime, separation of low-level PCI from Verbs device instantiation means
there is an extra step to iterate on all possible IB ports. Same behavior as
usual, modifying it is not the purpose of this series.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-06-12  8:02   ` Xueming(Steven) Li
@ 2018-06-12 13:20     ` Adrien Mazarguil
  2018-06-12 13:43       ` Xueming(Steven) Li
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-12 13:20 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Tue, Jun 12, 2018 at 08:02:17AM +0000, Xueming(Steven) Li wrote:
> Hi Adrien,
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Saturday, May 26, 2018 12:35 AM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
> > 
> > Prior to this patch, all port representors detected on a given device were probed and Ethernet devices
> > instantiated for each of them.
> > 
> > This patch adds support for the standard "representor" parameter, which implies that port representors
> > are not probed by default anymore, except for the list provided through device arguments.
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
<snip>
> > @@ -1142,13 +1149,30 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  	struct rte_eth_dev **eth_list = NULL;
> >  	struct ibv_context *ctx;
> >  	struct ibv_device_attr_ex attr;
> > +	struct rte_eth_devargs eth_da;
> 
> Not related to this patch, from this data structure, maximum representor count is 32, 
> customer might use VF on container environment, 32 is far from requirement. We need
> additional work here. A workaround is that users call this api multiple times with different
> representor IDs.

32 ought to be enough for anybody!

Not sure I understand your concern actually. One can't instantiate more
representors than there are DPDK ports because the limit for both is
RTE_MAX_ETHPORTS (i.e. 1 representor = 1 DPDK port). Users who want to spawn
more than 32 DPDK ports overall must increase RTE_MAX_ETHPORTS regardless.

> >  	void *tmp;
> >  	unsigned int i;
> >  	unsigned int j = 0;
> >  	unsigned int n = 0;
> >  	int ret;
> > 
> > +	if (dpdk_dev->devargs) {
> > +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> > +		if (ret)
> > +			goto error;
> > +	} else {
> > +		memset(&eth_da, 0, sizeof(eth_da));
> > +	}
> >  next:
> > +	if (j) {
> > +		unsigned int k;
> > +
> > +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> > +			if (eth_da.representor_ports[k] == j - 1)
> > +				break;
> > +		if (k == eth_da.nb_representor_ports)
> > +			goto skip;
> > +	}
> >  	errno = 0;
> >  	ctx = mlx5_glue->open_device(ibv_dev[j]);
> 
> Need a range check for j here.

I think it's properly checked. j == 0 stands for "master device", always
found at index 0 and probed. Representors devices, if any, start at index 1
which triggers the previous block. This block makes sure that a given
representor is indeed enabled before either spawning the related device
(pass through with a valid "j") or skipping it altogether (goto skip).

I intend to leave this patch as is for v2.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-06-12 13:20     ` Adrien Mazarguil
@ 2018-06-12 13:43       ` Xueming(Steven) Li
  2018-06-14  8:01         ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-12 13:43 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Shahaf Shuler, dev



> -----Original Message-----
> From: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Sent: Tuesday, June 12, 2018 9:21 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
> 
> On Tue, Jun 12, 2018 at 08:02:17AM +0000, Xueming(Steven) Li wrote:
> > Hi Adrien,
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > Sent: Saturday, May 26, 2018 12:35 AM
> > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > Cc: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port
> > > representors
> > >
> > > Prior to this patch, all port representors detected on a given
> > > device were probed and Ethernet devices instantiated for each of them.
> > >
> > > This patch adds support for the standard "representor" parameter,
> > > which implies that port representors are not probed by default anymore, except for the list
> provided through device arguments.
> > >
> > > (Patch based on prior work from Yuanhan Liu)
> > >
> > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> <snip>
> > > @@ -1142,13 +1149,30 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > >  	struct rte_eth_dev **eth_list = NULL;
> > >  	struct ibv_context *ctx;
> > >  	struct ibv_device_attr_ex attr;
> > > +	struct rte_eth_devargs eth_da;
> >
> > Not related to this patch, from this data structure, maximum
> > representor count is 32, customer might use VF on container
> > environment, 32 is far from requirement. We need additional work here.
> > A workaround is that users call this api multiple times with different representor IDs.
> 
> 32 ought to be enough for anybody!
> 
> Not sure I understand your concern actually. One can't instantiate more representors than there are
> DPDK ports because the limit for both is RTE_MAX_ETHPORTS (i.e. 1 representor = 1 DPDK port). Users
> who want to spawn more than 32 DPDK ports overall must increase RTE_MAX_ETHPORTS regardless.

ConnectX-5 support 127 VFs, but as you said, increasing RTE_MAX_ETHPORTS should work.

> 
> > >  	void *tmp;
> > >  	unsigned int i;
> > >  	unsigned int j = 0;
> > >  	unsigned int n = 0;
> > >  	int ret;
> > >
> > > +	if (dpdk_dev->devargs) {
> > > +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> > > +		if (ret)
> > > +			goto error;
> > > +	} else {
> > > +		memset(&eth_da, 0, sizeof(eth_da));
> > > +	}
> > >  next:
> > > +	if (j) {
> > > +		unsigned int k;
> > > +
> > > +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> > > +			if (eth_da.representor_ports[k] == j - 1)
> > > +				break;
> > > +		if (k == eth_da.nb_representor_ports)
> > > +			goto skip;
> > > +	}
> > >  	errno = 0;
> > >  	ctx = mlx5_glue->open_device(ibv_dev[j]);
> >
> > Need a range check for j here.
> 
> I think it's properly checked. j == 0 stands for "master device", always found at index 0 and probed.
> Representors devices, if any, start at index 1 which triggers the previous block. This block makes
> sure that a given representor is indeed enabled before either spawning the related device (pass
> through with a valid "j") or skipping it altogether (goto skip).

Yes, this code looks good. What I wanted to ask what if dev args specify an invalid rep id, e.g. 33.
This code walk through silently w/o warning, it works, but it better to have a warning if input id out of range.

> 
> I intend to leave this patch as is for v2.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for " Adrien Mazarguil
  2018-06-12  8:02   ` Xueming(Steven) Li
@ 2018-06-12 14:44   ` Xueming(Steven) Li
  2018-06-13 13:11     ` Adrien Mazarguil
  1 sibling, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-12 14:44 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
> 
> Prior to this patch, all port representors detected on a given device were probed and Ethernet devices
> instantiated for each of them.
> 
> This patch adds support for the standard "representor" parameter, which implies that port representors
> are not probed by default anymore, except for the list provided through device arguments.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  doc/guides/nics/mlx5.rst                | 12 ++++++++++++
>  doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
>  drivers/net/mlx5/mlx5.c                 | 25 +++++++++++++++++++++++++
>  3 files changed, 39 insertions(+)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index 79c982e29..5229e546c 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -388,6 +388,18 @@ Run-time configuration
> 
>    Disabled by default.
> 
> +- ``representor`` parameter [list]
> +
> +  This parameter can be used to instantiate DPDK Ethernet devices from
> + existing port (or VF) representors configured on the device.
> +
> +  It is a standard parameter whose format is described in
> + :ref:`ethernet_device_standard_device_arguments`.
> +
> +  For instance, to probe port representors 0 through 2::
> +
> +    representor=[0-2]
> +
>  Firmware configuration
>  ~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
> index af82352a0..58d49ba0f 100644
> --- a/doc/guides/prog_guide/poll_mode_drv.rst
> +++ b/doc/guides/prog_guide/poll_mode_drv.rst
> @@ -365,6 +365,8 @@ Ethernet Device API
> 
>  The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
> 
> +.. _ethernet_device_standard_device_arguments:
> +
>  Ethernet Device Standard Device Arguments  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 09afca63c..216753ba6 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -90,6 +90,9 @@
>  /* Activate Netlink support in VF mode. */  #define MLX5_VF_NL_EN "vf_nl_en"
> 
> +/* Select port representors to instantiate. */ #define MLX5_REPRESENTOR
> +"representor"
> +
>  #ifndef HAVE_IBV_MLX5_MOD_MPW
>  #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)  #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
> @@ -420,6 +423,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
>  	struct mlx5_dev_config *config = opaque;
>  	unsigned long tmp;
> 
> +	/* No-op, port representors are processed in mlx5_dev_spawn(). */
> +	if (!strcmp(MLX5_REPRESENTOR, key))
> +		return 0;
>  	errno = 0;
>  	tmp = strtoul(val, NULL, 0);
>  	if (errno) {
> @@ -492,6 +498,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
>  		MLX5_VF_NL_EN,
> +		MLX5_REPRESENTOR,
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> @@ -1142,13 +1149,30 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	struct rte_eth_devargs eth_da;
>  	void *tmp;
>  	unsigned int i;
>  	unsigned int j = 0;
>  	unsigned int n = 0;
>  	int ret;
> 
> +	if (dpdk_dev->devargs) {
> +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> +		if (ret)
> +			goto error;
> +	} else {
> +		memset(&eth_da, 0, sizeof(eth_da));
> +	}
>  next:
> +	if (j) {
> +		unsigned int k;
> +
> +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> +			if (eth_da.representor_ports[k] == j - 1)
> +				break;
> +		if (k == eth_da.nb_representor_ports)
> +			goto skip;
> +	}
>  	errno = 0;
>  	ctx = mlx5_glue->open_device(ibv_dev[j]);
>  	if (!ctx) {
> @@ -1187,6 +1211,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  			goto error;
>  		++n;
>  	}
> +skip:
>  	if (ibv_dev[++j])
>  		goto next;

int rte_eth_dev_attach(const char *devargs, uint16_t *port_id);
The rte_eth_dev_attach api attach one device a time as only one *port_id parameter.
Dev argument "82:0.0,representer[a-b] will register multiple devices in one call,
is this correct behavior? I ask this because this caused testpmd CLI "port attach" 
crash due to only the last registered port id returned.

>  	eth_list[n] = NULL;
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-06-12 14:44   ` Xueming(Steven) Li
@ 2018-06-13 13:11     ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-13 13:11 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Tue, Jun 12, 2018 at 02:44:12PM +0000, Xueming(Steven) Li wrote:
<snip>
> > +	if (dpdk_dev->devargs) {
> > +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> > +		if (ret)
> > +			goto error;
> > +	} else {
> > +		memset(&eth_da, 0, sizeof(eth_da));
> > +	}
> >  next:
> > +	if (j) {
> > +		unsigned int k;
> > +
> > +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> > +			if (eth_da.representor_ports[k] == j - 1)
> > +				break;
> > +		if (k == eth_da.nb_representor_ports)
> > +			goto skip;
> > +	}
> >  	errno = 0;
> >  	ctx = mlx5_glue->open_device(ibv_dev[j]);
> >  	if (!ctx) {
> > @@ -1187,6 +1211,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  			goto error;
> >  		++n;
> >  	}
> > +skip:
> >  	if (ibv_dev[++j])
> >  		goto next;
> 
> int rte_eth_dev_attach(const char *devargs, uint16_t *port_id);
> The rte_eth_dev_attach api attach one device a time as only one *port_id parameter.
> Dev argument "82:0.0,representer[a-b] will register multiple devices in one call,
> is this correct behavior?

Yes, this is how the representor argument is documented and supposed to be
used. This probing approach is obviously not compatible with representors
hot-plugging, for which something will have to be devised if needed.

> I ask this because this caused testpmd CLI "port attach" 
> crash due to only the last registered port id returned.

I reproduced this crash and determined it is caused by a bug in
testpmd. I'll submit a separate fix for it.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for port representors
  2018-06-12 13:43       ` Xueming(Steven) Li
@ 2018-06-14  8:01         ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:01 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Tue, Jun 12, 2018 at 01:43:18PM +0000, Xueming(Steven) Li wrote:
<snip>
> > > >  	void *tmp;
> > > >  	unsigned int i;
> > > >  	unsigned int j = 0;
> > > >  	unsigned int n = 0;
> > > >  	int ret;
> > > >
> > > > +	if (dpdk_dev->devargs) {
> > > > +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> > > > +		if (ret)
> > > > +			goto error;
> > > > +	} else {
> > > > +		memset(&eth_da, 0, sizeof(eth_da));
> > > > +	}
> > > >  next:
> > > > +	if (j) {
> > > > +		unsigned int k;
> > > > +
> > > > +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> > > > +			if (eth_da.representor_ports[k] == j - 1)
> > > > +				break;
> > > > +		if (k == eth_da.nb_representor_ports)
> > > > +			goto skip;
> > > > +	}
> > > >  	errno = 0;
> > > >  	ctx = mlx5_glue->open_device(ibv_dev[j]);
> > >
> > > Need a range check for j here.
> > 
> > I think it's properly checked. j == 0 stands for "master device", always found at index 0 and probed.
> > Representors devices, if any, start at index 1 which triggers the previous block. This block makes
> > sure that a given representor is indeed enabled before either spawning the related device (pass
> > through with a valid "j") or skipping it altogether (goto skip).
> 
> Yes, this code looks good. What I wanted to ask what if dev args specify an invalid rep id, e.g. 33.
> This code walk through silently w/o warning, it works, but it better to have a warning if input id out of range.

You're right. On the other hand this provides a means to spawn all
representors without necessarily knowing how many can be instantiated first,
e.g. by always providing a "representor=[0-31]" argument, since no special
keyword is defined to request them all.

Not saying it's a good or bad thing, but somewhat harmless. Just like
specifying "-w {DBDF}" arguments with invalid addresses, nonexistent
representors are silently ignored.

In any case, this can be improved later. We're already seeing a couple of
limitations with the representor argument, namely the lack of hot-plug
support, which will need to be addressed as well.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support
  2018-05-25 16:35 [dpdk-dev] [PATCH 0/7] net/mlx5: add port representor support Adrien Mazarguil
                   ` (6 preceding siblings ...)
  2018-05-25 16:35 ` [dpdk-dev] [PATCH 7/7] net/mlx5: add parameter for " Adrien Mazarguil
@ 2018-06-14  8:34 ` Adrien Mazarguil
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
                     ` (7 more replies)
  7 siblings, 8 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This series adds support for port (VF) representors to the mlx5 PMD, which
can be instantiated using the standard "representor" device parameter.

Note the PMD only probes existing representors which exist as Verbs devices;
their creation is part of the host system configuration.

v2 changes:

- See individual patches for details.
- Rebased series.

Adrien Mazarguil (7):
  net/mlx5: rename confusing object in probe code
  net/mlx5: remove redundant objects in probe code
  net/mlx5: split PCI from generic probing code
  net/mlx5: re-indent generic probing function
  net/mlx5: add port representor awareness
  net/mlx5: probe all port representors
  net/mlx5: add parameter for port representors

 doc/guides/nics/mlx5.rst                |   12 +
 doc/guides/prog_guide/poll_mode_drv.rst |    2 +
 drivers/net/mlx5/mlx5.c                 | 1100 +++++++++++++++-----------
 drivers/net/mlx5/mlx5.h                 |    9 +-
 drivers/net/mlx5/mlx5_ethdev.c          |  151 +++-
 drivers/net/mlx5/mlx5_mac.c             |    2 +-
 drivers/net/mlx5/mlx5_stats.c           |    6 +-
 7 files changed, 818 insertions(+), 464 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
@ 2018-06-14  8:34   ` Adrien Mazarguil
  2018-06-16  8:24     ` Xueming(Steven) Li
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
                     ` (6 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

There are several attribute objects in this function:

- IB device attributes (struct ibv_device_attr_ex device_attr).
- Direct Verbs attributes (struct mlx5dv_context attrs_out).
- Port attributes (struct ibv_port_attr).
- IB device attributes again (struct ibv_device_attr_ex device_attr_ex).

"attrs_out" is both odd and initialized using a nonstandard syntax. Rename
it "dv_attr" for consistency.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Fixed ctx -> attr_ctx in mlx5_pci_probe().
---
 drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3e0a1b186..3bdcb3970 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
@@ -670,7 +671,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
 	int i;
-	struct mlx5dv_context attrs_out = {0};
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
@@ -736,21 +736,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
 	/*
 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 	 * as all ConnectX-5 devices.
 	 */
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
-	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
-		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
 			mps = MLX5_MPW_ENHANCED;
 		} else {
@@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		mps = MLX5_MPW_DISABLED;
 	}
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
-		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
-			attrs_out.striding_rq_caps;
+			dv_attr.striding_rq_caps;
 
 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
 			mprq_caps.min_single_stride_log_num_of_bytes);
@@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
-	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
-		tunnel_en = ((attrs_out.tunnel_offloads_caps &
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
-			     (attrs_out.tunnel_offloads_caps &
+			     (dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
 	}
 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
@@ -812,9 +812,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
-	mpls_en = ((attrs_out.tunnel_offloads_caps &
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
-		   (attrs_out.tunnel_offloads_caps &
+		   (dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
 		mpls_en ? "" : "not ");
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-06-14  8:34   ` Adrien Mazarguil
  2018-06-16  8:27     ` Xueming(Steven) Li
  2018-06-17 10:14     ` Shahaf Shuler
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
                     ` (5 subsequent siblings)
  7 siblings, 2 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch gets rid of redundant calls to open the device and query its
attributes in order to simplify the code.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Minor indent fix on existing code.
---
 drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3bdcb3970..1a5391e63 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct ibv_context *ctx = NULL;
+	struct ibv_device_attr_ex attr;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
-	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr_ex device_attr;
 	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		attr_ctx = mlx5_glue->open_device(list[i]);
+		ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
 		break;
 	}
-	if (attr_ctx == NULL) {
+	if (ctx == NULL) {
 		switch (err) {
 		case 0:
 			DRV_LOG(ERR,
@@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	mlx5_glue->dv_query_device(ctx, &dv_attr);
 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
@@ -822,23 +822,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected",
-		device_attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
 		char name[RTE_ETH_NAME_MAX_LEN];
 		int len;
 		uint32_t port = i + 1; /* ports are indexed from one */
-		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (device_attr.orig_attr.phys_port_cnt > 1)
+		if (attr.orig_attr.phys_port_cnt > 1)
 			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
@@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		ctx = mlx5_glue->open_device(ibv_dev);
+		if (!ctx)
+			ctx = mlx5_glue->open_device(ibv_dev);
 		if (ctx == NULL) {
 			err = ENODEV;
 			goto port_error;
@@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = device_attr;
+		priv->device_attr = attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
@@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				strerror(rte_errno));
 			goto port_error;
 		}
-		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
-		if (err) {
-			DRV_LOG(ERR, "ibv_query_device_ex() failed");
-			goto port_error;
-		}
-		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
+		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!(device_attr.max_counter_sets);
+		config.flow_counter_en = !!attr.max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -978,7 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+			attr.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+		config.hw_padding = !!attr.rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
-			      (device_attr_ex.tso_caps.supported_qpts &
-			      (1 << IBV_QPT_RAW_PACKET)));
+		config.tso = (attr.tso_caps.max_tso > 0 &&
+			      (attr.tso_caps.supported_qpts &
+			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz =
-					device_attr_ex.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr.tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1168,14 +1160,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
+		/*
+		 * Each eth_dev instance is assigned its own Verbs context,
+		 * since this one is consumed, let the next iteration open
+		 * another.
+		 */
+		ctx = NULL;
 		continue;
 port_error:
 		if (priv)
 			rte_free(priv);
 		if (pd)
 			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (ctx)
-			claim_zero(mlx5_glue->close_device(ctx));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
 		break;
@@ -1187,8 +1183,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
 error:
-	if (attr_ctx)
-		claim_zero(mlx5_glue->close_device(attr_ctx));
+	if (ctx)
+		claim_zero(mlx5_glue->close_device(ctx));
 	if (list)
 		mlx5_glue->free_device_list(list);
 	if (err) {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-06-14  8:34   ` Adrien Mazarguil
  2018-06-16  8:29     ` Xueming(Steven) Li
  2018-06-17 10:14     ` Shahaf Shuler
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 4/7] net/mlx5: re-indent generic probing function Adrien Mazarguil
                     ` (4 subsequent siblings)
  7 siblings, 2 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Fixed device naming. A port suffix is now appended only if several IB
  ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
  initialization errors, as it was confusing.
---
 drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
 1 file changed, 209 insertions(+), 131 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1a5391e63..01dcf25b9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ * @param[in] attr
+ *   Verbs device attributes.
+ * @param port
+ *   Verbs port to use (indexed from 1).
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+		   struct ibv_device *ibv_dev,
+		   int vf,
+		   const struct ibv_device_attr_ex *attr,
+		   unsigned int port)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
-	struct ibv_context *ctx = NULL;
-	struct ibv_device_attr_ex attr;
+	struct ibv_context *ctx;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	ibv_dev = list[i];
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -822,20 +773,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-	if (err) {
-		DEBUG("ibv_query_device_ex() failed");
-		goto error;
-	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
+		if (attr->orig_attr.phys_port_cnt > 1)
+			snprintf(name, sizeof(name), "%s port %u",
+				 dpdk_dev->name, port);
+		else
+			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
-			continue;
+			mlx5_glue->close_device(ctx);
+			return eth_dev;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
-		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
+		priv->device_attr = *attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
 			goto port_error;
 		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
+		config.hw_csum = !!(attr->device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
+		config.flow_counter_en = !!attr->max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -971,7 +907,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
+			attr->rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
+		config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
+		config.tso = (attr->tso_caps.max_tso > 0 &&
+			      (attr->tso_caps.supported_qpts &
 			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr->tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
+		return eth_dev;
 port_error:
 		if (priv)
 			rte_free(priv);
@@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			claim_zero(mlx5_glue->dealloc_pd(pd));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
-		break;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ *
+ * @return
+ *   A NULL-terminated list of Ethernet device objects on success, NULL
+ *   otherwise and rte_errno is set. Caller is expected to release list
+ *   memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
+{
+	struct rte_eth_dev **eth_list = NULL;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex attr;
+	unsigned int i;
+	int ret;
+
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		if (rte_errno == ENODEV)
+			DRV_LOG(ERR,
+				"cannot access device, is mlx5_ib loaded?");
+		else
+			DRV_LOG(ERR,
+				"cannot use device, are drivers up to date?");
+		return NULL;
+	}
+	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+	mlx5_glue->close_device(ctx);
+	if (ret) {
+		rte_errno = ret;
+		DRV_LOG(ERR, "unable to query device information: %s",
+			strerror(rte_errno));
+		return NULL;
+	}
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	eth_list = malloc(sizeof(*eth_list) *
+			  (attr.orig_attr.phys_port_cnt + 1));
+	if (!eth_list) {
+		rte_errno = errno;
+		return NULL;
+	}
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+						 &attr, i + 1);
+		if (eth_list[i])
+			continue;
+		/* Save rte_errno and roll back in case of failure. */
+		ret = rte_errno;
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		free(eth_list);
+		rte_errno = ret;
+		return NULL;
+	}
+	eth_list[i] = NULL;
+	return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev **eth_list = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	if (ret >= 0)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else if (!eth_list || !*eth_list) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+	} else {
+		for (ret = 0; eth_list[ret]; ++ret) {
+			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			rte_eth_dev_probing_finish(eth_list[ret]);
+		}
+		ret = 0;
+	}
+	free(eth_list);
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 4/7] net/mlx5: re-indent generic probing function
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
                     ` (2 preceding siblings ...)
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-06-14  8:34   ` Adrien Mazarguil
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness Adrien Mazarguil
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Since commit "net/mlx5: split PCI from generic probing code" extracted the
inner loop to a separate function, mlx5_dev_spawn_one() is left with an
unnecessary indent level.

This patch eliminates a block, moves its local variables to function scope,
and re-indents its contents.

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming(Steven) Li <xuemingl@mellanox.com>
---
 drivers/net/mlx5/mlx5.c | 615 +++++++++++++++++++++----------------------
 1 file changed, 299 insertions(+), 316 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01dcf25b9..c9815d721 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -660,8 +660,27 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		   unsigned int port)
 {
 	struct ibv_context *ctx;
+	struct ibv_port_attr port_attr;
+	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct mlx5_dev_config config = {
+		.vf = !!vf,
+		.tx_vec_en = 1,
+		.rx_vec_en = 1,
+		.mpw_hdr_dseg = 0,
+		.txq_inline = MLX5_ARG_UNSET,
+		.txqs_inline = MLX5_ARG_UNSET,
+		.inline_max_packet_sz = MLX5_ARG_UNSET,
+		.vf_nl_en = 1,
+		.mprq = {
+			.enabled = 0,
+			.stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
+			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+		},
+	};
 	struct rte_eth_dev *eth_dev = NULL;
+	struct priv *priv = NULL;
 	int err = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -677,6 +696,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
+	struct ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -712,11 +733,13 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		DRV_LOG(DEBUG, "MPW isn't supported");
 		mps = MLX5_MPW_DISABLED;
 	}
+	config.mps = mps;
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
 		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
+	config.swp = !!swp;
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
@@ -742,6 +765,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 			mprq_caps.min_single_wqe_log_num_of_strides;
 		mprq_max_stride_num_n =
 			mprq_caps.max_single_wqe_log_num_of_strides;
+		config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						   mprq_min_stride_num_n);
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
@@ -749,6 +774,7 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
+	config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
 		tunnel_en = ((dv_attr.tunnel_offloads_caps &
@@ -762,6 +788,7 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
+	config.tunnel_en = tunnel_en;
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
 	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
@@ -773,338 +800,294 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	{
-		char name[RTE_ETH_NAME_MAX_LEN];
-		struct ibv_port_attr port_attr;
-		struct ibv_pd *pd = NULL;
-		struct priv *priv = NULL;
-		struct ether_addr mac;
-		struct mlx5_dev_config config = {
-			.cqe_comp = cqe_comp,
-			.mps = mps,
-			.tunnel_en = tunnel_en,
-			.mpls_en = mpls_en,
-			.tx_vec_en = 1,
-			.rx_vec_en = 1,
-			.mpw_hdr_dseg = 0,
-			.txq_inline = MLX5_ARG_UNSET,
-			.txqs_inline = MLX5_ARG_UNSET,
-			.inline_max_packet_sz = MLX5_ARG_UNSET,
-			.vf_nl_en = 1,
-			.swp = !!swp,
-			.mprq = {
-				.enabled = 0, /* Disabled by default. */
-				.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-							mprq_min_stride_num_n),
-				.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
-				.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
-			},
-		};
-
-		if (attr->orig_attr.phys_port_cnt > 1)
-			snprintf(name, sizeof(name), "%s port %u",
-				 dpdk_dev->name, port);
-		else
-			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
-		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-			eth_dev = rte_eth_dev_attach_secondary(name);
-			if (eth_dev == NULL) {
-				DRV_LOG(ERR, "can not attach rte ethdev");
-				rte_errno = ENOMEM;
-				err = rte_errno;
-				goto error;
-			}
-			eth_dev->device = dpdk_dev;
-			eth_dev->dev_ops = &mlx5_dev_sec_ops;
-			err = mlx5_uar_init_secondary(eth_dev);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Receive command fd from primary process */
-			err = mlx5_socket_connect(eth_dev);
-			if (err < 0) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Remap UAR for Tx queues. */
-			err = mlx5_tx_uar_remap(eth_dev, err);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/*
-			 * Ethdev pointer is still required as input since
-			 * the primary device is not accessible from the
-			 * secondary process.
-			 */
-			eth_dev->rx_pkt_burst =
-				mlx5_select_rx_function(eth_dev);
-			eth_dev->tx_pkt_burst =
-				mlx5_select_tx_function(eth_dev);
-			mlx5_glue->close_device(ctx);
-			return eth_dev;
-		}
-		DRV_LOG(DEBUG, "using port %u", port);
-		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, port, &port_attr);
-		if (err) {
-			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto port_error;
-		}
-		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR,
-				"port %d is not configured in Ethernet mode",
-				port);
-			err = EINVAL;
-			goto port_error;
-		}
-		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
-				port,
-				mlx5_glue->port_state_str(port_attr.state),
-				port_attr.state);
-		/* Allocate protection domain. */
-		pd = mlx5_glue->alloc_pd(ctx);
-		if (pd == NULL) {
-			DRV_LOG(ERR, "PD allocation failure");
-			err = ENOMEM;
-			goto port_error;
-		}
-		/* from rte_ethdev.c */
-		priv = rte_zmalloc("ethdev private structure",
-				   sizeof(*priv),
-				   RTE_CACHE_LINE_SIZE);
-		if (priv == NULL) {
-			DRV_LOG(ERR, "priv allocation failure");
-			err = ENOMEM;
-			goto port_error;
-		}
-		priv->ctx = ctx;
-		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
-			sizeof(priv->ibdev_path));
-		priv->device_attr = *attr;
-		priv->port = port;
-		priv->pd = pd;
-		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, dpdk_dev->devargs);
-		if (err) {
-			err = rte_errno;
-			DRV_LOG(ERR, "failed to process device arguments: %s",
-				strerror(rte_errno));
-			goto port_error;
-		}
-		config.hw_csum = !!(attr->device_cap_flags_ex &
-				    IBV_DEVICE_RAW_IP_CSUM);
-		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
-			(config.hw_csum ? "" : "not "));
-#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr->max_counter_sets;
-		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
-		DRV_LOG(DEBUG,
-			"counter type = %d, num of cs = %ld, attributes = %d",
-			cs_desc.counter_type, cs_desc.num_of_cs,
-			cs_desc.attributes);
-#endif
-		config.ind_table_max_size =
-			attr->rss_caps.max_rwq_indirection_table_size;
-		/* Remove this check once DPDK supports larger/variable
-		 * indirection tables. */
-		if (config.ind_table_max_size >
-				(unsigned int)ETH_RSS_RETA_SIZE_512)
-			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
-		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
-			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr->raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
-		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
-			(config.hw_vlan_strip ? "" : "not "));
-
-		config.hw_fcs_strip = !!(attr->raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
-		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
-			(config.hw_fcs_strip ? "" : "not "));
-
-#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr->rx_pad_end_addr_align;
-#endif
-		DRV_LOG(DEBUG,
-			"hardware Rx end alignment padding is %ssupported",
-			(config.hw_padding ? "" : "not "));
-		config.vf = vf;
-		config.tso = (attr->tso_caps.max_tso > 0 &&
-			      (attr->tso_caps.supported_qpts &
-			       (1 << IBV_QPT_RAW_PACKET)));
-		if (config.tso)
-			config.tso_max_payload_sz = attr->tso_caps.max_tso;
-		if (config.mps && !mps) {
-			DRV_LOG(ERR,
-				"multi-packet send not supported on this device"
-				" (" MLX5_TXQ_MPW_EN ")");
-			err = ENOTSUP;
-			goto port_error;
-		}
-		DRV_LOG(INFO, "%s MPS is %s",
-			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
-			config.mps != MLX5_MPW_DISABLED ? "enabled" :
-			"disabled");
-		if (config.cqe_comp && !cqe_comp) {
-			DRV_LOG(WARNING, "Rx CQE compression isn't supported");
-			config.cqe_comp = 0;
-		}
-		config.mprq.enabled = config.mprq.enabled && mprq;
-		if (config.mprq.enabled) {
-			if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
-			    config.mprq.stride_num_n < mprq_min_stride_num_n) {
-				config.mprq.stride_num_n =
-					RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-						mprq_min_stride_num_n);
-				DRV_LOG(WARNING,
-					"the number of strides"
-					" for Multi-Packet RQ is out of range,"
-					" setting default value (%u)",
-					1 << config.mprq.stride_num_n);
-			}
-			config.mprq.min_stride_size_n = mprq_min_stride_size_n;
-			config.mprq.max_stride_size_n = mprq_max_stride_size_n;
-		}
-		eth_dev = rte_eth_dev_allocate(name);
+	config.mpls_en = mpls_en;
+	if (attr->orig_attr.phys_port_cnt > 1)
+		snprintf(name, sizeof(name), "%s port %u",
+			 dpdk_dev->name, port);
+	else
+		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
-			DRV_LOG(ERR, "can not allocate rte ethdev");
-			err = ENOMEM;
-			goto port_error;
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			err = rte_errno;
+			goto error;
 		}
-		eth_dev->data->dev_private = priv;
-		priv->dev_data = eth_dev->data;
-		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = dpdk_dev;
-		eth_dev->device->driver = &mlx5_driver.driver;
-		err = mlx5_uar_init_primary(eth_dev);
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_uar_init_secondary(eth_dev);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
-		}
-		/* Configure the first MAC address by default. */
-		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
-			DRV_LOG(ERR,
-				"port %u cannot get MAC address, is mlx5_en"
-				" loaded? (errno: %s)",
-				eth_dev->data->port_id, strerror(rte_errno));
-			err = ENODEV;
-			goto port_error;
-		}
-		DRV_LOG(INFO,
-			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
-			eth_dev->data->port_id,
-			mac.addr_bytes[0], mac.addr_bytes[1],
-			mac.addr_bytes[2], mac.addr_bytes[3],
-			mac.addr_bytes[4], mac.addr_bytes[5]);
-#ifndef NDEBUG
-		{
-			char ifname[IF_NAMESIZE];
-
-			if (mlx5_get_ifname(eth_dev, &ifname) == 0)
-				DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
-					eth_dev->data->port_id, ifname);
-			else
-				DRV_LOG(DEBUG, "port %u ifname is unknown",
-					eth_dev->data->port_id);
+			goto error;
 		}
-#endif
-		/* Get actual MTU if possible. */
-		err = mlx5_get_mtu(eth_dev, &priv->mtu);
-		if (err) {
+		/* Receive command fd from primary process */
+		err = mlx5_socket_connect(eth_dev);
+		if (err < 0) {
 			err = rte_errno;
-			goto port_error;
-		}
-		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
-			priv->mtu);
-		/*
-		 * Initialize burst functions to prevent crashes before link-up.
-		 */
-		eth_dev->rx_pkt_burst = removed_rx_burst;
-		eth_dev->tx_pkt_burst = removed_tx_burst;
-		eth_dev->dev_ops = &mlx5_dev_ops;
-		/* Register MAC address. */
-		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-		priv->nl_socket = -1;
-		priv->nl_sn = 0;
-		if (vf && config.vf_nl_en) {
-			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-			if (priv->nl_socket < 0)
-				priv->nl_socket = -1;
-			mlx5_nl_mac_addr_sync(eth_dev);
+			goto error;
 		}
-		TAILQ_INIT(&priv->flows);
-		TAILQ_INIT(&priv->ctrl_flows);
-		/* Hint libmlx5 to use PMD allocator for data plane resources */
-		struct mlx5dv_ctx_allocators alctr = {
-			.alloc = &mlx5_alloc_verbs_buf,
-			.free = &mlx5_free_verbs_buf,
-			.data = priv,
-		};
-		mlx5_glue->dv_set_context_attr(ctx,
-					       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
-					       (void *)((uintptr_t)&alctr));
-		/* Bring Ethernet device up. */
-		DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
-			eth_dev->data->port_id);
-		mlx5_set_link_up(eth_dev);
-		/*
-		 * Even though the interrupt handler is not installed yet,
-		 * interrupts will still trigger on the asyn_fd from
-		 * Verbs context returned by ibv_open_device().
-		 */
-		mlx5_link_update(eth_dev, 0);
-		/* Store device configuration on private structure. */
-		priv->config = config;
-		/* Create drop queue. */
-		err = mlx5_flow_create_drop_queue(eth_dev);
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_remap(eth_dev, err);
 		if (err) {
-			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
-				eth_dev->data->port_id, strerror(rte_errno));
 			err = rte_errno;
-			goto port_error;
-		}
-		/* Supported Verbs flow priority number detection. */
-		if (verb_priorities == 0)
-			verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
-		if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
-			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
-				eth_dev->data->port_id, verb_priorities);
-			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
-		priv->config.max_verbs_prio = verb_priorities;
 		/*
-		 * Once the device is added to the list of memory event
-		 * callback, its global MR cache table cannot be expanded
-		 * on the fly because of deadlock. If it overflows, lookup
-		 * should be done by searching MR list linearly, which is slow.
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
 		 */
-		err = mlx5_mr_btree_init(&priv->mr.cache,
-					 MLX5_MR_BTREE_CACHE_N * 2,
-					 eth_dev->device->numa_node);
-		if (err) {
-			err = rte_errno;
-			goto port_error;
-		}
-		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-				 priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
-		rte_eth_dev_probing_finish(eth_dev);
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		mlx5_glue->close_device(ctx);
 		return eth_dev;
-port_error:
-		if (priv)
-			rte_free(priv);
-		if (pd)
-			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
-			rte_eth_dev_release_port(eth_dev);
 	}
+	DRV_LOG(DEBUG, "using port %u", port);
+	/* Check port status. */
+	err = mlx5_glue->query_port(ctx, port, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port %d is not configured in Ethernet mode",
+			port);
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
+			port, mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate protection domain. */
+	pd = mlx5_glue->alloc_pd(ctx);
+	if (pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv = rte_zmalloc("ethdev private structure",
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->ctx = ctx;
+	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
+		sizeof(priv->ibdev_path));
+	priv->device_attr = *attr;
+	priv->port = port;
+	priv->pd = pd;
+	priv->mtu = ETHER_MTU;
+	err = mlx5_args(&config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	config.hw_csum = !!(attr->device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config.hw_csum ? "" : "not "));
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	config.flow_counter_en = !!attr->max_counter_sets;
+	mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
+	DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d",
+		cs_desc.counter_type, cs_desc.num_of_cs,
+		cs_desc.attributes);
+#endif
+	config.ind_table_max_size =
+		attr->rss_caps.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config.ind_table_max_size);
+	config.hw_vlan_strip = !!(attr->raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config.hw_vlan_strip ? "" : "not "));
+	config.hw_fcs_strip = !!(attr->raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config.hw_fcs_strip ? "" : "not "));
+#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
+	config.hw_padding = !!attr->rx_pad_end_addr_align;
+#endif
+	DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported",
+		(config.hw_padding ? "" : "not "));
+	config.tso = (attr->tso_caps.max_tso > 0 &&
+		      (attr->tso_caps.supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config.tso)
+		config.tso_max_payload_sz = attr->tso_caps.max_tso;
+	if (config.mps && !mps) {
+		DRV_LOG(ERR,
+			"multi-packet send not supported on this device"
+			" (" MLX5_TXQ_MPW_EN ")");
+		err = ENOTSUP;
+		goto error;
+	}
+	DRV_LOG(INFO, "%sMPS is %s",
+		config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
+		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config.cqe_comp && !cqe_comp) {
+		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+		config.cqe_comp = 0;
+	}
+	config.mprq.enabled = config.mprq.enabled && mprq;
+	if (config.mprq.enabled) {
+		if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+		    config.mprq.stride_num_n < mprq_min_stride_num_n) {
+			config.mprq.stride_num_n =
+				RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+					mprq_min_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_num_n);
+		}
+		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+	}
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	eth_dev->device->driver = &mlx5_driver.driver;
+	err = mlx5_uar_init_primary(eth_dev);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		eth_dev->data->port_id,
+		mac.addr_bytes[0], mac.addr_bytes[1],
+		mac.addr_bytes[2], mac.addr_bytes[3],
+		mac.addr_bytes[4], mac.addr_bytes[5]);
+#ifndef NDEBUG
+	{
+		char ifname[IF_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
+#endif
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	priv->nl_socket = -1;
+	priv->nl_sn = 0;
+	if (vf && config.vf_nl_en) {
+		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
+		if (priv->nl_socket < 0)
+			priv->nl_socket = -1;
+		mlx5_nl_mac_addr_sync(eth_dev);
+	}
+	TAILQ_INIT(&priv->flows);
+	TAILQ_INIT(&priv->ctrl_flows);
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	struct mlx5dv_ctx_allocators alctr = {
+		.alloc = &mlx5_alloc_verbs_buf,
+		.free = &mlx5_free_verbs_buf,
+		.data = priv,
+	};
+	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				       (void *)((uintptr_t)&alctr));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the asyn_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+	/* Store device configuration on private structure. */
+	priv->config = config;
+	/* Create drop queue. */
+	err = mlx5_flow_create_drop_queue(eth_dev);
+	if (err) {
+		DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = rte_errno;
+		goto error;
+	}
+	/* Supported Verbs flow priority number detection. */
+	if (verb_priorities == 0)
+		verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
+	if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
+		DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
+			eth_dev->data->port_id, verb_priorities);
+		err = ENOTSUP;
+		goto error;
+	}
+	priv->config.max_verbs_prio = verb_priorities;
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 */
+	err = mlx5_mr_btree_init(&priv->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 eth_dev->device->numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Add device to memory callback list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+			 priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	return eth_dev;
 error:
+	if (priv)
+		rte_free(priv);
+	if (pd)
+		claim_zero(mlx5_glue->dealloc_pd(pd));
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
 	assert(err > 0);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
                     ` (3 preceding siblings ...)
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 4/7] net/mlx5: re-indent generic probing function Adrien Mazarguil
@ 2018-06-14  8:34   ` Adrien Mazarguil
  2018-06-16  8:37     ` Xueming(Steven) Li
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors Adrien Mazarguil
                     ` (2 subsequent siblings)
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:34 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

The current PCI probing method is not aware of Verbs port representors,
which appear as standard Verbs devices bound to the same PCI address and
cannot be distinguished.

Problem is that more often than not, the wrong Verbs device is used,
resulting in unexpected traffic.

This patch adds necessary heuristics to bind affected driver instances to
the intended (i.e. non-representor) device.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Fixed digit detection in mlx5_cmp_ibv_name() so that "foo1" and "foo10"
  are compared on the integer conversion of "1" against "10" instead of ""
  and "0".
---
 drivers/net/mlx5/mlx5.c | 66 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index c9815d721..498f80c89 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -3,6 +3,7 @@
  * Copyright 2015 Mellanox Technologies, Ltd
  */
 
+#include <ctype.h>
 #include <stddef.h>
 #include <unistd.h>
 #include <string.h>
@@ -1170,6 +1171,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 }
 
 /**
+ * Comparison callback to sort Verbs device names.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first Verbs device.
+ * @param b[in]
+ *   Pointer to pointer to second Verbs device.
+ *
+ * @return
+ *   0 if both names are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_cmp_ibv_name(const void *a, const void *b)
+{
+	const char *name_a = (*(const struct ibv_device *const *)a)->name;
+	const char *name_b = (*(const struct ibv_device *const *)b)->name;
+	size_t i = 0;
+
+	while (name_a[i] && name_a[i] == name_b[i])
+		++i;
+	while (i && isdigit(name_a[i - 1]) && isdigit(name_b[i - 1]))
+		--i;
+	return atoi(name_a + i) - atoi(name_b + i);
+}
+
+/**
  * DPDK callback to register a PCI device.
  *
  * This function creates an Ethernet device for each port of a given
@@ -1189,6 +1218,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **ibv_list;
 	struct rte_eth_dev **eth_list = NULL;
+	int n = 0;
 	int vf;
 	int ret;
 
@@ -1210,6 +1240,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
+
+	struct ibv_device *ibv_match[ret + 1];
+
 	while (ret-- > 0) {
 		struct rte_pci_addr pci_addr;
 
@@ -1221,14 +1254,37 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    pci_dev->addr.devid != pci_addr.devid ||
 		    pci_dev->addr.function != pci_addr.function)
 			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
 			ibv_list[ret]->name);
-		break;
+		ibv_match[n++] = ibv_list[ret];
+	}
+	ibv_match[n] = NULL;
+	if (n > 1) {
+		/*
+		 * The existence of several matching entries means port
+		 * representors have been instantiated. No existing Verbs
+		 * call nor /sys entries can tell them apart at this point.
+		 *
+		 * While definitely hackish, assume their names are numbered
+		 * based on order of creation with master device first,
+		 * followed by first port representor, followed by the
+		 * second one and so on.
+		 */
+		DRV_LOG(WARNING,
+			"probing device with port representors involves"
+			" heuristics with uncertain outcome");
+		qsort(ibv_match, n, sizeof(*ibv_match), mlx5_cmp_ibv_name);
+		DRV_LOG(WARNING, "assuming \"%s\" is the master device",
+			ibv_match[0]->name);
+		for (ret = 1; ret < n; ++ret)
+			DRV_LOG(WARNING,
+				"assuming \"%s\" is port representor #%d",
+				ibv_match[ret]->name, ret - 1);
 	}
-	if (ret >= 0)
-		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	if (n)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
 	mlx5_glue->free_device_list(ibv_list);
-	if (!ret) {
+	if (!n) {
 		DRV_LOG(WARNING,
 			"no Verbs device matches PCI device " PCI_PRI_FMT ","
 			" are kernel drivers loaded?",
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
                     ` (4 preceding siblings ...)
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness Adrien Mazarguil
@ 2018-06-14  8:35   ` Adrien Mazarguil
  2018-06-16  8:57     ` Xueming(Steven) Li
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for " Adrien Mazarguil
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Probe existing port representors in addition to their master device and
associate them automatically.

To avoid name collision between Ethernet devices, their names use the same
convention as ixgbe and i40e PMDs, that is, instead of only a PCI address
in DBDF notation:

- "net_{DBDF}_0" for master/switch devices.
- "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
  representors.

Both optionally suffixed with "_port_{num}" instead of " port {num}" for
devices that expose several Verbs ports (note this is never the case on
mlx5, but kept for historical reasons for the time being).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Added representor information to dev_infos_get(). DPDK port ID of master
  device is now stored in the private structure to retrieve it
  conveniently.
- Master device is assigned dummy representor ID value -1 to better
  distinguish from the the first actual representor reported by
  dev_infos_get() as those are indexed from 0.
- Added RTE_ETH_DEV_REPRESENTOR device flag.
---
 drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
 drivers/net/mlx5/mlx5.h        |   9 ++-
 drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_mac.c    |   2 +-
 drivers/net/mlx5/mlx5_stats.c  |   6 +-
 5 files changed, 252 insertions(+), 54 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 498f80c89..716c9d9a5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
+	if (!priv->representor &&
+	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 	memset(priv, 0, sizeof(*priv));
 }
 
@@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *   Verbs device attributes.
  * @param port
  *   Verbs port to use (indexed from 1).
+ * @param master
+ *   Master device in case @p ibv_dev is a port representor.
+ * @param rep_id
+ *   Representor identifier when @p master is non-NULL.
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
@@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		   struct ibv_device *ibv_dev,
 		   int vf,
 		   const struct ibv_device_attr_ex *attr,
-		   unsigned int port)
+		   unsigned int port,
+		   struct rte_eth_dev *master,
+		   unsigned int rep_id)
 {
 	struct ibv_context *ctx;
 	struct ibv_port_attr port_attr;
@@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config.mpls_en = mpls_en;
-	if (attr->orig_attr.phys_port_cnt > 1)
-		snprintf(name, sizeof(name), "%s port %u",
-			 dpdk_dev->name, port);
+	if (!master)
+		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
 	else
-		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+		snprintf(name, sizeof(name), "net_%s_representor_%u",
+			 dpdk_dev->name, rep_id);
+	if (attr->orig_attr.phys_port_cnt > 1)
+		snprintf(name, sizeof(name), "%s_port_%u", name, port);
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
@@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	priv->port = port;
 	priv->pd = pd;
 	priv->mtu = ETHER_MTU;
+	/*
+	 * Allocate a switch domain for master devices and share it with
+	 * port representors.
+	 */
+	if (!master) {
+		priv->representor = 0;
+		priv->master_id = -1; /* Updated once known. */
+		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+		priv->rep_id = -1; /* Dummy unique value. */
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+	} else {
+		priv->representor = 1;
+		priv->master_id =
+			((struct priv *)master->data->dev_private)->master_id;
+		priv->domain_id =
+			((struct priv *)master->data->dev_private)->domain_id;
+		priv->rep_id = rep_id;
+	}
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -964,6 +1000,18 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Now that eth_dev is allocated and its port ID is known, make
+	 * non-representor ports target their own port ID as master for
+	 * convenience.
+	 *
+	 * Master port ID is already set for actual representors. Those only
+	 * need the right device flag.
+	 */
+	if (!master)
+		priv->master_id = eth_dev->data->port_id;
+	else
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
 	eth_dev->data->dev_private = priv;
 	priv->dev_data = eth_dev->data;
 	eth_dev->data->mac_addrs = priv->mac;
@@ -1083,8 +1131,12 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
-	if (priv)
+	if (priv) {
+		if (!priv->representor &&
+		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
+	}
 	if (pd)
 		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev)
@@ -1097,12 +1149,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 }
 
 /**
- * Spawn Ethernet devices from Verbs information, one per detected port.
+ * Spawn Ethernet devices from Verbs information, one per detected port and
+ * port representor.
  *
  * @param dpdk_dev
  *   Backing DPDK device.
  * @param ibv_dev
- *   Verbs device.
+ *   NULL-terminated list of Verbs devices. First entry is the master device
+ *   (mandatory), followed by optional representors.
  * @param vf
  *   If nonzero, enable VF-specific features.
  *
@@ -1113,17 +1167,21 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
  */
 static struct rte_eth_dev **
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
-	       struct ibv_device *ibv_dev,
+	       struct ibv_device **ibv_dev,
 	       int vf)
 {
 	struct rte_eth_dev **eth_list = NULL;
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	void *tmp;
 	unsigned int i;
+	unsigned int j = 0;
+	unsigned int n = 0;
 	int ret;
 
+next:
 	errno = 0;
-	ctx = mlx5_glue->open_device(ibv_dev);
+	ctx = mlx5_glue->open_device(ibv_dev[j]);
 	if (!ctx) {
 		rte_errno = errno ? errno : ENODEV;
 		if (rte_errno == ENODEV)
@@ -1132,7 +1190,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		else
 			DRV_LOG(ERR,
 				"cannot use device, are drivers up to date?");
-		return NULL;
+		goto error;
 	}
 	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	mlx5_glue->close_device(ctx);
@@ -1140,34 +1198,42 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		rte_errno = ret;
 		DRV_LOG(ERR, "unable to query device information: %s",
 			strerror(rte_errno));
-		return NULL;
+		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	eth_list = malloc(sizeof(*eth_list) *
-			  (attr.orig_attr.phys_port_cnt + 1));
-	if (!eth_list) {
+	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
+		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
+	tmp = realloc(eth_list, sizeof(*eth_list) *
+		      (n + attr.orig_attr.phys_port_cnt + 1));
+	if (!tmp) {
 		rte_errno = errno;
-		return NULL;
+		goto error;
 	}
+	eth_list = tmp;
 	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
-		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
-						 &attr, i + 1);
-		if (eth_list[i])
-			continue;
-		/* Save rte_errno and roll back in case of failure. */
-		ret = rte_errno;
-		while (i--) {
-			mlx5_dev_close(eth_list[i]);
-			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-				rte_free(eth_list[i]->data->dev_private);
-			claim_zero(rte_eth_dev_release_port(eth_list[i]));
-		}
-		free(eth_list);
-		rte_errno = ret;
-		return NULL;
+		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
+						 &attr, i + 1,
+						 j ? eth_list[0] : NULL,
+						 j - 1);
+		if (!eth_list[n])
+			goto error;
+		++n;
 	}
-	eth_list[i] = NULL;
+	if (ibv_dev[++j])
+		goto next;
+	eth_list[n] = NULL;
 	return eth_list;
+error:
+	/* Save rte_errno and roll back in case of failure. */
+	ret = rte_errno;
+	while (n--) {
+		mlx5_dev_close(eth_list[n]);
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			rte_free(eth_list[n]->data->dev_private);
+		claim_zero(rte_eth_dev_release_port(eth_list[n]));
+	}
+	free(eth_list);
+	rte_errno = ret;
+	return NULL;
 }
 
 /**
@@ -1282,7 +1348,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				ibv_match[ret]->name, ret - 1);
 	}
 	if (n)
-		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
 		DRV_LOG(WARNING,
@@ -1302,7 +1368,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 	} else {
 		for (ret = 0; eth_list[ret]; ++ret) {
+			uint32_t restore = eth_list[ret]->data->dev_flags;
+
 			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			/* Restore non-PCI flags cleared by the above call. */
+			eth_list[ret]->data->dev_flags |= restore;
 			rte_eth_dev_probing_finish(eth_list[ret]);
 		}
 		ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 997b04a33..0fe467140 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -161,6 +161,10 @@ struct priv {
 	uint16_t mtu; /* Configured MTU. */
 	uint8_t port; /* Physical port number. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	uint16_t master_id; /* DPDK port ID of switch domain master. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	unsigned int rep_id; /* Port representor identifier. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
@@ -209,9 +213,12 @@ int mlx5_getenv_int(const char *);
 
 /* mlx5_ethdev.c */
 
+int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+			   char (*ifname)[IF_NAMESIZE]);
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
 int mlx5_ifindex(const struct rte_eth_dev *dev);
-int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	       int master);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
 		   unsigned int flags);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 90488af33..9d579659e 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -93,7 +93,7 @@ struct ethtool_link_settings {
 #endif
 
 /**
- * Get interface name from private structure.
+ * Get master interface name from private structure.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -104,7 +104,8 @@ struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+		       char (*ifname)[IF_NAMESIZE])
 {
 	struct priv *priv = dev->data->dev_private;
 	DIR *dir;
@@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret;
+	char master[IF_NAMESIZE];
+	FILE *file;
+	DIR *dir;
+	uint64_t phys_switch_id;
+
+	if (!priv->representor)
+		return mlx5_get_master_ifname(dev, ifname);
+	ret = mlx5_get_master_ifname(dev, &master);
+	if (ret)
+		return ret;
+	{
+		MKSTR(path, "%s/device/net/%s/phys_switch_id",
+		      priv->ibdev_path, master);
+
+		file = fopen(path, "rb");
+	}
+	if (!file) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
+	fclose(file);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	{
+		MKSTR(path, "%s/device/net/%s/subsystem",
+		      priv->ibdev_path, master);
+
+		dir = opendir(path);
+	}
+	if (!dir) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	/*
+	 * Scan network interfaces to find one with matching phys_switch_id
+	 * and phys_switch_name.
+	 */
+	do {
+		struct dirent *dent;
+		uint64_t phys_switch_id_rep;
+		int rep_id;
+
+		ret = -ENOENT;
+		dent = readdir(dir);
+		if (!dent)
+			break;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (phys_switch_id_rep != phys_switch_id)
+			continue;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_port_name",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%d", &rep_id);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
+			continue;
+		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
+		ret = 0;
+		break;
+	} while (1);
+	closedir(dir);
+	if (ret)
+		rte_errno = -ret;
+	return ret;
+}
+
+/**
  * Get the interface index from device name.
  *
  * @param[in] dev
@@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
  *   Request number to pass to ioctl().
  * @param[out] ifr
  *   Interface request structure output buffer.
+ * @param master
+ *   When device is a port representor, perform request on master device
+ *   instead.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	   int master)
 {
 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 	int ret = 0;
@@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 		rte_errno = errno;
 		return -rte_errno;
 	}
-	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (master)
+		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
+	else
+		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
 	if (ret)
 		goto error;
 	ret = ioctl(sock, req, ifr);
@@ -258,7 +373,7 @@ int
 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
 
 	if (ret)
 		return ret;
@@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
 	struct ifreq request = { .ifr_mtu = mtu, };
 
-	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
 }
 
 /**
@@ -302,13 +417,13 @@ int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
 
 	if (ret)
 		return ret;
 	request.ifr_flags &= keep;
 	request.ifr_flags |= flags & ~keep;
-	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
 }
 
 /**
@@ -477,6 +592,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	info->speed_capa = priv->link_speed_capa;
 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
 	mlx5_set_default_params(dev, info);
+	if (rte_eth_dev_is_valid_port(priv->master_id)) {
+		info->switch_info.name =
+			rte_eth_devices[priv->master_id].data->name;
+		info->switch_info.domain_id = priv->domain_id;
+		info->switch_info.port_id = priv->rep_id;
+	}
 }
 
 /**
@@ -540,7 +661,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	int link_speed = 0;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -550,7 +671,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&edata;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
@@ -611,7 +732,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	uint64_t sc;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -621,7 +742,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&gcmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -638,7 +759,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
 	*ecmd = gcmd;
 	ifr.ifr_data = (void *)ecmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -801,7 +922,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	int ret;
 
 	ifr.ifr_data = (void *)&ethpause;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
@@ -854,7 +975,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		ethpause.tx_pause = 1;
 	else
 		ethpause.tx_pause = 0;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 672a47619..12ee37f55 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 	struct ifreq request;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
 	if (ret)
 		return ret;
 	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 875dd1027..91f3d474a 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
 	et_stats->cmd = ETHTOOL_GSTATS;
 	et_stats->n_stats = xstats_ctrl->stats_n;
 	ifr.ifr_data = (caddr_t)et_stats;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u unable to read statistic values from device",
@@ -194,7 +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
 
 	drvinfo.cmd = ETHTOOL_GDRVINFO;
 	ifr.ifr_data = (caddr_t)&drvinfo;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to query number of statistics",
 			dev->data->port_id);
@@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
 	strings->string_set = ETH_SS_STATS;
 	strings->len = dev_stats_n;
 	ifr.ifr_data = (caddr_t)strings;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to get statistic names",
 			dev->data->port_id);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for port representors
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
                     ` (5 preceding siblings ...)
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-06-14  8:35   ` Adrien Mazarguil
  2018-06-16  8:59     ` Xueming(Steven) Li
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
  7 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-14  8:35 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Prior to this patch, all port representors detected on a given device were
probed and Ethernet devices instantiated for each of them.

This patch adds support for the standard "representor" parameter, which
implies that port representors are not probed by default anymore, except
for the list provided through device arguments.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Added error message for when rte_eth_devargs_parse() fails.
---
 doc/guides/nics/mlx5.rst                | 12 ++++++++++++
 doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
 drivers/net/mlx5/mlx5.c                 | 29 ++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 79c982e29..5229e546c 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -388,6 +388,18 @@ Run-time configuration
 
   Disabled by default.
 
+- ``representor`` parameter [list]
+
+  This parameter can be used to instantiate DPDK Ethernet devices from
+  existing port (or VF) representors configured on the device.
+
+  It is a standard parameter whose format is described in
+  :ref:`ethernet_device_standard_device_arguments`.
+
+  For instance, to probe port representors 0 through 2::
+
+    representor=[0-2]
+
 Firmware configuration
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
index af82352a0..58d49ba0f 100644
--- a/doc/guides/prog_guide/poll_mode_drv.rst
+++ b/doc/guides/prog_guide/poll_mode_drv.rst
@@ -365,6 +365,8 @@ Ethernet Device API
 
 The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
 
+.. _ethernet_device_standard_device_arguments:
+
 Ethernet Device Standard Device Arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 716c9d9a5..26e61d99d 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -91,6 +91,9 @@
 /* Activate Netlink support in VF mode. */
 #define MLX5_VF_NL_EN "vf_nl_en"
 
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
 #ifndef HAVE_IBV_MLX5_MOD_MPW
 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
@@ -423,6 +426,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	struct mlx5_dev_config *config = opaque;
 	unsigned long tmp;
 
+	/* No-op, port representors are processed in mlx5_dev_spawn(). */
+	if (!strcmp(MLX5_REPRESENTOR, key))
+		return 0;
 	errno = 0;
 	tmp = strtoul(val, NULL, 0);
 	if (errno) {
@@ -495,6 +501,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
+		MLX5_REPRESENTOR,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
@@ -1173,13 +1180,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	struct rte_eth_dev **eth_list = NULL;
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	struct rte_eth_devargs eth_da;
 	void *tmp;
 	unsigned int i;
 	unsigned int j = 0;
 	unsigned int n = 0;
 	int ret;
 
+	if (dpdk_dev->devargs) {
+		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+		if (ret) {
+			rte_errno = -ret;
+			DRV_LOG(ERR, "failed to process device arguments: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+	} else {
+		memset(&eth_da, 0, sizeof(eth_da));
+	}
 next:
+	if (j) {
+		unsigned int k;
+
+		for (k = 0; k < eth_da.nb_representor_ports; ++k)
+			if (eth_da.representor_ports[k] == j - 1)
+				break;
+		if (k == eth_da.nb_representor_ports)
+			goto skip;
+	}
 	errno = 0;
 	ctx = mlx5_glue->open_device(ibv_dev[j]);
 	if (!ctx) {
@@ -1218,6 +1246,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			goto error;
 		++n;
 	}
+skip:
 	if (ibv_dev[++j])
 		goto next;
 	eth_list[n] = NULL;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-06-16  8:24     ` Xueming(Steven) Li
  0 siblings, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:24 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev


> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code
> 
> There are several attribute objects in this function:
> 
> - IB device attributes (struct ibv_device_attr_ex device_attr).
> - Direct Verbs attributes (struct mlx5dv_context attrs_out).
> - Port attributes (struct ibv_port_attr).
> - IB device attributes again (struct ibv_device_attr_ex device_attr_ex).
> 
> "attrs_out" is both odd and initialized using a nonstandard syntax. Rename it "dv_attr" for
> consistency.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed ctx -> attr_ctx in mlx5_pci_probe().
> ---
>  drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
>  1 file changed, 17 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 3e0a1b186..3bdcb3970 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  {
>  	struct ibv_device **list = NULL;
>  	struct ibv_device *ibv_dev;
> +	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	int err = 0;
>  	struct ibv_context *attr_ctx = NULL;
>  	struct ibv_device_attr_ex device_attr; @@ -670,7 +671,6 @@ mlx5_pci_probe(struct rte_pci_driver
> *pci_drv __rte_unused,
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
>  	int i;
> -	struct mlx5dv_context attrs_out = {0};
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };  #endif @@ -736,21 +736,21
> @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	ibv_dev = list[i];
>  	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
>  #endif
>  	/*
>  	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
>  	 * as all ConnectX-5 devices.
>  	 */
>  #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
>  #endif
>  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> -	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
> +	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
>  #endif
> -	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
> -	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
> -		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
> +	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
> +	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
> +		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
>  			DRV_LOG(DEBUG, "enhanced MPW is supported");
>  			mps = MLX5_MPW_ENHANCED;
>  		} else {
> @@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		mps = MLX5_MPW_DISABLED;
>  	}
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
> -		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
> +		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
>  	DRV_LOG(DEBUG, "SWP support: %u", swp);  #endif  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
>  		struct mlx5dv_striding_rq_caps mprq_caps =
> -			attrs_out.striding_rq_caps;
> +			dv_attr.striding_rq_caps;
> 
>  		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
>  			mprq_caps.min_single_stride_log_num_of_bytes);
> @@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	}
>  #endif
>  	if (RTE_CACHE_LINE_SIZE == 128 &&
> -	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
> +	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
>  		cqe_comp = 0;
>  	else
>  		cqe_comp = 1;
>  #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
> -	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
> -		tunnel_en = ((attrs_out.tunnel_offloads_caps &
> +	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
> +		tunnel_en = ((dv_attr.tunnel_offloads_caps &
>  			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
> -			     (attrs_out.tunnel_offloads_caps &
> +			     (dv_attr.tunnel_offloads_caps &
>  			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
>  	}
>  	DRV_LOG(DEBUG, "tunnel offloading is %ssupported", @@ -812,9 +812,9 @@ mlx5_pci_probe(struct
> rte_pci_driver *pci_drv __rte_unused,
>  		"tunnel offloading disabled due to old OFED/rdma-core version");  #endif  #ifdef
> HAVE_IBV_DEVICE_MPLS_SUPPORT
> -	mpls_en = ((attrs_out.tunnel_offloads_caps &
> +	mpls_en = ((dv_attr.tunnel_offloads_caps &
>  		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
> -		   (attrs_out.tunnel_offloads_caps &
> +		   (dv_attr.tunnel_offloads_caps &
>  		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
>  	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
>  		mpls_en ? "" : "not ");
> --
> 2.11.0

Reviewed-by: Xueming Li <xuemingl@mellanox.com>


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-06-16  8:27     ` Xueming(Steven) Li
  2018-06-17 10:14     ` Shahaf Shuler
  1 sibling, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:27 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
> 
> This patch gets rid of redundant calls to open the device and query its attributes in order to
> simplify the code.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Minor indent fix on existing code.
> ---
>  drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
>  1 file changed, 30 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 3bdcb3970..1a5391e63 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  {
>  	struct ibv_device **list = NULL;
>  	struct ibv_device *ibv_dev;
> +	struct ibv_context *ctx = NULL;
> +	struct ibv_device_attr_ex attr;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	int err = 0;
> -	struct ibv_context *attr_ctx = NULL;
> -	struct ibv_device_attr_ex device_attr;
>  	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
> @@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
>  		      (pci_dev->id.device_id ==
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		attr_ctx = mlx5_glue->open_device(list[i]);
> +		ctx = mlx5_glue->open_device(list[i]);
>  		rte_errno = errno;
>  		err = rte_errno;
>  		break;
>  	}
> -	if (attr_ctx == NULL) {
> +	if (ctx == NULL) {
>  		switch (err) {
>  		case 0:
>  			DRV_LOG(ERR,
> @@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  #ifdef
> HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;  #endif
> -	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
> +	mlx5_glue->dv_query_device(ctx, &dv_attr);
>  	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
>  		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
>  			DRV_LOG(DEBUG, "enhanced MPW is supported"); @@ -822,23 +822,20 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
> +	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	if (err) {
>  		DEBUG("ibv_query_device_ex() failed");
>  		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected",
> -		device_attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
>  		char name[RTE_ETH_NAME_MAX_LEN];
>  		int len;
>  		uint32_t port = i + 1; /* ports are indexed from one */
> -		struct ibv_context *ctx = NULL;
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
>  		struct rte_eth_dev *eth_dev = NULL;
> -		struct ibv_device_attr_ex device_attr_ex;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
>  			 pci_dev->addr.domain, pci_dev->addr.bus,
>  			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (device_attr.orig_attr.phys_port_cnt > 1)
> +		if (attr.orig_attr.phys_port_cnt > 1)
>  			snprintf(name + len, sizeof(name), " port %u", i);
>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
> @@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			continue;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		ctx = mlx5_glue->open_device(ibv_dev);
> +		if (!ctx)
> +			ctx = mlx5_glue->open_device(ibv_dev);
>  		if (ctx == NULL) {
>  			err = ENODEV;
>  			goto port_error;
> @@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = device_attr;
> +		priv->device_attr = attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> @@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
> -		if (err) {
> -			DRV_LOG(ERR, "ibv_query_device_ex() failed");
> -			goto port_error;
> -		}
> -		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
> +		config.hw_csum = !!(attr.device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!(device_attr.max_counter_sets);
> +		config.flow_counter_en = !!attr.max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -978,7 +971,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
> +			attr.rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr.raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr.raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr.rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
> -			      (device_attr_ex.tso_caps.supported_qpts &
> -			      (1 << IBV_QPT_RAW_PACKET)));
> +		config.tso = (attr.tso_caps.max_tso > 0 &&
> +			      (attr.tso_caps.supported_qpts &
> +			       (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz =
> -					device_attr_ex.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr.tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1168,14 +1160,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> +		/*
> +		 * Each eth_dev instance is assigned its own Verbs context,
> +		 * since this one is consumed, let the next iteration open
> +		 * another.
> +		 */
> +		ctx = NULL;
>  		continue;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
>  		if (pd)
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
> -		if (ctx)
> -			claim_zero(mlx5_glue->close_device(ctx));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
>  		break;
> @@ -1187,8 +1183,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	 * way to enumerate the registered ethdevs to free the previous ones.
>  	 */
>  error:
> -	if (attr_ctx)
> -		claim_zero(mlx5_glue->close_device(attr_ctx));
> +	if (ctx)
> +		claim_zero(mlx5_glue->close_device(ctx));
>  	if (list)
>  		mlx5_glue->free_device_list(list);
>  	if (err) {
> --
> 2.11.0

Reviewed-by: Xueming Li <xuemingl@mellanox.com>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-06-16  8:29     ` Xueming(Steven) Li
  2018-06-17 10:14     ` Shahaf Shuler
  1 sibling, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:29 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Reviewed-by: Xueming Li <xuemingl@mellanox.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is currently supplied by a PCI
> lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed device naming. A port suffix is now appended only if several IB
>   ports happen to be detected.
> - Added separate message to distinguish missing kernel drivers from other
>   initialization errors, as it was confusing.
> ---
>  drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
>  1 file changed, 209 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 1a5391e63..01dcf25b9 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)  }
> 
>  /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
>   *
> - * @param[in] pci_drv
> - *   PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - *   PCI device information.
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + * @param[in] attr
> + *   Verbs device attributes.
> + * @param port
> + *   Verbs port to use (indexed from 1).
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> + *   is set.
>   */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> -	       struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> +		   struct ibv_device *ibv_dev,
> +		   int vf,
> +		   const struct ibv_device_attr_ex *attr,
> +		   unsigned int port)
>  {
> -	struct ibv_device **list = NULL;
> -	struct ibv_device *ibv_dev;
> -	struct ibv_context *ctx = NULL;
> -	struct ibv_device_attr_ex attr;
> +	struct ibv_context *ctx;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +	struct rte_eth_dev *eth_dev = NULL;
>  	int err = 0;
> -	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
>  	unsigned int tunnel_en = 0;
> @@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	unsigned int mprq_max_stride_size_n = 0;
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
> -	int i;
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };  #endif
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> -	assert(pci_drv == &mlx5_driver);
> -	list = mlx5_glue->get_device_list(&i);
> -	if (list == NULL) {
> -		assert(errno);
> -		err = errno;
> -		if (errno == ENOSYS)
> -			DRV_LOG(ERR,
> -				"cannot list devices, is ib_uverbs loaded?");
> -		goto error;
> -	}
> -	assert(i >= 0);
> -	/*
> -	 * For each listed device, check related sysfs entry against
> -	 * the provided PCI ID.
> -	 */
> -	while (i != 0) {
> -		struct rte_pci_addr pci_addr;
> -
> -		--i;
> -		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> -		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> -			continue;
> -		if ((pci_dev->addr.domain != pci_addr.domain) ||
> -		    (pci_dev->addr.bus != pci_addr.bus) ||
> -		    (pci_dev->addr.devid != pci_addr.devid) ||
> -		    (pci_dev->addr.function != pci_addr.function))
> -			continue;
> -		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> -			list[i]->name);
> -		vf = ((pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		ctx = mlx5_glue->open_device(list[i]);
> -		rte_errno = errno;
> -		err = rte_errno;
> -		break;
> -	}
> -	if (ctx == NULL) {
> -		switch (err) {
> -		case 0:
> -			DRV_LOG(ERR,
> -				"cannot access device, is mlx5_ib loaded?");
> -			err = ENODEV;
> -			break;
> -		case EINVAL:
> -			DRV_LOG(ERR,
> -				"cannot use device, are drivers up to date?");
> -			break;
> -		}
> -		goto error;
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		return NULL;
>  	}
> -	ibv_dev = list[i];
> -	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@ -822,20 +773,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -	if (err) {
> -		DEBUG("ibv_query_device_ex() failed");
> -		goto error;
> -	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> +	{
>  		char name[RTE_ETH_NAME_MAX_LEN];
> -		int len;
> -		uint32_t port = i + 1; /* ports are indexed from one */
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
> -		struct rte_eth_dev *eth_dev = NULL;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			},
>  		};
> 
> -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> -			 pci_dev->addr.domain, pci_dev->addr.bus,
> -			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (attr.orig_attr.phys_port_cnt > 1)
> -			snprintf(name + len, sizeof(name), " port %u", i);
> +		if (attr->orig_attr.phys_port_cnt > 1)
> +			snprintf(name, sizeof(name), "%s port %u",
> +				 dpdk_dev->name, port);
> +		else
> +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
>  			if (eth_dev == NULL) {
> @@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				err = rte_errno;
>  				goto error;
>  			}
> -			eth_dev->device = &pci_dev->device;
> +			eth_dev->device = dpdk_dev;
>  			eth_dev->dev_ops = &mlx5_dev_sec_ops;
>  			err = mlx5_uar_init_secondary(eth_dev);
>  			if (err) {
> @@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				mlx5_select_rx_function(eth_dev);
>  			eth_dev->tx_pkt_burst =
>  				mlx5_select_tx_function(eth_dev);
> -			rte_eth_dev_probing_finish(eth_dev);
> -			continue;
> +			mlx5_glue->close_device(ctx);
> +			return eth_dev;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		if (!ctx)
> -			ctx = mlx5_glue->open_device(ibv_dev);
> -		if (ctx == NULL) {
> -			err = ENODEV;
> -			goto port_error;
> -		}
>  		/* Check port status. */
>  		err = mlx5_glue->query_port(ctx, port, &port_attr);
>  		if (err) {
> @@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = attr;
> +		priv->device_attr = *attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> -		err = mlx5_args(&config, pci_dev->device.devargs);
> +		err = mlx5_args(&config, dpdk_dev->devargs);
>  		if (err) {
>  			err = rte_errno;
>  			DRV_LOG(ERR, "failed to process device arguments: %s",
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		config.hw_csum = !!(attr.device_cap_flags_ex &
> +		config.hw_csum = !!(attr->device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!attr.max_counter_sets;
> +		config.flow_counter_en = !!attr->max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -971,7 +907,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			attr.rss_caps.max_rwq_indirection_table_size;
> +			attr->rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!attr.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = (attr.tso_caps.max_tso > 0 &&
> -			      (attr.tso_caps.supported_qpts &
> +		config.tso = (attr->tso_caps.max_tso > 0 &&
> +			      (attr->tso_caps.supported_qpts &
>  			       (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr->tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		eth_dev->data->dev_private = priv;
>  		priv->dev_data = eth_dev->data;
>  		eth_dev->data->mac_addrs = priv->mac;
> -		eth_dev->device = &pci_dev->device;
> -		rte_eth_copy_pci_info(eth_dev, pci_dev);
> +		eth_dev->device = dpdk_dev;
>  		eth_dev->device->driver = &mlx5_driver.driver;
>  		err = mlx5_uar_init_primary(eth_dev);
>  		if (err) {
> @@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> -		/*
> -		 * Each eth_dev instance is assigned its own Verbs context,
> -		 * since this one is consumed, let the next iteration open
> -		 * another.
> -		 */
> -		ctx = NULL;
> -		continue;
> +		return eth_dev;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
> @@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
> -		break;
>  	}
> -	/*
> -	 * XXX if something went wrong in the loop above, there is a resource
> -	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> -	 * long as the dpdk does not provide a way to deallocate a ethdev and a
> -	 * way to enumerate the registered ethdevs to free the previous ones.
> -	 */
>  error:
>  	if (ctx)
>  		claim_zero(mlx5_glue->close_device(ctx));
> -	if (list)
> -		mlx5_glue->free_device_list(list);
> -	if (err) {
> -		rte_errno = err;
> +	assert(err > 0);
> +	rte_errno = err;
> +	return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + *
> + * @return
> + *   A NULL-terminated list of Ethernet device objects on success, NULL
> + *   otherwise and rte_errno is set. Caller is expected to release list
> + *   memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> +	       struct ibv_device *ibv_dev,
> +	       int vf)
> +{
> +	struct rte_eth_dev **eth_list = NULL;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex attr;
> +	unsigned int i;
> +	int ret;
> +
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		if (rte_errno == ENODEV)
> +			DRV_LOG(ERR,
> +				"cannot access device, is mlx5_ib loaded?");
> +		else
> +			DRV_LOG(ERR,
> +				"cannot use device, are drivers up to date?");
> +		return NULL;
> +	}
> +	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> +	mlx5_glue->close_device(ctx);
> +	if (ret) {
> +		rte_errno = ret;
> +		DRV_LOG(ERR, "unable to query device information: %s",
> +			strerror(rte_errno));
> +		return NULL;
> +	}
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	eth_list = malloc(sizeof(*eth_list) *
> +			  (attr.orig_attr.phys_port_cnt + 1));
> +	if (!eth_list) {
> +		rte_errno = errno;
> +		return NULL;
> +	}
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> +						 &attr, i + 1);
> +		if (eth_list[i])
> +			continue;
> +		/* Save rte_errno and roll back in case of failure. */
> +		ret = rte_errno;
> +		while (i--) {
> +			mlx5_dev_close(eth_list[i]);
> +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +				rte_free(eth_list[i]->data->dev_private);
> +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +		}
> +		free(eth_list);
> +		rte_errno = ret;
> +		return NULL;
> +	}
> +	eth_list[i] = NULL;
> +	return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}
> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}
> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI device " PCI_PRI_FMT ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else if (!eth_list || !*eth_list) {
> +		DRV_LOG(ERR,
> +			"probe of PCI device " PCI_PRI_FMT " aborted after"
> +			" encountering an error: %s",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function,
> +			strerror(rte_errno));
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness Adrien Mazarguil
@ 2018-06-16  8:37     ` Xueming(Steven) Li
  2018-06-27 13:32       ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:37 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Reviewed-by: Xueming Li <xuemingl@mellanox.com>

One minor issue we should be able to ignore.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness
> 
> The current PCI probing method is not aware of Verbs port representors, which appear as standard Verbs
> devices bound to the same PCI address and cannot be distinguished.
> 
> Problem is that more often than not, the wrong Verbs device is used, resulting in unexpected traffic.
> 
> This patch adds necessary heuristics to bind affected driver instances to the intended (i.e. non-
> representor) device.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed digit detection in mlx5_cmp_ibv_name() so that "foo1" and "foo10"
>   are compared on the integer conversion of "1" against "10" instead of ""
>   and "0".
> ---
>  drivers/net/mlx5/mlx5.c | 66 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 61 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index c9815d721..498f80c89 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -3,6 +3,7 @@
>   * Copyright 2015 Mellanox Technologies, Ltd
>   */
> 
> +#include <ctype.h>
>  #include <stddef.h>
>  #include <unistd.h>
>  #include <string.h>
> @@ -1170,6 +1171,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,  }
> 
>  /**
> + * Comparison callback to sort Verbs device names.
> + *
> + * This is meant to be used with qsort().
> + *
> + * @param a[in]
> + *   Pointer to pointer to first Verbs device.
> + * @param b[in]
> + *   Pointer to pointer to second Verbs device.
> + *
> + * @return
> + *   0 if both names are equal, less than 0 if the first argument is less
> + *   than the second, greater than 0 otherwise.
> + */
> +static int
> +mlx5_cmp_ibv_name(const void *a, const void *b) {
> +	const char *name_a = (*(const struct ibv_device *const *)a)->name;
> +	const char *name_b = (*(const struct ibv_device *const *)b)->name;
> +	size_t i = 0;
> +
> +	while (name_a[i] && name_a[i] == name_b[i])
> +		++i;
> +	while (i && isdigit(name_a[i - 1]) && isdigit(name_b[i - 1]))

name_a[i - 1] and name_b[i - 1] must be same here.

> +		--i;
> +	return atoi(name_a + i) - atoi(name_b + i); }
> +
> +/**
>   * DPDK callback to register a PCI device.
>   *
>   * This function creates an Ethernet device for each port of a given @@ -1189,6 +1218,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  {
>  	struct ibv_device **ibv_list;
>  	struct rte_eth_dev **eth_list = NULL;
> +	int n = 0;
>  	int vf;
>  	int ret;
> 
> @@ -1210,6 +1240,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> +
> +	struct ibv_device *ibv_match[ret + 1];
> +
>  	while (ret-- > 0) {
>  		struct rte_pci_addr pci_addr;
> 
> @@ -1221,14 +1254,37 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		    pci_dev->addr.devid != pci_addr.devid ||
>  		    pci_dev->addr.function != pci_addr.function)
>  			continue;
> -		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
>  			ibv_list[ret]->name);
> -		break;
> +		ibv_match[n++] = ibv_list[ret];
> +	}
> +	ibv_match[n] = NULL;
> +	if (n > 1) {
> +		/*
> +		 * The existence of several matching entries means port
> +		 * representors have been instantiated. No existing Verbs
> +		 * call nor /sys entries can tell them apart at this point.
> +		 *
> +		 * While definitely hackish, assume their names are numbered
> +		 * based on order of creation with master device first,
> +		 * followed by first port representor, followed by the
> +		 * second one and so on.
> +		 */
> +		DRV_LOG(WARNING,
> +			"probing device with port representors involves"
> +			" heuristics with uncertain outcome");
> +		qsort(ibv_match, n, sizeof(*ibv_match), mlx5_cmp_ibv_name);
> +		DRV_LOG(WARNING, "assuming \"%s\" is the master device",
> +			ibv_match[0]->name);
> +		for (ret = 1; ret < n; ++ret)
> +			DRV_LOG(WARNING,
> +				"assuming \"%s\" is port representor #%d",
> +				ibv_match[ret]->name, ret - 1);
>  	}
> -	if (ret >= 0)
> -		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +	if (n)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
>  	mlx5_glue->free_device_list(ibv_list);
> -	if (!ret) {
> +	if (!n) {
>  		DRV_LOG(WARNING,
>  			"no Verbs device matches PCI device " PCI_PRI_FMT ","
>  			" are kernel drivers loaded?",
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-06-16  8:57     ` Xueming(Steven) Li
  2018-06-17 10:15       ` Shahaf Shuler
  2018-06-27 13:32       ` Adrien Mazarguil
  0 siblings, 2 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:57 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Reviewed-by: Xueming Li <xuemingl@mellanox.com>

Minor comments inside:

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> Probe existing port representors in addition to their master device and associate them automatically.
> 
> To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> 
> - "net_{DBDF}_0" for master/switch devices.
> - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
>   representors.
> 
> Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Added representor information to dev_infos_get(). DPDK port ID of master
>   device is now stored in the private structure to retrieve it
>   conveniently.
> - Master device is assigned dummy representor ID value -1 to better
>   distinguish from the the first actual representor reported by
>   dev_infos_get() as those are indexed from 0.
> - Added RTE_ETH_DEV_REPRESENTOR device flag.
> ---
>  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
>  drivers/net/mlx5/mlx5.h        |   9 ++-
>  drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
>  drivers/net/mlx5/mlx5_mac.c    |   2 +-
>  drivers/net/mlx5/mlx5_stats.c  |   6 +-
>  5 files changed, 252 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 498f80c89..716c9d9a5 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  	if (ret)
>  		DRV_LOG(WARNING, "port %u some flows still remain",
>  			dev->data->port_id);
> +	if (!priv->representor &&
> +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  	memset(priv, 0, sizeof(*priv));
>  }
> 
> @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
>   *   Verbs device attributes.
>   * @param port
>   *   Verbs port to use (indexed from 1).
> + * @param master
> + *   Master device in case @p ibv_dev is a port representor.
> + * @param rep_id
> + *   Representor identifier when @p master is non-NULL.
>   *
>   * @return
>   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		   struct ibv_device *ibv_dev,
>  		   int vf,
>  		   const struct ibv_device_attr_ex *attr,
> -		   unsigned int port)
> +		   unsigned int port,
> +		   struct rte_eth_dev *master,
> +		   unsigned int rep_id)
>  {
>  	struct ibv_context *ctx;
>  	struct ibv_port_attr port_attr;
> @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		" old OFED/rdma-core version or firmware configuration");  #endif
>  	config.mpls_en = mpls_en;
> -	if (attr->orig_attr.phys_port_cnt > 1)
> -		snprintf(name, sizeof(name), "%s port %u",
> -			 dpdk_dev->name, port);
> +	if (!master)
> +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
>  	else
> -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> +			 dpdk_dev->name, rep_id);
> +	if (attr->orig_attr.phys_port_cnt > 1)
> +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  		eth_dev = rte_eth_dev_attach_secondary(name);
>  		if (eth_dev == NULL) {
> @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	priv->port = port;
>  	priv->pd = pd;
>  	priv->mtu = ETHER_MTU;
> +	/*
> +	 * Allocate a switch domain for master devices and share it with
> +	 * port representors.
> +	 */
> +	if (!master) {
> +		priv->representor = 0;
> +		priv->master_id = -1; /* Updated once known. */
> +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;

Domain_id will override below.

> +		priv->rep_id = -1; /* Dummy unique value. */
> +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> +		if (err) {
> +			err = rte_errno;
> +			DRV_LOG(ERR, "unable to allocate switch domain: %s",
> +				strerror(rte_errno));
> +			goto error;
> +		}
> +	} else {
> +		priv->representor = 1;
> +		priv->master_id =
> +			((struct priv *)master->data->dev_private)->master_id;
> +		priv->domain_id =
> +			((struct priv *)master->data->dev_private)->domain_id;
> +		priv->rep_id = rep_id;
> +	}

Do you think such information should be set as well in secondary process?

>  	err = mlx5_args(&config, dpdk_dev->devargs);
>  	if (err) {
>  		err = rte_errno;
> @@ -964,6 +1000,18 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		err = ENOMEM;
>  		goto error;
>  	}
> +	/*
> +	 * Now that eth_dev is allocated and its port ID is known, make
> +	 * non-representor ports target their own port ID as master for
> +	 * convenience.
> +	 *
> +	 * Master port ID is already set for actual representors. Those only
> +	 * need the right device flag.
> +	 */
> +	if (!master)
> +		priv->master_id = eth_dev->data->port_id;
> +	else
> +		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
>  	eth_dev->data->dev_private = priv;
>  	priv->dev_data = eth_dev->data;
>  	eth_dev->data->mac_addrs = priv->mac;
> @@ -1083,8 +1131,12 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  	return eth_dev;
>  error:
> -	if (priv)
> +	if (priv) {
> +		if (!priv->representor &&
> +		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  		rte_free(priv);
> +	}
>  	if (pd)
>  		claim_zero(mlx5_glue->dealloc_pd(pd));
>  	if (eth_dev)
> @@ -1097,12 +1149,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,  }
> 
>  /**
> - * Spawn Ethernet devices from Verbs information, one per detected port.
> + * Spawn Ethernet devices from Verbs information, one per detected port
> + and
> + * port representor.
>   *
>   * @param dpdk_dev
>   *   Backing DPDK device.
>   * @param ibv_dev
> - *   Verbs device.
> + *   NULL-terminated list of Verbs devices. First entry is the master device
> + *   (mandatory), followed by optional representors.
>   * @param vf
>   *   If nonzero, enable VF-specific features.
>   *
> @@ -1113,17 +1167,21 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>   */
>  static struct rte_eth_dev **
>  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> -	       struct ibv_device *ibv_dev,
> +	       struct ibv_device **ibv_dev,
>  	       int vf)
>  {
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	void *tmp;
>  	unsigned int i;
> +	unsigned int j = 0;
> +	unsigned int n = 0;
>  	int ret;
> 
> +next:
>  	errno = 0;
> -	ctx = mlx5_glue->open_device(ibv_dev);
> +	ctx = mlx5_glue->open_device(ibv_dev[j]);
>  	if (!ctx) {
>  		rte_errno = errno ? errno : ENODEV;
>  		if (rte_errno == ENODEV)
> @@ -1132,7 +1190,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		else
>  			DRV_LOG(ERR,
>  				"cannot use device, are drivers up to date?");
> -		return NULL;
> +		goto error;
>  	}
>  	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	mlx5_glue->close_device(ctx);
> @@ -1140,34 +1198,42 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		rte_errno = ret;
>  		DRV_LOG(ERR, "unable to query device information: %s",
>  			strerror(rte_errno));
> -		return NULL;
> +		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	eth_list = malloc(sizeof(*eth_list) *
> -			  (attr.orig_attr.phys_port_cnt + 1));
> -	if (!eth_list) {
> +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> +	tmp = realloc(eth_list, sizeof(*eth_list) *
> +		      (n + attr.orig_attr.phys_port_cnt + 1));
> +	if (!tmp) {
>  		rte_errno = errno;
> -		return NULL;
> +		goto error;
>  	}
> +	eth_list = tmp;
>  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> -						 &attr, i + 1);
> -		if (eth_list[i])
> -			continue;
> -		/* Save rte_errno and roll back in case of failure. */
> -		ret = rte_errno;
> -		while (i--) {
> -			mlx5_dev_close(eth_list[i]);
> -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> -				rte_free(eth_list[i]->data->dev_private);
> -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> -		}
> -		free(eth_list);
> -		rte_errno = ret;
> -		return NULL;
> +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
> +						 &attr, i + 1,
> +						 j ? eth_list[0] : NULL,
> +						 j - 1);
> +		if (!eth_list[n])
> +			goto error;
> +		++n;
>  	}
> -	eth_list[i] = NULL;
> +	if (ibv_dev[++j])
> +		goto next;
> +	eth_list[n] = NULL;
>  	return eth_list;
> +error:
> +	/* Save rte_errno and roll back in case of failure. */
> +	ret = rte_errno;
> +	while (n--) {
> +		mlx5_dev_close(eth_list[n]);
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			rte_free(eth_list[n]->data->dev_private);
> +		claim_zero(rte_eth_dev_release_port(eth_list[n]));
> +	}
> +	free(eth_list);
> +	rte_errno = ret;
> +	return NULL;
>  }
> 
>  /**
> @@ -1282,7 +1348,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				ibv_match[ret]->name, ret - 1);
>  	}
>  	if (n)
> -		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
>  	mlx5_glue->free_device_list(ibv_list);
>  	if (!n) {
>  		DRV_LOG(WARNING,
> @@ -1302,7 +1368,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		ret = -rte_errno;
>  	} else {
>  		for (ret = 0; eth_list[ret]; ++ret) {
> +			uint32_t restore = eth_list[ret]->data->dev_flags;
> +
>  			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			/* Restore non-PCI flags cleared by the above call. */
> +			eth_list[ret]->data->dev_flags |= restore;
>  			rte_eth_dev_probing_finish(eth_list[ret]);
>  		}
>  		ret = 0;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 997b04a33..0fe467140 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -161,6 +161,10 @@ struct priv {
>  	uint16_t mtu; /* Configured MTU. */
>  	uint8_t port; /* Physical port number. */
>  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> +	unsigned int representor:1; /* Device is a port representor. */
> +	uint16_t master_id; /* DPDK port ID of switch domain master. */
> +	uint16_t domain_id; /* Switch domain identifier. */
> +	unsigned int rep_id; /* Port representor identifier. */
>  	/* RX/TX queues. */
>  	unsigned int rxqs_n; /* RX queues array size. */
>  	unsigned int txqs_n; /* TX queues array size. */ @@ -209,9 +213,12 @@ int mlx5_getenv_int(const
> char *);
> 
>  /* mlx5_ethdev.c */
> 
> +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +			   char (*ifname)[IF_NAMESIZE]);
>  int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);  int
> mlx5_ifindex(const struct rte_eth_dev *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr);
> +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	       int master);
>  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int mlx5_set_flags(struct rte_eth_dev *dev,
> unsigned int keep,
>  		   unsigned int flags);
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index
> 90488af33..9d579659e 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -93,7 +93,7 @@ struct ethtool_link_settings {  #endif
> 
>  /**
> - * Get interface name from private structure.
> + * Get master interface name from private structure.
>   *
>   * @param[in] dev
>   *   Pointer to Ethernet device.
> @@ -104,7 +104,8 @@ struct ethtool_link_settings {
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
> +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +		       char (*ifname)[IF_NAMESIZE])
>  {
>  	struct priv *priv = dev->data->dev_private;
>  	DIR *dir;
> @@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])  }
> 
>  /**
> + * Get interface name from private structure.
> + *
> + * This is a port representor-aware version of mlx5_get_master_ifname().
> + *
> + * @param[in] dev
> + *   Pointer to Ethernet device.
> + * @param[out] ifname
> + *   Interface name output buffer.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int
> +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> +(*ifname)[IF_NAMESIZE]) {
> +	struct priv *priv = dev->data->dev_private;
> +	int ret;
> +	char master[IF_NAMESIZE];
> +	FILE *file;
> +	DIR *dir;
> +	uint64_t phys_switch_id;
> +
> +	if (!priv->representor)
> +		return mlx5_get_master_ifname(dev, ifname);
> +	ret = mlx5_get_master_ifname(dev, &master);
> +	if (ret)
> +		return ret;
> +	{
> +		MKSTR(path, "%s/device/net/%s/phys_switch_id",
> +		      priv->ibdev_path, master);
> +
> +		file = fopen(path, "rb");
> +	}
> +	if (!file) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
> +	fclose(file);
> +	if (ret != 1) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	{
> +		MKSTR(path, "%s/device/net/%s/subsystem",
> +		      priv->ibdev_path, master);
> +
> +		dir = opendir(path);
> +	}
> +	if (!dir) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	/*
> +	 * Scan network interfaces to find one with matching phys_switch_id
> +	 * and phys_switch_name.
> +	 */
> +	do {
> +		struct dirent *dent;
> +		uint64_t phys_switch_id_rep;
> +		int rep_id;
> +
> +		ret = -ENOENT;
> +		dent = readdir(dir);
> +		if (!dent)
> +			break;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (phys_switch_id_rep != phys_switch_id)
> +			continue;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_port_name",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%d", &rep_id);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
> +			continue;
> +		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
> +		ret = 0;
> +		break;
> +	} while (1);
> +	closedir(dir);
> +	if (ret)
> +		rte_errno = -ret;
> +	return ret;
> +}
> +
> +/**
>   * Get the interface index from device name.
>   *
>   * @param[in] dev
> @@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
>   *   Request number to pass to ioctl().
>   * @param[out] ifr
>   *   Interface request structure output buffer.
> + * @param master
> + *   When device is a port representor, perform request on master device
> + *   instead.
>   *
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	   int master)
>  {
>  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
>  	int ret = 0;
> @@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
>  		rte_errno = errno;
>  		return -rte_errno;
>  	}
> -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> +	if (master)
> +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> +	else
> +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
>  	if (ret)
>  		goto error;
>  	ret = ioctl(sock, req, ifr);
> @@ -258,7 +373,7 @@ int
>  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> 
>  	if (ret)
>  		return ret;
> @@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)  {
>  	struct ifreq request = { .ifr_mtu = mtu, };
> 
> -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
>  }
> 
>  /**
> @@ -302,13 +417,13 @@ int
>  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> 
>  	if (ret)
>  		return ret;
>  	request.ifr_flags &= keep;
>  	request.ifr_flags |= flags & ~keep;
> -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
>  }
> 
>  /**
> @@ -477,6 +592,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
>  	info->speed_capa = priv->link_speed_capa;
>  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
>  	mlx5_set_default_params(dev, info);
> +	if (rte_eth_dev_is_valid_port(priv->master_id)) {
> +		info->switch_info.name =
> +			rte_eth_devices[priv->master_id].data->name;
> +		info->switch_info.domain_id = priv->domain_id;
> +		info->switch_info.port_id = priv->rep_id;
> +	}
>  }
> 
>  /**
> @@ -540,7 +661,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	int link_speed = 0;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -550,7 +671,7 @@
> mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&edata;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", @@ -611,7 +732,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	uint64_t sc;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -621,7 +742,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&gcmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -638,7 +759,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> 
>  	*ecmd = gcmd;
>  	ifr.ifr_data = (void *)ecmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -801,7 +922,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  	int ret;
> 
>  	ifr.ifr_data = (void *)&ethpause;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
> @@ -854,7 +975,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  		ethpause.tx_pause = 1;
>  	else
>  		ethpause.tx_pause = 0;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
> diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 672a47619..12ee37f55
> 100644
> --- a/drivers/net/mlx5/mlx5_mac.c
> +++ b/drivers/net/mlx5/mlx5_mac.c
> @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
>  	struct ifreq request;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
>  	if (ret)
>  		return ret;
>  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff --git
> a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 875dd1027..91f3d474a 100644
> --- a/drivers/net/mlx5/mlx5_stats.c
> +++ b/drivers/net/mlx5/mlx5_stats.c
> @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
>  	et_stats->cmd = ETHTOOL_GSTATS;
>  	et_stats->n_stats = xstats_ctrl->stats_n;
>  	ifr.ifr_data = (caddr_t)et_stats;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u unable to read statistic values from device", @@ -194,7 +194,7 @@
> mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
> 
>  	drvinfo.cmd = ETHTOOL_GDRVINFO;
>  	ifr.ifr_data = (caddr_t)&drvinfo;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to query number of statistics",
>  			dev->data->port_id);
> @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
>  	strings->string_set = ETH_SS_STATS;
>  	strings->len = dev_stats_n;
>  	ifr.ifr_data = (caddr_t)strings;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to get statistic names",
>  			dev->data->port_id);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for port representors
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for " Adrien Mazarguil
@ 2018-06-16  8:59     ` Xueming(Steven) Li
  0 siblings, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-16  8:59 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev

Reviewed-by: Xueming Li <xuemingl@mellanox.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for port representors
> 
> Prior to this patch, all port representors detected on a given device were probed and Ethernet devices
> instantiated for each of them.
> 
> This patch adds support for the standard "representor" parameter, which implies that port representors
> are not probed by default anymore, except for the list provided through device arguments.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Added error message for when rte_eth_devargs_parse() fails.
> ---
>  doc/guides/nics/mlx5.rst                | 12 ++++++++++++
>  doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
>  drivers/net/mlx5/mlx5.c                 | 29 ++++++++++++++++++++++++++++
>  3 files changed, 43 insertions(+)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index 79c982e29..5229e546c 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -388,6 +388,18 @@ Run-time configuration
> 
>    Disabled by default.
> 
> +- ``representor`` parameter [list]
> +
> +  This parameter can be used to instantiate DPDK Ethernet devices from
> + existing port (or VF) representors configured on the device.
> +
> +  It is a standard parameter whose format is described in
> + :ref:`ethernet_device_standard_device_arguments`.
> +
> +  For instance, to probe port representors 0 through 2::
> +
> +    representor=[0-2]
> +
>  Firmware configuration
>  ~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
> index af82352a0..58d49ba0f 100644
> --- a/doc/guides/prog_guide/poll_mode_drv.rst
> +++ b/doc/guides/prog_guide/poll_mode_drv.rst
> @@ -365,6 +365,8 @@ Ethernet Device API
> 
>  The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
> 
> +.. _ethernet_device_standard_device_arguments:
> +
>  Ethernet Device Standard Device Arguments  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 716c9d9a5..26e61d99d 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -91,6 +91,9 @@
>  /* Activate Netlink support in VF mode. */  #define MLX5_VF_NL_EN "vf_nl_en"
> 
> +/* Select port representors to instantiate. */ #define MLX5_REPRESENTOR
> +"representor"
> +
>  #ifndef HAVE_IBV_MLX5_MOD_MPW
>  #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)  #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
> @@ -423,6 +426,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
>  	struct mlx5_dev_config *config = opaque;
>  	unsigned long tmp;
> 
> +	/* No-op, port representors are processed in mlx5_dev_spawn(). */
> +	if (!strcmp(MLX5_REPRESENTOR, key))
> +		return 0;
>  	errno = 0;
>  	tmp = strtoul(val, NULL, 0);
>  	if (errno) {
> @@ -495,6 +501,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
>  		MLX5_VF_NL_EN,
> +		MLX5_REPRESENTOR,
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> @@ -1173,13 +1180,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	struct rte_eth_devargs eth_da;
>  	void *tmp;
>  	unsigned int i;
>  	unsigned int j = 0;
>  	unsigned int n = 0;
>  	int ret;
> 
> +	if (dpdk_dev->devargs) {
> +		ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
> +		if (ret) {
> +			rte_errno = -ret;
> +			DRV_LOG(ERR, "failed to process device arguments: %s",
> +				strerror(rte_errno));
> +			goto error;
> +		}
> +	} else {
> +		memset(&eth_da, 0, sizeof(eth_da));
> +	}
>  next:
> +	if (j) {
> +		unsigned int k;
> +
> +		for (k = 0; k < eth_da.nb_representor_ports; ++k)
> +			if (eth_da.representor_ports[k] == j - 1)
> +				break;
> +		if (k == eth_da.nb_representor_ports)
> +			goto skip;
> +	}
>  	errno = 0;
>  	ctx = mlx5_glue->open_device(ibv_dev[j]);
>  	if (!ctx) {
> @@ -1218,6 +1246,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  			goto error;
>  		++n;
>  	}
> +skip:
>  	if (ibv_dev[++j])
>  		goto next;
>  	eth_list[n] = NULL;
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects " Adrien Mazarguil
  2018-06-16  8:27     ` Xueming(Steven) Li
@ 2018-06-17 10:14     ` Shahaf Shuler
  2018-06-27 13:30       ` Adrien Mazarguil
  1 sibling, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-17 10:14 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Hi Adrien, 

Small nit, 

Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> Subject: [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
> 
> This patch gets rid of redundant calls to open the device and query its
> attributes in order to simplify the code.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Minor indent fix on existing code.
> ---
>  drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
>  1 file changed, 30 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 3bdcb3970..1a5391e63 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,  {
>  	struct ibv_device **list = NULL;
>  	struct ibv_device *ibv_dev;
> +	struct ibv_context *ctx = NULL;
> +	struct ibv_device_attr_ex attr;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	int err = 0;
> -	struct ibv_context *attr_ctx = NULL;
> -	struct ibv_device_attr_ex device_attr;
>  	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
> @@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
>  		      (pci_dev->id.device_id ==
>  		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		attr_ctx = mlx5_glue->open_device(list[i]);
> +		ctx = mlx5_glue->open_device(list[i]);
>  		rte_errno = errno;
>  		err = rte_errno;
>  		break;
>  	}
> -	if (attr_ctx == NULL) {
> +	if (ctx == NULL) {
>  		switch (err) {
>  		case 0:
>  			DRV_LOG(ERR,
> @@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
> #endif
> -	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
> +	mlx5_glue->dv_query_device(ctx, &dv_attr);
>  	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
>  		if (dv_attr.flags &
> MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
>  			DRV_LOG(DEBUG, "enhanced MPW is supported");
> @@ -822,23 +822,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading
> disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");
> #endif
> -	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
> +	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	if (err) {
>  		DEBUG("ibv_query_device_ex() failed");
>  		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected",
> -		device_attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
> +	DRV_LOG(INFO, "%u port(s) detected",
> attr.orig_attr.phys_port_cnt);
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
>  		char name[RTE_ETH_NAME_MAX_LEN];
>  		int len;
>  		uint32_t port = i + 1; /* ports are indexed from one */
> -		struct ibv_context *ctx = NULL;
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
>  		struct rte_eth_dev *eth_dev = NULL;
> -		struct ibv_device_attr_ex device_attr_ex;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
>  			 pci_dev->addr.domain, pci_dev->addr.bus,
>  			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (device_attr.orig_attr.phys_port_cnt > 1)
> +		if (attr.orig_attr.phys_port_cnt > 1)
>  			snprintf(name + len, sizeof(name), " port %u", i);
>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
> @@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			continue;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		ctx = mlx5_glue->open_device(ibv_dev);
> +		if (!ctx)

Is it really possible for ctx to be NULL on this stage? 
Maybe assert is preferable? 

> +			ctx = mlx5_glue->open_device(ibv_dev);
>  		if (ctx == NULL) {
>  			err = ENODEV;
>  			goto port_error;
> @@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = device_attr;
> +		priv->device_attr = attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> @@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		err = mlx5_glue->query_device_ex(ctx, NULL,
> &device_attr_ex);
> -		if (err) {
> -			DRV_LOG(ERR, "ibv_query_device_ex() failed");
> -			goto port_error;
> -		}
> -		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
> +		config.hw_csum = !!(attr.device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en =
> !!(device_attr.max_counter_sets);
> +		config.flow_counter_en = !!attr.max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes =
> %d", @@ -978,7 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -
> 	device_attr_ex.rss_caps.max_rwq_indirection_table_size;
> +			attr.rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			config.ind_table_max_size =
> ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr.raw_packet_caps &
> 
> IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr.raw_packet_caps &
> 
> IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is
> %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding =
> !!device_attr_ex.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr.rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is
> %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
> -			      (device_attr_ex.tso_caps.supported_qpts &
> -			      (1 << IBV_QPT_RAW_PACKET)));
> +		config.tso = (attr.tso_caps.max_tso > 0 &&
> +			      (attr.tso_caps.supported_qpts &
> +			       (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz =
> -					device_attr_ex.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr.tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this
> device"
> @@ -1168,14 +1160,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data-
> >mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> +		/*
> +		 * Each eth_dev instance is assigned its own Verbs context,
> +		 * since this one is consumed, let the next iteration open
> +		 * another.
> +		 */
> +		ctx = NULL;
>  		continue;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
>  		if (pd)
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
> -		if (ctx)
> -			claim_zero(mlx5_glue->close_device(ctx));
>  		if (eth_dev && rte_eal_process_type() ==
> RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
>  		break;
> @@ -1187,8 +1183,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  	 * way to enumerate the registered ethdevs to free the previous
> ones.
>  	 */
>  error:
> -	if (attr_ctx)
> -		claim_zero(mlx5_glue->close_device(attr_ctx));
> +	if (ctx)
> +		claim_zero(mlx5_glue->close_device(ctx));
>  	if (list)
>  		mlx5_glue->free_device_list(list);
>  	if (err) {
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
  2018-06-14  8:34   ` [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code Adrien Mazarguil
  2018-06-16  8:29     ` Xueming(Steven) Li
@ 2018-06-17 10:14     ` Shahaf Shuler
  2018-06-27 13:31       ` Adrien Mazarguil
  1 sibling, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-17 10:14 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> Subject: [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is
> currently supplied by a PCI lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as
> follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed device naming. A port suffix is now appended only if several IB
>   ports happen to be detected.
> - Added separate message to distinguish missing kernel drivers from other
>   initialization errors, as it was confusing.

[...]

> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}

Even though I couldn't find any functional bug, I think it is logically more correct to determine if pci device is vf after we know this is Mellanox device. 
Meaning the above block should be ...

> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]-
> >name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device
> \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}

Here. 

> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret],
> vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI device " PCI_PRI_FMT
> ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else if (!eth_list || !*eth_list) {
> +		DRV_LOG(ERR,
> +			"probe of PCI device " PCI_PRI_FMT " aborted after"
> +			" encountering an error: %s",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function,
> +			strerror(rte_errno));
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-16  8:57     ` Xueming(Steven) Li
@ 2018-06-17 10:15       ` Shahaf Shuler
  2018-06-24 13:33         ` Shahaf Shuler
  2018-06-27 13:32         ` Adrien Mazarguil
  2018-06-27 13:32       ` Adrien Mazarguil
  1 sibling, 2 replies; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-17 10:15 UTC (permalink / raw)
  To: Xueming(Steven) Li, Adrien Mazarguil; +Cc: dev

Hi Adrien,

Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Thursday, June 14, 2018 4:35 PM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > representors
> >
> > Probe existing port representors in addition to their master device and
> associate them automatically.
> >
> > To avoid name collision between Ethernet devices, their names use the
> > same convention as ixgbe and i40e PMDs, that is, instead of only a PCI
> address in DBDF notation:
> >
> > - "net_{DBDF}_0" for master/switch devices.

This is breaking compatibility for application using the device names in order to attach them to the application (e.g. OVS-DPDK). 
Before this patch the naming scheme for non-representor port is "{DBDF}". 

Can we preserve the compatibility and add appropriate suffix for the representor case? 

> > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> >
> > Both optionally suffixed with "_port_{num}" instead of " port {num}"
> > for devices that expose several Verbs ports (note this is never the case on
> mlx5, but kept for historical reasons for the time being).
> >
> > (Patch based on prior work from Yuanhan Liu)
> >
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> >
> > - Added representor information to dev_infos_get(). DPDK port ID of
> master
> >   device is now stored in the private structure to retrieve it
> >   conveniently.
> > - Master device is assigned dummy representor ID value -1 to better
> >   distinguish from the the first actual representor reported by
> >   dev_infos_get() as those are indexed from 0.
> > - Added RTE_ETH_DEV_REPRESENTOR device flag.
> > ---
> >  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
> >  drivers/net/mlx5/mlx5.h        |   9 ++-
> >  drivers/net/mlx5/mlx5_ethdev.c | 151
> ++++++++++++++++++++++++++++++++----
> >  drivers/net/mlx5/mlx5_mac.c    |   2 +-
> >  drivers/net/mlx5/mlx5_stats.c  |   6 +-
> >  5 files changed, 252 insertions(+), 54 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 498f80c89..716c9d9a5 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
> >  	if (ret)
> >  		DRV_LOG(WARNING, "port %u some flows still remain",
> >  			dev->data->port_id);
> > +	if (!priv->representor &&
> > +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
> >  	memset(priv, 0, sizeof(*priv));
> >  }
> >
> > @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev
> *dev)
> >   *   Verbs device attributes.
> >   * @param port
> >   *   Verbs port to use (indexed from 1).
> > + * @param master
> > + *   Master device in case @p ibv_dev is a port representor.
> > + * @param rep_id
> > + *   Representor identifier when @p master is non-NULL.
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and
> rte_errno
> > @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		   struct ibv_device *ibv_dev,
> >  		   int vf,
> >  		   const struct ibv_device_attr_ex *attr,
> > -		   unsigned int port)
> > +		   unsigned int port,
> > +		   struct rte_eth_dev *master,
> > +		   unsigned int rep_id)
> >  {
> >  	struct ibv_context *ctx;
> >  	struct ibv_port_attr port_attr;
> > @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		" old OFED/rdma-core version or firmware configuration");
> #endif
> >  	config.mpls_en = mpls_en;
> > -	if (attr->orig_attr.phys_port_cnt > 1)
> > -		snprintf(name, sizeof(name), "%s port %u",
> > -			 dpdk_dev->name, port);
> > +	if (!master)
> > +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev-
> >name);
> >  	else
> > -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> > +			 dpdk_dev->name, rep_id);
> > +	if (attr->orig_attr.phys_port_cnt > 1)
> > +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> > +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> >  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> >  		eth_dev = rte_eth_dev_attach_secondary(name);
> >  		if (eth_dev == NULL) {
> > @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  	priv->port = port;
> >  	priv->pd = pd;
> >  	priv->mtu = ETHER_MTU;
> > +	/*
> > +	 * Allocate a switch domain for master devices and share it with
> > +	 * port representors.
> > +	 */
> > +	if (!master) {
> > +		priv->representor = 0;
> > +		priv->master_id = -1; /* Updated once known. */
> > +		priv->domain_id =
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> 
> Domain_id will override below.
> 
> > +		priv->rep_id = -1; /* Dummy unique value. */
> > +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> > +		if (err) {
> > +			err = rte_errno;
> > +			DRV_LOG(ERR, "unable to allocate switch domain:
> %s",
> > +				strerror(rte_errno));
> > +			goto error;
> > +		}
> > +	} else {
> > +		priv->representor = 1;
> > +		priv->master_id =
> > +			((struct priv *)master->data->dev_private)-
> >master_id;
> > +		priv->domain_id =
> > +			((struct priv *)master->data->dev_private)-
> >domain_id;
> > +		priv->rep_id = rep_id;
> > +	}
> 
> Do you think such information should be set as well in secondary process?
> 
> >  	err = mlx5_args(&config, dpdk_dev->devargs);
> >  	if (err) {
> >  		err = rte_errno;
> > @@ -964,6 +1000,18 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		err = ENOMEM;
> >  		goto error;
> >  	}
> > +	/*
> > +	 * Now that eth_dev is allocated and its port ID is known, make
> > +	 * non-representor ports target their own port ID as master for
> > +	 * convenience.
> > +	 *
> > +	 * Master port ID is already set for actual representors. Those only
> > +	 * need the right device flag.
> > +	 */
> > +	if (!master)
> > +		priv->master_id = eth_dev->data->port_id;
> > +	else
> > +		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
> >  	eth_dev->data->dev_private = priv;
> >  	priv->dev_data = eth_dev->data;
> >  	eth_dev->data->mac_addrs = priv->mac; @@ -1083,8 +1131,12 @@
> > mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  	rte_rwlock_write_unlock(&mlx5_shared_data-
> >mem_event_rwlock);
> >  	return eth_dev;
> >  error:
> > -	if (priv)
> > +	if (priv) {
> > +		if (!priv->representor &&
> > +		    priv->domain_id !=
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
> >  		rte_free(priv);
> > +	}
> >  	if (pd)
> >  		claim_zero(mlx5_glue->dealloc_pd(pd));
> >  	if (eth_dev)
> > @@ -1097,12 +1149,14 @@ mlx5_dev_spawn_one(struct rte_device
> > *dpdk_dev,  }
> >
> >  /**
> > - * Spawn Ethernet devices from Verbs information, one per detected
> port.
> > + * Spawn Ethernet devices from Verbs information, one per detected
> > + port and
> > + * port representor.
> >   *
> >   * @param dpdk_dev
> >   *   Backing DPDK device.
> >   * @param ibv_dev
> > - *   Verbs device.
> > + *   NULL-terminated list of Verbs devices. First entry is the master device
> > + *   (mandatory), followed by optional representors.
> >   * @param vf
> >   *   If nonzero, enable VF-specific features.
> >   *
> > @@ -1113,17 +1167,21 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >   */
> >  static struct rte_eth_dev **
> >  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > -	       struct ibv_device *ibv_dev,
> > +	       struct ibv_device **ibv_dev,
> >  	       int vf)
> >  {
> >  	struct rte_eth_dev **eth_list = NULL;
> >  	struct ibv_context *ctx;
> >  	struct ibv_device_attr_ex attr;
> > +	void *tmp;
> >  	unsigned int i;
> > +	unsigned int j = 0;
> > +	unsigned int n = 0;
> >  	int ret;
> >
> > +next:
> >  	errno = 0;
> > -	ctx = mlx5_glue->open_device(ibv_dev);
> > +	ctx = mlx5_glue->open_device(ibv_dev[j]);
> >  	if (!ctx) {
> >  		rte_errno = errno ? errno : ENODEV;
> >  		if (rte_errno == ENODEV)
> > @@ -1132,7 +1190,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  		else
> >  			DRV_LOG(ERR,
> >  				"cannot use device, are drivers up to date?");
> > -		return NULL;
> > +		goto error;
> >  	}
> >  	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> >  	mlx5_glue->close_device(ctx);
> > @@ -1140,34 +1198,42 @@ mlx5_dev_spawn(struct rte_device
> *dpdk_dev,
> >  		rte_errno = ret;
> >  		DRV_LOG(ERR, "unable to query device information: %s",
> >  			strerror(rte_errno));
> > -		return NULL;
> > +		goto error;
> >  	}
> > -	DRV_LOG(INFO, "%u port(s) detected",
> attr.orig_attr.phys_port_cnt);
> > -	eth_list = malloc(sizeof(*eth_list) *
> > -			  (attr.orig_attr.phys_port_cnt + 1));
> > -	if (!eth_list) {
> > +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> > +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> > +	tmp = realloc(eth_list, sizeof(*eth_list) *
> > +		      (n + attr.orig_attr.phys_port_cnt + 1));
> > +	if (!tmp) {
> >  		rte_errno = errno;
> > -		return NULL;
> > +		goto error;
> >  	}
> > +	eth_list = tmp;
> >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > -						 &attr, i + 1);
> > -		if (eth_list[i])
> > -			continue;
> > -		/* Save rte_errno and roll back in case of failure. */
> > -		ret = rte_errno;
> > -		while (i--) {
> > -			mlx5_dev_close(eth_list[i]);
> > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > -				rte_free(eth_list[i]->data->dev_private);
> > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > -		}
> > -		free(eth_list);
> > -		rte_errno = ret;
> > -		return NULL;
> > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> vf,
> > +						 &attr, i + 1,
> > +						 j ? eth_list[0] : NULL,
> > +						 j - 1);

The representor id is according to the sort made by qsort (based on device names).
A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname function.
What do you think? 

> > +		if (!eth_list[n])
> > +			goto error;
> > +		++n;
> >  	}
> > -	eth_list[i] = NULL;
> > +	if (ibv_dev[++j])
> > +		goto next;
> > +	eth_list[n] = NULL;
> >  	return eth_list;
> > +error:
> > +	/* Save rte_errno and roll back in case of failure. */
> > +	ret = rte_errno;
> > +	while (n--) {
> > +		mlx5_dev_close(eth_list[n]);
> > +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > +			rte_free(eth_list[n]->data->dev_private);
> > +		claim_zero(rte_eth_dev_release_port(eth_list[n]));
> > +	}
> > +	free(eth_list);
> > +	rte_errno = ret;
> > +	return NULL;
> >  }
> >
> >  /**
> > @@ -1282,7 +1348,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> >  				ibv_match[ret]->name, ret - 1);
> >  	}
> >  	if (n)
> > -		eth_list = mlx5_dev_spawn(&pci_dev->device,
> ibv_match[0], vf);
> > +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match,
> vf);
> >  	mlx5_glue->free_device_list(ibv_list);
> >  	if (!n) {
> >  		DRV_LOG(WARNING,
> > @@ -1302,7 +1368,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> >  		ret = -rte_errno;
> >  	} else {
> >  		for (ret = 0; eth_list[ret]; ++ret) {
> > +			uint32_t restore = eth_list[ret]->data->dev_flags;
> > +
> >  			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> > +			/* Restore non-PCI flags cleared by the above call. */
> > +			eth_list[ret]->data->dev_flags |= restore;
> >  			rte_eth_dev_probing_finish(eth_list[ret]);
> >  		}
> >  		ret = 0;
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 997b04a33..0fe467140 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -161,6 +161,10 @@ struct priv {
> >  	uint16_t mtu; /* Configured MTU. */
> >  	uint8_t port; /* Physical port number. */
> >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > +	unsigned int representor:1; /* Device is a port representor. */

Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags. 

> > +	uint16_t master_id; /* DPDK port ID of switch domain master. */
> > +	uint16_t domain_id; /* Switch domain identifier. */
> > +	unsigned int rep_id; /* Port representor identifier. */
> >  	/* RX/TX queues. */
> >  	unsigned int rxqs_n; /* RX queues array size. */
> >  	unsigned int txqs_n; /* TX queues array size. */ @@ -209,9 +213,12
> > @@ int mlx5_getenv_int(const char *);
> >
> >  /* mlx5_ethdev.c */
> >
> > +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> > +			   char (*ifname)[IF_NAMESIZE]);
> >  int mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > (*ifname)[IF_NAMESIZE]);  int mlx5_ifindex(const struct rte_eth_dev
> > *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct
> > ifreq *ifr);
> > +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> > +	       int master);
> >  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int
> > mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
> >  		   unsigned int flags);
> > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > b/drivers/net/mlx5/mlx5_ethdev.c index 90488af33..9d579659e 100644
> > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > @@ -93,7 +93,7 @@ struct ethtool_link_settings {  #endif
> >
> >  /**
> > - * Get interface name from private structure.
> > + * Get master interface name from private structure.
> >   *
> >   * @param[in] dev
> >   *   Pointer to Ethernet device.
> > @@ -104,7 +104,8 @@ struct ethtool_link_settings {
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> >  int
> > -mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > (*ifname)[IF_NAMESIZE])
> > +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> > +		       char (*ifname)[IF_NAMESIZE])
> >  {
> >  	struct priv *priv = dev->data->dev_private;
> >  	DIR *dir;
> > @@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev,
> > char (*ifname)[IF_NAMESIZE])  }
> >
> >  /**
> > + * Get interface name from private structure.
> > + *
> > + * This is a port representor-aware version of mlx5_get_master_ifname().
> > + *
> > + * @param[in] dev
> > + *   Pointer to Ethernet device.
> > + * @param[out] ifname
> > + *   Interface name output buffer.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +int
> > +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > +(*ifname)[IF_NAMESIZE]) {
> > +	struct priv *priv = dev->data->dev_private;
> > +	int ret;
> > +	char master[IF_NAMESIZE];
> > +	FILE *file;
> > +	DIR *dir;
> > +	uint64_t phys_switch_id;
> > +
> > +	if (!priv->representor)
> > +		return mlx5_get_master_ifname(dev, ifname);
> > +	ret = mlx5_get_master_ifname(dev, &master);
> > +	if (ret)
> > +		return ret;
> > +	{
> > +		MKSTR(path, "%s/device/net/%s/phys_switch_id",
> > +		      priv->ibdev_path, master);
> > +
> > +		file = fopen(path, "rb");
> > +	}
> > +	if (!file) {
> > +		rte_errno = errno;
> > +		return -rte_errno;
> > +	}
> > +	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
> > +	fclose(file);
> > +	if (ret != 1) {
> > +		rte_errno = EINVAL;
> > +		return -rte_errno;
> > +	}
> > +	{
> > +		MKSTR(path, "%s/device/net/%s/subsystem",
> > +		      priv->ibdev_path, master);
> > +
> > +		dir = opendir(path);
> > +	}
> > +	if (!dir) {
> > +		rte_errno = errno;
> > +		return -rte_errno;
> > +	}
> > +	/*
> > +	 * Scan network interfaces to find one with matching phys_switch_id
> > +	 * and phys_switch_name.
> > +	 */
> > +	do {
> > +		struct dirent *dent;
> > +		uint64_t phys_switch_id_rep;
> > +		int rep_id;
> > +
> > +		ret = -ENOENT;
> > +		dent = readdir(dir);
> > +		if (!dent)
> > +			break;
> > +		{
> > +			MKSTR(path,
> > +
> "%s/device/net/%s/subsystem/%s/phys_switch_id",
> > +			      priv->ibdev_path, master, dent->d_name);
> > +
> > +			file = fopen(path, "rb");
> > +		}
> > +		if (!file)
> > +			continue;
> > +		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
> > +		fclose(file);
> > +		if (ret != 1)
> > +			continue;
> > +		if (phys_switch_id_rep != phys_switch_id)
> > +			continue;
> > +		{
> > +			MKSTR(path,
> > +
> "%s/device/net/%s/subsystem/%s/phys_port_name",
> > +			      priv->ibdev_path, master, dent->d_name);
> > +
> > +			file = fopen(path, "rb");
> > +		}
> > +		if (!file)
> > +			continue;
> > +		ret = fscanf(file, "%d", &rep_id);
> > +		fclose(file);
> > +		if (ret != 1)
> > +			continue;
> > +		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
> > +			continue;
> > +		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
> > +		ret = 0;
> > +		break;
> > +	} while (1);
> > +	closedir(dir);
> > +	if (ret)
> > +		rte_errno = -ret;
> > +	return ret;
> > +}
> > +
> > +/**
> >   * Get the interface index from device name.
> >   *
> >   * @param[in] dev
> > @@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
> >   *   Request number to pass to ioctl().
> >   * @param[out] ifr
> >   *   Interface request structure output buffer.
> > + * @param master
> > + *   When device is a port representor, perform request on master device
> > + *   instead.
> >   *
> >   * @return
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> >  int
> > -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> > +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> > +	   int master)
> >  {
> >  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
> >  	int ret = 0;
> > @@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr)
> >  		rte_errno = errno;
> >  		return -rte_errno;
> >  	}
> > -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> > +	if (master)
> > +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> > +	else
> > +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> >  	if (ret)
> >  		goto error;
> >  	ret = ioctl(sock, req, ifr);
> > @@ -258,7 +373,7 @@ int
> >  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
> >  	struct ifreq request;
> > -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> > +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> >
> >  	if (ret)
> >  		return ret;
> > @@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t
> mtu)  {
> >  	struct ifreq request = { .ifr_mtu = mtu, };
> >
> > -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> > +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
> >  }
> >
> >  /**
> > @@ -302,13 +417,13 @@ int
> >  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int
> flags)  {
> >  	struct ifreq request;
> > -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> > +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> >
> >  	if (ret)
> >  		return ret;
> >  	request.ifr_flags &= keep;
> >  	request.ifr_flags |= flags & ~keep;
> > -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> > +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
> >  }
> >
> >  /**
> > @@ -477,6 +592,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev,
> struct rte_eth_dev_info *info)
> >  	info->speed_capa = priv->link_speed_capa;
> >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> >  	mlx5_set_default_params(dev, info);
> > +	if (rte_eth_dev_is_valid_port(priv->master_id)) {
> > +		info->switch_info.name =
> > +			rte_eth_devices[priv->master_id].data->name;
> > +		info->switch_info.domain_id = priv->domain_id;
> > +		info->switch_info.port_id = priv->rep_id;
> > +	}
> >  }
> >
> >  /**
> > @@ -540,7 +661,7 @@ mlx5_link_update_unlocked_gset(struct
> rte_eth_dev *dev,
> >  	int link_speed = 0;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
> >  			dev->data->port_id, strerror(rte_errno)); @@ -550,7
> +671,7 @@
> > mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
> >  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
> >  				(ifr.ifr_flags & IFF_RUNNING));
> >  	ifr.ifr_data = (void *)&edata;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed:
> %s", @@ -611,7
> > +732,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> >  	uint64_t sc;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
> >  			dev->data->port_id, strerror(rte_errno)); @@ -621,7
> +742,7 @@
> > mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> >  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
> >  				(ifr.ifr_flags & IFF_RUNNING));
> >  	ifr.ifr_data = (void *)&gcmd;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(DEBUG,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> > @@ -638,7 +759,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev
> > *dev,
> >
> >  	*ecmd = gcmd;
> >  	ifr.ifr_data = (void *)ecmd;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(DEBUG,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> > @@ -801,7 +922,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
> >  	int ret;
> >
> >  	ifr.ifr_data = (void *)&ethpause;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GPAUSEPARAM) failed:"
> > @@ -854,7 +975,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
> >  		ethpause.tx_pause = 1;
> >  	else
> >  		ethpause.tx_pause = 0;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_SPAUSEPARAM)"
> > diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
> > index 672a47619..12ee37f55
> > 100644
> > --- a/drivers/net/mlx5/mlx5_mac.c
> > +++ b/drivers/net/mlx5/mlx5_mac.c
> > @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t
> (*mac)[ETHER_ADDR_LEN])
> >  	struct ifreq request;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> > +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
> >  	if (ret)
> >  		return ret;
> >  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff -
> -git
> > a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index
> > 875dd1027..91f3d474a 100644
> > --- a/drivers/net/mlx5/mlx5_stats.c
> > +++ b/drivers/net/mlx5/mlx5_stats.c
> > @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev,
> uint64_t *stats)
> >  	et_stats->cmd = ETHTOOL_GSTATS;
> >  	et_stats->n_stats = xstats_ctrl->stats_n;
> >  	ifr.ifr_data = (caddr_t)et_stats;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u unable to read statistic values from device",
> @@ -194,7
> > +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
> >
> >  	drvinfo.cmd = ETHTOOL_GDRVINFO;
> >  	ifr.ifr_data = (caddr_t)&drvinfo;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u unable to query number of
> statistics",
> >  			dev->data->port_id);
> > @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
> >  	strings->string_set = ETH_SS_STATS;
> >  	strings->len = dev_stats_n;
> >  	ifr.ifr_data = (caddr_t)strings;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u unable to get statistic
> names",
> >  			dev->data->port_id);
> > --
> > 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-17 10:15       ` Shahaf Shuler
@ 2018-06-24 13:33         ` Shahaf Shuler
  2018-06-27 13:32           ` Adrien Mazarguil
  2018-06-27 13:32         ` Adrien Mazarguil
  1 sibling, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-24 13:33 UTC (permalink / raw)
  To: Xueming(Steven) Li, Adrien Mazarguil
  Cc: dev, Guillaume Gaudonville, Wisam Monther, Raslan Darawsheh, Olga Shern

One more input, 

Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors

[...]

> > > +	eth_list = tmp;
> > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > -						 &attr, i + 1);
> > > -		if (eth_list[i])
> > > -			continue;
> > > -		/* Save rte_errno and roll back in case of failure. */
> > > -		ret = rte_errno;
> > > -		while (i--) {
> > > -			mlx5_dev_close(eth_list[i]);
> > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > -				rte_free(eth_list[i]->data->dev_private);
> > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > -		}
> > > -		free(eth_list);
> > > -		rte_errno = ret;
> > > -		return NULL;
> > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > vf,
> > > +						 &attr, i + 1,
> > > +						 j ? eth_list[0] : NULL,
> > > +						 j - 1);
> 
> The representor id is according to the sort made by qsort (based on device
> names).
> A better way may be to set it according to the sysfs information, like you do
> in the mlx5_get_ifname function.
> What do you think?

In fact relaying on linear increasing port numbers is dangerous. In may break on special scenarios like BlueField.
In BlueField there are representors between the x86 and the ARM cores. Those are not VF representors. The phys_port_name of those is -1 and each of them belongs to different phys_switch_id.

We can argue whether it is correct/not to assign them w/ -1 value, but I think the suggested approach above can detect the right "vf_id" for those and not break the current behavior on x86.  
Let me know if you have other suggestions. 


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-17 10:14     ` Shahaf Shuler
@ 2018-06-27 13:30       ` Adrien Mazarguil
  2018-06-28  5:35         ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:30 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Hey Shahaf,

I couldn't reply earlier, sorry for that. See below.

On Sun, Jun 17, 2018 at 10:14:01AM +0000, Shahaf Shuler wrote:
> Hi Adrien, 
> 
> Small nit, 
> 
> Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> > Subject: [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
> > 
> > This patch gets rid of redundant calls to open the device and query its
> > attributes in order to simplify the code.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> > 
> > - Minor indent fix on existing code.
> > ---
> >  drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
> >  1 file changed, 30 insertions(+), 34 deletions(-)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
<snip>
> > @@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> > __rte_unused,
> >  			continue;
> >  		}
> >  		DRV_LOG(DEBUG, "using port %u", port);
> > -		ctx = mlx5_glue->open_device(ibv_dev);
> > +		if (!ctx)
> 
> Is it really possible for ctx to be NULL on this stage? 
> Maybe assert is preferable? 

See below, ctx is only inherited (non-NULL) during the first iteration. It
is reset and reopened for each instance since they need their own dedicated
Verbs context.

In any case, this patch focuses on removing redundant calls in preparation
for subsequent patches in the series. This code disappears entirely later.

<snip>
> > +		/*
> > +		 * Each eth_dev instance is assigned its own Verbs context,
> > +		 * since this one is consumed, let the next iteration open
> > +		 * another.
> > +		 */
> > +		ctx = NULL;
> >  		continue;

No problem if I leave it that way?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
  2018-06-17 10:14     ` Shahaf Shuler
@ 2018-06-27 13:31       ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:31 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

On Sun, Jun 17, 2018 at 10:14:58AM +0000, Shahaf Shuler wrote:
> Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> > Subject: [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> > 
> > All the generic probing code needs is an IB device. While this device is
> > currently supplied by a PCI lookup, other methods will be added soon.
> > 
> > This patch divides the original function, which has become huge over time, as
> > follows:
> > 
> > 1. PCI-specific (mlx5_pci_probe()).
> > 2. All ports of a Verbs device (mlx5_dev_spawn()).
> > 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> > 
> > - Fixed device naming. A port suffix is now appended only if several IB
> >   ports happen to be detected.
> > - Added separate message to distinguish missing kernel drivers from other
> >   initialization errors, as it was confusing.
<snip>
> > +static int
> > +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> > +	       struct rte_pci_device *pci_dev) {
> > +	struct ibv_device **ibv_list;
> > +	struct rte_eth_dev **eth_list = NULL;
> > +	int vf;
> > +	int ret;
> > +
> > +	assert(pci_drv == &mlx5_driver);
> > +	switch (pci_dev->id.device_id) {
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> > +		vf = 1;
> > +		break;
> > +	default:
> > +		vf = 0;
> > +	}
> 
> Even though I couldn't find any functional bug, I think it is logically more correct to determine if pci device is vf after we know this is Mellanox device. 
> Meaning the above block should be ...

<snip> 
> > +	while (ret-- > 0) {
> > +		struct rte_pci_addr pci_addr;
> > +
> > +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]-
> > >name);
> > +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> > +			continue;
> > +		if (pci_dev->addr.domain != pci_addr.domain ||
> > +		    pci_dev->addr.bus != pci_addr.bus ||
> > +		    pci_dev->addr.devid != pci_addr.devid ||
> > +		    pci_dev->addr.function != pci_addr.function)
> > +			continue;
> > +		DRV_LOG(INFO, "PCI information matches, using device
> > \"%s\"",
> > +			ibv_list[ret]->name);
> > +		break;
> > +	}
> 
> Here. 

No problem, I will update.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness
  2018-06-16  8:37     ` Xueming(Steven) Li
@ 2018-06-27 13:32       ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:32 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Sat, Jun 16, 2018 at 08:37:14AM +0000, Xueming(Steven) Li wrote:
> Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> 
> One minor issue we should be able to ignore.
<snip>
> > +static int
> > +mlx5_cmp_ibv_name(const void *a, const void *b) {
> > +	const char *name_a = (*(const struct ibv_device *const *)a)->name;
> > +	const char *name_b = (*(const struct ibv_device *const *)b)->name;
> > +	size_t i = 0;
> > +
> > +	while (name_a[i] && name_a[i] == name_b[i])
> > +		++i;
> > +	while (i && isdigit(name_a[i - 1]) && isdigit(name_b[i - 1]))
> 
> name_a[i - 1] and name_b[i - 1] must be same here.

Indeed, I'll simplify it in v3, thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-16  8:57     ` Xueming(Steven) Li
  2018-06-17 10:15       ` Shahaf Shuler
@ 2018-06-27 13:32       ` Adrien Mazarguil
  1 sibling, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:32 UTC (permalink / raw)
  To: Xueming(Steven) Li; +Cc: Shahaf Shuler, dev

On Sat, Jun 16, 2018 at 08:57:51AM +0000, Xueming(Steven) Li wrote:
> Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> 
> Minor comments inside:
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Thursday, June 14, 2018 4:35 PM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > Probe existing port representors in addition to their master device and associate them automatically.
> > 
> > To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> > i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> > 
> > - "net_{DBDF}_0" for master/switch devices.
> > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> > 
> > Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> > Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> > 
> > - Added representor information to dev_infos_get(). DPDK port ID of master
> >   device is now stored in the private structure to retrieve it
> >   conveniently.
> > - Master device is assigned dummy representor ID value -1 to better
> >   distinguish from the the first actual representor reported by
> >   dev_infos_get() as those are indexed from 0.
> > - Added RTE_ETH_DEV_REPRESENTOR device flag.
> > ---
> >  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
> >  drivers/net/mlx5/mlx5.h        |   9 ++-
> >  drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
> >  drivers/net/mlx5/mlx5_mac.c    |   2 +-
> >  drivers/net/mlx5/mlx5_stats.c  |   6 +-
> >  5 files changed, 252 insertions(+), 54 deletions(-)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 498f80c89..716c9d9a5 100644
<snip>
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
> >  	if (ret)
> >  		DRV_LOG(WARNING, "port %u some flows still remain",
> >  			dev->data->port_id);
> > +	if (!priv->representor &&
> > +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
> >  	memset(priv, 0, sizeof(*priv));
> >  }
> > 
> > @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
> >   *   Verbs device attributes.
> >   * @param port
> >   *   Verbs port to use (indexed from 1).
> > + * @param master
> > + *   Master device in case @p ibv_dev is a port representor.
> > + * @param rep_id
> > + *   Representor identifier when @p master is non-NULL.
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> > @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  		   struct ibv_device *ibv_dev,
> >  		   int vf,
> >  		   const struct ibv_device_attr_ex *attr,
> > -		   unsigned int port)
> > +		   unsigned int port,
> > +		   struct rte_eth_dev *master,
> > +		   unsigned int rep_id)
> >  {
> >  	struct ibv_context *ctx;
> >  	struct ibv_port_attr port_attr;
> > @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  		" old OFED/rdma-core version or firmware configuration");  #endif
> >  	config.mpls_en = mpls_en;
> > -	if (attr->orig_attr.phys_port_cnt > 1)
> > -		snprintf(name, sizeof(name), "%s port %u",
> > -			 dpdk_dev->name, port);
> > +	if (!master)
> > +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
> >  	else
> > -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> > +			 dpdk_dev->name, rep_id);
> > +	if (attr->orig_attr.phys_port_cnt > 1)
> > +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> > +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> >  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> >  		eth_dev = rte_eth_dev_attach_secondary(name);
> >  		if (eth_dev == NULL) {
> > @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  	priv->port = port;
> >  	priv->pd = pd;
> >  	priv->mtu = ETHER_MTU;
> > +	/*
> > +	 * Allocate a switch domain for master devices and share it with
> > +	 * port representors.
> > +	 */
> > +	if (!master) {
> > +		priv->representor = 0;
> > +		priv->master_id = -1; /* Updated once known. */
> > +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> 
> Domain_id will override below.

It's done as a safety measure. If rte_eth_switch_domain_alloc() happened to
fail, rte_eth_switch_domain_free() would otherwise be attempted on an
uninitialized value when cleaning up priv, possibly destroying an unrelated
domain. This is prevented thanks to RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID.

> > +		priv->rep_id = -1; /* Dummy unique value. */
> > +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> > +		if (err) {
> > +			err = rte_errno;
> > +			DRV_LOG(ERR, "unable to allocate switch domain: %s",
> > +				strerror(rte_errno));
> > +			goto error;
> > +		}
> > +	} else {
> > +		priv->representor = 1;
> > +		priv->master_id =
> > +			((struct priv *)master->data->dev_private)->master_id;
> > +		priv->domain_id =
> > +			((struct priv *)master->data->dev_private)->domain_id;
> > +		priv->rep_id = rep_id;
> > +	}
> 
> Do you think such information should be set as well in secondary process?

Unless I'm mistaken, it's implicitly the case as secondaries do not allocate
their own private structure, they inherit it from the primary.

Thanks for the review.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-17 10:15       ` Shahaf Shuler
  2018-06-24 13:33         ` Shahaf Shuler
@ 2018-06-27 13:32         ` Adrien Mazarguil
  2018-06-27 17:30           ` Xueming(Steven) Li
  2018-06-28  6:01           ` Shahaf Shuler
  1 sibling, 2 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:32 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: Xueming(Steven) Li, dev

On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> Hi Adrien,
> 
> Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > Sent: Thursday, June 14, 2018 4:35 PM
> > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > Cc: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > Probe existing port representors in addition to their master device and
> > associate them automatically.
> > >
> > > To avoid name collision between Ethernet devices, their names use the
> > > same convention as ixgbe and i40e PMDs, that is, instead of only a PCI
> > address in DBDF notation:
> > >
> > > - "net_{DBDF}_0" for master/switch devices.
> 
> This is breaking compatibility for application using the device names in order to attach them to the application (e.g. OVS-DPDK). 
> Before this patch the naming scheme for non-representor port is "{DBDF}". 
> 
> Can we preserve the compatibility and add appropriate suffix for the representor case? 

There's one issue if representors are hot-plugged. The name of the master
device, which happens to be that of the switch domain, cannot be
updated. The form "net_{DBDF}_0" seems expected for PMDs that support
representors (see ixgbe and i40e).

Now since representor hot-plugging is not supported yet, I guess we could
postpone this problem by keeping the old format in the meantime, however
ideally, these applications should not rely on it. The only safe assumption
they can make is the uniqueness of any given name among ethdevs.

PCI bus addresses, if needed, should be retrieved by looking at the
underlying bus object.

By the way, while thinking again about a past comment from Xueming [1],
maybe it's finally time to remove support for multiple Verbs ports on mlx5
after all. This should drop another unnecessary loop and the need for the
unused "port %u" suffix at all while naming the device.

So how about the following plan for v3:

- Adding a patch that drops support for multiple Verbs ports (note for
  Xueming, yes I changed my mind *again* :)

- If you really think this will break OVS (please confirm), then when no
  "representor" parameter is provided (regardless of the presence of any
  representors), name format will use the usual "{DBDF}" notation as you
  suggested.

- Otherwise as soon as a "representor" is found on the command line, the new
  format will be used, again regardless of the presence of any representors.

- In both cases, representors if any, will be named according to the format
  specified in this patch.

[1] https://mails.dpdk.org/archives/dev/2018-June/104015.html

<snip>
> > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > -						 &attr, i + 1);
> > > -		if (eth_list[i])
> > > -			continue;
> > > -		/* Save rte_errno and roll back in case of failure. */
> > > -		ret = rte_errno;
> > > -		while (i--) {
> > > -			mlx5_dev_close(eth_list[i]);
> > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > -				rte_free(eth_list[i]->data->dev_private);
> > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > -		}
> > > -		free(eth_list);
> > > -		rte_errno = ret;
> > > -		return NULL;
> > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > vf,
> > > +						 &attr, i + 1,
> > > +						 j ? eth_list[0] : NULL,
> > > +						 j - 1);
> 
> The representor id is according to the sort made by qsort (based on device names).
> A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname function.
> What do you think? 

I agree that the current approach sucks, hence the big fat warnings I left
around (see discussion with Xueming [2]). Problem is that the needed
information is not yet known at this stage; there is no private structure to
rely on to use mlx5_get_ifname() directly.

I'd also rather see these assumptions go in any case. I'll attempt to
improve things for v3 in preparation of allowing representors to be probed
on their own anytime, possibly even before the master device.

[2] https://mails.dpdk.org/archives/dev/2018-June/104059.html

<snip>
> > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > > 997b04a33..0fe467140 100644
> > > --- a/drivers/net/mlx5/mlx5.h
> > > +++ b/drivers/net/mlx5/mlx5.h
> > > @@ -161,6 +161,10 @@ struct priv {
> > >  	uint16_t mtu; /* Configured MTU. */
> > >  	uint8_t port; /* Physical port number. */
> > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > +	unsigned int representor:1; /* Device is a port representor. */
> 
> Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags. 

Problem is that this flag can only be set once the ethdev is fully
instantiated and can't be relied on internally where needed (e.g. during
clean up in error handling code). It's reported to applications but not used
internally.

As a device property, it's actually pretty similar to the VF bit or
offloaded capabilities where checking exposed information would be
needlessly complex.

Now maybe it could be part of struct mlx5_dev_config as well. I initially
assumed this object was only for user-provided parameters but looks like
it's not the case. I intend to move it there for v3.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-24 13:33         ` Shahaf Shuler
@ 2018-06-27 13:32           ` Adrien Mazarguil
  2018-06-28  5:57             ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-27 13:32 UTC (permalink / raw)
  To: Shahaf Shuler
  Cc: Xueming(Steven) Li, dev, Guillaume Gaudonville, Wisam Monther,
	Raslan Darawsheh, Olga Shern

On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> One more input, 
> 
> Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> [...]
> 
> > > > +	eth_list = tmp;
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> > 
> > The representor id is according to the sort made by qsort (based on device
> > names).
> > A better way may be to set it according to the sysfs information, like you do
> > in the mlx5_get_ifname function.
> > What do you think?
> 
> In fact relaying on linear increasing port numbers is dangerous. In may break on special scenarios like BlueField.
> In BlueField there are representors between the x86 and the ARM cores. Those are not VF representors. The phys_port_name of those is -1 and each of them belongs to different phys_switch_id.
> 
> We can argue whether it is correct/not to assign them w/ -1 value, but I think the suggested approach above can detect the right "vf_id" for those and not break the current behavior on x86.  
> Let me know if you have other suggestions. 

I didn't know that. Assuming that with these, there is exactly only one
representor per device, I think we can manage, the main issue being that
"-1" will be difficult to parse as a valid "representor" argument which uses
"-" for ranges.

Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
This one focuses on and is validated with VF representors only.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-27 13:32         ` Adrien Mazarguil
@ 2018-06-27 17:30           ` Xueming(Steven) Li
  2018-06-28  6:01           ` Shahaf Shuler
  1 sibling, 0 replies; 100+ messages in thread
From: Xueming(Steven) Li @ 2018-06-27 17:30 UTC (permalink / raw)
  To: Adrien Mazarguil, Shahaf Shuler; +Cc: dev



> -----Original Message-----
> From: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Sent: Wednesday, June 27, 2018 9:32 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: Xueming(Steven) Li <xuemingl@mellanox.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > Cc: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > Probe existing port representors in addition to their master
> > > > device and
> > > associate them automatically.
> > > >
> > > > To avoid name collision between Ethernet devices, their names use
> > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > only a PCI
> > > address in DBDF notation:
> > > >
> > > > - "net_{DBDF}_0" for master/switch devices.
> >
> > This is breaking compatibility for application using the device names in order to attach them to the
> application (e.g. OVS-DPDK).
> > Before this patch the naming scheme for non-representor port is "{DBDF}".
> >
> > Can we preserve the compatibility and add appropriate suffix for the representor case?
> 
> There's one issue if representors are hot-plugged. The name of the master device, which happens to be
> that of the switch domain, cannot be updated. The form "net_{DBDF}_0" seems expected for PMDs that
> support representors (see ixgbe and i40e).
> 
> Now since representor hot-plugging is not supported yet, I guess we could postpone this problem by
> keeping the old format in the meantime, however ideally, these applications should not rely on it. The
> only safe assumption they can make is the uniqueness of any given name among ethdevs.
> 
> PCI bus addresses, if needed, should be retrieved by looking at the underlying bus object.
> 
> By the way, while thinking again about a past comment from Xueming [1], maybe it's finally time to
> remove support for multiple Verbs ports on mlx5 after all. This should drop another unnecessary loop
> and the need for the unused "port %u" suffix at all while naming the device.
> 
> So how about the following plan for v3:
> 
> - Adding a patch that drops support for multiple Verbs ports (note for
>   Xueming, yes I changed my mind *again* :)
> 
> - If you really think this will break OVS (please confirm), then when no
>   "representor" parameter is provided (regardless of the presence of any
>   representors), name format will use the usual "{DBDF}" notation as you
>   suggested.
> 
> - Otherwise as soon as a "representor" is found on the command line, the new
>   format will be used, again regardless of the presence of any representors.

The port creation sequence of upcoming hot plug looks like this:
0000:81:00.1
0000:81:00.1,representor=0
0000:81:00.1,representor=1

So the PF attaching comes always w/o "representor" parameter.

> 
> - In both cases, representors if any, will be named according to the format
>   specified in this patch.
> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2
> 018-
> June%2F104015.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Cad9a1b32e5e241e375d208d5dc32778b%7Ca652971
> c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657031666639542&sdata=w4oNeWXwKXS0%2BNSZsYQaneW%2BkFxvWIHZFHLoM
> fLOxkg%3D&reserved=0
> 
> <snip>
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> >
> > The representor id is according to the sort made by qsort (based on device names).
> > A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname
> function.
> > What do you think?
> 
> I agree that the current approach sucks, hence the big fat warnings I left around (see discussion with
> Xueming [2]). Problem is that the needed information is not yet known at this stage; there is no
> private structure to rely on to use mlx5_get_ifname() directly.
> 
> I'd also rather see these assumptions go in any case. I'll attempt to improve things for v3 in
> preparation of allowing representors to be probed on their own anytime, possibly even before the
> master device.
> 
> [2]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2
> 018-
> June%2F104059.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Cad9a1b32e5e241e375d208d5dc32778b%7Ca652971
> c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657031666639542&sdata=5eOWb69duEB%2BkIW1ZGkv%2FLxkZfwErQOd%2FV7
> nDpN2jOg%3D&reserved=0
> 
> <snip>
> > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > > > index
> > > > 997b04a33..0fe467140 100644
> > > > --- a/drivers/net/mlx5/mlx5.h
> > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > @@ -161,6 +161,10 @@ struct priv {
> > > >  	uint16_t mtu; /* Configured MTU. */
> > > >  	uint8_t port; /* Physical port number. */
> > > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > > +	unsigned int representor:1; /* Device is a port representor. */
> >
> > Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags.
> 
> Problem is that this flag can only be set once the ethdev is fully instantiated and can't be relied on
> internally where needed (e.g. during clean up in error handling code). It's reported to applications
> but not used internally.
> 
> As a device property, it's actually pretty similar to the VF bit or offloaded capabilities where
> checking exposed information would be needlessly complex.
> 
> Now maybe it could be part of struct mlx5_dev_config as well. I initially assumed this object was only
> for user-provided parameters but looks like it's not the case. I intend to move it there for v3.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
  2018-06-27 13:30       ` Adrien Mazarguil
@ 2018-06-28  5:35         ` Shahaf Shuler
  0 siblings, 0 replies; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-28  5:35 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Wednesday, June 27, 2018 4:31 PM, Adrien Mazarguil:
> Subject: Re: [PATCH v2 2/7] net/mlx5: remove redundant objects in probe
> code
> 
> Hey Shahaf,
> 
> I couldn't reply earlier, sorry for that. See below.
> 
> On Sun, Jun 17, 2018 at 10:14:01AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> > Small nit,
> >
> > Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> > > Subject: [PATCH v2 2/7] net/mlx5: remove redundant objects in probe
> > > code
> > >
> > > This patch gets rid of redundant calls to open the device and query
> > > its attributes in order to simplify the code.
> > >
> > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > --
> > > v2 changes:
> > >
> > > - Minor indent fix on existing code.
> > > ---
> > >  drivers/net/mlx5/mlx5.c | 64
> > > +++++++++++++++++++++-----------------------
> > >  1 file changed, 30 insertions(+), 34 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> <snip>
> > > @@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> > > __rte_unused,
> > >  			continue;
> > >  		}
> > >  		DRV_LOG(DEBUG, "using port %u", port);
> > > -		ctx = mlx5_glue->open_device(ibv_dev);
> > > +		if (!ctx)
> >
> > Is it really possible for ctx to be NULL on this stage?
> > Maybe assert is preferable?
> 
> See below, ctx is only inherited (non-NULL) during the first iteration. It is
> reset and reopened for each instance since they need their own dedicated
> Verbs context.
> 
> In any case, this patch focuses on removing redundant calls in preparation for
> subsequent patches in the series. This code disappears entirely later.
> 
> <snip>
> > > +		/*
> > > +		 * Each eth_dev instance is assigned its own Verbs context,
> > > +		 * since this one is consumed, let the next iteration open
> > > +		 * another.
> > > +		 */
> > > +		ctx = NULL;
> > >  		continue;
> 
> No problem if I leave it that way?

Sure. 

> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-27 13:32           ` Adrien Mazarguil
@ 2018-06-28  5:57             ` Shahaf Shuler
  2018-06-28  9:13               ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-28  5:57 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Xueming(Steven) Li, dev, Guillaume Gaudonville, Wisam Monther,
	Raslan Darawsheh, Olga Shern

Wednesday, June 27, 2018 4:33 PM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> > One more input,
> >
> > Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> >
> > [...]
> >
> > > > > +	eth_list = tmp;
> > > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev,
> ibv_dev, vf,
> > > > > -						 &attr, i + 1);
> > > > > -		if (eth_list[i])
> > > > > -			continue;
> > > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > > -		ret = rte_errno;
> > > > > -		while (i--) {
> > > > > -			mlx5_dev_close(eth_list[i]);
> > > > > -			if (rte_eal_process_type() ==
> RTE_PROC_PRIMARY)
> > > > > -				rte_free(eth_list[i]->data-
> >dev_private);
> > > > > -
> 	claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > > -		}
> > > > > -		free(eth_list);
> > > > > -		rte_errno = ret;
> > > > > -		return NULL;
> > > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev,
> ibv_dev[j],
> > > > vf,
> > > > > +						 &attr, i + 1,
> > > > > +						 j ? eth_list[0] : NULL,
> > > > > +						 j - 1);
> > >
> > > The representor id is according to the sort made by qsort (based on
> > > device names).
> > > A better way may be to set it according to the sysfs information,
> > > like you do in the mlx5_get_ifname function.
> > > What do you think?
> >
> > In fact relaying on linear increasing port numbers is dangerous. In may
> break on special scenarios like BlueField.
> > In BlueField there are representors between the x86 and the ARM cores.
> Those are not VF representors. The phys_port_name of those is -1 and each
> of them belongs to different phys_switch_id.
> >
> > We can argue whether it is correct/not to assign them w/ -1 value, but I
> think the suggested approach above can detect the right "vf_id" for those
> and not break the current behavior on x86.
> > Let me know if you have other suggestions.
> 
> I didn't know that. Assuming that with these, there is exactly only one
> representor per device, I think we can manage, the main issue being that "-
> 1" will be difficult to parse as a valid "representor" argument which uses "-"
> for ranges.

The -1 value is not for the representor id, It is for the id of the entity which exists on the other size of the representor. 
The repesentor index is still 0, meaning the command line -w <pci_bdf>,representor=0 is correct on this case.

The problems comes from the assumption you do in your code about the representor id.
What you do currently is to receive the representors and qsort them by device name. then you assign the priv->rep_id based on the qsort output.
Later on when querying the if_name (mlx5_get_ifname) you assume that the phys_port_name of representor (which include the enumeration of what exists on its other side) is the same.

For x86 it probably works. On BlueField it breaks, as from some reason the phys_port_name is -1. 

My suggestion is to set the priv->rep_id based on the phys_port_name instead of qsort output. 

> 
> Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
> This one focuses on and is validated with VF representors only.

It is related to VF representors only. BlueField is just an example. 

> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-27 13:32         ` Adrien Mazarguil
  2018-06-27 17:30           ` Xueming(Steven) Li
@ 2018-06-28  6:01           ` Shahaf Shuler
  2018-06-28  8:45             ` Adrien Mazarguil
  1 sibling, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-28  6:01 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Xueming(Steven) Li, dev

Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > Cc: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > Probe existing port representors in addition to their master
> > > > device and
> > > associate them automatically.
> > > >
> > > > To avoid name collision between Ethernet devices, their names use
> > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > only a PCI
> > > address in DBDF notation:
> > > >
> > > > - "net_{DBDF}_0" for master/switch devices.
> >
> > This is breaking compatibility for application using the device names in
> order to attach them to the application (e.g. OVS-DPDK).
> > Before this patch the naming scheme for non-representor port is "{DBDF}".
> >
> > Can we preserve the compatibility and add appropriate suffix for the
> representor case?
> 
> There's one issue if representors are hot-plugged. The name of the master
> device, which happens to be that of the switch domain, cannot be updated.
> The form "net_{DBDF}_0" seems expected for PMDs that support
> representors (see ixgbe and i40e).
> 
> Now since representor hot-plugging is not supported yet, I guess we could
> postpone this problem by keeping the old format in the meantime, however
> ideally, these applications should not rely on it. The only safe assumption
> they can make is the uniqueness of any given name among ethdevs.
> 
> PCI bus addresses, if needed, should be retrieved by looking at the
> underlying bus object.

Am not sure I understand. Those application attach the device to the application based on its name, which happens to be the PCI address in case of mlx5. 

> 
> By the way, while thinking again about a past comment from Xueming [1],
> maybe it's finally time to remove support for multiple Verbs ports on mlx5
> after all. This should drop another unnecessary loop and the need for the
> unused "port %u" suffix at all while naming the device.
> 
> So how about the following plan for v3:
> 
> - Adding a patch that drops support for multiple Verbs ports (note for
>   Xueming, yes I changed my mind *again* :)

I am OK w/ that. 

> 
> - If you really think this will break OVS (please confirm),

It will. 

 then when no
>   "representor" parameter is provided (regardless of the presence of any
>   representors), name format will use the usual "{DBDF}" notation as you
>   suggested.
> 
> - Otherwise as soon as a "representor" is found on the command line, the
> new
>   format will be used, again regardless of the presence of any representors.
> 
> - In both cases, representors if any, will be named according to the format
>   specified in this patch.

Can we do the following?
In case representor is found the naming will be DBDF_representor_%d
In case no-representor naming will be DBDF

Just removing the net prefix.  

> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fma
> ils.dpdk.org%2Farchives%2Fdev%2F2018-
> June%2F104015.html&data=02%7C01%7Cshahafs%40mellanox.com%7C0037
> 6c6df5044ac9f8f908d5dc32778f%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> 0%7C0%7C636657031665047796&sdata=XXYtvW3J3i3Xzkn%2B8YBKYK1b2D6P
> 5eUiD2h4VqLUJD8%3D&reserved=0
> 
> <snip>
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> >
> > The representor id is according to the sort made by qsort (based on device
> names).
> > A better way may be to set it according to the sysfs information, like you do
> in the mlx5_get_ifname function.
> > What do you think?
> 
> I agree that the current approach sucks, hence the big fat warnings I left
> around (see discussion with Xueming [2]). Problem is that the needed
> information is not yet known at this stage; there is no private structure to
> rely on to use mlx5_get_ifname() directly.
> 
> I'd also rather see these assumptions go in any case. I'll attempt to improve
> things for v3 in preparation of allowing representors to be probed on their
> own anytime, possibly even before the master device.
> 
> [2]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fma
> ils.dpdk.org%2Farchives%2Fdev%2F2018-
> June%2F104059.html&data=02%7C01%7Cshahafs%40mellanox.com%7C0037
> 6c6df5044ac9f8f908d5dc32778f%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> 0%7C0%7C636657031665047796&sdata=jWLFP6GMdQ6C88r1v%2BYZx7iKH3k
> ZDhVpgP4am9F11PU%3D&reserved=0
> 
> <snip>
> > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > > > index
> > > > 997b04a33..0fe467140 100644
> > > > --- a/drivers/net/mlx5/mlx5.h
> > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > @@ -161,6 +161,10 @@ struct priv {
> > > >  	uint16_t mtu; /* Configured MTU. */
> > > >  	uint8_t port; /* Physical port number. */
> > > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > > +	unsigned int representor:1; /* Device is a port representor. */
> >
> > Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR
> from eth_dev->data->dev_flags.
> 
> Problem is that this flag can only be set once the ethdev is fully instantiated
> and can't be relied on internally where needed (e.g. during clean up in error
> handling code). It's reported to applications but not used internally.
> 
> As a device property, it's actually pretty similar to the VF bit or offloaded
> capabilities where checking exposed information would be needlessly
> complex.
> 
> Now maybe it could be part of struct mlx5_dev_config as well. I initially
> assumed this object was only for user-provided parameters but looks like it's
> not the case. I intend to move it there for v3.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-28  6:01           ` Shahaf Shuler
@ 2018-06-28  8:45             ` Adrien Mazarguil
  2018-06-28  9:06               ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-28  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: Xueming(Steven) Li, dev

On Thu, Jun 28, 2018 at 06:01:54AM +0000, Shahaf Shuler wrote:
> Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > > Hi Adrien,
> > >
> > > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > > -----Original Message-----
> > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > > Cc: dev@dpdk.org
> > > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > representors
> > > > >
> > > > > Probe existing port representors in addition to their master
> > > > > device and
> > > > associate them automatically.
> > > > >
> > > > > To avoid name collision between Ethernet devices, their names use
> > > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > > only a PCI
> > > > address in DBDF notation:
> > > > >
> > > > > - "net_{DBDF}_0" for master/switch devices.
> > >
> > > This is breaking compatibility for application using the device names in
> > order to attach them to the application (e.g. OVS-DPDK).
> > > Before this patch the naming scheme for non-representor port is "{DBDF}".
> > >
> > > Can we preserve the compatibility and add appropriate suffix for the
> > representor case?
> > 
> > There's one issue if representors are hot-plugged. The name of the master
> > device, which happens to be that of the switch domain, cannot be updated.
> > The form "net_{DBDF}_0" seems expected for PMDs that support
> > representors (see ixgbe and i40e).
> > 
> > Now since representor hot-plugging is not supported yet, I guess we could
> > postpone this problem by keeping the old format in the meantime, however
> > ideally, these applications should not rely on it. The only safe assumption
> > they can make is the uniqueness of any given name among ethdevs.
> > 
> > PCI bus addresses, if needed, should be retrieved by looking at the
> > underlying bus object.
> 
> Am not sure I understand. Those application attach the device to the application based on its name, which happens to be the PCI address in case of mlx5. 

I'm only saying it's not future-proof seeing what happens when they rely on
it. Moreover this forces them to convert the name to some binary form
instead of e.g. simply checking RTE_DEV_TO_PCI(dev->device)->addr where
needed and only use the name as some kind of opaque identifier.

> > By the way, while thinking again about a past comment from Xueming [1],
> > maybe it's finally time to remove support for multiple Verbs ports on mlx5
> > after all. This should drop another unnecessary loop and the need for the
> > unused "port %u" suffix at all while naming the device.
> > 
> > So how about the following plan for v3:
> > 
> > - Adding a patch that drops support for multiple Verbs ports (note for
> >   Xueming, yes I changed my mind *again* :)
> 
> I am OK w/ that. 
> 
> > 
> > - If you really think this will break OVS (please confirm),
> 
> It will. 

Out of curiosity, can you point me to the relevant code in OVS? Maybe
something can be done on their side.

Either ixgbe and i40e are unaffected by the very same change, or they also
break OVS, in which case there's an issue we need to solve with the
representor interface in DPDK before it's too late.

>  then when no
> >   "representor" parameter is provided (regardless of the presence of any
> >   representors), name format will use the usual "{DBDF}" notation as you
> >   suggested.
> > 
> > - Otherwise as soon as a "representor" is found on the command line, the
> > new
> >   format will be used, again regardless of the presence of any representors.
> > 
> > - In both cases, representors if any, will be named according to the format
> >   specified in this patch.
> 
> Can we do the following?
> In case representor is found the naming will be DBDF_representor_%d
> In case no-representor naming will be DBDF
> 
> Just removing the net prefix.  

Yes, I'll remove it. We'll standardize on the naming used for ixgbe/i40e
only once the above concerns are addressed.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-28  8:45             ` Adrien Mazarguil
@ 2018-06-28  9:06               ` Shahaf Shuler
  0 siblings, 0 replies; 100+ messages in thread
From: Shahaf Shuler @ 2018-06-28  9:06 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Xueming(Steven) Li, dev

Thursday, June 28, 2018 11:46 AM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Thu, Jun 28, 2018 at 06:01:54AM +0000, Shahaf Shuler wrote:
> > Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> > > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > > > Hi Adrien,
> > > >
> > > > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > representors
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien
> Mazarguil
> > > > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > > > Cc: dev@dpdk.org
> > > > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > > representors
> > > > > >
> > > > > > Probe existing port representors in addition to their master
> > > > > > device and
> > > > > associate them automatically.
> > > > > >
> > > > > > To avoid name collision between Ethernet devices, their names
> > > > > > use the same convention as ixgbe and i40e PMDs, that is,
> > > > > > instead of only a PCI
> > > > > address in DBDF notation:
> > > > > >
> > > > > > - "net_{DBDF}_0" for master/switch devices.
> > > >
> > > > This is breaking compatibility for application using the device
> > > > names in
> > > order to attach them to the application (e.g. OVS-DPDK).
> > > > Before this patch the naming scheme for non-representor port is
> "{DBDF}".
> > > >
> > > > Can we preserve the compatibility and add appropriate suffix for
> > > > the
> > > representor case?
> > >
> > > There's one issue if representors are hot-plugged. The name of the
> > > master device, which happens to be that of the switch domain, cannot be
> updated.
> > > The form "net_{DBDF}_0" seems expected for PMDs that support
> > > representors (see ixgbe and i40e).
> > >
> > > Now since representor hot-plugging is not supported yet, I guess we
> > > could postpone this problem by keeping the old format in the
> > > meantime, however ideally, these applications should not rely on it.
> > > The only safe assumption they can make is the uniqueness of any given
> name among ethdevs.
> > >
> > > PCI bus addresses, if needed, should be retrieved by looking at the
> > > underlying bus object.
> >
> > Am not sure I understand. Those application attach the device to the
> application based on its name, which happens to be the PCI address in case
> of mlx5.
> 
> I'm only saying it's not future-proof seeing what happens when they rely on
> it. Moreover this forces them to convert the name to some binary form
> instead of e.g. simply checking RTE_DEV_TO_PCI(dev->device)->addr where
> needed and only use the name as some kind of opaque identifier.
> 
> > > By the way, while thinking again about a past comment from Xueming
> > > [1], maybe it's finally time to remove support for multiple Verbs
> > > ports on mlx5 after all. This should drop another unnecessary loop
> > > and the need for the unused "port %u" suffix at all while naming the
> device.
> > >
> > > So how about the following plan for v3:
> > >
> > > - Adding a patch that drops support for multiple Verbs ports (note for
> > >   Xueming, yes I changed my mind *again* :)
> >
> > I am OK w/ that.
> >
> > >
> > > - If you really think this will break OVS (please confirm),
> >
> > It will.
> 
> Out of curiosity, can you point me to the relevant code in OVS? Maybe
> something can be done on their side.

For example the command to add new port to the ovs bridge is done on the following syntax
ovs-vsctl add-port ovsbr0 dpdk0 \        
    -- set interface dpdk0 type=dpdk \   
    options:dpdk-devargs="0000:81:00.0" \
    ofport_request=1                    

OVS users/automation are using the PCI address for Mellanox PMDs. w/ your change they will need to use net_0000:81:00.0 to attach the port. 

> 
> Either ixgbe and i40e are unaffected by the very same change, or they also
> break OVS, in which case there's an issue we need to solve with the
> representor interface in DPDK before it's too late.
> 
> >  then when no
> > >   "representor" parameter is provided (regardless of the presence of any
> > >   representors), name format will use the usual "{DBDF}" notation as you
> > >   suggested.
> > >
> > > - Otherwise as soon as a "representor" is found on the command line,
> > > the new
> > >   format will be used, again regardless of the presence of any
> representors.
> > >
> > > - In both cases, representors if any, will be named according to the
> format
> > >   specified in this patch.
> >
> > Can we do the following?
> > In case representor is found the naming will be DBDF_representor_%d In
> > case no-representor naming will be DBDF
> >
> > Just removing the net prefix.
> 
> Yes, I'll remove it. We'll standardize on the naming used for ixgbe/i40e only
> once the above concerns are addressed.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
  2018-06-28  5:57             ` Shahaf Shuler
@ 2018-06-28  9:13               ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-06-28  9:13 UTC (permalink / raw)
  To: Shahaf Shuler
  Cc: Xueming(Steven) Li, dev, Guillaume Gaudonville, Wisam Monther,
	Raslan Darawsheh, Olga Shern

On Thu, Jun 28, 2018 at 05:57:03AM +0000, Shahaf Shuler wrote:
> Wednesday, June 27, 2018 4:33 PM, Adrien Mazarguil:
> > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> > > One more input,
> > >
> > > Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > >
> > > [...]
> > >
> > > > > > +	eth_list = tmp;
> > > > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev,
> > ibv_dev, vf,
> > > > > > -						 &attr, i + 1);
> > > > > > -		if (eth_list[i])
> > > > > > -			continue;
> > > > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > > > -		ret = rte_errno;
> > > > > > -		while (i--) {
> > > > > > -			mlx5_dev_close(eth_list[i]);
> > > > > > -			if (rte_eal_process_type() ==
> > RTE_PROC_PRIMARY)
> > > > > > -				rte_free(eth_list[i]->data-
> > >dev_private);
> > > > > > -
> > 	claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > > > -		}
> > > > > > -		free(eth_list);
> > > > > > -		rte_errno = ret;
> > > > > > -		return NULL;
> > > > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev,
> > ibv_dev[j],
> > > > > vf,
> > > > > > +						 &attr, i + 1,
> > > > > > +						 j ? eth_list[0] : NULL,
> > > > > > +						 j - 1);
> > > >
> > > > The representor id is according to the sort made by qsort (based on
> > > > device names).
> > > > A better way may be to set it according to the sysfs information,
> > > > like you do in the mlx5_get_ifname function.
> > > > What do you think?
> > >
> > > In fact relaying on linear increasing port numbers is dangerous. In may
> > break on special scenarios like BlueField.
> > > In BlueField there are representors between the x86 and the ARM cores.
> > Those are not VF representors. The phys_port_name of those is -1 and each
> > of them belongs to different phys_switch_id.
> > >
> > > We can argue whether it is correct/not to assign them w/ -1 value, but I
> > think the suggested approach above can detect the right "vf_id" for those
> > and not break the current behavior on x86.
> > > Let me know if you have other suggestions.
> > 
> > I didn't know that. Assuming that with these, there is exactly only one
> > representor per device, I think we can manage, the main issue being that "-
> > 1" will be difficult to parse as a valid "representor" argument which uses "-"
> > for ranges.
> 
> The -1 value is not for the representor id, It is for the id of the entity which exists on the other size of the representor. 
> The repesentor index is still 0, meaning the command line -w <pci_bdf>,representor=0 is correct on this case.
> 
> The problems comes from the assumption you do in your code about the representor id.
> What you do currently is to receive the representors and qsort them by device name. then you assign the priv->rep_id based on the qsort output.
> Later on when querying the if_name (mlx5_get_ifname) you assume that the phys_port_name of representor (which include the enumeration of what exists on its other side) is the same.
> 
> For x86 it probably works. On BlueField it breaks, as from some reason the phys_port_name is -1. 
> 
> My suggestion is to set the priv->rep_id based on the phys_port_name instead of qsort output. 

Yes, understood. The only drawback using this approach is that mlx5 devices
won't be usable at all if no netdevice can be associated with them (e.g. in
case it was moved to another netns). Currently all matching IB devs are
probed regardless, except they are handled as normal devices when the PMD
can't determine whether it's dealing with a representor.

> > Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
> > This one focuses on and is validated with VF representors only.
> 
> It is related to VF representors only. BlueField is just an example. 

By "BlueField specifics", I mean the translation of -1 to 0 which so far is
specific to BlueField. Another patch is needed for that.

For devices where representors are properly numbered starting from 0, we must
rely on the uninterpreted phys_port_name value directly, which must be a
positive integer instead of a qsort() interpretation in order to properly
handle holes in the sequence due to missing devices (netns).

I intend to modify this patch as described.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support
  2018-06-14  8:34 ` [dpdk-dev] [PATCH v2 0/7] net/mlx5: add port representor support Adrien Mazarguil
                     ` (6 preceding siblings ...)
  2018-06-14  8:35   ` [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for " Adrien Mazarguil
@ 2018-07-04 17:27   ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
                       ` (10 more replies)
  7 siblings, 11 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This series adds support for port (VF) representors to the mlx5 PMD, which
can be instantiated using the standard "representor" device parameter.

Note the PMD only probes existing representors which exist as Verbs devices;
their creation is part of the host system configuration.

v3 changes:

- Added the following patches:
  - net/mlx5: drop useless support for several Verbs ports
  - net/mlx5: probe port representors in natural order
  - net/mlx5: support negative identifiers for port representors
- See individual patches for details.
- Rebased series.

v2 changes:

- See individual patches for details.
- Rebased series.

Adrien Mazarguil (10):
  net/mlx5: rename confusing object in probe code
  net/mlx5: remove redundant objects in probe code
  net/mlx5: drop useless support for several Verbs ports
  net/mlx5: split PCI from generic probing code
  net/mlx5: re-indent generic probing function
  net/mlx5: add port representor awareness
  net/mlx5: probe all port representors
  net/mlx5: probe port representors in natural order
  net/mlx5: add parameter for port representors
  net/mlx5: support negative identifiers for port representors

 doc/guides/nics/mlx5.rst                |   12 +
 doc/guides/prog_guide/poll_mode_drv.rst |    2 +
 drivers/net/mlx5/Makefile               |   30 +
 drivers/net/mlx5/mlx5.c                 | 1085 +++++++++++++++-----------
 drivers/net/mlx5/mlx5.h                 |   29 +-
 drivers/net/mlx5/mlx5_ethdev.c          |  137 +++-
 drivers/net/mlx5/mlx5_mac.c             |    2 +-
 drivers/net/mlx5/mlx5_nl.c              |  297 ++++++-
 drivers/net/mlx5/mlx5_stats.c           |    6 +-
 drivers/net/mlx5/mlx5_txq.c             |    2 +-
 10 files changed, 1127 insertions(+), 475 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 01/10] net/mlx5: rename confusing object in probe code
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
                       ` (9 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

There are several attribute objects in this function:

- IB device attributes (struct ibv_device_attr_ex device_attr).
- Direct Verbs attributes (struct mlx5dv_context attrs_out).
- Port attributes (struct ibv_port_attr).
- IB device attributes again (struct ibv_device_attr_ex device_attr_ex).

"attrs_out" is both odd and initialized using a nonstandard syntax. Rename
it "dv_attr" for consistency.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Fixed ctx -> attr_ctx in mlx5_pci_probe().
---
 drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d081bdd05..22cbce8d5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
@@ -670,7 +671,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
 	int i;
-	struct mlx5dv_context attrs_out = {0};
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
@@ -736,21 +736,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
 	/*
 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 	 * as all ConnectX-5 devices.
 	 */
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
-	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
-		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
 			mps = MLX5_MPW_ENHANCED;
 		} else {
@@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		mps = MLX5_MPW_DISABLED;
 	}
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
-		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
-			attrs_out.striding_rq_caps;
+			dv_attr.striding_rq_caps;
 
 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
 			mprq_caps.min_single_stride_log_num_of_bytes);
@@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
-	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
-		tunnel_en = ((attrs_out.tunnel_offloads_caps &
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
-			     (attrs_out.tunnel_offloads_caps &
+			     (dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
 	}
 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
@@ -812,9 +812,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
-	mpls_en = ((attrs_out.tunnel_offloads_caps &
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
-		   (attrs_out.tunnel_offloads_caps &
+		   (dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
 		mpls_en ? "" : "not ");
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 02/10] net/mlx5: remove redundant objects in probe code
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 03/10] net/mlx5: drop useless support for several Verbs ports Adrien Mazarguil
                       ` (8 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch gets rid of redundant calls to open the device and query its
attributes in order to simplify the code.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Minor indent fix on existing code.
---
 drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 22cbce8d5..4e7f29f5b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct ibv_context *ctx = NULL;
+	struct ibv_device_attr_ex attr;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
-	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr_ex device_attr;
 	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		attr_ctx = mlx5_glue->open_device(list[i]);
+		ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
 		break;
 	}
-	if (attr_ctx == NULL) {
+	if (ctx == NULL) {
 		switch (err) {
 		case 0:
 			DRV_LOG(ERR,
@@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	mlx5_glue->dv_query_device(ctx, &dv_attr);
 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
@@ -822,23 +822,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected",
-		device_attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
 		char name[RTE_ETH_NAME_MAX_LEN];
 		int len;
 		uint32_t port = i + 1; /* ports are indexed from one */
-		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (device_attr.orig_attr.phys_port_cnt > 1)
+		if (attr.orig_attr.phys_port_cnt > 1)
 			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
@@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		ctx = mlx5_glue->open_device(ibv_dev);
+		if (!ctx)
+			ctx = mlx5_glue->open_device(ibv_dev);
 		if (ctx == NULL) {
 			err = ENODEV;
 			goto port_error;
@@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = device_attr;
+		priv->device_attr = attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
@@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				strerror(rte_errno));
 			goto port_error;
 		}
-		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
-		if (err) {
-			DRV_LOG(ERR, "ibv_query_device_ex() failed");
-			goto port_error;
-		}
-		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
+		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!(device_attr.max_counter_sets);
+		config.flow_counter_en = !!attr.max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -978,7 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+			attr.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+		config.hw_padding = !!attr.rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
-			      (device_attr_ex.tso_caps.supported_qpts &
-			      (1 << IBV_QPT_RAW_PACKET)));
+		config.tso = (attr.tso_caps.max_tso > 0 &&
+			      (attr.tso_caps.supported_qpts &
+			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz =
-					device_attr_ex.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr.tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1170,14 +1162,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
+		/*
+		 * Each eth_dev instance is assigned its own Verbs context,
+		 * since this one is consumed, let the next iteration open
+		 * another.
+		 */
+		ctx = NULL;
 		continue;
 port_error:
 		if (priv)
 			rte_free(priv);
 		if (pd)
 			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (ctx)
-			claim_zero(mlx5_glue->close_device(ctx));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
 		break;
@@ -1189,8 +1185,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
 error:
-	if (attr_ctx)
-		claim_zero(mlx5_glue->close_device(attr_ctx));
+	if (ctx)
+		claim_zero(mlx5_glue->close_device(ctx));
 	if (list)
 		mlx5_glue->free_device_list(list);
 	if (err) {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 03/10] net/mlx5: drop useless support for several Verbs ports
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 04/10] net/mlx5: split PCI from generic probing code Adrien Mazarguil
                       ` (7 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Xueming Li

Unlike mlx4 from which this capability was inherited, mlx5 devices expose
exactly one Verbs port per PCI bus address. Each physical port gets
assigned its own bus address with a single Verbs port.

While harmless, this code requires an extra loop that would get in the way
of subsequent refactoring.

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

This patch was not present in prior revisions. As discussed [1], it was
added after finally deciding to remove this support.

[1] https://mails.dpdk.org/archives/dev/2018-June/105661.html
---
 drivers/net/mlx5/mlx5.c        | 96 +++++++++++++------------------------
 drivers/net/mlx5/mlx5.h        |  1 -
 drivers/net/mlx5/mlx5_ethdev.c |  2 +-
 drivers/net/mlx5/mlx5_txq.c    |  2 +-
 4 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4e7f29f5b..717d8b268 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -652,11 +652,13 @@ static int
 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	       struct rte_pci_device *pci_dev)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
+	struct ibv_device **list;
 	struct ibv_context *ctx = NULL;
 	struct ibv_device_attr_ex attr;
+	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
+	struct priv *priv = NULL;
 	int err = 0;
 	unsigned int vf = 0;
 	unsigned int mps;
@@ -719,6 +721,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = rte_errno;
 		break;
 	}
+	mlx5_glue->free_device_list(list);
 	if (ctx == NULL) {
 		switch (err) {
 		case 0:
@@ -733,7 +736,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		}
 		goto error;
 	}
-	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
@@ -827,15 +829,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
-		struct ibv_pd *pd = NULL;
-		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -859,11 +855,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
+		snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -901,31 +895,22 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
 			rte_eth_dev_probing_finish(eth_dev);
-			continue;
-		}
-		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
+			claim_zero(mlx5_glue->close_device(ctx));
+			return 0;
 		}
 		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, port, &port_attr);
+		err = mlx5_glue->query_port(ctx, 1, &port_attr);
 		if (err) {
 			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR,
-				"port %d is not configured in Ethernet mode",
-				port);
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
 			err = EINVAL;
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
-				port,
+			DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
 				mlx5_glue->port_state_str(port_attr.state),
 				port_attr.state);
 		/* Allocate protection domain. */
@@ -933,7 +918,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (pd == NULL) {
 			DRV_LOG(ERR, "PD allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		/* from rte_ethdev.c */
 		priv = rte_zmalloc("ethdev private structure",
@@ -942,13 +927,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (priv == NULL) {
 			DRV_LOG(ERR, "priv allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
 		priv->device_attr = attr;
-		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		err = mlx5_args(&config, pci_dev->device.devargs);
@@ -956,7 +940,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
-			goto port_error;
+			goto error;
 		}
 		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
@@ -1006,7 +990,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				"multi-packet send not supported on this device"
 				" (" MLX5_TXQ_MPW_EN ")");
 			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(INFO, "%s MPS is %s",
 			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
@@ -1038,7 +1022,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (eth_dev == NULL) {
 			DRV_LOG(ERR, "can not allocate rte ethdev");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
@@ -1049,7 +1033,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Configure the first MAC address by default. */
 		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
@@ -1058,7 +1042,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				" loaded? (errno: %s)",
 				eth_dev->data->port_id, strerror(rte_errno));
 			err = ENODEV;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(INFO,
 			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -1082,7 +1066,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = mlx5_get_mtu(eth_dev, &priv->mtu);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
 			priv->mtu);
@@ -1131,7 +1115,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
 				eth_dev->data->port_id, strerror(rte_errno));
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Supported Verbs flow priority number detection. */
 		if (verb_priorities == 0)
@@ -1140,7 +1124,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
 				eth_dev->data->port_id, verb_priorities);
 			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
 		priv->config.max_verbs_prio = verb_priorities;
 		/*
@@ -1154,7 +1138,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 					 eth_dev->device->numa_node);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Add device to memory callback list. */
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
@@ -1162,33 +1146,17 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
-port_error:
-		if (priv)
-			rte_free(priv);
-		if (pd)
-			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
-			rte_eth_dev_release_port(eth_dev);
-		break;
+		return 0;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
+	if (priv)
+		rte_free(priv);
+	if (pd)
+		claim_zero(mlx5_glue->dealloc_pd(pd));
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
 	if (err) {
 		rte_errno = err;
 		return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 997b04a33..f55ff4a21 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -159,7 +159,6 @@ struct priv {
 	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
 	/* Device properties. */
 	uint16_t mtu; /* Configured MTU. */
-	uint8_t port; /* Physical port number. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index ebe5cb6e3..819f5baad 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -166,7 +166,7 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 		if (dev_port == dev_port_prev)
 			goto try_dev_id;
 		dev_port_prev = dev_port;
-		if (dev_port == (priv->port - 1u))
+		if (dev_port == 0)
 			strlcpy(match, name, sizeof(match));
 	}
 	closedir(dir);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 669b91319..5057561ae 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -434,7 +434,7 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 		/* Move the QP to this state. */
 		.qp_state = IBV_QPS_INIT,
 		/* Primary port number. */
-		.port_num = priv->port
+		.port_num = 1,
 	};
 	ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod,
 				   (IBV_QP_STATE | IBV_QP_PORT));
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 04/10] net/mlx5: split PCI from generic probing code
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (2 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 03/10] net/mlx5: drop useless support for several Verbs ports Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 05/10] net/mlx5: re-indent generic probing function Adrien Mazarguil
                       ` (6 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. Verbs device (mlx5_dev_spawn()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v3 changes:

- Moved VF device check within mlx5_pci_probe() after identifying the
  device instead of before that.
- Merged mlx5_dev_spawn_one() with mlx5_dev_spawn() since there is no need
  anymore for an intermediate function to iterate over Verbs ports.

v2 changes:

- Fixed device naming. A port suffix is now appended only if several IB
  ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
  initialization errors, as it was confusing.
---
 drivers/net/mlx5/mlx5.c | 195 ++++++++++++++++++++++++-------------------
 1 file changed, 109 insertions(+), 86 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 717d8b268..8916d4684 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -36,6 +36,7 @@
 #include <rte_kvargs.h>
 #include <rte_rwlock.h>
 #include <rte_spinlock.h>
+#include <rte_string_fns.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -635,32 +636,31 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
 {
-	struct ibv_device **list;
-	struct ibv_context *ctx = NULL;
+	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
 	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct priv *priv = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -672,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	mlx5_glue->free_device_list(list);
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -855,9 +802,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
+		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -866,7 +811,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -894,9 +839,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
 			claim_zero(mlx5_glue->close_device(ctx));
-			return 0;
+			return eth_dev;
 		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, 1, &port_attr);
@@ -935,7 +879,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->device_attr = attr;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
@@ -1027,8 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1146,7 +1089,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		return 0;
+		return eth_dev;
 	}
 error:
 	if (priv)
@@ -1157,11 +1100,91 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function spawns an Ethernet device out of a given PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev *eth_dev = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	if (ret >= 0)
+		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else if (!eth_dev) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+	} else {
+		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		rte_eth_dev_probing_finish(eth_dev);
+		ret = 0;
+	}
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 05/10] net/mlx5: re-indent generic probing function
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (3 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 04/10] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 06/10] net/mlx5: add port representor awareness Adrien Mazarguil
                       ` (5 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Since commit "net/mlx5: drop useless support for several Verbs ports"
removed an inner loop, mlx5_dev_spawn() is left with an unnecessary indent
level.

This patch eliminates a block, moves its local variables to function scope,
and re-indents its contents (diff best viewed with --ignore-all-space).

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming(Steven) Li <xuemingl@mellanox.com>
--
v3 changes:

- Reworded commit log since original patch was modified. This patch is also
  much shorter as a consequence.
---
 drivers/net/mlx5/mlx5.c | 578 +++++++++++++++++++++----------------------
 1 file changed, 282 insertions(+), 296 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 8916d4684..1054bf6d0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -656,8 +656,25 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	struct ibv_port_attr port_attr;
 	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct mlx5_dev_config config = {
+		.vf = !!vf,
+		.tx_vec_en = 1,
+		.rx_vec_en = 1,
+		.mpw_hdr_dseg = 0,
+		.txq_inline = MLX5_ARG_UNSET,
+		.txqs_inline = MLX5_ARG_UNSET,
+		.inline_max_packet_sz = MLX5_ARG_UNSET,
+		.vf_nl_en = 1,
+		.mprq = {
+			.enabled = 0,
+			.stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
+			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+		},
+	};
 	struct rte_eth_dev *eth_dev = NULL;
 	struct priv *priv = NULL;
 	int err = 0;
@@ -675,6 +692,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
+	struct ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -710,11 +729,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		DRV_LOG(DEBUG, "MPW isn't supported");
 		mps = MLX5_MPW_DISABLED;
 	}
+	config.mps = mps;
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
 		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
+	config.swp = !!swp;
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
@@ -740,6 +761,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mprq_caps.min_single_wqe_log_num_of_strides;
 		mprq_max_stride_num_n =
 			mprq_caps.max_single_wqe_log_num_of_strides;
+		config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						   mprq_min_stride_num_n);
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
@@ -747,6 +770,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
+	config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
 		tunnel_en = ((dv_attr.tunnel_offloads_caps &
@@ -760,6 +784,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
+	config.tunnel_en = tunnel_en;
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
 	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
@@ -771,326 +796,287 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
+	config.mpls_en = mpls_en;
 	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	{
-		char name[RTE_ETH_NAME_MAX_LEN];
-		struct ibv_port_attr port_attr;
-		struct ether_addr mac;
-		struct mlx5_dev_config config = {
-			.cqe_comp = cqe_comp,
-			.mps = mps,
-			.tunnel_en = tunnel_en,
-			.mpls_en = mpls_en,
-			.tx_vec_en = 1,
-			.rx_vec_en = 1,
-			.mpw_hdr_dseg = 0,
-			.txq_inline = MLX5_ARG_UNSET,
-			.txqs_inline = MLX5_ARG_UNSET,
-			.inline_max_packet_sz = MLX5_ARG_UNSET,
-			.vf_nl_en = 1,
-			.swp = !!swp,
-			.mprq = {
-				.enabled = 0, /* Disabled by default. */
-				.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-							mprq_min_stride_num_n),
-				.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
-				.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
-			},
-		};
-
-		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
-		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-			eth_dev = rte_eth_dev_attach_secondary(name);
-			if (eth_dev == NULL) {
-				DRV_LOG(ERR, "can not attach rte ethdev");
-				rte_errno = ENOMEM;
-				err = rte_errno;
-				goto error;
-			}
-			eth_dev->device = dpdk_dev;
-			eth_dev->dev_ops = &mlx5_dev_sec_ops;
-			err = mlx5_uar_init_secondary(eth_dev);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Receive command fd from primary process */
-			err = mlx5_socket_connect(eth_dev);
-			if (err < 0) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Remap UAR for Tx queues. */
-			err = mlx5_tx_uar_remap(eth_dev, err);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/*
-			 * Ethdev pointer is still required as input since
-			 * the primary device is not accessible from the
-			 * secondary process.
-			 */
-			eth_dev->rx_pkt_burst =
-				mlx5_select_rx_function(eth_dev);
-			eth_dev->tx_pkt_burst =
-				mlx5_select_tx_function(eth_dev);
-			claim_zero(mlx5_glue->close_device(ctx));
-			return eth_dev;
-		}
-		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, 1, &port_attr);
-		if (err) {
-			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto error;
-		}
-		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR, "port is not configured in Ethernet mode");
-			err = EINVAL;
-			goto error;
-		}
-		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
-				mlx5_glue->port_state_str(port_attr.state),
-				port_attr.state);
-		/* Allocate protection domain. */
-		pd = mlx5_glue->alloc_pd(ctx);
-		if (pd == NULL) {
-			DRV_LOG(ERR, "PD allocation failure");
-			err = ENOMEM;
-			goto error;
-		}
-		/* from rte_ethdev.c */
-		priv = rte_zmalloc("ethdev private structure",
-				   sizeof(*priv),
-				   RTE_CACHE_LINE_SIZE);
-		if (priv == NULL) {
-			DRV_LOG(ERR, "priv allocation failure");
-			err = ENOMEM;
-			goto error;
-		}
-		priv->ctx = ctx;
-		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
-			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
-		priv->pd = pd;
-		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, dpdk_dev->devargs);
-		if (err) {
-			err = rte_errno;
-			DRV_LOG(ERR, "failed to process device arguments: %s",
-				strerror(rte_errno));
-			goto error;
-		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
-				    IBV_DEVICE_RAW_IP_CSUM);
-		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
-			(config.hw_csum ? "" : "not "));
-#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
-		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
-		DRV_LOG(DEBUG,
-			"counter type = %d, num of cs = %ld, attributes = %d",
-			cs_desc.counter_type, cs_desc.num_of_cs,
-			cs_desc.attributes);
-#endif
-		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
-		/* Remove this check once DPDK supports larger/variable
-		 * indirection tables. */
-		if (config.ind_table_max_size >
-				(unsigned int)ETH_RSS_RETA_SIZE_512)
-			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
-		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
-			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
-		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
-			(config.hw_vlan_strip ? "" : "not "));
-
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
-		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
-			(config.hw_fcs_strip ? "" : "not "));
-
-#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
-#endif
-		DRV_LOG(DEBUG,
-			"hardware Rx end alignment padding is %ssupported",
-			(config.hw_padding ? "" : "not "));
-		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
-			       (1 << IBV_QPT_RAW_PACKET)));
-		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
-		if (config.mps && !mps) {
-			DRV_LOG(ERR,
-				"multi-packet send not supported on this device"
-				" (" MLX5_TXQ_MPW_EN ")");
-			err = ENOTSUP;
-			goto error;
-		}
-		DRV_LOG(INFO, "%s MPS is %s",
-			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
-			config.mps != MLX5_MPW_DISABLED ? "enabled" :
-			"disabled");
-		if (config.cqe_comp && !cqe_comp) {
-			DRV_LOG(WARNING, "Rx CQE compression isn't supported");
-			config.cqe_comp = 0;
-		}
-		if (config.mprq.enabled && mprq) {
-			if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
-			    config.mprq.stride_num_n < mprq_min_stride_num_n) {
-				config.mprq.stride_num_n =
-					RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-						mprq_min_stride_num_n);
-				DRV_LOG(WARNING,
-					"the number of strides"
-					" for Multi-Packet RQ is out of range,"
-					" setting default value (%u)",
-					1 << config.mprq.stride_num_n);
-			}
-			config.mprq.min_stride_size_n = mprq_min_stride_size_n;
-			config.mprq.max_stride_size_n = mprq_max_stride_size_n;
-		} else if (config.mprq.enabled && !mprq) {
-			DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
-			config.mprq.enabled = 0;
-		}
-		eth_dev = rte_eth_dev_allocate(name);
+	rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
-			DRV_LOG(ERR, "can not allocate rte ethdev");
-			err = ENOMEM;
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			err = rte_errno;
 			goto error;
 		}
-		eth_dev->data->dev_private = priv;
-		priv->dev_data = eth_dev->data;
-		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = dpdk_dev;
-		eth_dev->device->driver = &mlx5_driver.driver;
-		err = mlx5_uar_init_primary(eth_dev);
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_uar_init_secondary(eth_dev);
 		if (err) {
 			err = rte_errno;
 			goto error;
 		}
-		/* Configure the first MAC address by default. */
-		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
-			DRV_LOG(ERR,
-				"port %u cannot get MAC address, is mlx5_en"
-				" loaded? (errno: %s)",
-				eth_dev->data->port_id, strerror(rte_errno));
-			err = ENODEV;
-			goto error;
-		}
-		DRV_LOG(INFO,
-			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
-			eth_dev->data->port_id,
-			mac.addr_bytes[0], mac.addr_bytes[1],
-			mac.addr_bytes[2], mac.addr_bytes[3],
-			mac.addr_bytes[4], mac.addr_bytes[5]);
-#ifndef NDEBUG
-		{
-			char ifname[IF_NAMESIZE];
-
-			if (mlx5_get_ifname(eth_dev, &ifname) == 0)
-				DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
-					eth_dev->data->port_id, ifname);
-			else
-				DRV_LOG(DEBUG, "port %u ifname is unknown",
-					eth_dev->data->port_id);
-		}
-#endif
-		/* Get actual MTU if possible. */
-		err = mlx5_get_mtu(eth_dev, &priv->mtu);
-		if (err) {
+		/* Receive command fd from primary process */
+		err = mlx5_socket_connect(eth_dev);
+		if (err < 0) {
 			err = rte_errno;
 			goto error;
 		}
-		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
-			priv->mtu);
-		/*
-		 * Initialize burst functions to prevent crashes before link-up.
-		 */
-		eth_dev->rx_pkt_burst = removed_rx_burst;
-		eth_dev->tx_pkt_burst = removed_tx_burst;
-		eth_dev->dev_ops = &mlx5_dev_ops;
-		/* Register MAC address. */
-		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-		priv->nl_socket = -1;
-		priv->nl_sn = 0;
-		if (vf && config.vf_nl_en) {
-			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-			if (priv->nl_socket < 0)
-				priv->nl_socket = -1;
-			mlx5_nl_mac_addr_sync(eth_dev);
-		}
-		TAILQ_INIT(&priv->flows);
-		TAILQ_INIT(&priv->ctrl_flows);
-		/* Hint libmlx5 to use PMD allocator for data plane resources */
-		struct mlx5dv_ctx_allocators alctr = {
-			.alloc = &mlx5_alloc_verbs_buf,
-			.free = &mlx5_free_verbs_buf,
-			.data = priv,
-		};
-		mlx5_glue->dv_set_context_attr(ctx,
-					       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
-					       (void *)((uintptr_t)&alctr));
-		/* Bring Ethernet device up. */
-		DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
-			eth_dev->data->port_id);
-		mlx5_set_link_up(eth_dev);
-		/*
-		 * Even though the interrupt handler is not installed yet,
-		 * interrupts will still trigger on the asyn_fd from
-		 * Verbs context returned by ibv_open_device().
-		 */
-		mlx5_link_update(eth_dev, 0);
-		/* Store device configuration on private structure. */
-		priv->config = config;
-		/* Create drop queue. */
-		err = mlx5_flow_create_drop_queue(eth_dev);
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_remap(eth_dev, err);
 		if (err) {
-			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
-				eth_dev->data->port_id, strerror(rte_errno));
 			err = rte_errno;
 			goto error;
 		}
-		/* Supported Verbs flow priority number detection. */
-		if (verb_priorities == 0)
-			verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
-		if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
-			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
-				eth_dev->data->port_id, verb_priorities);
-			err = ENOTSUP;
-			goto error;
-		}
-		priv->config.max_verbs_prio = verb_priorities;
 		/*
-		 * Once the device is added to the list of memory event
-		 * callback, its global MR cache table cannot be expanded
-		 * on the fly because of deadlock. If it overflows, lookup
-		 * should be done by searching MR list linearly, which is slow.
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
 		 */
-		err = mlx5_mr_btree_init(&priv->mr.cache,
-					 MLX5_MR_BTREE_CACHE_N * 2,
-					 eth_dev->device->numa_node);
-		if (err) {
-			err = rte_errno;
-			goto error;
-		}
-		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-				 priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
-		rte_eth_dev_probing_finish(eth_dev);
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		claim_zero(mlx5_glue->close_device(ctx));
 		return eth_dev;
 	}
+	/* Check port status. */
+	err = mlx5_glue->query_port(ctx, 1, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port is not configured in Ethernet mode");
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+			mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate protection domain. */
+	pd = mlx5_glue->alloc_pd(ctx);
+	if (pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv = rte_zmalloc("ethdev private structure",
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->ctx = ctx;
+	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
+		sizeof(priv->ibdev_path));
+	priv->device_attr = attr;
+	priv->pd = pd;
+	priv->mtu = ETHER_MTU;
+	err = mlx5_args(&config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	config.hw_csum = !!(attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config.hw_csum ? "" : "not "));
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	config.flow_counter_en = !!attr.max_counter_sets;
+	mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
+	DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d",
+		cs_desc.counter_type, cs_desc.num_of_cs,
+		cs_desc.attributes);
+#endif
+	config.ind_table_max_size =
+		attr.rss_caps.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config.ind_table_max_size);
+	config.hw_vlan_strip = !!(attr.raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config.hw_vlan_strip ? "" : "not "));
+	config.hw_fcs_strip = !!(attr.raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config.hw_fcs_strip ? "" : "not "));
+#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
+	config.hw_padding = !!attr.rx_pad_end_addr_align;
+#endif
+	DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported",
+		(config.hw_padding ? "" : "not "));
+	config.tso = (attr.tso_caps.max_tso > 0 &&
+		      (attr.tso_caps.supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config.tso)
+		config.tso_max_payload_sz = attr.tso_caps.max_tso;
+	if (config.mps && !mps) {
+		DRV_LOG(ERR,
+			"multi-packet send not supported on this device"
+			" (" MLX5_TXQ_MPW_EN ")");
+		err = ENOTSUP;
+		goto error;
+	}
+	DRV_LOG(INFO, "%sMPS is %s",
+		config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
+		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config.cqe_comp && !cqe_comp) {
+		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+		config.cqe_comp = 0;
+	}
+	if (config.mprq.enabled && mprq) {
+		if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+		    config.mprq.stride_num_n < mprq_min_stride_num_n) {
+			config.mprq.stride_num_n =
+				RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+					mprq_min_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_num_n);
+		}
+		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+	} else if (config.mprq.enabled && !mprq) {
+		DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+		config.mprq.enabled = 0;
+	}
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	eth_dev->device->driver = &mlx5_driver.driver;
+	err = mlx5_uar_init_primary(eth_dev);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		eth_dev->data->port_id,
+		mac.addr_bytes[0], mac.addr_bytes[1],
+		mac.addr_bytes[2], mac.addr_bytes[3],
+		mac.addr_bytes[4], mac.addr_bytes[5]);
+#ifndef NDEBUG
+	{
+		char ifname[IF_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
+#endif
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	priv->nl_socket = -1;
+	priv->nl_sn = 0;
+	if (vf && config.vf_nl_en) {
+		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
+		if (priv->nl_socket < 0)
+			priv->nl_socket = -1;
+		mlx5_nl_mac_addr_sync(eth_dev);
+	}
+	TAILQ_INIT(&priv->flows);
+	TAILQ_INIT(&priv->ctrl_flows);
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	struct mlx5dv_ctx_allocators alctr = {
+		.alloc = &mlx5_alloc_verbs_buf,
+		.free = &mlx5_free_verbs_buf,
+		.data = priv,
+	};
+	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				       (void *)((uintptr_t)&alctr));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the asyn_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+	/* Store device configuration on private structure. */
+	priv->config = config;
+	/* Create drop queue. */
+	err = mlx5_flow_create_drop_queue(eth_dev);
+	if (err) {
+		DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = rte_errno;
+		goto error;
+	}
+	/* Supported Verbs flow priority number detection. */
+	if (verb_priorities == 0)
+		verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
+	if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
+		DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
+			eth_dev->data->port_id, verb_priorities);
+		err = ENOTSUP;
+		goto error;
+	}
+	priv->config.max_verbs_prio = verb_priorities;
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 */
+	err = mlx5_mr_btree_init(&priv->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 eth_dev->device->numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Add device to memory callback list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+			 priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	return eth_dev;
 error:
 	if (priv)
 		rte_free(priv);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 06/10] net/mlx5: add port representor awareness
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (4 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 05/10] net/mlx5: re-indent generic probing function Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 07/10] net/mlx5: probe all port representors Adrien Mazarguil
                       ` (4 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Xueming Li

The current PCI probing method is not aware of Verbs port representors,
which appear as standard Verbs devices bound to the same PCI address and
cannot be distinguished.

Problem is that more often than not, the wrong Verbs device is used,
resulting in unexpected traffic.

This patch makes the driver discard representors to only use the master
device. If unable to identify it (e.g. kernel drivers not recent enough),
either:

- There is only one matching device which isn't identified as a
  representor, in that case use it.
- Otherwise log an error and do not probe the device.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

- Replaced all heuristics (including mlx5_cmp_ibv_name()) with Netlink
  queries to associate IB devices with network interfaces.
- Reworded commit log.

v2 changes:

- Fixed digit detection in mlx5_cmp_ibv_name() so that "foo1" and "foo10"
  are compared on the integer conversion of "1" against "10" instead of ""
  and "0".
---
 drivers/net/mlx5/Makefile  |  30 ++++
 drivers/net/mlx5/mlx5.c    | 109 +++++++++++++--
 drivers/net/mlx5/mlx5.h    |  16 ++-
 drivers/net/mlx5/mlx5_nl.c | 297 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 428 insertions(+), 24 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 955861a41..745752e23 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -152,6 +152,36 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		infiniband/verbs.h \
 		enum IBV_FLOW_SPEC_ACTION_COUNT \
 		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_CMD_GET \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_CMD_GET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_CMD_PORT_GET \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_CMD_PORT_GET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_DEV_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_DEV_INDEX \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_DEV_NAME \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_DEV_NAME \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_PORT_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_PORT_INDEX \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_NDEV_INDEX \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1054bf6d0..d06ba9886 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -13,6 +13,7 @@
 #include <errno.h>
 #include <net/if.h>
 #include <sys/mman.h>
+#include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
 /* Verbs header. */
@@ -274,8 +275,10 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 		mlx5_socket_uninit(dev);
 	if (priv->config.vf)
 		mlx5_nl_mac_addr_flush(dev);
-	if (priv->nl_socket >= 0)
-		close(priv->nl_socket);
+	if (priv->nl_socket_route >= 0)
+		close(priv->nl_socket_route);
+	if (priv->nl_socket_rdma >= 0)
+		close(priv->nl_socket_rdma);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -876,6 +879,10 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->device_attr = attr;
 	priv->pd = pd;
 	priv->mtu = ETHER_MTU;
+	/* Some internal functions rely on Netlink sockets, open them now. */
+	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
+	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE);
+	priv->nl_sn = 0;
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -1010,14 +1017,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	eth_dev->dev_ops = &mlx5_dev_ops;
 	/* Register MAC address. */
 	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-	priv->nl_socket = -1;
-	priv->nl_sn = 0;
-	if (vf && config.vf_nl_en) {
-		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-		if (priv->nl_socket < 0)
-			priv->nl_socket = -1;
+	if (vf && config.vf_nl_en)
 		mlx5_nl_mac_addr_sync(eth_dev);
-	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1078,8 +1079,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
-	if (priv)
+	if (priv) {
+		if (priv->nl_socket_route >= 0)
+			close(priv->nl_socket_route);
+		if (priv->nl_socket_rdma >= 0)
+			close(priv->nl_socket_rdma);
 		rte_free(priv);
+	}
 	if (pd)
 		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev)
@@ -1110,6 +1116,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **ibv_list;
 	struct rte_eth_dev *eth_dev = NULL;
+	unsigned int n = 0;
 	int vf;
 	int ret;
 
@@ -1121,6 +1128,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
+
+	struct ibv_device *ibv_match[ret + 1];
+
 	while (ret-- > 0) {
 		struct rte_pci_addr pci_addr;
 
@@ -1132,10 +1142,81 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    pci_dev->addr.devid != pci_addr.devid ||
 		    pci_dev->addr.function != pci_addr.function)
 			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
 			ibv_list[ret]->name);
+		ibv_match[n++] = ibv_list[ret];
+	}
+	ibv_match[n] = NULL;
+
+	unsigned int ifindex[n];
+	struct mlx5_switch_info info[n];
+	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
+	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
+	unsigned int i;
+
+	/*
+	 * The existence of several matching entries (n > 1) means port
+	 * representors have been instantiated. No existing Verbs call nor
+	 * /sys entries can tell them apart, this can only be done through
+	 * Netlink calls assuming kernel drivers are recent enough to
+	 * support them.
+	 *
+	 * In the event of identification failure through Netlink, either:
+	 *
+	 * 1. No device matches (n == 0), complain and bail out.
+	 * 2. A single IB device matches (n == 1) and is not a representor,
+	 *    assume no switch support.
+	 * 3. Otherwise no safe assumptions can be made; complain louder and
+	 *    bail out.
+	 */
+	for (i = 0; i != n; ++i) {
+		if (nl_rdma < 0)
+			ifindex[i] = 0;
+		else
+			ifindex[i] = mlx5_nl_ifindex(nl_rdma,
+						     ibv_match[i]->name);
+		if (nl_route < 0 ||
+		    !ifindex[i] ||
+		    mlx5_nl_switch_info(nl_route, ifindex[i], &info[i])) {
+			ifindex[i] = 0;
+			memset(&info[i], 0, sizeof(info[i]));
+			continue;
+		}
+	}
+	if (nl_rdma >= 0)
+		close(nl_rdma);
+	if (nl_route >= 0)
+		close(nl_route);
+	/* Look for master device. */
+	for (i = 0; i != n; ++i) {
+		if (!info[i].master)
+			continue;
+		/* Make it the first entry. */
+		if (i == 0)
+			break;
+		ibv_match[n] = ibv_match[0];
+		ibv_match[0] = ibv_match[i];
+		ibv_match[n] = NULL;
 		break;
 	}
+	if (n && i == n) {
+		if (n == 1 && !info[0].representor) {
+			/* Case #2. */
+			DRV_LOG(INFO, "no switch support detected");
+		} else if (n == 1) {
+			/* Case #3. */
+			DRV_LOG(ERR,
+				"device looks like a port representor, this is"
+				" not supported yet");
+			n = 0;
+		} else {
+			/* Case #3. */
+			DRV_LOG(ERR,
+				"unable to tell which of the matching devices"
+				" is the master (lack of kernel support?)");
+			n = 0;
+		}
+	}
 	switch (pci_dev->id.device_id) {
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
@@ -1146,10 +1227,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	default:
 		vf = 0;
 	}
-	if (ret >= 0)
-		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	if (n)
+		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
 	mlx5_glue->free_device_list(ibv_list);
-	if (!ret) {
+	if (!n) {
 		DRV_LOG(WARNING,
 			"no Verbs device matches PCI device " PCI_PRI_FMT ","
 			" are kernel drivers loaded?",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index f55ff4a21..704046270 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -53,6 +53,14 @@ enum {
 	PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2,
 };
 
+/** Switch information returned by mlx5_nl_switch_info(). */
+struct mlx5_switch_info {
+	uint32_t master:1; /**< Master device. */
+	uint32_t representor:1; /**< Representor device. */
+	int32_t port_name; /**< Representor port name. */
+	uint64_t switch_id; /**< Switch identifier. */
+};
+
 LIST_HEAD(mlx5_dev_list, priv);
 
 /* Shared memory between primary and secondary processes. */
@@ -195,7 +203,8 @@ struct priv {
 	struct mlx5_dev_config config; /* Device configuration. */
 	struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
 	/* Context for Verbs allocator. */
-	int nl_socket; /* Netlink socket. */
+	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
+	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
 };
 
@@ -342,7 +351,7 @@ int mlx5_socket_connect(struct rte_eth_dev *priv);
 
 /* mlx5_nl.c */
 
-int mlx5_nl_init(uint32_t nlgroups);
+int mlx5_nl_init(uint32_t nlgroups, int protocol);
 int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
 			 uint32_t index);
 int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
@@ -351,5 +360,8 @@ void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev);
 void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
 int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable);
 int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable);
+unsigned int mlx5_nl_ifindex(int nl, const char *name);
+int mlx5_nl_switch_info(int nl, unsigned int ifindex,
+			struct mlx5_switch_info *info);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index dca85835a..58ef2f4f0 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -3,10 +3,21 @@
  * Copyright 2018 Mellanox Technologies, Ltd
  */
 
+#include <errno.h>
+#include <linux/if_link.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <rdma/rdma_netlink.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
 #include <unistd.h>
 
+#include <rte_errno.h>
+
 #include "mlx5.h"
 #include "mlx5_utils.h"
 
@@ -27,6 +38,29 @@
 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
 #endif
 
+/*
+ * The following definitions are normally found in rdma/rdma_netlink.h,
+ * however they are so recent that most systems do not expose them yet.
+ */
+#ifndef HAVE_RDMA_NLDEV_CMD_GET
+#define RDMA_NLDEV_CMD_GET 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
+#define RDMA_NLDEV_CMD_PORT_GET 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
+#define RDMA_NLDEV_ATTR_DEV_INDEX 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
+#define RDMA_NLDEV_ATTR_DEV_NAME 2
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
+#define RDMA_NLDEV_ATTR_PORT_INDEX 3
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
+#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
+#endif
+
 /* Add/remove MAC address through Netlink */
 struct mlx5_nl_mac_addr {
 	struct ether_addr (*mac)[];
@@ -34,18 +68,27 @@ struct mlx5_nl_mac_addr {
 	int mac_n; /**< Number of addresses in the array. */
 };
 
+/** Data structure used by mlx5_nl_ifindex_cb(). */
+struct mlx5_nl_ifindex_data {
+	const char *name; /**< IB device name (in). */
+	uint32_t ibindex; /**< IB device index (out). */
+	uint32_t ifindex; /**< Network interface index (out). */
+};
+
 /**
  * Opens a Netlink socket.
  *
  * @param nl_groups
  *   Netlink group value (e.g. RTMGRP_LINK).
+ * @param protocol
+ *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
  *
  * @return
  *   A file descriptor on success, a negative errno value otherwise and
  *   rte_errno is set.
  */
 int
-mlx5_nl_init(uint32_t nl_groups)
+mlx5_nl_init(uint32_t nl_groups, int protocol)
 {
 	int fd;
 	int sndbuf_size = MLX5_SEND_BUF_SIZE;
@@ -56,7 +99,7 @@ mlx5_nl_init(uint32_t nl_groups)
 	};
 	int ret;
 
-	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
 	if (fd == -1) {
 		rte_errno = errno;
 		return -rte_errno;
@@ -334,9 +377,9 @@ mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket == -1)
+	if (priv->nl_socket_route == -1)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
 			      sizeof(struct ifinfomsg));
 	if (ret < 0)
@@ -398,9 +441,9 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket == -1)
+	if (priv->nl_socket_route == -1)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 		RTA_ALIGN(req.rta.rta_len);
@@ -569,9 +612,9 @@ mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
 	int ret;
 
 	assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
-	if (priv->nl_socket < 0)
+	if (priv->nl_socket_route < 0)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
 	if (ret < 0)
 		return ret;
@@ -625,3 +668,241 @@ mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
 			strerror(rte_errno));
 	return ret;
 }
+
+/**
+ * Process network interface information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct mlx5_nl_ifindex_data *data = arg;
+	size_t off = NLMSG_HDRLEN;
+	uint32_t ibindex = 0;
+	uint32_t ifindex = 0;
+	int found = 0;
+
+	if (nh->nlmsg_type !=
+	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
+	    nh->nlmsg_type !=
+	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
+		goto error;
+	while (off < nh->nlmsg_len) {
+		struct nlattr *na = (void *)((uintptr_t)nh + off);
+		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
+
+		if (na->nla_len > nh->nlmsg_len - off)
+			goto error;
+		switch (na->nla_type) {
+		case RDMA_NLDEV_ATTR_DEV_INDEX:
+			ibindex = *(uint32_t *)payload;
+			break;
+		case RDMA_NLDEV_ATTR_DEV_NAME:
+			if (!strcmp(payload, data->name))
+				found = 1;
+			break;
+		case RDMA_NLDEV_ATTR_NDEV_INDEX:
+			ifindex = *(uint32_t *)payload;
+			break;
+		default:
+			break;
+		}
+		off += NLA_ALIGN(na->nla_len);
+	}
+	if (found) {
+		data->ibindex = ibindex;
+		data->ifindex = ifindex;
+	}
+	return 0;
+error:
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ *
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name)
+{
+	static const uint32_t pindex = 1;
+	uint32_t seq = random();
+	struct mlx5_nl_ifindex_data data = {
+		.name = name,
+		.ibindex = 0, /* Determined during first pass. */
+		.ifindex = 0, /* Determined during second pass. */
+	};
+	union {
+		struct nlmsghdr nh;
+		uint8_t buf[NLMSG_HDRLEN +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(0),
+			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+						       RDMA_NLDEV_CMD_GET),
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		},
+	};
+	struct nlattr *na;
+	int ret;
+
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret < 0)
+		return 0;
+	ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+	if (ret < 0)
+		return 0;
+	if (!data.ibindex)
+		goto error;
+	++seq;
+	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					     RDMA_NLDEV_CMD_PORT_GET);
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
+	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
+	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
+	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+	       &data.ibindex, sizeof(data.ibindex));
+	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
+	na->nla_len = NLA_HDRLEN + sizeof(pindex);
+	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
+	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+	       &pindex, sizeof(pindex));
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret < 0)
+		return 0;
+	ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+	if (ret < 0)
+		return 0;
+	if (!data.ifindex)
+		goto error;
+	return data.ifindex;
+error:
+	rte_errno = ENODEV;
+	return 0;
+}
+
+/**
+ * Process switch information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct mlx5_switch_info info = {
+		.master = 0,
+		.representor = 0,
+		.port_name = 0,
+		.switch_id = 0,
+	};
+	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	bool port_name_set = false;
+	bool switch_id_set = false;
+
+	if (nh->nlmsg_type != RTM_NEWLINK)
+		goto error;
+	while (off < nh->nlmsg_len) {
+		struct rtattr *ra = (void *)((uintptr_t)nh + off);
+		void *payload = RTA_DATA(ra);
+		char *end;
+		unsigned int i;
+
+		if (ra->rta_len > nh->nlmsg_len - off)
+			goto error;
+		switch (ra->rta_type) {
+		case IFLA_PHYS_PORT_NAME:
+			errno = 0;
+			info.port_name = strtol(payload, &end, 0);
+			if (errno ||
+			    (size_t)(end - (char *)payload) != strlen(payload))
+				goto error;
+			port_name_set = true;
+			break;
+		case IFLA_PHYS_SWITCH_ID:
+			info.switch_id = 0;
+			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
+				info.switch_id <<= 8;
+				info.switch_id |= ((uint8_t *)payload)[i];
+			}
+			switch_id_set = true;
+			break;
+		}
+		off += RTA_ALIGN(ra->rta_len);
+	}
+	info.master = switch_id_set && !port_name_set;
+	info.representor = switch_id_set && port_name_set;
+	memcpy(arg, &info, sizeof(info));
+	return 0;
+error:
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param nl
+ *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
+{
+	uint32_t seq = random();
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(req.info)),
+			.nlmsg_type = RTM_GETLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+	int ret;
+
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
+	return ret;
+}
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 07/10] net/mlx5: probe all port representors
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (5 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 06/10] net/mlx5: add port representor awareness Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 08/10] net/mlx5: probe port representors in natural order Adrien Mazarguil
                       ` (3 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Nelio Laranjeiro, Xueming Li

Probe existing port representors in addition to their master device and
associate them automatically.

To avoid collision between Ethernet devices, they are named as follows:

- "{DBDF}" for master/switch devices.
- "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
  representors.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v3 changes:

- Nelio introduced mlx5_dev_to_port_id() to prevent the master device from
  releasing a domain ID while representors are still bound. It is now
  released by the last device closed.
- Reverted to original naming convention as requested by Xueming and
  Shahaf; "net_" prefix and "_0" suffix were dropped.
- mlx5_dev_spawn() (previously mlx5_dev_spawn_one()) now decides on its own
  whether underlying device is a representor.
- Devices can now be probed in any order and not necessarily all at once;
  representors can exist without a master device.
- mlx5_pci_probe() iterates on the list of devices directly instead of
  relying on an intermediate function (previously mlx5_dev_spawn()).
- mlx5_get_ifname() was rewritten to rely on mlx5_nl_ifindex() when faced
  with a representor.
- Since it is not necessarily present, master device is now dynamically
  retrieved in mlx5_dev_infos_get().

v2 changes:

- Added representor information to dev_infos_get(). DPDK port ID of master
  device is now stored in the private structure to retrieve it
  conveniently.
- Master device is assigned dummy representor ID value -1 to better
  distinguish from the the first actual representor reported by
  dev_infos_get() as those are indexed from 0.
- Added RTE_ETH_DEV_REPRESENTOR device flag.
---
 drivers/net/mlx5/mlx5.c        | 109 +++++++++++++++++++++--------
 drivers/net/mlx5/mlx5.h        |  12 +++-
 drivers/net/mlx5/mlx5_ethdev.c | 135 ++++++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_mac.c    |   2 +-
 drivers/net/mlx5/mlx5_stats.c  |   6 +-
 5 files changed, 215 insertions(+), 49 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d06ba9886..52bfc5c63 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -307,6 +307,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
+	i = mlx5_dev_to_port_id(dev->device, NULL, 0);
+	if (i == 1 && priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 	memset(priv, 0, sizeof(*priv));
 }
 
@@ -647,6 +650,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *   Verbs device.
  * @param vf
  *   If nonzero, enable VF-specific features.
+ * @param[in] switch_info
+ *   Switch properties of Ethernet device.
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
@@ -655,7 +660,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 static struct rte_eth_dev *
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	       struct ibv_device *ibv_dev,
-	       int vf)
+	       int vf,
+	       const struct mlx5_switch_info *switch_info)
 {
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
@@ -697,6 +703,9 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 #endif
 	struct ether_addr mac;
 	char name[RTE_ETH_NAME_MAX_LEN];
+	uint16_t port_id;
+	int own_domain_id = 0;
+	unsigned int i;
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -805,7 +814,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	if (!switch_info->representor)
+		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	else
+		snprintf(name, sizeof(name), "%s_representor_%u",
+			 dpdk_dev->name, switch_info->port_name);
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
@@ -874,6 +888,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		goto error;
 	}
 	priv->ctx = ctx;
+	strncpy(priv->ibdev_name, priv->ctx->device->name,
+		sizeof(priv->ibdev_name));
 	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 		sizeof(priv->ibdev_path));
 	priv->device_attr = attr;
@@ -883,6 +899,32 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
 	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE);
 	priv->nl_sn = 0;
+	priv->representor = !!switch_info->representor;
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+	priv->representor_id =
+		switch_info->representor ? switch_info->port_name : -1;
+	/*
+	 * Look for sibling devices in order to reuse their switch domain,
+	 * otherwise allocate one.
+	 */
+	i = mlx5_dev_to_port_id(dpdk_dev, &port_id, 1);
+	if (i > 0) {
+		const struct priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+
+		if (opriv)
+			priv->domain_id = opriv->domain_id;
+	}
+	if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+		own_domain_id = 1;
+	}
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -966,6 +1008,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		err = ENOMEM;
 		goto error;
 	}
+	if (priv->representor)
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
 	eth_dev->data->dev_private = priv;
 	priv->dev_data = eth_dev->data;
 	eth_dev->data->mac_addrs = priv->mac;
@@ -1084,6 +1128,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (own_domain_id)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
 	}
 	if (pd)
@@ -1100,7 +1146,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 /**
  * DPDK callback to register a PCI device.
  *
- * This function spawns an Ethernet device out of a given PCI device.
+ * This function spawns Ethernet devices out of a given PCI device.
  *
  * @param[in] pci_drv
  *   PCI driver structure (mlx5_driver).
@@ -1115,7 +1161,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	       struct rte_pci_device *pci_dev)
 {
 	struct ibv_device **ibv_list;
-	struct rte_eth_dev *eth_dev = NULL;
 	unsigned int n = 0;
 	int vf;
 	int ret;
@@ -1150,9 +1195,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 
 	unsigned int ifindex[n];
 	struct mlx5_switch_info info[n];
+	struct rte_eth_dev *eth_list[n];
 	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
 	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
 	unsigned int i;
+	unsigned int u;
 
 	/*
 	 * The existence of several matching entries (n > 1) means port
@@ -1187,28 +1234,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		close(nl_rdma);
 	if (nl_route >= 0)
 		close(nl_route);
-	/* Look for master device. */
-	for (i = 0; i != n; ++i) {
-		if (!info[i].master)
-			continue;
-		/* Make it the first entry. */
-		if (i == 0)
-			break;
-		ibv_match[n] = ibv_match[0];
-		ibv_match[0] = ibv_match[i];
-		ibv_match[n] = NULL;
-		break;
-	}
-	if (n && i == n) {
-		if (n == 1 && !info[0].representor) {
+	/* Count unidentified devices. */
+	for (u = 0, i = 0; i != n; ++i)
+		if (!info[i].master && !info[i].representor)
+			++u;
+	if (u) {
+		if (n == 1 && u == 1) {
 			/* Case #2. */
 			DRV_LOG(INFO, "no switch support detected");
-		} else if (n == 1) {
-			/* Case #3. */
-			DRV_LOG(ERR,
-				"device looks like a port representor, this is"
-				" not supported yet");
-			n = 0;
 		} else {
 			/* Case #3. */
 			DRV_LOG(ERR,
@@ -1227,8 +1260,19 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	default:
 		vf = 0;
 	}
-	if (n)
-		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
+	for (i = 0; i != n; ++i) {
+		uint32_t restore;
+
+		eth_list[i] = mlx5_dev_spawn(&pci_dev->device, ibv_match[i],
+					     vf, &info[i]);
+		if (!eth_list[i])
+			break;
+		restore = eth_list[i]->data->dev_flags;
+		rte_eth_copy_pci_info(eth_list[i], pci_dev);
+		/* Restore non-PCI flags cleared by the above call. */
+		eth_list[i]->data->dev_flags |= restore;
+		rte_eth_dev_probing_finish(eth_list[i]);
+	}
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
 		DRV_LOG(WARNING,
@@ -1238,7 +1282,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			pci_dev->addr.devid, pci_dev->addr.function);
 		rte_errno = ENOENT;
 		ret = -rte_errno;
-	} else if (!eth_dev) {
+	} else if (i != n) {
 		DRV_LOG(ERR,
 			"probe of PCI device " PCI_PRI_FMT " aborted after"
 			" encountering an error: %s",
@@ -1246,9 +1290,16 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			pci_dev->addr.devid, pci_dev->addr.function,
 			strerror(rte_errno));
 		ret = -rte_errno;
+		/* Roll back. */
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		/* Restore original error. */
+		rte_errno = -ret;
 	} else {
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
-		rte_eth_dev_probing_finish(eth_dev);
 		ret = 0;
 	}
 	return ret;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 704046270..cc01310e0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -159,6 +159,7 @@ struct priv {
 	struct ibv_context *ctx; /* Verbs context. */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct ibv_pd *pd; /* Protection Domain. */
+	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
 	BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
@@ -168,6 +169,9 @@ struct priv {
 	/* Device properties. */
 	uint16_t mtu; /* Configured MTU. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	int32_t representor_id; /* Port representor identifier. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
@@ -217,9 +221,12 @@ int mlx5_getenv_int(const char *);
 
 /* mlx5_ethdev.c */
 
+int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+			   char (*ifname)[IF_NAMESIZE]);
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
 int mlx5_ifindex(const struct rte_eth_dev *dev);
-int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	       int master);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
 		   unsigned int flags);
@@ -244,6 +251,9 @@ int mlx5_set_link_up(struct rte_eth_dev *dev);
 int mlx5_is_removed(struct rte_eth_dev *dev);
 eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev);
 eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev);
+unsigned int mlx5_dev_to_port_id(const struct rte_device *dev,
+				 uint16_t *port_list,
+				 unsigned int port_list_n);
 
 /* mlx5_mac.c */
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 819f5baad..390e602c1 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -27,6 +27,7 @@
 #include <time.h>
 
 #include <rte_atomic.h>
+#include <rte_common.h>
 #include <rte_ethdev_driver.h>
 #include <rte_bus_pci.h>
 #include <rte_mbuf.h>
@@ -93,7 +94,7 @@ struct ethtool_link_settings {
 #endif
 
 /**
- * Get interface name from private structure.
+ * Get master interface name from private structure.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -104,7 +105,8 @@ struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+		       char (*ifname)[IF_NAMESIZE])
 {
 	struct priv *priv = dev->data->dev_private;
 	DIR *dir;
@@ -179,6 +181,39 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int ifindex =
+		priv->nl_socket_rdma >= 0 ?
+		mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0;
+
+	if (!ifindex) {
+		if (!priv->representor)
+			return mlx5_get_master_ifname(dev, ifname);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (if_indextoname(ifindex, &(*ifname)[0]))
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
  * Get the interface index from device name.
  *
  * @param[in] dev
@@ -214,12 +249,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
  *   Request number to pass to ioctl().
  * @param[out] ifr
  *   Interface request structure output buffer.
+ * @param master
+ *   When device is a port representor, perform request on master device
+ *   instead.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	   int master)
 {
 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 	int ret = 0;
@@ -228,7 +267,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 		rte_errno = errno;
 		return -rte_errno;
 	}
-	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (master)
+		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
+	else
+		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
 	if (ret)
 		goto error;
 	ret = ioctl(sock, req, ifr);
@@ -258,7 +300,7 @@ int
 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
 
 	if (ret)
 		return ret;
@@ -282,7 +324,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
 	struct ifreq request = { .ifr_mtu = mtu, };
 
-	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
 }
 
 /**
@@ -302,13 +344,13 @@ int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
 
 	if (ret)
 		return ret;
 	request.ifr_flags &= keep;
 	request.ifr_flags |= flags & ~keep;
-	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
 }
 
 /**
@@ -477,6 +519,32 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	info->speed_capa = priv->link_speed_capa;
 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
 	mlx5_set_default_params(dev, info);
+	info->switch_info.name = dev->data->name;
+	info->switch_info.domain_id = priv->domain_id;
+	info->switch_info.port_id = priv->representor_id;
+	if (priv->representor) {
+		uint16_t port_id[mlx5_dev_to_port_id(dev->device, NULL, 0)];
+		unsigned int i =
+			RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id,
+						    RTE_DIM(port_id)),
+				RTE_DIM(port_id));
+
+		while (i--) {
+			struct priv *opriv =
+				rte_eth_devices[port_id[i]].data->dev_private;
+
+			if (!opriv ||
+			    opriv->representor ||
+			    opriv->domain_id != priv->domain_id)
+				continue;
+			/*
+			 * Override switch name with that of the master
+			 * device.
+			 */
+			info->switch_info.name = opriv->dev_data->name;
+			break;
+		}
+	}
 }
 
 /**
@@ -540,7 +608,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	int link_speed = 0;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -550,7 +618,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&edata;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
@@ -611,7 +679,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	uint64_t sc;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -621,7 +689,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&gcmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -638,7 +706,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
 	*ecmd = gcmd;
 	ifr.ifr_data = (void *)ecmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -801,7 +869,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	int ret;
 
 	ifr.ifr_data = (void *)&ethpause;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
@@ -854,7 +922,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		ethpause.tx_pause = 1;
 	else
 		ethpause.tx_pause = 0;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
@@ -1193,3 +1261,40 @@ mlx5_is_removed(struct rte_eth_dev *dev)
 		return 1;
 	return 0;
 }
+
+/**
+ * Get port ID list of mlx5 instances sharing a common device.
+ *
+ * @param[in] dev
+ *   Device to look for.
+ * @param[out] port_list
+ *   Result buffer for collected port IDs.
+ * @param port_list_n
+ *   Maximum number of entries in result buffer. If 0, @p port_list can be
+ *   NULL.
+ *
+ * @return
+ *   Number of matching instances regardless of the @p port_list_n
+ *   parameter, 0 if none were found.
+ */
+unsigned int
+mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
+		    unsigned int port_list_n)
+{
+	uint16_t id;
+	unsigned int n = 0;
+
+	RTE_ETH_FOREACH_DEV(id) {
+		struct rte_eth_dev *ldev = &rte_eth_devices[id];
+
+		if (!ldev->device ||
+		    !ldev->device->driver ||
+		    strcmp(ldev->device->driver->name, MLX5_DRIVER_NAME) ||
+		    ldev->device != dev)
+			continue;
+		if (n < port_list_n)
+			port_list[n] = id;
+		n++;
+	}
+	return n;
+}
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 672a47619..12ee37f55 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 	struct ifreq request;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
 	if (ret)
 		return ret;
 	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 875dd1027..91f3d474a 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
 	et_stats->cmd = ETHTOOL_GSTATS;
 	et_stats->n_stats = xstats_ctrl->stats_n;
 	ifr.ifr_data = (caddr_t)et_stats;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u unable to read statistic values from device",
@@ -194,7 +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
 
 	drvinfo.cmd = ETHTOOL_GDRVINFO;
 	ifr.ifr_data = (caddr_t)&drvinfo;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to query number of statistics",
 			dev->data->port_id);
@@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
 	strings->string_set = ETH_SS_STATS;
 	strings->len = dev_stats_n;
 	ifr.ifr_data = (caddr_t)strings;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to get statistic names",
 			dev->data->port_id);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 08/10] net/mlx5: probe port representors in natural order
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (6 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 07/10] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 09/10] net/mlx5: add parameter for port representors Adrien Mazarguil
                       ` (2 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Port representors are probed in whatever unspecified order
ibv_get_device_list() returns them.

This is counterintuitive to users since DPDK port IDs assignment almost
never follows the same sequence as representor IDs. Additionally, the
master device does not necessarily inherit the lowest DPDK port ID.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v3 changes:

- This patch was not present in prior revisions.
---
 drivers/net/mlx5/mlx5.c | 95 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 21 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 52bfc5c63..05dd3bbe7 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1143,6 +1143,52 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	return NULL;
 }
 
+/** Data associated with devices to spawn. */
+struct mlx5_dev_spawn_data {
+	unsigned int ifindex; /**< Network interface index. */
+	struct mlx5_switch_info info; /**< Switch information. */
+	struct ibv_device *ibv_dev; /**< Associated IB device. */
+	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+};
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first data object.
+ * @param b[in]
+ *   Pointer to pointer to second data object.
+ *
+ * @return
+ *   0 if both objects are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+	const struct mlx5_switch_info *si_a =
+		&((const struct mlx5_dev_spawn_data *)a)->info;
+	const struct mlx5_switch_info *si_b =
+		&((const struct mlx5_dev_spawn_data *)b)->info;
+	int ret;
+
+	/* Master device first. */
+	ret = si_b->master - si_a->master;
+	if (ret)
+		return ret;
+	/* Then representor devices. */
+	ret = si_b->representor - si_a->representor;
+	if (ret)
+		return ret;
+	/* Unidentified devices come last in no specific order. */
+	if (!si_a->representor)
+		return 0;
+	/* Order representors by name. */
+	return si_a->port_name - si_b->port_name;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -1193,9 +1239,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 	ibv_match[n] = NULL;
 
-	unsigned int ifindex[n];
-	struct mlx5_switch_info info[n];
-	struct rte_eth_dev *eth_list[n];
+	struct mlx5_dev_spawn_data list[n];
 	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
 	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
 	unsigned int i;
@@ -1217,16 +1261,19 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 *    bail out.
 	 */
 	for (i = 0; i != n; ++i) {
+		list[i].ibv_dev = ibv_match[i];
+		list[i].eth_dev = NULL;
 		if (nl_rdma < 0)
-			ifindex[i] = 0;
+			list[i].ifindex = 0;
 		else
-			ifindex[i] = mlx5_nl_ifindex(nl_rdma,
-						     ibv_match[i]->name);
+			list[i].ifindex = mlx5_nl_ifindex
+				(nl_rdma, list[i].ibv_dev->name);
 		if (nl_route < 0 ||
-		    !ifindex[i] ||
-		    mlx5_nl_switch_info(nl_route, ifindex[i], &info[i])) {
-			ifindex[i] = 0;
-			memset(&info[i], 0, sizeof(info[i]));
+		    !list[i].ifindex ||
+		    mlx5_nl_switch_info(nl_route, list[i].ifindex,
+					&list[i].info)) {
+			list[i].ifindex = 0;
+			memset(&list[i].info, 0, sizeof(list[i].info));
 			continue;
 		}
 	}
@@ -1236,7 +1283,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		close(nl_route);
 	/* Count unidentified devices. */
 	for (u = 0, i = 0; i != n; ++i)
-		if (!info[i].master && !info[i].representor)
+		if (!list[i].info.master && !list[i].info.representor)
 			++u;
 	if (u) {
 		if (n == 1 && u == 1) {
@@ -1250,6 +1297,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			n = 0;
 		}
 	}
+	/*
+	 * Sort list to probe devices in natural order for users convenience
+	 * (i.e. master first, then representors from lowest to highest ID).
+	 */
+	if (n)
+		qsort(list, n, sizeof(*list), mlx5_dev_spawn_data_cmp);
 	switch (pci_dev->id.device_id) {
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
@@ -1263,15 +1316,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	for (i = 0; i != n; ++i) {
 		uint32_t restore;
 
-		eth_list[i] = mlx5_dev_spawn(&pci_dev->device, ibv_match[i],
-					     vf, &info[i]);
-		if (!eth_list[i])
+		list[i].eth_dev = mlx5_dev_spawn
+			(&pci_dev->device, list[i].ibv_dev, vf, &list[i].info);
+		if (!list[i].eth_dev)
 			break;
-		restore = eth_list[i]->data->dev_flags;
-		rte_eth_copy_pci_info(eth_list[i], pci_dev);
+		restore = list[i].eth_dev->data->dev_flags;
+		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
 		/* Restore non-PCI flags cleared by the above call. */
-		eth_list[i]->data->dev_flags |= restore;
-		rte_eth_dev_probing_finish(eth_list[i]);
+		list[i].eth_dev->data->dev_flags |= restore;
+		rte_eth_dev_probing_finish(list[i].eth_dev);
 	}
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
@@ -1292,10 +1345,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 		/* Roll back. */
 		while (i--) {
-			mlx5_dev_close(eth_list[i]);
+			mlx5_dev_close(list[i].eth_dev);
 			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-				rte_free(eth_list[i]->data->dev_private);
-			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+				rte_free(list[i].eth_dev->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
 		}
 		/* Restore original error. */
 		rte_errno = -ret;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 09/10] net/mlx5: add parameter for port representors
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (7 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 08/10] net/mlx5: probe port representors in natural order Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 10/10] net/mlx5: support negative identifiers " Adrien Mazarguil
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Prior to this patch, all port representors detected on a given device were
probed and Ethernet devices instantiated for each of them.

This patch adds support for the standard "representor" parameter, which
implies that port representors are not probed by default anymore, except
for the list provided through device arguments.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

- Adapted representor detection to the reworked mlx5_dev_spawn().

v2 changes:

- Added error message for when rte_eth_devargs_parse() fails.
---
 doc/guides/nics/mlx5.rst                | 12 ++++++++
 doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
 drivers/net/mlx5/mlx5.c                 | 41 ++++++++++++++++++++++++++--
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 7dd9c1c5e..0d0d21727 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -392,6 +392,18 @@ Run-time configuration
 
   Disabled by default.
 
+- ``representor`` parameter [list]
+
+  This parameter can be used to instantiate DPDK Ethernet devices from
+  existing port (or VF) representors configured on the device.
+
+  It is a standard parameter whose format is described in
+  :ref:`ethernet_device_standard_device_arguments`.
+
+  For instance, to probe port representors 0 through 2::
+
+    representor=[0-2]
+
 Firmware configuration
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
index af82352a0..58d49ba0f 100644
--- a/doc/guides/prog_guide/poll_mode_drv.rst
+++ b/doc/guides/prog_guide/poll_mode_drv.rst
@@ -365,6 +365,8 @@ Ethernet Device API
 
 The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
 
+.. _ethernet_device_standard_device_arguments:
+
 Ethernet Device Standard Device Arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 05dd3bbe7..7e757274a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -92,6 +92,9 @@
 /* Activate Netlink support in VF mode. */
 #define MLX5_VF_NL_EN "vf_nl_en"
 
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
 #ifndef HAVE_IBV_MLX5_MOD_MPW
 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
@@ -426,6 +429,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	struct mlx5_dev_config *config = opaque;
 	unsigned long tmp;
 
+	/* No-op, port representors are processed in mlx5_dev_spawn(). */
+	if (!strcmp(MLX5_REPRESENTOR, key))
+		return 0;
 	errno = 0;
 	tmp = strtoul(val, NULL, 0);
 	if (errno) {
@@ -498,6 +504,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
+		MLX5_REPRESENTOR,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
@@ -655,7 +662,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
- *   is set.
+ *   is set. The following error is defined:
+ *
+ *   EBUSY: device is not supposed to be spawned.
  */
 static struct rte_eth_dev *
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
@@ -707,6 +716,26 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	unsigned int i;
 
+	/* Determine if this port representor is supposed to be spawned. */
+	if (switch_info->representor && dpdk_dev->devargs) {
+		struct rte_eth_devargs eth_da;
+
+		err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+		if (err) {
+			rte_errno = -err;
+			DRV_LOG(ERR, "failed to process device arguments: %s",
+				strerror(rte_errno));
+			return NULL;
+		}
+		for (i = 0; i < eth_da.nb_representor_ports; ++i)
+			if (eth_da.representor_ports[i] ==
+			    (uint16_t)switch_info->port_name)
+				break;
+		if (i == eth_da.nb_representor_ports) {
+			rte_errno = EBUSY;
+			return NULL;
+		}
+	}
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
 	errno = 0;
@@ -1318,8 +1347,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 
 		list[i].eth_dev = mlx5_dev_spawn
 			(&pci_dev->device, list[i].ibv_dev, vf, &list[i].info);
-		if (!list[i].eth_dev)
-			break;
+		if (!list[i].eth_dev) {
+			if (rte_errno != EBUSY)
+				break;
+			/* Device is disabled, ignore it. */
+			continue;
+		}
 		restore = list[i].eth_dev->data->dev_flags;
 		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
 		/* Restore non-PCI flags cleared by the above call. */
@@ -1345,6 +1378,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 		/* Roll back. */
 		while (i--) {
+			if (!list[i].eth_dev)
+				continue;
 			mlx5_dev_close(list[i].eth_dev);
 			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 				rte_free(list[i].eth_dev->data->dev_private);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v3 10/10] net/mlx5: support negative identifiers for port representors
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (8 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 09/10] net/mlx5: add parameter for port representors Adrien Mazarguil
@ 2018-07-04 17:27     ` Adrien Mazarguil
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-04 17:27 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch brings support for BlueField representors.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v3 changes:

- This patch was not present in prior revisions.
---
 drivers/net/mlx5/mlx5.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7e757274a..d2031c633 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1305,6 +1305,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			memset(&list[i].info, 0, sizeof(list[i].info));
 			continue;
 		}
+		/*
+		 * Port representors not associated with any VFs (e.g. on
+		 * BlueField devices) report -1 as a port identifier.
+		 * Quietly set it to zero since DPDK only supports positive
+		 * values.
+		 */
+		if (list[i].info.representor && list[i].info.port_name == -1)
+			list[i].info.port_name = 0;
 	}
 	if (nl_rdma >= 0)
 		close(nl_rdma);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support
  2018-07-04 17:27   ` [dpdk-dev] [PATCH v3 00/10] net/mlx5: add port representor support Adrien Mazarguil
                       ` (9 preceding siblings ...)
  2018-07-04 17:27     ` [dpdk-dev] [PATCH v3 10/10] net/mlx5: support negative identifiers " Adrien Mazarguil
@ 2018-07-05  8:45     ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
                         ` (10 more replies)
  10 siblings, 11 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This series adds support for port (VF) representors to the mlx5 PMD, which
can be instantiated using the standard "representor" device parameter.

Note the PMD only probes existing representors which exist as Verbs devices;
their creation is part of the host system configuration.

v4 changes:

- Fixed domain ID release that did not work, see relevant patch.
- Rebased series.

v3 changes:

- Added the following patches:
  - net/mlx5: drop useless support for several Verbs ports
  - net/mlx5: probe port representors in natural order
  - net/mlx5: support negative identifiers for port representors
- See individual patches for details.
- Rebased series.

v2 changes:

- See individual patches for details.
- Rebased series.

Adrien Mazarguil (10):
  net/mlx5: rename confusing object in probe code
  net/mlx5: remove redundant objects in probe code
  net/mlx5: drop useless support for several Verbs ports
  net/mlx5: split PCI from generic probing code
  net/mlx5: re-indent generic probing function
  net/mlx5: add port representor awareness
  net/mlx5: probe all port representors
  net/mlx5: probe port representors in natural order
  net/mlx5: add parameter for port representors
  net/mlx5: support negative identifiers for port representors

 doc/guides/nics/mlx5.rst                |   12 +
 doc/guides/prog_guide/poll_mode_drv.rst |    2 +
 drivers/net/mlx5/Makefile               |   30 +
 drivers/net/mlx5/mlx5.c                 | 1108 ++++++++++++++++----------
 drivers/net/mlx5/mlx5.h                 |   29 +-
 drivers/net/mlx5/mlx5_ethdev.c          |  135 +++-
 drivers/net/mlx5/mlx5_mac.c             |    2 +-
 drivers/net/mlx5/mlx5_nl.c              |  297 ++++++-
 drivers/net/mlx5/mlx5_stats.c           |    6 +-
 drivers/net/mlx5/mlx5_txq.c             |    2 +-
 10 files changed, 1149 insertions(+), 474 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 01/10] net/mlx5: rename confusing object in probe code
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
                         ` (9 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

There are several attribute objects in this function:

- IB device attributes (struct ibv_device_attr_ex device_attr).
- Direct Verbs attributes (struct mlx5dv_context attrs_out).
- Port attributes (struct ibv_port_attr).
- IB device attributes again (struct ibv_device_attr_ex device_attr_ex).

"attrs_out" is both odd and initialized using a nonstandard syntax. Rename
it "dv_attr" for consistency.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Fixed ctx -> attr_ctx in mlx5_pci_probe().
---
 drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d081bdd05..22cbce8d5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
@@ -670,7 +671,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
 	int i;
-	struct mlx5dv_context attrs_out = {0};
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
@@ -736,21 +736,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
 	/*
 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 	 * as all ConnectX-5 devices.
 	 */
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
-	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
-		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
 			mps = MLX5_MPW_ENHANCED;
 		} else {
@@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		mps = MLX5_MPW_DISABLED;
 	}
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
-		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
-			attrs_out.striding_rq_caps;
+			dv_attr.striding_rq_caps;
 
 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
 			mprq_caps.min_single_stride_log_num_of_bytes);
@@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
-	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
-		tunnel_en = ((attrs_out.tunnel_offloads_caps &
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
-			     (attrs_out.tunnel_offloads_caps &
+			     (dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
 	}
 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
@@ -812,9 +812,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
-	mpls_en = ((attrs_out.tunnel_offloads_caps &
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
-		   (attrs_out.tunnel_offloads_caps &
+		   (dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
 		mpls_en ? "" : "not ");
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 02/10] net/mlx5: remove redundant objects in probe code
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 03/10] net/mlx5: drop useless support for several Verbs ports Adrien Mazarguil
                         ` (8 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch gets rid of redundant calls to open the device and query its
attributes in order to simplify the code.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Minor indent fix on existing code.
---
 drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 22cbce8d5..4e7f29f5b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct ibv_context *ctx = NULL;
+	struct ibv_device_attr_ex attr;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
-	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr_ex device_attr;
 	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		attr_ctx = mlx5_glue->open_device(list[i]);
+		ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
 		break;
 	}
-	if (attr_ctx == NULL) {
+	if (ctx == NULL) {
 		switch (err) {
 		case 0:
 			DRV_LOG(ERR,
@@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	mlx5_glue->dv_query_device(ctx, &dv_attr);
 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
@@ -822,23 +822,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected",
-		device_attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
 		char name[RTE_ETH_NAME_MAX_LEN];
 		int len;
 		uint32_t port = i + 1; /* ports are indexed from one */
-		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (device_attr.orig_attr.phys_port_cnt > 1)
+		if (attr.orig_attr.phys_port_cnt > 1)
 			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
@@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		ctx = mlx5_glue->open_device(ibv_dev);
+		if (!ctx)
+			ctx = mlx5_glue->open_device(ibv_dev);
 		if (ctx == NULL) {
 			err = ENODEV;
 			goto port_error;
@@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = device_attr;
+		priv->device_attr = attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
@@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				strerror(rte_errno));
 			goto port_error;
 		}
-		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
-		if (err) {
-			DRV_LOG(ERR, "ibv_query_device_ex() failed");
-			goto port_error;
-		}
-		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
+		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!(device_attr.max_counter_sets);
+		config.flow_counter_en = !!attr.max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -978,7 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+			attr.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+		config.hw_padding = !!attr.rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
-			      (device_attr_ex.tso_caps.supported_qpts &
-			      (1 << IBV_QPT_RAW_PACKET)));
+		config.tso = (attr.tso_caps.max_tso > 0 &&
+			      (attr.tso_caps.supported_qpts &
+			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz =
-					device_attr_ex.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr.tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1170,14 +1162,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
+		/*
+		 * Each eth_dev instance is assigned its own Verbs context,
+		 * since this one is consumed, let the next iteration open
+		 * another.
+		 */
+		ctx = NULL;
 		continue;
 port_error:
 		if (priv)
 			rte_free(priv);
 		if (pd)
 			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (ctx)
-			claim_zero(mlx5_glue->close_device(ctx));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
 		break;
@@ -1189,8 +1185,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
 error:
-	if (attr_ctx)
-		claim_zero(mlx5_glue->close_device(attr_ctx));
+	if (ctx)
+		claim_zero(mlx5_glue->close_device(ctx));
 	if (list)
 		mlx5_glue->free_device_list(list);
 	if (err) {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 03/10] net/mlx5: drop useless support for several Verbs ports
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 04/10] net/mlx5: split PCI from generic probing code Adrien Mazarguil
                         ` (7 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Xueming Li

Unlike mlx4 from which this capability was inherited, mlx5 devices expose
exactly one Verbs port per PCI bus address. Each physical port gets
assigned its own bus address with a single Verbs port.

While harmless, this code requires an extra loop that would get in the way
of subsequent refactoring.

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

This patch was not present in prior revisions. As discussed [1], it was
added after finally deciding to remove this support.

[1] https://mails.dpdk.org/archives/dev/2018-June/105661.html
---
 drivers/net/mlx5/mlx5.c        | 96 +++++++++++++------------------------
 drivers/net/mlx5/mlx5.h        |  1 -
 drivers/net/mlx5/mlx5_ethdev.c |  2 +-
 drivers/net/mlx5/mlx5_txq.c    |  2 +-
 4 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4e7f29f5b..717d8b268 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -652,11 +652,13 @@ static int
 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	       struct rte_pci_device *pci_dev)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
+	struct ibv_device **list;
 	struct ibv_context *ctx = NULL;
 	struct ibv_device_attr_ex attr;
+	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
+	struct priv *priv = NULL;
 	int err = 0;
 	unsigned int vf = 0;
 	unsigned int mps;
@@ -719,6 +721,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = rte_errno;
 		break;
 	}
+	mlx5_glue->free_device_list(list);
 	if (ctx == NULL) {
 		switch (err) {
 		case 0:
@@ -733,7 +736,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		}
 		goto error;
 	}
-	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
@@ -827,15 +829,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
-		struct ibv_pd *pd = NULL;
-		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -859,11 +855,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
+		snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -901,31 +895,22 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
 			rte_eth_dev_probing_finish(eth_dev);
-			continue;
-		}
-		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
+			claim_zero(mlx5_glue->close_device(ctx));
+			return 0;
 		}
 		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, port, &port_attr);
+		err = mlx5_glue->query_port(ctx, 1, &port_attr);
 		if (err) {
 			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR,
-				"port %d is not configured in Ethernet mode",
-				port);
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
 			err = EINVAL;
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
-				port,
+			DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
 				mlx5_glue->port_state_str(port_attr.state),
 				port_attr.state);
 		/* Allocate protection domain. */
@@ -933,7 +918,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (pd == NULL) {
 			DRV_LOG(ERR, "PD allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		/* from rte_ethdev.c */
 		priv = rte_zmalloc("ethdev private structure",
@@ -942,13 +927,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (priv == NULL) {
 			DRV_LOG(ERR, "priv allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
 		priv->device_attr = attr;
-		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		err = mlx5_args(&config, pci_dev->device.devargs);
@@ -956,7 +940,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
-			goto port_error;
+			goto error;
 		}
 		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
@@ -1006,7 +990,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				"multi-packet send not supported on this device"
 				" (" MLX5_TXQ_MPW_EN ")");
 			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(INFO, "%s MPS is %s",
 			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
@@ -1038,7 +1022,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (eth_dev == NULL) {
 			DRV_LOG(ERR, "can not allocate rte ethdev");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
@@ -1049,7 +1033,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Configure the first MAC address by default. */
 		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
@@ -1058,7 +1042,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				" loaded? (errno: %s)",
 				eth_dev->data->port_id, strerror(rte_errno));
 			err = ENODEV;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(INFO,
 			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -1082,7 +1066,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = mlx5_get_mtu(eth_dev, &priv->mtu);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
 			priv->mtu);
@@ -1131,7 +1115,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
 				eth_dev->data->port_id, strerror(rte_errno));
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Supported Verbs flow priority number detection. */
 		if (verb_priorities == 0)
@@ -1140,7 +1124,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
 				eth_dev->data->port_id, verb_priorities);
 			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
 		priv->config.max_verbs_prio = verb_priorities;
 		/*
@@ -1154,7 +1138,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 					 eth_dev->device->numa_node);
 		if (err) {
 			err = rte_errno;
-			goto port_error;
+			goto error;
 		}
 		/* Add device to memory callback list. */
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
@@ -1162,33 +1146,17 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
-port_error:
-		if (priv)
-			rte_free(priv);
-		if (pd)
-			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
-			rte_eth_dev_release_port(eth_dev);
-		break;
+		return 0;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
+	if (priv)
+		rte_free(priv);
+	if (pd)
+		claim_zero(mlx5_glue->dealloc_pd(pd));
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
 	if (err) {
 		rte_errno = err;
 		return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 997b04a33..f55ff4a21 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -159,7 +159,6 @@ struct priv {
 	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
 	/* Device properties. */
 	uint16_t mtu; /* Configured MTU. */
-	uint8_t port; /* Physical port number. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index ebe5cb6e3..819f5baad 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -166,7 +166,7 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 		if (dev_port == dev_port_prev)
 			goto try_dev_id;
 		dev_port_prev = dev_port;
-		if (dev_port == (priv->port - 1u))
+		if (dev_port == 0)
 			strlcpy(match, name, sizeof(match));
 	}
 	closedir(dir);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 669b91319..5057561ae 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -434,7 +434,7 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 		/* Move the QP to this state. */
 		.qp_state = IBV_QPS_INIT,
 		/* Primary port number. */
-		.port_num = priv->port
+		.port_num = 1,
 	};
 	ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod,
 				   (IBV_QP_STATE | IBV_QP_PORT));
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 04/10] net/mlx5: split PCI from generic probing code
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (2 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 03/10] net/mlx5: drop useless support for several Verbs ports Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 05/10] net/mlx5: re-indent generic probing function Adrien Mazarguil
                         ` (6 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. Verbs device (mlx5_dev_spawn()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v3 changes:

- Moved VF device check within mlx5_pci_probe() after identifying the
  device instead of before that.
- Merged mlx5_dev_spawn_one() with mlx5_dev_spawn() since there is no need
  anymore for an intermediate function to iterate over Verbs ports.

v2 changes:

- Fixed device naming. A port suffix is now appended only if several IB
  ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
  initialization errors, as it was confusing.
---
 drivers/net/mlx5/mlx5.c | 195 ++++++++++++++++++++++++-------------------
 1 file changed, 109 insertions(+), 86 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 717d8b268..8916d4684 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -36,6 +36,7 @@
 #include <rte_kvargs.h>
 #include <rte_rwlock.h>
 #include <rte_spinlock.h>
+#include <rte_string_fns.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -635,32 +636,31 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
 {
-	struct ibv_device **list;
-	struct ibv_context *ctx = NULL;
+	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
 	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct priv *priv = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -672,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	mlx5_glue->free_device_list(list);
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -855,9 +802,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
+		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -866,7 +811,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -894,9 +839,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
 			claim_zero(mlx5_glue->close_device(ctx));
-			return 0;
+			return eth_dev;
 		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, 1, &port_attr);
@@ -935,7 +879,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->device_attr = attr;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
@@ -1027,8 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1146,7 +1089,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		return 0;
+		return eth_dev;
 	}
 error:
 	if (priv)
@@ -1157,11 +1100,91 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		rte_eth_dev_release_port(eth_dev);
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function spawns an Ethernet device out of a given PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev *eth_dev = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	if (ret >= 0)
+		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else if (!eth_dev) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+	} else {
+		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		rte_eth_dev_probing_finish(eth_dev);
+		ret = 0;
+	}
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 05/10] net/mlx5: re-indent generic probing function
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (3 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 04/10] net/mlx5: split PCI from generic probing code Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 06/10] net/mlx5: add port representor awareness Adrien Mazarguil
                         ` (5 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Since commit "net/mlx5: drop useless support for several Verbs ports"
removed an inner loop, mlx5_dev_spawn() is left with an unnecessary indent
level.

This patch eliminates a block, moves its local variables to function scope,
and re-indents its contents (diff best viewed with --ignore-all-space).

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming(Steven) Li <xuemingl@mellanox.com>
--
v3 changes:

- Reworded commit log since original patch was modified. This patch is also
  much shorter as a consequence.
---
 drivers/net/mlx5/mlx5.c | 578 +++++++++++++++++++++----------------------
 1 file changed, 282 insertions(+), 296 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 8916d4684..1054bf6d0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -656,8 +656,25 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	struct ibv_port_attr port_attr;
 	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct mlx5_dev_config config = {
+		.vf = !!vf,
+		.tx_vec_en = 1,
+		.rx_vec_en = 1,
+		.mpw_hdr_dseg = 0,
+		.txq_inline = MLX5_ARG_UNSET,
+		.txqs_inline = MLX5_ARG_UNSET,
+		.inline_max_packet_sz = MLX5_ARG_UNSET,
+		.vf_nl_en = 1,
+		.mprq = {
+			.enabled = 0,
+			.stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
+			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+		},
+	};
 	struct rte_eth_dev *eth_dev = NULL;
 	struct priv *priv = NULL;
 	int err = 0;
@@ -675,6 +692,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
+	struct ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -710,11 +729,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		DRV_LOG(DEBUG, "MPW isn't supported");
 		mps = MLX5_MPW_DISABLED;
 	}
+	config.mps = mps;
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
 		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
+	config.swp = !!swp;
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
@@ -740,6 +761,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mprq_caps.min_single_wqe_log_num_of_strides;
 		mprq_max_stride_num_n =
 			mprq_caps.max_single_wqe_log_num_of_strides;
+		config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						   mprq_min_stride_num_n);
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
@@ -747,6 +770,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
+	config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
 		tunnel_en = ((dv_attr.tunnel_offloads_caps &
@@ -760,6 +784,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
+	config.tunnel_en = tunnel_en;
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
 	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
@@ -771,326 +796,287 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
+	config.mpls_en = mpls_en;
 	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	{
-		char name[RTE_ETH_NAME_MAX_LEN];
-		struct ibv_port_attr port_attr;
-		struct ether_addr mac;
-		struct mlx5_dev_config config = {
-			.cqe_comp = cqe_comp,
-			.mps = mps,
-			.tunnel_en = tunnel_en,
-			.mpls_en = mpls_en,
-			.tx_vec_en = 1,
-			.rx_vec_en = 1,
-			.mpw_hdr_dseg = 0,
-			.txq_inline = MLX5_ARG_UNSET,
-			.txqs_inline = MLX5_ARG_UNSET,
-			.inline_max_packet_sz = MLX5_ARG_UNSET,
-			.vf_nl_en = 1,
-			.swp = !!swp,
-			.mprq = {
-				.enabled = 0, /* Disabled by default. */
-				.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-							mprq_min_stride_num_n),
-				.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
-				.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
-			},
-		};
-
-		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
-		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-			eth_dev = rte_eth_dev_attach_secondary(name);
-			if (eth_dev == NULL) {
-				DRV_LOG(ERR, "can not attach rte ethdev");
-				rte_errno = ENOMEM;
-				err = rte_errno;
-				goto error;
-			}
-			eth_dev->device = dpdk_dev;
-			eth_dev->dev_ops = &mlx5_dev_sec_ops;
-			err = mlx5_uar_init_secondary(eth_dev);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Receive command fd from primary process */
-			err = mlx5_socket_connect(eth_dev);
-			if (err < 0) {
-				err = rte_errno;
-				goto error;
-			}
-			/* Remap UAR for Tx queues. */
-			err = mlx5_tx_uar_remap(eth_dev, err);
-			if (err) {
-				err = rte_errno;
-				goto error;
-			}
-			/*
-			 * Ethdev pointer is still required as input since
-			 * the primary device is not accessible from the
-			 * secondary process.
-			 */
-			eth_dev->rx_pkt_burst =
-				mlx5_select_rx_function(eth_dev);
-			eth_dev->tx_pkt_burst =
-				mlx5_select_tx_function(eth_dev);
-			claim_zero(mlx5_glue->close_device(ctx));
-			return eth_dev;
-		}
-		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, 1, &port_attr);
-		if (err) {
-			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto error;
-		}
-		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR, "port is not configured in Ethernet mode");
-			err = EINVAL;
-			goto error;
-		}
-		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
-				mlx5_glue->port_state_str(port_attr.state),
-				port_attr.state);
-		/* Allocate protection domain. */
-		pd = mlx5_glue->alloc_pd(ctx);
-		if (pd == NULL) {
-			DRV_LOG(ERR, "PD allocation failure");
-			err = ENOMEM;
-			goto error;
-		}
-		/* from rte_ethdev.c */
-		priv = rte_zmalloc("ethdev private structure",
-				   sizeof(*priv),
-				   RTE_CACHE_LINE_SIZE);
-		if (priv == NULL) {
-			DRV_LOG(ERR, "priv allocation failure");
-			err = ENOMEM;
-			goto error;
-		}
-		priv->ctx = ctx;
-		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
-			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
-		priv->pd = pd;
-		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, dpdk_dev->devargs);
-		if (err) {
-			err = rte_errno;
-			DRV_LOG(ERR, "failed to process device arguments: %s",
-				strerror(rte_errno));
-			goto error;
-		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
-				    IBV_DEVICE_RAW_IP_CSUM);
-		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
-			(config.hw_csum ? "" : "not "));
-#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
-		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
-		DRV_LOG(DEBUG,
-			"counter type = %d, num of cs = %ld, attributes = %d",
-			cs_desc.counter_type, cs_desc.num_of_cs,
-			cs_desc.attributes);
-#endif
-		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
-		/* Remove this check once DPDK supports larger/variable
-		 * indirection tables. */
-		if (config.ind_table_max_size >
-				(unsigned int)ETH_RSS_RETA_SIZE_512)
-			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
-		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
-			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
-		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
-			(config.hw_vlan_strip ? "" : "not "));
-
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
-					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
-		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
-			(config.hw_fcs_strip ? "" : "not "));
-
-#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
-#endif
-		DRV_LOG(DEBUG,
-			"hardware Rx end alignment padding is %ssupported",
-			(config.hw_padding ? "" : "not "));
-		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
-			       (1 << IBV_QPT_RAW_PACKET)));
-		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
-		if (config.mps && !mps) {
-			DRV_LOG(ERR,
-				"multi-packet send not supported on this device"
-				" (" MLX5_TXQ_MPW_EN ")");
-			err = ENOTSUP;
-			goto error;
-		}
-		DRV_LOG(INFO, "%s MPS is %s",
-			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
-			config.mps != MLX5_MPW_DISABLED ? "enabled" :
-			"disabled");
-		if (config.cqe_comp && !cqe_comp) {
-			DRV_LOG(WARNING, "Rx CQE compression isn't supported");
-			config.cqe_comp = 0;
-		}
-		if (config.mprq.enabled && mprq) {
-			if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
-			    config.mprq.stride_num_n < mprq_min_stride_num_n) {
-				config.mprq.stride_num_n =
-					RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
-						mprq_min_stride_num_n);
-				DRV_LOG(WARNING,
-					"the number of strides"
-					" for Multi-Packet RQ is out of range,"
-					" setting default value (%u)",
-					1 << config.mprq.stride_num_n);
-			}
-			config.mprq.min_stride_size_n = mprq_min_stride_size_n;
-			config.mprq.max_stride_size_n = mprq_max_stride_size_n;
-		} else if (config.mprq.enabled && !mprq) {
-			DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
-			config.mprq.enabled = 0;
-		}
-		eth_dev = rte_eth_dev_allocate(name);
+	rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
-			DRV_LOG(ERR, "can not allocate rte ethdev");
-			err = ENOMEM;
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			err = rte_errno;
 			goto error;
 		}
-		eth_dev->data->dev_private = priv;
-		priv->dev_data = eth_dev->data;
-		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = dpdk_dev;
-		eth_dev->device->driver = &mlx5_driver.driver;
-		err = mlx5_uar_init_primary(eth_dev);
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_uar_init_secondary(eth_dev);
 		if (err) {
 			err = rte_errno;
 			goto error;
 		}
-		/* Configure the first MAC address by default. */
-		if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
-			DRV_LOG(ERR,
-				"port %u cannot get MAC address, is mlx5_en"
-				" loaded? (errno: %s)",
-				eth_dev->data->port_id, strerror(rte_errno));
-			err = ENODEV;
-			goto error;
-		}
-		DRV_LOG(INFO,
-			"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
-			eth_dev->data->port_id,
-			mac.addr_bytes[0], mac.addr_bytes[1],
-			mac.addr_bytes[2], mac.addr_bytes[3],
-			mac.addr_bytes[4], mac.addr_bytes[5]);
-#ifndef NDEBUG
-		{
-			char ifname[IF_NAMESIZE];
-
-			if (mlx5_get_ifname(eth_dev, &ifname) == 0)
-				DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
-					eth_dev->data->port_id, ifname);
-			else
-				DRV_LOG(DEBUG, "port %u ifname is unknown",
-					eth_dev->data->port_id);
-		}
-#endif
-		/* Get actual MTU if possible. */
-		err = mlx5_get_mtu(eth_dev, &priv->mtu);
-		if (err) {
+		/* Receive command fd from primary process */
+		err = mlx5_socket_connect(eth_dev);
+		if (err < 0) {
 			err = rte_errno;
 			goto error;
 		}
-		DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
-			priv->mtu);
-		/*
-		 * Initialize burst functions to prevent crashes before link-up.
-		 */
-		eth_dev->rx_pkt_burst = removed_rx_burst;
-		eth_dev->tx_pkt_burst = removed_tx_burst;
-		eth_dev->dev_ops = &mlx5_dev_ops;
-		/* Register MAC address. */
-		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-		priv->nl_socket = -1;
-		priv->nl_sn = 0;
-		if (vf && config.vf_nl_en) {
-			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-			if (priv->nl_socket < 0)
-				priv->nl_socket = -1;
-			mlx5_nl_mac_addr_sync(eth_dev);
-		}
-		TAILQ_INIT(&priv->flows);
-		TAILQ_INIT(&priv->ctrl_flows);
-		/* Hint libmlx5 to use PMD allocator for data plane resources */
-		struct mlx5dv_ctx_allocators alctr = {
-			.alloc = &mlx5_alloc_verbs_buf,
-			.free = &mlx5_free_verbs_buf,
-			.data = priv,
-		};
-		mlx5_glue->dv_set_context_attr(ctx,
-					       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
-					       (void *)((uintptr_t)&alctr));
-		/* Bring Ethernet device up. */
-		DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
-			eth_dev->data->port_id);
-		mlx5_set_link_up(eth_dev);
-		/*
-		 * Even though the interrupt handler is not installed yet,
-		 * interrupts will still trigger on the asyn_fd from
-		 * Verbs context returned by ibv_open_device().
-		 */
-		mlx5_link_update(eth_dev, 0);
-		/* Store device configuration on private structure. */
-		priv->config = config;
-		/* Create drop queue. */
-		err = mlx5_flow_create_drop_queue(eth_dev);
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_remap(eth_dev, err);
 		if (err) {
-			DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
-				eth_dev->data->port_id, strerror(rte_errno));
 			err = rte_errno;
 			goto error;
 		}
-		/* Supported Verbs flow priority number detection. */
-		if (verb_priorities == 0)
-			verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
-		if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
-			DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
-				eth_dev->data->port_id, verb_priorities);
-			err = ENOTSUP;
-			goto error;
-		}
-		priv->config.max_verbs_prio = verb_priorities;
 		/*
-		 * Once the device is added to the list of memory event
-		 * callback, its global MR cache table cannot be expanded
-		 * on the fly because of deadlock. If it overflows, lookup
-		 * should be done by searching MR list linearly, which is slow.
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
 		 */
-		err = mlx5_mr_btree_init(&priv->mr.cache,
-					 MLX5_MR_BTREE_CACHE_N * 2,
-					 eth_dev->device->numa_node);
-		if (err) {
-			err = rte_errno;
-			goto error;
-		}
-		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-				 priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
-		rte_eth_dev_probing_finish(eth_dev);
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		claim_zero(mlx5_glue->close_device(ctx));
 		return eth_dev;
 	}
+	/* Check port status. */
+	err = mlx5_glue->query_port(ctx, 1, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port is not configured in Ethernet mode");
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+			mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate protection domain. */
+	pd = mlx5_glue->alloc_pd(ctx);
+	if (pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv = rte_zmalloc("ethdev private structure",
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->ctx = ctx;
+	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
+		sizeof(priv->ibdev_path));
+	priv->device_attr = attr;
+	priv->pd = pd;
+	priv->mtu = ETHER_MTU;
+	err = mlx5_args(&config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	config.hw_csum = !!(attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config.hw_csum ? "" : "not "));
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	config.flow_counter_en = !!attr.max_counter_sets;
+	mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
+	DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d",
+		cs_desc.counter_type, cs_desc.num_of_cs,
+		cs_desc.attributes);
+#endif
+	config.ind_table_max_size =
+		attr.rss_caps.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config.ind_table_max_size);
+	config.hw_vlan_strip = !!(attr.raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config.hw_vlan_strip ? "" : "not "));
+	config.hw_fcs_strip = !!(attr.raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config.hw_fcs_strip ? "" : "not "));
+#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
+	config.hw_padding = !!attr.rx_pad_end_addr_align;
+#endif
+	DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported",
+		(config.hw_padding ? "" : "not "));
+	config.tso = (attr.tso_caps.max_tso > 0 &&
+		      (attr.tso_caps.supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config.tso)
+		config.tso_max_payload_sz = attr.tso_caps.max_tso;
+	if (config.mps && !mps) {
+		DRV_LOG(ERR,
+			"multi-packet send not supported on this device"
+			" (" MLX5_TXQ_MPW_EN ")");
+		err = ENOTSUP;
+		goto error;
+	}
+	DRV_LOG(INFO, "%sMPS is %s",
+		config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
+		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config.cqe_comp && !cqe_comp) {
+		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+		config.cqe_comp = 0;
+	}
+	if (config.mprq.enabled && mprq) {
+		if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+		    config.mprq.stride_num_n < mprq_min_stride_num_n) {
+			config.mprq.stride_num_n =
+				RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+					mprq_min_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_num_n);
+		}
+		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+	} else if (config.mprq.enabled && !mprq) {
+		DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+		config.mprq.enabled = 0;
+	}
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	eth_dev->device->driver = &mlx5_driver.driver;
+	err = mlx5_uar_init_primary(eth_dev);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		eth_dev->data->port_id,
+		mac.addr_bytes[0], mac.addr_bytes[1],
+		mac.addr_bytes[2], mac.addr_bytes[3],
+		mac.addr_bytes[4], mac.addr_bytes[5]);
+#ifndef NDEBUG
+	{
+		char ifname[IF_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
+#endif
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	priv->nl_socket = -1;
+	priv->nl_sn = 0;
+	if (vf && config.vf_nl_en) {
+		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
+		if (priv->nl_socket < 0)
+			priv->nl_socket = -1;
+		mlx5_nl_mac_addr_sync(eth_dev);
+	}
+	TAILQ_INIT(&priv->flows);
+	TAILQ_INIT(&priv->ctrl_flows);
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	struct mlx5dv_ctx_allocators alctr = {
+		.alloc = &mlx5_alloc_verbs_buf,
+		.free = &mlx5_free_verbs_buf,
+		.data = priv,
+	};
+	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				       (void *)((uintptr_t)&alctr));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the asyn_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+	/* Store device configuration on private structure. */
+	priv->config = config;
+	/* Create drop queue. */
+	err = mlx5_flow_create_drop_queue(eth_dev);
+	if (err) {
+		DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = rte_errno;
+		goto error;
+	}
+	/* Supported Verbs flow priority number detection. */
+	if (verb_priorities == 0)
+		verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
+	if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
+		DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
+			eth_dev->data->port_id, verb_priorities);
+		err = ENOTSUP;
+		goto error;
+	}
+	priv->config.max_verbs_prio = verb_priorities;
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 */
+	err = mlx5_mr_btree_init(&priv->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 eth_dev->device->numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	/* Add device to memory callback list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+			 priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	return eth_dev;
 error:
 	if (priv)
 		rte_free(priv);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 06/10] net/mlx5: add port representor awareness
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (4 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 05/10] net/mlx5: re-indent generic probing function Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors Adrien Mazarguil
                         ` (4 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Xueming Li

The current PCI probing method is not aware of Verbs port representors,
which appear as standard Verbs devices bound to the same PCI address and
cannot be distinguished.

Problem is that more often than not, the wrong Verbs device is used,
resulting in unexpected traffic.

This patch makes the driver discard representors to only use the master
device. If unable to identify it (e.g. kernel drivers not recent enough),
either:

- There is only one matching device which isn't identified as a
  representor, in that case use it.
- Otherwise log an error and do not probe the device.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

- Replaced all heuristics (including mlx5_cmp_ibv_name()) with Netlink
  queries to associate IB devices with network interfaces.
- Reworded commit log.

v2 changes:

- Fixed digit detection in mlx5_cmp_ibv_name() so that "foo1" and "foo10"
  are compared on the integer conversion of "1" against "10" instead of ""
  and "0".
---
 drivers/net/mlx5/Makefile  |  30 ++++
 drivers/net/mlx5/mlx5.c    | 109 +++++++++++++--
 drivers/net/mlx5/mlx5.h    |  16 ++-
 drivers/net/mlx5/mlx5_nl.c | 297 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 428 insertions(+), 24 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 955861a41..745752e23 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -152,6 +152,36 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		infiniband/verbs.h \
 		enum IBV_FLOW_SPEC_ACTION_COUNT \
 		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_CMD_GET \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_CMD_GET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_CMD_PORT_GET \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_CMD_PORT_GET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_DEV_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_DEV_INDEX \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_DEV_NAME \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_DEV_NAME \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_PORT_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_PORT_INDEX \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX \
+		linux/rdma_netlink.h \
+		enum RDMA_NLDEV_ATTR_NDEV_INDEX \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1054bf6d0..d06ba9886 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -13,6 +13,7 @@
 #include <errno.h>
 #include <net/if.h>
 #include <sys/mman.h>
+#include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
 /* Verbs header. */
@@ -274,8 +275,10 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 		mlx5_socket_uninit(dev);
 	if (priv->config.vf)
 		mlx5_nl_mac_addr_flush(dev);
-	if (priv->nl_socket >= 0)
-		close(priv->nl_socket);
+	if (priv->nl_socket_route >= 0)
+		close(priv->nl_socket_route);
+	if (priv->nl_socket_rdma >= 0)
+		close(priv->nl_socket_rdma);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -876,6 +879,10 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->device_attr = attr;
 	priv->pd = pd;
 	priv->mtu = ETHER_MTU;
+	/* Some internal functions rely on Netlink sockets, open them now. */
+	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
+	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE);
+	priv->nl_sn = 0;
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -1010,14 +1017,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	eth_dev->dev_ops = &mlx5_dev_ops;
 	/* Register MAC address. */
 	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-	priv->nl_socket = -1;
-	priv->nl_sn = 0;
-	if (vf && config.vf_nl_en) {
-		priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-		if (priv->nl_socket < 0)
-			priv->nl_socket = -1;
+	if (vf && config.vf_nl_en)
 		mlx5_nl_mac_addr_sync(eth_dev);
-	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1078,8 +1079,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
-	if (priv)
+	if (priv) {
+		if (priv->nl_socket_route >= 0)
+			close(priv->nl_socket_route);
+		if (priv->nl_socket_rdma >= 0)
+			close(priv->nl_socket_rdma);
 		rte_free(priv);
+	}
 	if (pd)
 		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev)
@@ -1110,6 +1116,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **ibv_list;
 	struct rte_eth_dev *eth_dev = NULL;
+	unsigned int n = 0;
 	int vf;
 	int ret;
 
@@ -1121,6 +1128,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
+
+	struct ibv_device *ibv_match[ret + 1];
+
 	while (ret-- > 0) {
 		struct rte_pci_addr pci_addr;
 
@@ -1132,10 +1142,81 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    pci_dev->addr.devid != pci_addr.devid ||
 		    pci_dev->addr.function != pci_addr.function)
 			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
 			ibv_list[ret]->name);
+		ibv_match[n++] = ibv_list[ret];
+	}
+	ibv_match[n] = NULL;
+
+	unsigned int ifindex[n];
+	struct mlx5_switch_info info[n];
+	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
+	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
+	unsigned int i;
+
+	/*
+	 * The existence of several matching entries (n > 1) means port
+	 * representors have been instantiated. No existing Verbs call nor
+	 * /sys entries can tell them apart, this can only be done through
+	 * Netlink calls assuming kernel drivers are recent enough to
+	 * support them.
+	 *
+	 * In the event of identification failure through Netlink, either:
+	 *
+	 * 1. No device matches (n == 0), complain and bail out.
+	 * 2. A single IB device matches (n == 1) and is not a representor,
+	 *    assume no switch support.
+	 * 3. Otherwise no safe assumptions can be made; complain louder and
+	 *    bail out.
+	 */
+	for (i = 0; i != n; ++i) {
+		if (nl_rdma < 0)
+			ifindex[i] = 0;
+		else
+			ifindex[i] = mlx5_nl_ifindex(nl_rdma,
+						     ibv_match[i]->name);
+		if (nl_route < 0 ||
+		    !ifindex[i] ||
+		    mlx5_nl_switch_info(nl_route, ifindex[i], &info[i])) {
+			ifindex[i] = 0;
+			memset(&info[i], 0, sizeof(info[i]));
+			continue;
+		}
+	}
+	if (nl_rdma >= 0)
+		close(nl_rdma);
+	if (nl_route >= 0)
+		close(nl_route);
+	/* Look for master device. */
+	for (i = 0; i != n; ++i) {
+		if (!info[i].master)
+			continue;
+		/* Make it the first entry. */
+		if (i == 0)
+			break;
+		ibv_match[n] = ibv_match[0];
+		ibv_match[0] = ibv_match[i];
+		ibv_match[n] = NULL;
 		break;
 	}
+	if (n && i == n) {
+		if (n == 1 && !info[0].representor) {
+			/* Case #2. */
+			DRV_LOG(INFO, "no switch support detected");
+		} else if (n == 1) {
+			/* Case #3. */
+			DRV_LOG(ERR,
+				"device looks like a port representor, this is"
+				" not supported yet");
+			n = 0;
+		} else {
+			/* Case #3. */
+			DRV_LOG(ERR,
+				"unable to tell which of the matching devices"
+				" is the master (lack of kernel support?)");
+			n = 0;
+		}
+	}
 	switch (pci_dev->id.device_id) {
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
@@ -1146,10 +1227,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	default:
 		vf = 0;
 	}
-	if (ret >= 0)
-		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	if (n)
+		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
 	mlx5_glue->free_device_list(ibv_list);
-	if (!ret) {
+	if (!n) {
 		DRV_LOG(WARNING,
 			"no Verbs device matches PCI device " PCI_PRI_FMT ","
 			" are kernel drivers loaded?",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index f55ff4a21..704046270 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -53,6 +53,14 @@ enum {
 	PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2,
 };
 
+/** Switch information returned by mlx5_nl_switch_info(). */
+struct mlx5_switch_info {
+	uint32_t master:1; /**< Master device. */
+	uint32_t representor:1; /**< Representor device. */
+	int32_t port_name; /**< Representor port name. */
+	uint64_t switch_id; /**< Switch identifier. */
+};
+
 LIST_HEAD(mlx5_dev_list, priv);
 
 /* Shared memory between primary and secondary processes. */
@@ -195,7 +203,8 @@ struct priv {
 	struct mlx5_dev_config config; /* Device configuration. */
 	struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
 	/* Context for Verbs allocator. */
-	int nl_socket; /* Netlink socket. */
+	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
+	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
 };
 
@@ -342,7 +351,7 @@ int mlx5_socket_connect(struct rte_eth_dev *priv);
 
 /* mlx5_nl.c */
 
-int mlx5_nl_init(uint32_t nlgroups);
+int mlx5_nl_init(uint32_t nlgroups, int protocol);
 int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
 			 uint32_t index);
 int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
@@ -351,5 +360,8 @@ void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev);
 void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
 int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable);
 int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable);
+unsigned int mlx5_nl_ifindex(int nl, const char *name);
+int mlx5_nl_switch_info(int nl, unsigned int ifindex,
+			struct mlx5_switch_info *info);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index dca85835a..58ef2f4f0 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -3,10 +3,21 @@
  * Copyright 2018 Mellanox Technologies, Ltd
  */
 
+#include <errno.h>
+#include <linux/if_link.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <rdma/rdma_netlink.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
 #include <unistd.h>
 
+#include <rte_errno.h>
+
 #include "mlx5.h"
 #include "mlx5_utils.h"
 
@@ -27,6 +38,29 @@
 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
 #endif
 
+/*
+ * The following definitions are normally found in rdma/rdma_netlink.h,
+ * however they are so recent that most systems do not expose them yet.
+ */
+#ifndef HAVE_RDMA_NLDEV_CMD_GET
+#define RDMA_NLDEV_CMD_GET 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
+#define RDMA_NLDEV_CMD_PORT_GET 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
+#define RDMA_NLDEV_ATTR_DEV_INDEX 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
+#define RDMA_NLDEV_ATTR_DEV_NAME 2
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
+#define RDMA_NLDEV_ATTR_PORT_INDEX 3
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
+#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
+#endif
+
 /* Add/remove MAC address through Netlink */
 struct mlx5_nl_mac_addr {
 	struct ether_addr (*mac)[];
@@ -34,18 +68,27 @@ struct mlx5_nl_mac_addr {
 	int mac_n; /**< Number of addresses in the array. */
 };
 
+/** Data structure used by mlx5_nl_ifindex_cb(). */
+struct mlx5_nl_ifindex_data {
+	const char *name; /**< IB device name (in). */
+	uint32_t ibindex; /**< IB device index (out). */
+	uint32_t ifindex; /**< Network interface index (out). */
+};
+
 /**
  * Opens a Netlink socket.
  *
  * @param nl_groups
  *   Netlink group value (e.g. RTMGRP_LINK).
+ * @param protocol
+ *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
  *
  * @return
  *   A file descriptor on success, a negative errno value otherwise and
  *   rte_errno is set.
  */
 int
-mlx5_nl_init(uint32_t nl_groups)
+mlx5_nl_init(uint32_t nl_groups, int protocol)
 {
 	int fd;
 	int sndbuf_size = MLX5_SEND_BUF_SIZE;
@@ -56,7 +99,7 @@ mlx5_nl_init(uint32_t nl_groups)
 	};
 	int ret;
 
-	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
 	if (fd == -1) {
 		rte_errno = errno;
 		return -rte_errno;
@@ -334,9 +377,9 @@ mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket == -1)
+	if (priv->nl_socket_route == -1)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
 			      sizeof(struct ifinfomsg));
 	if (ret < 0)
@@ -398,9 +441,9 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket == -1)
+	if (priv->nl_socket_route == -1)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 		RTA_ALIGN(req.rta.rta_len);
@@ -569,9 +612,9 @@ mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
 	int ret;
 
 	assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
-	if (priv->nl_socket < 0)
+	if (priv->nl_socket_route < 0)
 		return 0;
-	fd = priv->nl_socket;
+	fd = priv->nl_socket_route;
 	ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
 	if (ret < 0)
 		return ret;
@@ -625,3 +668,241 @@ mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
 			strerror(rte_errno));
 	return ret;
 }
+
+/**
+ * Process network interface information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct mlx5_nl_ifindex_data *data = arg;
+	size_t off = NLMSG_HDRLEN;
+	uint32_t ibindex = 0;
+	uint32_t ifindex = 0;
+	int found = 0;
+
+	if (nh->nlmsg_type !=
+	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
+	    nh->nlmsg_type !=
+	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
+		goto error;
+	while (off < nh->nlmsg_len) {
+		struct nlattr *na = (void *)((uintptr_t)nh + off);
+		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
+
+		if (na->nla_len > nh->nlmsg_len - off)
+			goto error;
+		switch (na->nla_type) {
+		case RDMA_NLDEV_ATTR_DEV_INDEX:
+			ibindex = *(uint32_t *)payload;
+			break;
+		case RDMA_NLDEV_ATTR_DEV_NAME:
+			if (!strcmp(payload, data->name))
+				found = 1;
+			break;
+		case RDMA_NLDEV_ATTR_NDEV_INDEX:
+			ifindex = *(uint32_t *)payload;
+			break;
+		default:
+			break;
+		}
+		off += NLA_ALIGN(na->nla_len);
+	}
+	if (found) {
+		data->ibindex = ibindex;
+		data->ifindex = ifindex;
+	}
+	return 0;
+error:
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ *
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name)
+{
+	static const uint32_t pindex = 1;
+	uint32_t seq = random();
+	struct mlx5_nl_ifindex_data data = {
+		.name = name,
+		.ibindex = 0, /* Determined during first pass. */
+		.ifindex = 0, /* Determined during second pass. */
+	};
+	union {
+		struct nlmsghdr nh;
+		uint8_t buf[NLMSG_HDRLEN +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(0),
+			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+						       RDMA_NLDEV_CMD_GET),
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		},
+	};
+	struct nlattr *na;
+	int ret;
+
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret < 0)
+		return 0;
+	ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+	if (ret < 0)
+		return 0;
+	if (!data.ibindex)
+		goto error;
+	++seq;
+	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					     RDMA_NLDEV_CMD_PORT_GET);
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
+	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
+	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
+	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+	       &data.ibindex, sizeof(data.ibindex));
+	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
+	na->nla_len = NLA_HDRLEN + sizeof(pindex);
+	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
+	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+	       &pindex, sizeof(pindex));
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret < 0)
+		return 0;
+	ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+	if (ret < 0)
+		return 0;
+	if (!data.ifindex)
+		goto error;
+	return data.ifindex;
+error:
+	rte_errno = ENODEV;
+	return 0;
+}
+
+/**
+ * Process switch information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct mlx5_switch_info info = {
+		.master = 0,
+		.representor = 0,
+		.port_name = 0,
+		.switch_id = 0,
+	};
+	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	bool port_name_set = false;
+	bool switch_id_set = false;
+
+	if (nh->nlmsg_type != RTM_NEWLINK)
+		goto error;
+	while (off < nh->nlmsg_len) {
+		struct rtattr *ra = (void *)((uintptr_t)nh + off);
+		void *payload = RTA_DATA(ra);
+		char *end;
+		unsigned int i;
+
+		if (ra->rta_len > nh->nlmsg_len - off)
+			goto error;
+		switch (ra->rta_type) {
+		case IFLA_PHYS_PORT_NAME:
+			errno = 0;
+			info.port_name = strtol(payload, &end, 0);
+			if (errno ||
+			    (size_t)(end - (char *)payload) != strlen(payload))
+				goto error;
+			port_name_set = true;
+			break;
+		case IFLA_PHYS_SWITCH_ID:
+			info.switch_id = 0;
+			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
+				info.switch_id <<= 8;
+				info.switch_id |= ((uint8_t *)payload)[i];
+			}
+			switch_id_set = true;
+			break;
+		}
+		off += RTA_ALIGN(ra->rta_len);
+	}
+	info.master = switch_id_set && !port_name_set;
+	info.representor = switch_id_set && port_name_set;
+	memcpy(arg, &info, sizeof(info));
+	return 0;
+error:
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param nl
+ *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
+{
+	uint32_t seq = random();
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(req.info)),
+			.nlmsg_type = RTM_GETLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+	int ret;
+
+	ret = mlx5_nl_send(nl, &req.nh, seq);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
+	return ret;
+}
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (5 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 06/10] net/mlx5: add port representor awareness Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-09 11:57         ` Shahaf Shuler
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 08/10] net/mlx5: probe port representors in natural order Adrien Mazarguil
                         ` (3 subsequent siblings)
  10 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Nelio Laranjeiro, Xueming Li

Probe existing port representors in addition to their master device and
associate them automatically.

To avoid collision between Ethernet devices, they are named as follows:

- "{DBDF}" for master/switch devices.
- "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
  representors.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v4 changes:

- Fixed domain ID release once the last port using it is closed. Closed
  devices are not necessarily detached, their presence is not a good
  indicator. Code was modified to check if they still use their domain IDs
  before deciding to release it.

v3 changes:

- Nelio introduced mlx5_dev_to_port_id() to prevent the master device from
  releasing a domain ID while representors are still bound. It is now
  released by the last device closed.
- Reverted to original naming convention as requested by Xueming and
  Shahaf; "net_" prefix and "_0" suffix were dropped.
- mlx5_dev_spawn() (previously mlx5_dev_spawn_one()) now decides on its own
  whether underlying device is a representor.
- Devices can now be probed in any order and not necessarily all at once;
  representors can exist without a master device.
- mlx5_pci_probe() iterates on the list of devices directly instead of
  relying on an intermediate function (previously mlx5_dev_spawn()).
- mlx5_get_ifname() was rewritten to rely on mlx5_nl_ifindex() when faced
  with a representor.
- Since it is not necessarily present, master device is now dynamically
  retrieved in mlx5_dev_infos_get().

v2 changes:

- Added representor information to dev_infos_get(). DPDK port ID of master
  device is now stored in the private structure to retrieve it
  conveniently.
- Master device is assigned dummy representor ID value -1 to better
  distinguish from the the first actual representor reported by
  dev_infos_get() as those are indexed from 0.
- Added RTE_ETH_DEV_REPRESENTOR device flag.
---
 drivers/net/mlx5/mlx5.c        | 134 ++++++++++++++++++++++++++++--------
 drivers/net/mlx5/mlx5.h        |  12 +++-
 drivers/net/mlx5/mlx5_ethdev.c | 133 +++++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_mac.c    |   2 +-
 drivers/net/mlx5/mlx5_stats.c  |   6 +-
 5 files changed, 238 insertions(+), 49 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d06ba9886..c02afbb82 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -307,7 +307,27 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
+	if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		unsigned int c = 0;
+		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
+		uint16_t port_id[i];
+
+		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
+		while (i--) {
+			struct priv *opriv =
+				rte_eth_devices[port_id[i]].data->dev_private;
+
+			if (!opriv ||
+			    opriv->domain_id != priv->domain_id ||
+			    &rte_eth_devices[port_id[i]] == dev)
+				continue;
+			++c;
+		}
+		if (!c)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+	}
 	memset(priv, 0, sizeof(*priv));
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
 }
 
 const struct eth_dev_ops mlx5_dev_ops = {
@@ -647,6 +667,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *   Verbs device.
  * @param vf
  *   If nonzero, enable VF-specific features.
+ * @param[in] switch_info
+ *   Switch properties of Ethernet device.
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
@@ -655,7 +677,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 static struct rte_eth_dev *
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	       struct ibv_device *ibv_dev,
-	       int vf)
+	       int vf,
+	       const struct mlx5_switch_info *switch_info)
 {
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
@@ -697,6 +720,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 #endif
 	struct ether_addr mac;
 	char name[RTE_ETH_NAME_MAX_LEN];
+	int own_domain_id = 0;
+	unsigned int i;
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
@@ -805,7 +830,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	if (!switch_info->representor)
+		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
+	else
+		snprintf(name, sizeof(name), "%s_representor_%u",
+			 dpdk_dev->name, switch_info->port_name);
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
@@ -874,6 +904,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		goto error;
 	}
 	priv->ctx = ctx;
+	strncpy(priv->ibdev_name, priv->ctx->device->name,
+		sizeof(priv->ibdev_name));
 	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 		sizeof(priv->ibdev_path));
 	priv->device_attr = attr;
@@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
 	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE);
 	priv->nl_sn = 0;
+	priv->representor = !!switch_info->representor;
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+	priv->representor_id =
+		switch_info->representor ? switch_info->port_name : -1;
+	/*
+	 * Look for sibling devices in order to reuse their switch domain
+	 * if any, otherwise allocate one.
+	 */
+	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
+	if (i > 0) {
+		uint16_t port_id[i];
+
+		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i);
+		while (i--) {
+			const struct priv *opriv =
+				rte_eth_devices[port_id[i]].data->dev_private;
+
+			if (!opriv ||
+			    opriv->domain_id ==
+			    RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+				continue;
+			priv->domain_id = opriv->domain_id;
+			break;
+		}
+	}
+	if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+		own_domain_id = 1;
+	}
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -966,6 +1033,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		err = ENOMEM;
 		goto error;
 	}
+	if (priv->representor)
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
 	eth_dev->data->dev_private = priv;
 	priv->dev_data = eth_dev->data;
 	eth_dev->data->mac_addrs = priv->mac;
@@ -1084,6 +1153,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (own_domain_id)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
 	}
 	if (pd)
@@ -1100,7 +1171,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 /**
  * DPDK callback to register a PCI device.
  *
- * This function spawns an Ethernet device out of a given PCI device.
+ * This function spawns Ethernet devices out of a given PCI device.
  *
  * @param[in] pci_drv
  *   PCI driver structure (mlx5_driver).
@@ -1115,7 +1186,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	       struct rte_pci_device *pci_dev)
 {
 	struct ibv_device **ibv_list;
-	struct rte_eth_dev *eth_dev = NULL;
 	unsigned int n = 0;
 	int vf;
 	int ret;
@@ -1150,9 +1220,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 
 	unsigned int ifindex[n];
 	struct mlx5_switch_info info[n];
+	struct rte_eth_dev *eth_list[n];
 	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
 	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
 	unsigned int i;
+	unsigned int u;
 
 	/*
 	 * The existence of several matching entries (n > 1) means port
@@ -1187,28 +1259,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		close(nl_rdma);
 	if (nl_route >= 0)
 		close(nl_route);
-	/* Look for master device. */
-	for (i = 0; i != n; ++i) {
-		if (!info[i].master)
-			continue;
-		/* Make it the first entry. */
-		if (i == 0)
-			break;
-		ibv_match[n] = ibv_match[0];
-		ibv_match[0] = ibv_match[i];
-		ibv_match[n] = NULL;
-		break;
-	}
-	if (n && i == n) {
-		if (n == 1 && !info[0].representor) {
+	/* Count unidentified devices. */
+	for (u = 0, i = 0; i != n; ++i)
+		if (!info[i].master && !info[i].representor)
+			++u;
+	if (u) {
+		if (n == 1 && u == 1) {
 			/* Case #2. */
 			DRV_LOG(INFO, "no switch support detected");
-		} else if (n == 1) {
-			/* Case #3. */
-			DRV_LOG(ERR,
-				"device looks like a port representor, this is"
-				" not supported yet");
-			n = 0;
 		} else {
 			/* Case #3. */
 			DRV_LOG(ERR,
@@ -1227,8 +1285,19 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	default:
 		vf = 0;
 	}
-	if (n)
-		eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
+	for (i = 0; i != n; ++i) {
+		uint32_t restore;
+
+		eth_list[i] = mlx5_dev_spawn(&pci_dev->device, ibv_match[i],
+					     vf, &info[i]);
+		if (!eth_list[i])
+			break;
+		restore = eth_list[i]->data->dev_flags;
+		rte_eth_copy_pci_info(eth_list[i], pci_dev);
+		/* Restore non-PCI flags cleared by the above call. */
+		eth_list[i]->data->dev_flags |= restore;
+		rte_eth_dev_probing_finish(eth_list[i]);
+	}
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
 		DRV_LOG(WARNING,
@@ -1238,7 +1307,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			pci_dev->addr.devid, pci_dev->addr.function);
 		rte_errno = ENOENT;
 		ret = -rte_errno;
-	} else if (!eth_dev) {
+	} else if (i != n) {
 		DRV_LOG(ERR,
 			"probe of PCI device " PCI_PRI_FMT " aborted after"
 			" encountering an error: %s",
@@ -1246,9 +1315,16 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			pci_dev->addr.devid, pci_dev->addr.function,
 			strerror(rte_errno));
 		ret = -rte_errno;
+		/* Roll back. */
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		/* Restore original error. */
+		rte_errno = -ret;
 	} else {
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
-		rte_eth_dev_probing_finish(eth_dev);
 		ret = 0;
 	}
 	return ret;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 704046270..cc01310e0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -159,6 +159,7 @@ struct priv {
 	struct ibv_context *ctx; /* Verbs context. */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct ibv_pd *pd; /* Protection Domain. */
+	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
 	BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
@@ -168,6 +169,9 @@ struct priv {
 	/* Device properties. */
 	uint16_t mtu; /* Configured MTU. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	int32_t representor_id; /* Port representor identifier. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
@@ -217,9 +221,12 @@ int mlx5_getenv_int(const char *);
 
 /* mlx5_ethdev.c */
 
+int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+			   char (*ifname)[IF_NAMESIZE]);
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
 int mlx5_ifindex(const struct rte_eth_dev *dev);
-int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	       int master);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
 		   unsigned int flags);
@@ -244,6 +251,9 @@ int mlx5_set_link_up(struct rte_eth_dev *dev);
 int mlx5_is_removed(struct rte_eth_dev *dev);
 eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev);
 eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev);
+unsigned int mlx5_dev_to_port_id(const struct rte_device *dev,
+				 uint16_t *port_list,
+				 unsigned int port_list_n);
 
 /* mlx5_mac.c */
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 819f5baad..05f66f7b6 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -27,6 +27,7 @@
 #include <time.h>
 
 #include <rte_atomic.h>
+#include <rte_common.h>
 #include <rte_ethdev_driver.h>
 #include <rte_bus_pci.h>
 #include <rte_mbuf.h>
@@ -93,7 +94,7 @@ struct ethtool_link_settings {
 #endif
 
 /**
- * Get interface name from private structure.
+ * Get master interface name from private structure.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -104,7 +105,8 @@ struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+		       char (*ifname)[IF_NAMESIZE])
 {
 	struct priv *priv = dev->data->dev_private;
 	DIR *dir;
@@ -179,6 +181,39 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int ifindex =
+		priv->nl_socket_rdma >= 0 ?
+		mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0;
+
+	if (!ifindex) {
+		if (!priv->representor)
+			return mlx5_get_master_ifname(dev, ifname);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (if_indextoname(ifindex, &(*ifname)[0]))
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
  * Get the interface index from device name.
  *
  * @param[in] dev
@@ -214,12 +249,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
  *   Request number to pass to ioctl().
  * @param[out] ifr
  *   Interface request structure output buffer.
+ * @param master
+ *   When device is a port representor, perform request on master device
+ *   instead.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	   int master)
 {
 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 	int ret = 0;
@@ -228,7 +267,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 		rte_errno = errno;
 		return -rte_errno;
 	}
-	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (master)
+		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
+	else
+		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
 	if (ret)
 		goto error;
 	ret = ioctl(sock, req, ifr);
@@ -258,7 +300,7 @@ int
 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
 
 	if (ret)
 		return ret;
@@ -282,7 +324,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
 	struct ifreq request = { .ifr_mtu = mtu, };
 
-	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
 }
 
 /**
@@ -302,13 +344,13 @@ int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
 
 	if (ret)
 		return ret;
 	request.ifr_flags &= keep;
 	request.ifr_flags |= flags & ~keep;
-	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
 }
 
 /**
@@ -477,6 +519,30 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	info->speed_capa = priv->link_speed_capa;
 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
 	mlx5_set_default_params(dev, info);
+	info->switch_info.name = dev->data->name;
+	info->switch_info.domain_id = priv->domain_id;
+	info->switch_info.port_id = priv->representor_id;
+	if (priv->representor) {
+		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
+		uint16_t port_id[i];
+
+		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
+		while (i--) {
+			struct priv *opriv =
+				rte_eth_devices[port_id[i]].data->dev_private;
+
+			if (!opriv ||
+			    opriv->representor ||
+			    opriv->domain_id != priv->domain_id)
+				continue;
+			/*
+			 * Override switch name with that of the master
+			 * device.
+			 */
+			info->switch_info.name = opriv->dev_data->name;
+			break;
+		}
+	}
 }
 
 /**
@@ -540,7 +606,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	int link_speed = 0;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -550,7 +616,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&edata;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
@@ -611,7 +677,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	uint64_t sc;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -621,7 +687,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&gcmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -638,7 +704,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
 	*ecmd = gcmd;
 	ifr.ifr_data = (void *)ecmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -801,7 +867,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	int ret;
 
 	ifr.ifr_data = (void *)&ethpause;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
@@ -854,7 +920,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		ethpause.tx_pause = 1;
 	else
 		ethpause.tx_pause = 0;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
@@ -1193,3 +1259,40 @@ mlx5_is_removed(struct rte_eth_dev *dev)
 		return 1;
 	return 0;
 }
+
+/**
+ * Get port ID list of mlx5 instances sharing a common device.
+ *
+ * @param[in] dev
+ *   Device to look for.
+ * @param[out] port_list
+ *   Result buffer for collected port IDs.
+ * @param port_list_n
+ *   Maximum number of entries in result buffer. If 0, @p port_list can be
+ *   NULL.
+ *
+ * @return
+ *   Number of matching instances regardless of the @p port_list_n
+ *   parameter, 0 if none were found.
+ */
+unsigned int
+mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
+		    unsigned int port_list_n)
+{
+	uint16_t id;
+	unsigned int n = 0;
+
+	RTE_ETH_FOREACH_DEV(id) {
+		struct rte_eth_dev *ldev = &rte_eth_devices[id];
+
+		if (!ldev->device ||
+		    !ldev->device->driver ||
+		    strcmp(ldev->device->driver->name, MLX5_DRIVER_NAME) ||
+		    ldev->device != dev)
+			continue;
+		if (n < port_list_n)
+			port_list[n] = id;
+		n++;
+	}
+	return n;
+}
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 672a47619..12ee37f55 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 	struct ifreq request;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
 	if (ret)
 		return ret;
 	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 875dd1027..91f3d474a 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
 	et_stats->cmd = ETHTOOL_GSTATS;
 	et_stats->n_stats = xstats_ctrl->stats_n;
 	ifr.ifr_data = (caddr_t)et_stats;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u unable to read statistic values from device",
@@ -194,7 +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
 
 	drvinfo.cmd = ETHTOOL_GDRVINFO;
 	ifr.ifr_data = (caddr_t)&drvinfo;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to query number of statistics",
 			dev->data->port_id);
@@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
 	strings->string_set = ETH_SS_STATS;
 	strings->len = dev_stats_n;
 	ifr.ifr_data = (caddr_t)strings;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to get statistic names",
 			dev->data->port_id);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 08/10] net/mlx5: probe port representors in natural order
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (6 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors Adrien Mazarguil
                         ` (2 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Port representors are probed in whatever unspecified order
ibv_get_device_list() returns them.

This is counterintuitive to users since DPDK port IDs assignment almost
never follows the same sequence as representor IDs. Additionally, the
master device does not necessarily inherit the lowest DPDK port ID.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v3 changes:

- This patch was not present in prior revisions.
---
 drivers/net/mlx5/mlx5.c | 95 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 21 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index c02afbb82..6592480bf 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1168,6 +1168,52 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	return NULL;
 }
 
+/** Data associated with devices to spawn. */
+struct mlx5_dev_spawn_data {
+	unsigned int ifindex; /**< Network interface index. */
+	struct mlx5_switch_info info; /**< Switch information. */
+	struct ibv_device *ibv_dev; /**< Associated IB device. */
+	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+};
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first data object.
+ * @param b[in]
+ *   Pointer to pointer to second data object.
+ *
+ * @return
+ *   0 if both objects are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+	const struct mlx5_switch_info *si_a =
+		&((const struct mlx5_dev_spawn_data *)a)->info;
+	const struct mlx5_switch_info *si_b =
+		&((const struct mlx5_dev_spawn_data *)b)->info;
+	int ret;
+
+	/* Master device first. */
+	ret = si_b->master - si_a->master;
+	if (ret)
+		return ret;
+	/* Then representor devices. */
+	ret = si_b->representor - si_a->representor;
+	if (ret)
+		return ret;
+	/* Unidentified devices come last in no specific order. */
+	if (!si_a->representor)
+		return 0;
+	/* Order representors by name. */
+	return si_a->port_name - si_b->port_name;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -1218,9 +1264,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 	ibv_match[n] = NULL;
 
-	unsigned int ifindex[n];
-	struct mlx5_switch_info info[n];
-	struct rte_eth_dev *eth_list[n];
+	struct mlx5_dev_spawn_data list[n];
 	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
 	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
 	unsigned int i;
@@ -1242,16 +1286,19 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 *    bail out.
 	 */
 	for (i = 0; i != n; ++i) {
+		list[i].ibv_dev = ibv_match[i];
+		list[i].eth_dev = NULL;
 		if (nl_rdma < 0)
-			ifindex[i] = 0;
+			list[i].ifindex = 0;
 		else
-			ifindex[i] = mlx5_nl_ifindex(nl_rdma,
-						     ibv_match[i]->name);
+			list[i].ifindex = mlx5_nl_ifindex
+				(nl_rdma, list[i].ibv_dev->name);
 		if (nl_route < 0 ||
-		    !ifindex[i] ||
-		    mlx5_nl_switch_info(nl_route, ifindex[i], &info[i])) {
-			ifindex[i] = 0;
-			memset(&info[i], 0, sizeof(info[i]));
+		    !list[i].ifindex ||
+		    mlx5_nl_switch_info(nl_route, list[i].ifindex,
+					&list[i].info)) {
+			list[i].ifindex = 0;
+			memset(&list[i].info, 0, sizeof(list[i].info));
 			continue;
 		}
 	}
@@ -1261,7 +1308,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		close(nl_route);
 	/* Count unidentified devices. */
 	for (u = 0, i = 0; i != n; ++i)
-		if (!info[i].master && !info[i].representor)
+		if (!list[i].info.master && !list[i].info.representor)
 			++u;
 	if (u) {
 		if (n == 1 && u == 1) {
@@ -1275,6 +1322,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			n = 0;
 		}
 	}
+	/*
+	 * Sort list to probe devices in natural order for users convenience
+	 * (i.e. master first, then representors from lowest to highest ID).
+	 */
+	if (n)
+		qsort(list, n, sizeof(*list), mlx5_dev_spawn_data_cmp);
 	switch (pci_dev->id.device_id) {
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
@@ -1288,15 +1341,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	for (i = 0; i != n; ++i) {
 		uint32_t restore;
 
-		eth_list[i] = mlx5_dev_spawn(&pci_dev->device, ibv_match[i],
-					     vf, &info[i]);
-		if (!eth_list[i])
+		list[i].eth_dev = mlx5_dev_spawn
+			(&pci_dev->device, list[i].ibv_dev, vf, &list[i].info);
+		if (!list[i].eth_dev)
 			break;
-		restore = eth_list[i]->data->dev_flags;
-		rte_eth_copy_pci_info(eth_list[i], pci_dev);
+		restore = list[i].eth_dev->data->dev_flags;
+		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
 		/* Restore non-PCI flags cleared by the above call. */
-		eth_list[i]->data->dev_flags |= restore;
-		rte_eth_dev_probing_finish(eth_list[i]);
+		list[i].eth_dev->data->dev_flags |= restore;
+		rte_eth_dev_probing_finish(list[i].eth_dev);
 	}
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
@@ -1317,10 +1370,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 		/* Roll back. */
 		while (i--) {
-			mlx5_dev_close(eth_list[i]);
+			mlx5_dev_close(list[i].eth_dev);
 			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-				rte_free(eth_list[i]->data->dev_private);
-			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+				rte_free(list[i].eth_dev->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
 		}
 		/* Restore original error. */
 		rte_errno = -ret;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (7 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 08/10] net/mlx5: probe port representors in natural order Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-09 11:57         ` Shahaf Shuler
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers " Adrien Mazarguil
  2018-07-10 16:04       ` [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support Adrien Mazarguil
  10 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

Prior to this patch, all port representors detected on a given device were
probed and Ethernet devices instantiated for each of them.

This patch adds support for the standard "representor" parameter, which
implies that port representors are not probed by default anymore, except
for the list provided through device arguments.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

- Adapted representor detection to the reworked mlx5_dev_spawn().

v2 changes:

- Added error message for when rte_eth_devargs_parse() fails.
---
 doc/guides/nics/mlx5.rst                | 12 ++++++++
 doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
 drivers/net/mlx5/mlx5.c                 | 41 ++++++++++++++++++++++++++--
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 7dd9c1c5e..0d0d21727 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -392,6 +392,18 @@ Run-time configuration
 
   Disabled by default.
 
+- ``representor`` parameter [list]
+
+  This parameter can be used to instantiate DPDK Ethernet devices from
+  existing port (or VF) representors configured on the device.
+
+  It is a standard parameter whose format is described in
+  :ref:`ethernet_device_standard_device_arguments`.
+
+  For instance, to probe port representors 0 through 2::
+
+    representor=[0-2]
+
 Firmware configuration
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/prog_guide/poll_mode_drv.rst b/doc/guides/prog_guide/poll_mode_drv.rst
index 4b69f6cbe..b2cf48354 100644
--- a/doc/guides/prog_guide/poll_mode_drv.rst
+++ b/doc/guides/prog_guide/poll_mode_drv.rst
@@ -360,6 +360,8 @@ Ethernet Device API
 
 The Ethernet device API exported by the Ethernet PMDs is described in the *DPDK API Reference*.
 
+.. _ethernet_device_standard_device_arguments:
+
 Ethernet Device Standard Device Arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6592480bf..12a77afa8 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -92,6 +92,9 @@
 /* Activate Netlink support in VF mode. */
 #define MLX5_VF_NL_EN "vf_nl_en"
 
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
 #ifndef HAVE_IBV_MLX5_MOD_MPW
 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
@@ -443,6 +446,9 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	struct mlx5_dev_config *config = opaque;
 	unsigned long tmp;
 
+	/* No-op, port representors are processed in mlx5_dev_spawn(). */
+	if (!strcmp(MLX5_REPRESENTOR, key))
+		return 0;
 	errno = 0;
 	tmp = strtoul(val, NULL, 0);
 	if (errno) {
@@ -515,6 +521,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
+		MLX5_REPRESENTOR,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
@@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
- *   is set.
+ *   is set. The following error is defined:
+ *
+ *   EBUSY: device is not supposed to be spawned.
  */
 static struct rte_eth_dev *
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
@@ -723,6 +732,26 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	unsigned int i;
 
+	/* Determine if this port representor is supposed to be spawned. */
+	if (switch_info->representor && dpdk_dev->devargs) {
+		struct rte_eth_devargs eth_da;
+
+		err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+		if (err) {
+			rte_errno = -err;
+			DRV_LOG(ERR, "failed to process device arguments: %s",
+				strerror(rte_errno));
+			return NULL;
+		}
+		for (i = 0; i < eth_da.nb_representor_ports; ++i)
+			if (eth_da.representor_ports[i] ==
+			    (uint16_t)switch_info->port_name)
+				break;
+		if (i == eth_da.nb_representor_ports) {
+			rte_errno = EBUSY;
+			return NULL;
+		}
+	}
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
 	errno = 0;
@@ -1343,8 +1372,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 
 		list[i].eth_dev = mlx5_dev_spawn
 			(&pci_dev->device, list[i].ibv_dev, vf, &list[i].info);
-		if (!list[i].eth_dev)
-			break;
+		if (!list[i].eth_dev) {
+			if (rte_errno != EBUSY)
+				break;
+			/* Device is disabled, ignore it. */
+			continue;
+		}
 		restore = list[i].eth_dev->data->dev_flags;
 		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
 		/* Restore non-PCI flags cleared by the above call. */
@@ -1370,6 +1403,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 		/* Roll back. */
 		while (i--) {
+			if (!list[i].eth_dev)
+				continue;
 			mlx5_dev_close(list[i].eth_dev);
 			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 				rte_free(list[i].eth_dev->data->dev_private);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers for port representors
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (8 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors Adrien Mazarguil
@ 2018-07-05  8:45       ` Adrien Mazarguil
  2018-07-09 11:58         ` Shahaf Shuler
  2018-07-10 16:04       ` [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support Adrien Mazarguil
  10 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-05  8:45 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch brings support for BlueField representors.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
--
v3 changes:

- This patch was not present in prior revisions.
---
 drivers/net/mlx5/mlx5.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 12a77afa8..df7f39844 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1330,6 +1330,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			memset(&list[i].info, 0, sizeof(list[i].info));
 			continue;
 		}
+		/*
+		 * Port representors not associated with any VFs (e.g. on
+		 * BlueField devices) report -1 as a port identifier.
+		 * Quietly set it to zero since DPDK only supports positive
+		 * values.
+		 */
+		if (list[i].info.representor && list[i].info.port_name == -1)
+			list[i].info.port_name = 0;
 	}
 	if (nl_rdma >= 0)
 		close(nl_rdma);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors Adrien Mazarguil
@ 2018-07-09 11:57         ` Shahaf Shuler
  2018-07-10  9:37           ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-09 11:57 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Nélio Laranjeiro, Xueming(Steven) Li

Hi Adrien,


Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> Subject: [PATCH v4 07/10] net/mlx5: probe all port representors
> 
> Probe existing port representors in addition to their master device and
> associate them automatically.
> 
> To avoid collision between Ethernet devices, they are named as follows:
> 
> - "{DBDF}" for master/switch devices.
> - "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
>   representors.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>
> --
> v4 changes:
> 
> - Fixed domain ID release once the last port using it is closed. Closed
>   devices are not necessarily detached, their presence is not a good
>   indicator. Code was modified to check if they still use their domain IDs
>   before deciding to release it.
> 
> v3 changes:
> 
> - Nelio introduced mlx5_dev_to_port_id() to prevent the master device
> from
>   releasing a domain ID while representors are still bound. It is now
>   released by the last device closed.
> - Reverted to original naming convention as requested by Xueming and
>   Shahaf; "net_" prefix and "_0" suffix were dropped.
> - mlx5_dev_spawn() (previously mlx5_dev_spawn_one()) now decides on
> its own
>   whether underlying device is a representor.
> - Devices can now be probed in any order and not necessarily all at once;
>   representors can exist without a master device.
> - mlx5_pci_probe() iterates on the list of devices directly instead of
>   relying on an intermediate function (previously mlx5_dev_spawn()).
> - mlx5_get_ifname() was rewritten to rely on mlx5_nl_ifindex() when faced
>   with a representor.
> - Since it is not necessarily present, master device is now dynamically
>   retrieved in mlx5_dev_infos_get().
> 
> v2 changes:
> 
> - Added representor information to dev_infos_get(). DPDK port ID of master
>   device is now stored in the private structure to retrieve it
>   conveniently.
> - Master device is assigned dummy representor ID value -1 to better
>   distinguish from the the first actual representor reported by
>   dev_infos_get() as those are indexed from 0.
> - Added RTE_ETH_DEV_REPRESENTOR device flag.
> ---
>  drivers/net/mlx5/mlx5.c        | 134 ++++++++++++++++++++++++++++-------
> -
>  drivers/net/mlx5/mlx5.h        |  12 +++-
>  drivers/net/mlx5/mlx5_ethdev.c | 133
> +++++++++++++++++++++++++++++++----
>  drivers/net/mlx5/mlx5_mac.c    |   2 +-
>  drivers/net/mlx5/mlx5_stats.c  |   6 +-
>  5 files changed, 238 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> d06ba9886..c02afbb82 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -307,7 +307,27 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  	if (ret)
>  		DRV_LOG(WARNING, "port %u some flows still remain",
>  			dev->data->port_id);
> +	if (priv->domain_id !=
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
> +		unsigned int c = 0;
> +		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
> +		uint16_t port_id[i];
> +
> +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i),
> i);
> +		while (i--) {
> +			struct priv *opriv =
> +				rte_eth_devices[port_id[i]].data-
> >dev_private;
> +
> +			if (!opriv ||
> +			    opriv->domain_id != priv->domain_id ||
> +			    &rte_eth_devices[port_id[i]] == dev)
> +				continue;
> +			++c;
> +		}
> +		if (!c)
> +			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
> +	}
>  	memset(priv, 0, sizeof(*priv));
> +	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
>  }
> 
>  const struct eth_dev_ops mlx5_dev_ops = { @@ -647,6 +667,8 @@
> mlx5_uar_init_secondary(struct rte_eth_dev *dev)
>   *   Verbs device.
>   * @param vf
>   *   If nonzero, enable VF-specific features.
> + * @param[in] switch_info
> + *   Switch properties of Ethernet device.
>   *
>   * @return
>   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> @@ -655,7 +677,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
> static struct rte_eth_dev *  mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	       struct ibv_device *ibv_dev,
> -	       int vf)
> +	       int vf,
> +	       const struct mlx5_switch_info *switch_info)
>  {
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> @@ -697,6 +720,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> #endif
>  	struct ether_addr mac;
>  	char name[RTE_ETH_NAME_MAX_LEN];
> +	int own_domain_id = 0;
> +	unsigned int i;
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> @@ -805,7 +830,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		DEBUG("ibv_query_device_ex() failed");
>  		goto error;
>  	}
> -	rte_strlcpy(name, dpdk_dev->name, sizeof(name));
> +	if (!switch_info->representor)
> +		rte_strlcpy(name, dpdk_dev->name, sizeof(name));
> +	else
> +		snprintf(name, sizeof(name), "%s_representor_%u",
> +			 dpdk_dev->name, switch_info->port_name);
> +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  		eth_dev = rte_eth_dev_attach_secondary(name);
>  		if (eth_dev == NULL) {
> @@ -874,6 +904,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		goto error;
>  	}
>  	priv->ctx = ctx;
> +	strncpy(priv->ibdev_name, priv->ctx->device->name,
> +		sizeof(priv->ibdev_name));
>  	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  		sizeof(priv->ibdev_path));
>  	priv->device_attr = attr;
> @@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
>  	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK,
> NETLINK_ROUTE);
>  	priv->nl_sn = 0;
> +	priv->representor = !!switch_info->representor;
> +	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> +	priv->representor_id =
> +		switch_info->representor ? switch_info->port_name : -1;
> +	/*
> +	 * Look for sibling devices in order to reuse their switch domain
> +	 * if any, otherwise allocate one.
> +	 */
> +	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
> +	if (i > 0) {
> +		uint16_t port_id[i];
> +
> +		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i);
> +		while (i--) {
> +			const struct priv *opriv =
> +				rte_eth_devices[port_id[i]].data-
> >dev_private;
> +
> +			if (!opriv ||
> +			    opriv->domain_id ==
> +			    RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +				continue;
> +			priv->domain_id = opriv->domain_id;

It looks like for the second port it will use the domain_id of the first port. Is that what you intent? 

Note - I couldn't test it due to compilation errors:

/.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c: In function 'mlx5_nl_switch_info_cb':
/.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:843:8: error: 'IFLA_PHYS_PORT_NAME' undecl
ared (first use in this function)
   case IFLA_PHYS_PORT_NAME:
        ^
/.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:843:8: note: each undeclared identifier is
 reported only once for each function it appears in
/.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:851:8: error: 'IFLA_PHYS_SWITCH_ID' undecl
ared (first use in this function)
   case IFLA_PHYS_SWITCH_ID:
        ^

My system info:
NAME="Red Hat Enterprise Linux Server"
VERSION="7.3 (Maipo)"
ID="rhel"
ID_LIKE="fedora"
VERSION_ID="7.3"
PRETTY_NAME="Red Hat Enterprise Linux Server 7.3 (Maipo)"
ANSI_COLOR="0;31"
CPE_NAME="cpe:/o:redhat:enterprise_linux:7.3:GA:server"
HOME_URL="https://www.redhat.com/"
BUG_REPORT_URL="https://bugzilla.redhat.com/"

REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7"
REDHAT_BUGZILLA_PRODUCT_VERSION=7.3
REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
REDHAT_SUPPORT_PRODUCT_VERSION="7.3"
Red Hat Enterprise Linux Server release 7.3 (Maipo)
Red Hat Enterprise Linux Server release 7.3 (Maipo)


> +			break;
> +		}
> +	}
> +	if (priv->domain_id ==
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
> +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> +		if (err) {
> +			err = rte_errno;
> +			DRV_LOG(ERR, "unable to allocate switch domain:
> %s",
> +				strerror(rte_errno));
> +			goto error;
> +		}
> +		own_domain_id = 1;
> +	}
>  	err = mlx5_args(&config, dpdk_dev->devargs);
>  	if (err) {
>  		err = rte_errno;
> @@ -966,6 +1033,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		err = ENOMEM;
>  		goto error;
>  	}
> +	if (priv->representor)
> +		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
>  	eth_dev->data->dev_private = priv;
>  	priv->dev_data = eth_dev->data;
>  	eth_dev->data->mac_addrs = priv->mac;
> @@ -1084,6 +1153,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  			close(priv->nl_socket_route);
>  		if (priv->nl_socket_rdma >= 0)
>  			close(priv->nl_socket_rdma);
> +		if (own_domain_id)
> +			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
>  		rte_free(priv);
>  	}
>  	if (pd)
> @@ -1100,7 +1171,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  /**
>   * DPDK callback to register a PCI device.
>   *
> - * This function spawns an Ethernet device out of a given PCI device.
> + * This function spawns Ethernet devices out of a given PCI device.
>   *
>   * @param[in] pci_drv
>   *   PCI driver structure (mlx5_driver).
> @@ -1115,7 +1186,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  	       struct rte_pci_device *pci_dev)  {
>  	struct ibv_device **ibv_list;
> -	struct rte_eth_dev *eth_dev = NULL;
>  	unsigned int n = 0;
>  	int vf;
>  	int ret;
> @@ -1150,9 +1220,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> 
>  	unsigned int ifindex[n];
>  	struct mlx5_switch_info info[n];
> +	struct rte_eth_dev *eth_list[n];
>  	int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
>  	int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
>  	unsigned int i;
> +	unsigned int u;
> 
>  	/*
>  	 * The existence of several matching entries (n > 1) means port @@ -
> 1187,28 +1259,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  		close(nl_rdma);
>  	if (nl_route >= 0)
>  		close(nl_route);
> -	/* Look for master device. */
> -	for (i = 0; i != n; ++i) {
> -		if (!info[i].master)
> -			continue;
> -		/* Make it the first entry. */
> -		if (i == 0)
> -			break;
> -		ibv_match[n] = ibv_match[0];
> -		ibv_match[0] = ibv_match[i];
> -		ibv_match[n] = NULL;
> -		break;
> -	}
> -	if (n && i == n) {
> -		if (n == 1 && !info[0].representor) {
> +	/* Count unidentified devices. */
> +	for (u = 0, i = 0; i != n; ++i)
> +		if (!info[i].master && !info[i].representor)
> +			++u;
> +	if (u) {
> +		if (n == 1 && u == 1) {
>  			/* Case #2. */
>  			DRV_LOG(INFO, "no switch support detected");
> -		} else if (n == 1) {
> -			/* Case #3. */
> -			DRV_LOG(ERR,
> -				"device looks like a port representor, this is"
> -				" not supported yet");
> -			n = 0;
>  		} else {
>  			/* Case #3. */
>  			DRV_LOG(ERR,
> @@ -1227,8 +1285,19 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  	default:
>  		vf = 0;
>  	}
> -	if (n)
> -		eth_dev = mlx5_dev_spawn(&pci_dev->device,
> ibv_match[0], vf);
> +	for (i = 0; i != n; ++i) {
> +		uint32_t restore;
> +
> +		eth_list[i] = mlx5_dev_spawn(&pci_dev->device,
> ibv_match[i],
> +					     vf, &info[i]);
> +		if (!eth_list[i])
> +			break;
> +		restore = eth_list[i]->data->dev_flags;
> +		rte_eth_copy_pci_info(eth_list[i], pci_dev);
> +		/* Restore non-PCI flags cleared by the above call. */
> +		eth_list[i]->data->dev_flags |= restore;
> +		rte_eth_dev_probing_finish(eth_list[i]);
> +	}
>  	mlx5_glue->free_device_list(ibv_list);
>  	if (!n) {
>  		DRV_LOG(WARNING,
> @@ -1238,7 +1307,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			pci_dev->addr.devid, pci_dev->addr.function);
>  		rte_errno = ENOENT;
>  		ret = -rte_errno;
> -	} else if (!eth_dev) {
> +	} else if (i != n) {
>  		DRV_LOG(ERR,
>  			"probe of PCI device " PCI_PRI_FMT " aborted after"
>  			" encountering an error: %s",
> @@ -1246,9 +1315,16 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			pci_dev->addr.devid, pci_dev->addr.function,
>  			strerror(rte_errno));
>  		ret = -rte_errno;
> +		/* Roll back. */
> +		while (i--) {
> +			mlx5_dev_close(eth_list[i]);
> +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +				rte_free(eth_list[i]->data->dev_private);
> +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +		}
> +		/* Restore original error. */
> +		rte_errno = -ret;
>  	} else {
> -		rte_eth_copy_pci_info(eth_dev, pci_dev);
> -		rte_eth_dev_probing_finish(eth_dev);
>  		ret = 0;
>  	}
>  	return ret;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 704046270..cc01310e0 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -159,6 +159,7 @@ struct priv {
>  	struct ibv_context *ctx; /* Verbs context. */
>  	struct ibv_device_attr_ex device_attr; /* Device properties. */
>  	struct ibv_pd *pd; /* Protection Domain. */
> +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */


Why we need a dedicated entry for the ibdev_name? it is already part of priv->ctx->device->name. 

>  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> secondary */
>  	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC
> addresses. */
>  	BITFIELD_DECLARE(mac_own, uint64_t,
> MLX5_MAX_MAC_ADDRESSES); @@ -168,6 +169,9 @@ struct priv {
>  	/* Device properties. */
>  	uint16_t mtu; /* Configured MTU. */
>  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> +	unsigned int representor:1; /* Device is a port representor. */
> +	uint16_t domain_id; /* Switch domain identifier. */
> +	int32_t representor_id; /* Port representor identifier. */
>  	/* RX/TX queues. */
>  	unsigned int rxqs_n; /* RX queues array size. */
>  	unsigned int txqs_n; /* TX queues array size. */ @@ -217,9 +221,12
> @@ int mlx5_getenv_int(const char *);
> 
>  /* mlx5_ethdev.c */
> 
> +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +			   char (*ifname)[IF_NAMESIZE]);
>  int mlx5_get_ifname(const struct rte_eth_dev *dev, char
> (*ifname)[IF_NAMESIZE]);  int mlx5_ifindex(const struct rte_eth_dev *dev);
> -int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
> +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	       int master);
>  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int
> mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
>  		   unsigned int flags);
> @@ -244,6 +251,9 @@ int mlx5_set_link_up(struct rte_eth_dev *dev);  int
> mlx5_is_removed(struct rte_eth_dev *dev);  eth_tx_burst_t
> mlx5_select_tx_function(struct rte_eth_dev *dev);  eth_rx_burst_t
> mlx5_select_rx_function(struct rte_eth_dev *dev);
> +unsigned int mlx5_dev_to_port_id(const struct rte_device *dev,
> +				 uint16_t *port_list,
> +				 unsigned int port_list_n);
> 
>  /* mlx5_mac.c */
> 
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> b/drivers/net/mlx5/mlx5_ethdev.c index 819f5baad..05f66f7b6 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -27,6 +27,7 @@
>  #include <time.h>
> 
>  #include <rte_atomic.h>
> +#include <rte_common.h>
>  #include <rte_ethdev_driver.h>
>  #include <rte_bus_pci.h>
>  #include <rte_mbuf.h>
> @@ -93,7 +94,7 @@ struct ethtool_link_settings {  #endif
> 
>  /**
> - * Get interface name from private structure.
> + * Get master interface name from private structure.
>   *
>   * @param[in] dev
>   *   Pointer to Ethernet device.
> @@ -104,7 +105,8 @@ struct ethtool_link_settings {
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_get_ifname(const struct rte_eth_dev *dev, char
> (*ifname)[IF_NAMESIZE])
> +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +		       char (*ifname)[IF_NAMESIZE])
>  {
>  	struct priv *priv = dev->data->dev_private;
>  	DIR *dir;
> @@ -179,6 +181,39 @@ mlx5_get_ifname(const struct rte_eth_dev *dev,
> char (*ifname)[IF_NAMESIZE])  }
> 
>  /**
> + * Get interface name from private structure.
> + *
> + * This is a port representor-aware version of mlx5_get_master_ifname().
> + *
> + * @param[in] dev
> + *   Pointer to Ethernet device.
> + * @param[out] ifname
> + *   Interface name output buffer.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int
> +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> +(*ifname)[IF_NAMESIZE]) {
> +	struct priv *priv = dev->data->dev_private;
> +	unsigned int ifindex =
> +		priv->nl_socket_rdma >= 0 ?
> +		mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) :
> 0;
> +
> +	if (!ifindex) {
> +		if (!priv->representor)
> +			return mlx5_get_master_ifname(dev, ifname);
> +		rte_errno = ENXIO;
> +		return -rte_errno;
> +	}
> +	if (if_indextoname(ifindex, &(*ifname)[0]))
> +		return 0;
> +	rte_errno = errno;
> +	return -rte_errno;
> +}
> +
> +/**
>   * Get the interface index from device name.
>   *
>   * @param[in] dev
> @@ -214,12 +249,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
>   *   Request number to pass to ioctl().
>   * @param[out] ifr
>   *   Interface request structure output buffer.
> + * @param master
> + *   When device is a port representor, perform request on master device
> + *   instead.
>   *
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	   int master)
>  {
>  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
>  	int ret = 0;
> @@ -228,7 +267,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr)
>  		rte_errno = errno;
>  		return -rte_errno;
>  	}
> -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> +	if (master)
> +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> +	else
> +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
>  	if (ret)
>  		goto error;
>  	ret = ioctl(sock, req, ifr);
> @@ -258,7 +300,7 @@ int
>  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> 
>  	if (ret)
>  		return ret;
> @@ -282,7 +324,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t
> mtu)  {
>  	struct ifreq request = { .ifr_mtu = mtu, };
> 
> -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
>  }
> 
>  /**
> @@ -302,13 +344,13 @@ int
>  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int
> flags)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> 
>  	if (ret)
>  		return ret;
>  	request.ifr_flags &= keep;
>  	request.ifr_flags |= flags & ~keep;
> -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
>  }
> 
>  /**
> @@ -477,6 +519,30 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev,
> struct rte_eth_dev_info *info)
>  	info->speed_capa = priv->link_speed_capa;
>  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
>  	mlx5_set_default_params(dev, info);
> +	info->switch_info.name = dev->data->name;
> +	info->switch_info.domain_id = priv->domain_id;
> +	info->switch_info.port_id = priv->representor_id;
> +	if (priv->representor) {
> +		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
> +		uint16_t port_id[i];
> +
> +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i),
> i);
> +		while (i--) {
> +			struct priv *opriv =
> +				rte_eth_devices[port_id[i]].data-
> >dev_private;
> +
> +			if (!opriv ||
> +			    opriv->representor ||
> +			    opriv->domain_id != priv->domain_id)
> +				continue;
> +			/*
> +			 * Override switch name with that of the master
> +			 * device.
> +			 */
> +			info->switch_info.name = opriv->dev_data->name;
> +			break;

According to this logic it means once the master device is closed, all the representors are no longer belong to the same switch (switch name of each is different) which is not correct.
According to your notes it is possible to close master w/o closing the representor. 

Why not just storing the master switch name when probing the representor and to use it as is on the dev_info? 

> +		}
> +	}
>  }
> 
>  /**
> @@ -540,7 +606,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev
> *dev,
>  	int link_speed = 0;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -550,7
> +616,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&edata;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed:
> %s", @@ -611,7 +677,7 @@ mlx5_link_update_unlocked_gs(struct
> rte_eth_dev *dev,
>  	uint64_t sc;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -621,7
> +687,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&gcmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> @@ -638,7 +704,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev
> *dev,
> 
>  	*ecmd = gcmd;
>  	ifr.ifr_data = (void *)ecmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> @@ -801,7 +867,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
>  	int ret;
> 
>  	ifr.ifr_data = (void *)&ethpause;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GPAUSEPARAM) failed:"
> @@ -854,7 +920,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
>  		ethpause.tx_pause = 1;
>  	else
>  		ethpause.tx_pause = 0;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_SPAUSEPARAM)"
> @@ -1193,3 +1259,40 @@ mlx5_is_removed(struct rte_eth_dev *dev)
>  		return 1;
>  	return 0;
>  }
> +
> +/**
> + * Get port ID list of mlx5 instances sharing a common device.
> + *
> + * @param[in] dev
> + *   Device to look for.
> + * @param[out] port_list
> + *   Result buffer for collected port IDs.
> + * @param port_list_n
> + *   Maximum number of entries in result buffer. If 0, @p port_list can be
> + *   NULL.
> + *
> + * @return
> + *   Number of matching instances regardless of the @p port_list_n
> + *   parameter, 0 if none were found.
> + */
> +unsigned int
> +mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
> +		    unsigned int port_list_n)
> +{
> +	uint16_t id;
> +	unsigned int n = 0;
> +
> +	RTE_ETH_FOREACH_DEV(id) {
> +		struct rte_eth_dev *ldev = &rte_eth_devices[id];
> +
> +		if (!ldev->device ||
> +		    !ldev->device->driver ||
> +		    strcmp(ldev->device->driver->name,
> MLX5_DRIVER_NAME) ||
> +		    ldev->device != dev)
> +			continue;
> +		if (n < port_list_n)
> +			port_list[n] = id;
> +		n++;
> +	}
> +	return n;
> +}
> diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
> index 672a47619..12ee37f55 100644
> --- a/drivers/net/mlx5/mlx5_mac.c
> +++ b/drivers/net/mlx5/mlx5_mac.c
> @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t
> (*mac)[ETHER_ADDR_LEN])
>  	struct ifreq request;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
>  	if (ret)
>  		return ret;
>  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff -
> -git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index
> 875dd1027..91f3d474a 100644
> --- a/drivers/net/mlx5/mlx5_stats.c
> +++ b/drivers/net/mlx5/mlx5_stats.c
> @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev,
> uint64_t *stats)
>  	et_stats->cmd = ETHTOOL_GSTATS;
>  	et_stats->n_stats = xstats_ctrl->stats_n;
>  	ifr.ifr_data = (caddr_t)et_stats;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u unable to read statistic values from device",
> @@ -194,7 +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev)
> {
> 
>  	drvinfo.cmd = ETHTOOL_GDRVINFO;
>  	ifr.ifr_data = (caddr_t)&drvinfo;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to query number of
> statistics",
>  			dev->data->port_id);
> @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
>  	strings->string_set = ETH_SS_STATS;
>  	strings->len = dev_stats_n;
>  	ifr.ifr_data = (caddr_t)strings;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to get statistic
> names",
>  			dev->data->port_id);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors Adrien Mazarguil
@ 2018-07-09 11:57         ` Shahaf Shuler
  2018-07-10  9:37           ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-09 11:57 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> Subject: [PATCH v4 09/10] net/mlx5: add parameter for port representors
> 
> Prior to this patch, all port representors detected on a given device were
> probed and Ethernet devices instantiated for each of them.
> 
> This patch adds support for the standard "representor" parameter, which
> implies that port representors are not probed by default anymore, except
> for the list provided through device arguments.
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> --
> v3 changes:
> 
> - Adapted representor detection to the reworked mlx5_dev_spawn().
> 
> v2 changes:
> 
> - Added error message for when rte_eth_devargs_parse() fails.
> ---
>  doc/guides/nics/mlx5.rst                | 12 ++++++++
>  doc/guides/prog_guide/poll_mode_drv.rst |  2 ++
>  drivers/net/mlx5/mlx5.c                 | 41 ++++++++++++++++++++++++++--
>  3 files changed, 52 insertions(+), 3 deletions(-)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index
> 7dd9c1c5e..0d0d21727 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -392,6 +392,18 @@ Run-time configuration
> 
>    Disabled by default.
> 
> +- ``representor`` parameter [list]
> +
> +  This parameter can be used to instantiate DPDK Ethernet devices from
> + existing port (or VF) representors configured on the device.
> +
> +  It is a standard parameter whose format is described in
> + :ref:`ethernet_device_standard_device_arguments`.
> +
> +  For instance, to probe port representors 0 through 2::
> +
> +    representor=[0-2]
> +
>  Firmware configuration
>  ~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/doc/guides/prog_guide/poll_mode_drv.rst
> b/doc/guides/prog_guide/poll_mode_drv.rst
> index 4b69f6cbe..b2cf48354 100644
> --- a/doc/guides/prog_guide/poll_mode_drv.rst
> +++ b/doc/guides/prog_guide/poll_mode_drv.rst
> @@ -360,6 +360,8 @@ Ethernet Device API
> 
>  The Ethernet device API exported by the Ethernet PMDs is described in the
> *DPDK API Reference*.
> 
> +.. _ethernet_device_standard_device_arguments:
> +
>  Ethernet Device Standard Device Arguments
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 6592480bf..12a77afa8 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -92,6 +92,9 @@
>  /* Activate Netlink support in VF mode. */  #define MLX5_VF_NL_EN
> "vf_nl_en"
> 
> +/* Select port representors to instantiate. */ #define MLX5_REPRESENTOR
> +"representor"
> +
>  #ifndef HAVE_IBV_MLX5_MOD_MPW
>  #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)  #define
> MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) @@ -443,6 +446,9
> @@ mlx5_args_check(const char *key, const char *val, void *opaque)
>  	struct mlx5_dev_config *config = opaque;
>  	unsigned long tmp;
> 
> +	/* No-op, port representors are processed in mlx5_dev_spawn(). */
> +	if (!strcmp(MLX5_REPRESENTOR, key))
> +		return 0;
>  	errno = 0;
>  	tmp = strtoul(val, NULL, 0);
>  	if (errno) {
> @@ -515,6 +521,7 @@ mlx5_args(struct mlx5_dev_config *config, struct
> rte_devargs *devargs)
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
>  		MLX5_VF_NL_EN,
> +		MLX5_REPRESENTOR,
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> @@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
>   *
>   * @return
>   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> - *   is set.
> + *   is set. The following error is defined:
> + *
> + *   EBUSY: device is not supposed to be spawned.
>   */
>  static struct rte_eth_dev *
>  mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -723,6 +732,26 @@
> mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	int own_domain_id = 0;
>  	unsigned int i;
> 
> +	/* Determine if this port representor is supposed to be spawned. */
> +	if (switch_info->representor && dpdk_dev->devargs) {
> +		struct rte_eth_devargs eth_da;
> +
> +		err = rte_eth_devargs_parse(dpdk_dev->devargs->args,
> &eth_da);
> +		if (err) {
> +			rte_errno = -err;
> +			DRV_LOG(ERR, "failed to process device arguments:
> %s",
> +				strerror(rte_errno));
> +			return NULL;
> +		}
> +		for (i = 0; i < eth_da.nb_representor_ports; ++i)
> +			if (eth_da.representor_ports[i] ==
> +			    (uint16_t)switch_info->port_name)
> +				break;
> +		if (i == eth_da.nb_representor_ports) {
> +			rte_errno = EBUSY;

Why EBUSY is the correct errno? Will another attempts to probe the device can be successful? 

> +			return NULL;
> +		}
> +	}
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
>  	errno = 0;
> @@ -1343,8 +1372,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> 
>  		list[i].eth_dev = mlx5_dev_spawn
>  			(&pci_dev->device, list[i].ibv_dev, vf, &list[i].info);
> -		if (!list[i].eth_dev)
> -			break;
> +		if (!list[i].eth_dev) {
> +			if (rte_errno != EBUSY)
> +				break;
> +			/* Device is disabled, ignore it. */
> +			continue;
> +		}
>  		restore = list[i].eth_dev->data->dev_flags;
>  		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
>  		/* Restore non-PCI flags cleared by the above call. */ @@ -
> 1370,6 +1403,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  		ret = -rte_errno;
>  		/* Roll back. */
>  		while (i--) {
> +			if (!list[i].eth_dev)
> +				continue;
>  			mlx5_dev_close(list[i].eth_dev);
>  			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
>  				rte_free(list[i].eth_dev->data-
> >dev_private);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers for port representors
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers " Adrien Mazarguil
@ 2018-07-09 11:58         ` Shahaf Shuler
  2018-07-10  9:37           ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-09 11:58 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Adrien, thank for this patch.

Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> Subject: [PATCH v4 10/10] net/mlx5: support negative identifiers for port
> representors
> 
> This patch brings support for BlueField representors.
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>
> --
> v3 changes:
> 
> - This patch was not present in prior revisions.
> ---
>  drivers/net/mlx5/mlx5.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 12a77afa8..df7f39844 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1330,6 +1330,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			memset(&list[i].info, 0, sizeof(list[i].info));
>  			continue;
>  		}
> +		/*
> +		 * Port representors not associated with any VFs (e.g. on
> +		 * BlueField devices) report -1 as a port identifier.
> +		 * Quietly set it to zero since DPDK only supports positive
> +		 * values.
> +		 */

I am waiting for the final answer from the BlueField team about the way they are going to enum the BlueField representors. 
In case it will be the same as x86 I think we can drop this patch, otherwise use it, agree?

> +		if (list[i].info.representor && list[i].info.port_name == -1)
> +			list[i].info.port_name = 0;
>  	}
>  	if (nl_rdma >= 0)
>  		close(nl_rdma);
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-09 11:57         ` Shahaf Shuler
@ 2018-07-10  9:37           ` Adrien Mazarguil
  2018-07-10 10:13             ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10  9:37 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Nélio Laranjeiro, Xueming(Steven) Li

On Mon, Jul 09, 2018 at 11:57:29AM +0000, Shahaf Shuler wrote:
> Hi Adrien,
> 
> 
> Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > Subject: [PATCH v4 07/10] net/mlx5: probe all port representors
> > 
> > Probe existing port representors in addition to their master device and
> > associate them automatically.
> > 
> > To avoid collision between Ethernet devices, they are named as follows:
> > 
> > - "{DBDF}" for master/switch devices.
> > - "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > Cc: Xueming Li <xuemingl@mellanox.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>
> > --
> > v4 changes:
> > 
> > - Fixed domain ID release once the last port using it is closed. Closed
> >   devices are not necessarily detached, their presence is not a good
> >   indicator. Code was modified to check if they still use their domain IDs
> >   before deciding to release it.
<snip>
> > @@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
> >  	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK,
> > NETLINK_ROUTE);
> >  	priv->nl_sn = 0;
> > +	priv->representor = !!switch_info->representor;
> > +	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> > +	priv->representor_id =
> > +		switch_info->representor ? switch_info->port_name : -1;
> > +	/*
> > +	 * Look for sibling devices in order to reuse their switch domain
> > +	 * if any, otherwise allocate one.
> > +	 */
> > +	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
> > +	if (i > 0) {
> > +		uint16_t port_id[i];
> > +
> > +		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i);
> > +		while (i--) {
> > +			const struct priv *opriv =
> > +				rte_eth_devices[port_id[i]].data-
> > >dev_private;
> > +
> > +			if (!opriv ||
> > +			    opriv->domain_id ==
> > +			    RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +				continue;
> > +			priv->domain_id = opriv->domain_id;
> 
> It looks like for the second port it will use the domain_id of the first port. Is that what you intent? 

Yes, it's on purpose. Master and representors of a given device must share
the same domain ID to let applications know they can create flow rules to
forward traffic between them all.

> Note - I couldn't test it due to compilation errors:
> 
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c: In function 'mlx5_nl_switch_info_cb':
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:843:8: error: 'IFLA_PHYS_PORT_NAME' undecl
> ared (first use in this function)
>    case IFLA_PHYS_PORT_NAME:
>         ^
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:843:8: note: each undeclared identifier is
>  reported only once for each function it appears in
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5_nl.c:851:8: error: 'IFLA_PHYS_SWITCH_ID' undecl
> ared (first use in this function)
>    case IFLA_PHYS_SWITCH_ID:
>         ^
> 
> My system info:
> NAME="Red Hat Enterprise Linux Server"
> VERSION="7.3 (Maipo)"
> ID="rhel"
> ID_LIKE="fedora"
> VERSION_ID="7.3"
> PRETTY_NAME="Red Hat Enterprise Linux Server 7.3 (Maipo)"
> ANSI_COLOR="0;31"
> CPE_NAME="cpe:/o:redhat:enterprise_linux:7.3:GA:server"
> HOME_URL="https://www.redhat.com/"
> BUG_REPORT_URL="https://bugzilla.redhat.com/"
> 
> REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7"
> REDHAT_BUGZILLA_PRODUCT_VERSION=7.3
> REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
> REDHAT_SUPPORT_PRODUCT_VERSION="7.3"
> Red Hat Enterprise Linux Server release 7.3 (Maipo)
> Red Hat Enterprise Linux Server release 7.3 (Maipo)

OK, I'll redefine in v5 in case they are missing on the host system.

<snip>
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 704046270..cc01310e0 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -159,6 +159,7 @@ struct priv {
> >  	struct ibv_context *ctx; /* Verbs context. */
> >  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> >  	struct ibv_pd *pd; /* Protection Domain. */
> > +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> 
> 
> Why we need a dedicated entry for the ibdev_name? it is already part of priv->ctx->device->name. 

Heh, same reason as the next line below, don't forget those damn secondaries
which can't dereference local pointers from the primary process :)

> >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> > secondary */
<snip>
> > struct rte_eth_dev_info *info)
> >  	info->speed_capa = priv->link_speed_capa;
> >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> >  	mlx5_set_default_params(dev, info);
> > +	info->switch_info.name = dev->data->name;
> > +	info->switch_info.domain_id = priv->domain_id;
> > +	info->switch_info.port_id = priv->representor_id;
> > +	if (priv->representor) {
> > +		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
> > +		uint16_t port_id[i];
> > +
> > +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i),
> > i);
> > +		while (i--) {
> > +			struct priv *opriv =
> > +				rte_eth_devices[port_id[i]].data-
> > >dev_private;
> > +
> > +			if (!opriv ||
> > +			    opriv->representor ||
> > +			    opriv->domain_id != priv->domain_id)
> > +				continue;
> > +			/*
> > +			 * Override switch name with that of the master
> > +			 * device.
> > +			 */
> > +			info->switch_info.name = opriv->dev_data->name;
> > +			break;
> 
> According to this logic it means once the master device is closed, all the representors are no longer belong to the same switch (switch name of each is different) which is not correct.

They still share the same domain ID, which is what actually matters. The
switch name is only provided to let applications identify the master
(control) device in case it's needed.

> According to your notes it is possible to close master w/o closing the representor. 

This allows devices to be probed in any order on a needed basis, not all at
once. It's done on purpose to pave the way for hotplug support.

> Why not just storing the master switch name when probing the representor and to use it as is on the dev_info? 

The switch name *must* be that of the master device. If the master is not
probed, there can't be a switch name. However there's no real provision for
this in the API, so I chose the most acceptable unique name, which is the
name of the local device. Would you prefer an empty name instead?

Thing is, on mlx5 flow rules can be created directly between representors
without involving the master device. An empty switch name may be misleading
in this respect.

What do you suggest?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-09 11:57         ` Shahaf Shuler
@ 2018-07-10  9:37           ` Adrien Mazarguil
  2018-07-10 10:16             ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10  9:37 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

On Mon, Jul 09, 2018 at 11:57:37AM +0000, Shahaf Shuler wrote:
> Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > Subject: [PATCH v4 09/10] net/mlx5: add parameter for port representors
> > 
> > Prior to this patch, all port representors detected on a given device were
> > probed and Ethernet devices instantiated for each of them.
> > 
> > This patch adds support for the standard "representor" parameter, which
> > implies that port representors are not probed by default anymore, except
> > for the list provided through device arguments.
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > --
> > v3 changes:
> > 
> > - Adapted representor detection to the reworked mlx5_dev_spawn().
<snip>
> > @@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> > - *   is set.
> > + *   is set. The following error is defined:
> > + *
> > + *   EBUSY: device is not supposed to be spawned.
> >   */
> >  static struct rte_eth_dev *
> >  mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -723,6 +732,26 @@
> > mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  	int own_domain_id = 0;
> >  	unsigned int i;
> > 
> > +	/* Determine if this port representor is supposed to be spawned. */
> > +	if (switch_info->representor && dpdk_dev->devargs) {
> > +		struct rte_eth_devargs eth_da;
> > +
> > +		err = rte_eth_devargs_parse(dpdk_dev->devargs->args,
> > &eth_da);
> > +		if (err) {
> > +			rte_errno = -err;
> > +			DRV_LOG(ERR, "failed to process device arguments:
> > %s",
> > +				strerror(rte_errno));
> > +			return NULL;
> > +		}
> > +		for (i = 0; i < eth_da.nb_representor_ports; ++i)
> > +			if (eth_da.representor_ports[i] ==
> > +			    (uint16_t)switch_info->port_name)
> > +				break;
> > +		if (i == eth_da.nb_representor_ports) {
> > +			rte_errno = EBUSY;
> 
> Why EBUSY is the correct errno? Will another attempts to probe the device can be successful? 

That's the definition of EAGAIN :)

I thought EBUSY in the sense of "don't disturb" would be appropriate. This
value was also chosen because it is not likely to be returned by any
intermediate function calls. I've defined EBUSY along with the return value
of this function for clarity (see above). Any suggestion?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers for port representors
  2018-07-09 11:58         ` Shahaf Shuler
@ 2018-07-10  9:37           ` Adrien Mazarguil
  0 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10  9:37 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

On Mon, Jul 09, 2018 at 11:58:05AM +0000, Shahaf Shuler wrote:
> Adrien, thank for this patch.
> 
> Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > Subject: [PATCH v4 10/10] net/mlx5: support negative identifiers for port
> > representors
> > 
> > This patch brings support for BlueField representors.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>
> > --
> > v3 changes:
> > 
> > - This patch was not present in prior revisions.
> > ---
> >  drivers/net/mlx5/mlx5.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 12a77afa8..df7f39844 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -1330,6 +1330,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> > __rte_unused,
> >  			memset(&list[i].info, 0, sizeof(list[i].info));
> >  			continue;
> >  		}
> > +		/*
> > +		 * Port representors not associated with any VFs (e.g. on
> > +		 * BlueField devices) report -1 as a port identifier.
> > +		 * Quietly set it to zero since DPDK only supports positive
> > +		 * values.
> > +		 */
> 
> I am waiting for the final answer from the BlueField team about the way they are going to enum the BlueField representors. 
> In case it will be the same as x86 I think we can drop this patch, otherwise use it, agree?

No problem.

Note this patch is also based on the assumption that there's only one such
device, but I couldn't verify it.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-10  9:37           ` Adrien Mazarguil
@ 2018-07-10 10:13             ` Shahaf Shuler
  2018-07-10 10:58               ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-10 10:13 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Nélio Laranjeiro, Xueming(Steven) Li

Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> Subject: Re: [PATCH v4 07/10] net/mlx5: probe all port representors
> 
> On Mon, Jul 09, 2018 at 11:57:29AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> >
> > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > Subject: [PATCH v4 07/10] net/mlx5: probe all port representors
> > >
> > > Probe existing port representors in addition to their master device
> > > and associate them automatically.
> > >
> > > To avoid collision between Ethernet devices, they are named as follows:
> > >
> > > - "{DBDF}" for master/switch devices.
> > > - "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> > >   representors.
> > >
> > > (Patch based on prior work from Yuanhan Liu)
> > >
> > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > Cc: Xueming Li <xuemingl@mellanox.com>
> > > Cc: Shahaf Shuler <shahafs@mellanox.com>
> > > --
> > > v4 changes:
> > >
> > > - Fixed domain ID release once the last port using it is closed. Closed
> > >   devices are not necessarily detached, their presence is not a good
> > >   indicator. Code was modified to check if they still use their domain IDs
> > >   before deciding to release it.
> <snip>
> > > @@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > >  	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
> > >  	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK,
> > > NETLINK_ROUTE);
> > >  	priv->nl_sn = 0;
> > > +	priv->representor = !!switch_info->representor;
> > > +	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> > > +	priv->representor_id =
> > > +		switch_info->representor ? switch_info->port_name : -1;
> > > +	/*
> > > +	 * Look for sibling devices in order to reuse their switch domain
> > > +	 * if any, otherwise allocate one.
> > > +	 */
> > > +	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
> > > +	if (i > 0) {
> > > +		uint16_t port_id[i];
> > > +
> > > +		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i);
> > > +		while (i--) {
> > > +			const struct priv *opriv =
> > > +				rte_eth_devices[port_id[i]].data-
> > > >dev_private;
> > > +
> > > +			if (!opriv ||
> > > +			    opriv->domain_id ==
> > > +			    RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > > +				continue;
> > > +			priv->domain_id = opriv->domain_id;
> >
> > It looks like for the second port it will use the domain_id of the first port. Is
> that what you intent?
> 
> Yes, it's on purpose. Master and representors of a given device must share
> the same domain ID to let applications know they can create flow rules to
> forward traffic between them all.

But this is not the case in Mellanox devices. On Mellanox devices each PF along w/ its representors has a separate eswitch, and traffic cannot be routed between the switches using flow rules.
For example if we have PF0 along w/ its representor REP0_0 and PF1 along w/ its representor REP1_0 . PF0 and REP0_0 will belong to switch X and PF1 and REP1_0 will belong to switch domain Y. it is also being reflected on the phys_switch_id.

We should have switch domain per PF. 

> 
> > Note - I couldn't test it due to compilation errors:
> >
> >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> _nl.c: In function 'mlx5_nl_switch_info_cb':
> >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> _
> > nl.c:843:8: error: 'IFLA_PHYS_PORT_NAME' undecl ared (first use in this
> function)
> >    case IFLA_PHYS_PORT_NAME:
> >         ^
> >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> _
> > nl.c:843:8: note: each undeclared identifier is  reported only once
> > for each function it appears in
> >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> _
> > nl.c:851:8: error: 'IFLA_PHYS_SWITCH_ID' undecl ared (first use in this
> function)
> >    case IFLA_PHYS_SWITCH_ID:
> >         ^
> >
> > My system info:
> > NAME="Red Hat Enterprise Linux Server"
> > VERSION="7.3 (Maipo)"
> > ID="rhel"
> > ID_LIKE="fedora"
> > VERSION_ID="7.3"
> > PRETTY_NAME="Red Hat Enterprise Linux Server 7.3 (Maipo)"
> > ANSI_COLOR="0;31"
> > CPE_NAME="cpe:/o:redhat:enterprise_linux:7.3:GA:server"
> >
> HOME_URL="https://emea01.safelinks.protection.outlook.com/?url=https%
> 3A%2F%2Fwww.redhat.com%2F&amp;data=02%7C01%7Cshahafs%40mellan
> ox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e4d9ba6a4
> d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=Lg8arhiYLvH5L
> 2hef8DVhS8A3fVJ%2B5IZkLIHmqCd%2FmY%3D&amp;reserved=0"
> >
> BUG_REPORT_URL="https://emea01.safelinks.protection.outlook.com/?url=
> https%3A%2F%2Fbugzilla.redhat.com%2F&amp;data=02%7C01%7Cshahafs%
> 40mellanox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e
> 4d9ba6a4d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=3Do
> RKjxovM8tOgKLssC1mq2wwfhjpVUZSExXV4ywBEQ%3D&amp;reserved=0"
> >
> > REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7"
> > REDHAT_BUGZILLA_PRODUCT_VERSION=7.3
> > REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
> > REDHAT_SUPPORT_PRODUCT_VERSION="7.3"
> > Red Hat Enterprise Linux Server release 7.3 (Maipo) Red Hat Enterprise
> > Linux Server release 7.3 (Maipo)
> 
> OK, I'll redefine in v5 in case they are missing on the host system.
> 
> <snip>
> > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > > 704046270..cc01310e0 100644
> > > --- a/drivers/net/mlx5/mlx5.h
> > > +++ b/drivers/net/mlx5/mlx5.h
> > > @@ -159,6 +159,7 @@ struct priv {
> > >  	struct ibv_context *ctx; /* Verbs context. */
> > >  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> > >  	struct ibv_pd *pd; /* Protection Domain. */
> > > +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> >
> >
> > Why we need a dedicated entry for the ibdev_name? it is already part of
> priv->ctx->device->name.
> 
> Heh, same reason as the next line below, don't forget those damn
> secondaries which can't dereference local pointers from the primary process
> :)

Right 😊. 

> 
> > >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> > > secondary */
> <snip>
> > > struct rte_eth_dev_info *info)
> > >  	info->speed_capa = priv->link_speed_capa;
> > >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> > >  	mlx5_set_default_params(dev, info);
> > > +	info->switch_info.name = dev->data->name;
> > > +	info->switch_info.domain_id = priv->domain_id;
> > > +	info->switch_info.port_id = priv->representor_id;
> > > +	if (priv->representor) {
> > > +		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
> > > +		uint16_t port_id[i];
> > > +
> > > +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i),
> > > i);
> > > +		while (i--) {
> > > +			struct priv *opriv =
> > > +				rte_eth_devices[port_id[i]].data-
> > > >dev_private;
> > > +
> > > +			if (!opriv ||
> > > +			    opriv->representor ||
> > > +			    opriv->domain_id != priv->domain_id)
> > > +				continue;
> > > +			/*
> > > +			 * Override switch name with that of the master
> > > +			 * device.
> > > +			 */
> > > +			info->switch_info.name = opriv->dev_data->name;
> > > +			break;
> >
> > According to this logic it means once the master device is closed, all the
> representors are no longer belong to the same switch (switch name of each
> is different) which is not correct.
> 
> They still share the same domain ID, which is what actually matters. The
> switch name is only provided to let applications identify the master
> (control) device in case it's needed.
> 
> > According to your notes it is possible to close master w/o closing the
> representor.
> 
> This allows devices to be probed in any order on a needed basis, not all at
> once. It's done on purpose to pave the way for hotplug support.
> 
> > Why not just storing the master switch name when probing the
> representor and to use it as is on the dev_info?
> 
> The switch name *must* be that of the master device. If the master is not
> probed, there can't be a switch name. However there's no real provision for
> this in the API, so I chose the most acceptable unique name, which is the
> name of the local device. Would you prefer an empty name instead?

The current approach is OK. 
I was just suggesting to skip the loop iteration by saving the switch name on the private structure. 

> 
> Thing is, on mlx5 flow rules can be created directly between representors
> without involving the master device. An empty switch name may be
> misleading in this respect.
> 
> What do you suggest?
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-10  9:37           ` Adrien Mazarguil
@ 2018-07-10 10:16             ` Shahaf Shuler
  2018-07-10 10:58               ` Adrien Mazarguil
  0 siblings, 1 reply; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-10 10:16 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> Subject: Re: [PATCH v4 09/10] net/mlx5: add parameter for port
> representors
> 
> On Mon, Jul 09, 2018 at 11:57:37AM +0000, Shahaf Shuler wrote:
> > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > Subject: [PATCH v4 09/10] net/mlx5: add parameter for port
> > > representors
> > >
> > > Prior to this patch, all port representors detected on a given
> > > device were probed and Ethernet devices instantiated for each of them.
> > >
> > > This patch adds support for the standard "representor" parameter,
> > > which implies that port representors are not probed by default
> > > anymore, except for the list provided through device arguments.
> > >
> > > (Patch based on prior work from Yuanhan Liu)
> > >
> > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > --
> > > v3 changes:
> > >
> > > - Adapted representor detection to the reworked mlx5_dev_spawn().
> <snip>
> > > @@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev
> *dev)
> > >   *
> > >   * @return
> > >   *   A valid Ethernet device object on success, NULL otherwise and
> rte_errno
> > > - *   is set.
> > > + *   is set. The following error is defined:
> > > + *
> > > + *   EBUSY: device is not supposed to be spawned.
> > >   */
> > >  static struct rte_eth_dev *
> > >  mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -723,6 +732,26 @@
> > > mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > >  	int own_domain_id = 0;
> > >  	unsigned int i;
> > >
> > > +	/* Determine if this port representor is supposed to be spawned. */
> > > +	if (switch_info->representor && dpdk_dev->devargs) {
> > > +		struct rte_eth_devargs eth_da;
> > > +
> > > +		err = rte_eth_devargs_parse(dpdk_dev->devargs->args,
> > > &eth_da);
> > > +		if (err) {
> > > +			rte_errno = -err;
> > > +			DRV_LOG(ERR, "failed to process device arguments:
> > > %s",
> > > +				strerror(rte_errno));
> > > +			return NULL;
> > > +		}
> > > +		for (i = 0; i < eth_da.nb_representor_ports; ++i)
> > > +			if (eth_da.representor_ports[i] ==
> > > +			    (uint16_t)switch_info->port_name)
> > > +				break;
> > > +		if (i == eth_da.nb_representor_ports) {
> > > +			rte_errno = EBUSY;
> >
> > Why EBUSY is the correct errno? Will another attempts to probe the device
> can be successful?
> 
> That's the definition of EAGAIN :)
> 
> I thought EBUSY in the sense of "don't disturb" would be appropriate. This
> value was also chosen because it is not likely to be returned by any
> intermediate function calls. I've defined EBUSY along with the return value of
> this function for clarity (see above). Any suggestion?

How about ENODEV ?

> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-10 10:16             ` Shahaf Shuler
@ 2018-07-10 10:58               ` Adrien Mazarguil
  2018-07-10 11:15                 ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 10:58 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

On Tue, Jul 10, 2018 at 10:16:03AM +0000, Shahaf Shuler wrote:
> Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> > Subject: Re: [PATCH v4 09/10] net/mlx5: add parameter for port
> > representors
> > 
> > On Mon, Jul 09, 2018 at 11:57:37AM +0000, Shahaf Shuler wrote:
> > > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > > Subject: [PATCH v4 09/10] net/mlx5: add parameter for port
> > > > representors
> > > >
> > > > Prior to this patch, all port representors detected on a given
> > > > device were probed and Ethernet devices instantiated for each of them.
> > > >
> > > > This patch adds support for the standard "representor" parameter,
> > > > which implies that port representors are not probed by default
> > > > anymore, except for the list provided through device arguments.
> > > >
> > > > (Patch based on prior work from Yuanhan Liu)
> > > >
> > > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > > --
> > > > v3 changes:
> > > >
> > > > - Adapted representor detection to the reworked mlx5_dev_spawn().
> > <snip>
> > > > @@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev
> > *dev)
> > > >   *
> > > >   * @return
> > > >   *   A valid Ethernet device object on success, NULL otherwise and
> > rte_errno
> > > > - *   is set.
> > > > + *   is set. The following error is defined:
> > > > + *
> > > > + *   EBUSY: device is not supposed to be spawned.
> > > >   */
> > > >  static struct rte_eth_dev *
> > > >  mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -723,6 +732,26 @@
> > > > mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > > >  	int own_domain_id = 0;
> > > >  	unsigned int i;
> > > >
> > > > +	/* Determine if this port representor is supposed to be spawned. */
> > > > +	if (switch_info->representor && dpdk_dev->devargs) {
> > > > +		struct rte_eth_devargs eth_da;
> > > > +
> > > > +		err = rte_eth_devargs_parse(dpdk_dev->devargs->args,
> > > > &eth_da);
> > > > +		if (err) {
> > > > +			rte_errno = -err;
> > > > +			DRV_LOG(ERR, "failed to process device arguments:
> > > > %s",
> > > > +				strerror(rte_errno));
> > > > +			return NULL;
> > > > +		}
> > > > +		for (i = 0; i < eth_da.nb_representor_ports; ++i)
> > > > +			if (eth_da.representor_ports[i] ==
> > > > +			    (uint16_t)switch_info->port_name)
> > > > +				break;
> > > > +		if (i == eth_da.nb_representor_ports) {
> > > > +			rte_errno = EBUSY;
> > >
> > > Why EBUSY is the correct errno? Will another attempts to probe the device
> > can be successful?
> > 
> > That's the definition of EAGAIN :)
> > 
> > I thought EBUSY in the sense of "don't disturb" would be appropriate. This
> > value was also chosen because it is not likely to be returned by any
> > intermediate function calls. I've defined EBUSY along with the return value of
> > this function for clarity (see above). Any suggestion?
> 
> How about ENODEV ?

Already used by many internal functions, typically returned if the
associated netdevice doesn't exist (e.g. sent to another netns; a fatal
error when probing representors).

We need a unique error code that says "OK, no problem, just not this one".

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-10 10:13             ` Shahaf Shuler
@ 2018-07-10 10:58               ` Adrien Mazarguil
  2018-07-10 11:17                 ` Shahaf Shuler
  0 siblings, 1 reply; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 10:58 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Nélio Laranjeiro, Xueming(Steven) Li

On Tue, Jul 10, 2018 at 10:13:25AM +0000, Shahaf Shuler wrote:
> Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> > Subject: Re: [PATCH v4 07/10] net/mlx5: probe all port representors
> > 
> > On Mon, Jul 09, 2018 at 11:57:29AM +0000, Shahaf Shuler wrote:
> > > Hi Adrien,
> > >
> > >
> > > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > > Subject: [PATCH v4 07/10] net/mlx5: probe all port representors
> > > >
> > > > Probe existing port representors in addition to their master device
> > > > and associate them automatically.
> > > >
> > > > To avoid collision between Ethernet devices, they are named as follows:
> > > >
> > > > - "{DBDF}" for master/switch devices.
> > > > - "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> > > >   representors.
> > > >
> > > > (Patch based on prior work from Yuanhan Liu)
> > > >
> > > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > > Cc: Xueming Li <xuemingl@mellanox.com>
> > > > Cc: Shahaf Shuler <shahafs@mellanox.com>
> > > > --
> > > > v4 changes:
> > > >
> > > > - Fixed domain ID release once the last port using it is closed. Closed
> > > >   devices are not necessarily detached, their presence is not a good
> > > >   indicator. Code was modified to check if they still use their domain IDs
> > > >   before deciding to release it.
> > <snip>
> > > > @@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > > >  	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
> > > >  	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK,
> > > > NETLINK_ROUTE);
> > > >  	priv->nl_sn = 0;
> > > > +	priv->representor = !!switch_info->representor;
> > > > +	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> > > > +	priv->representor_id =
> > > > +		switch_info->representor ? switch_info->port_name : -1;
> > > > +	/*
> > > > +	 * Look for sibling devices in order to reuse their switch domain
> > > > +	 * if any, otherwise allocate one.
> > > > +	 */
> > > > +	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
> > > > +	if (i > 0) {
> > > > +		uint16_t port_id[i];
> > > > +
> > > > +		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i);
> > > > +		while (i--) {
> > > > +			const struct priv *opriv =
> > > > +				rte_eth_devices[port_id[i]].data-
> > > > >dev_private;
> > > > +
> > > > +			if (!opriv ||
> > > > +			    opriv->domain_id ==
> > > > +			    RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > > > +				continue;
> > > > +			priv->domain_id = opriv->domain_id;
> > >
> > > It looks like for the second port it will use the domain_id of the first port. Is
> > that what you intent?
> > 
> > Yes, it's on purpose. Master and representors of a given device must share
> > the same domain ID to let applications know they can create flow rules to
> > forward traffic between them all.
> 
> But this is not the case in Mellanox devices. On Mellanox devices each PF along w/ its representors has a separate eswitch, and traffic cannot be routed between the switches using flow rules.
> For example if we have PF0 along w/ its representor REP0_0 and PF1 along w/ its representor REP1_0 . PF0 and REP0_0 will belong to switch X and PF1 and REP1_0 will belong to switch domain Y. it is also being reflected on the phys_switch_id.
> 
> We should have switch domain per PF. 

Looks like I didn't understand your previous comment. I confirm there is no
such issue, one domain ID is allocated per PF/representors group, which are
identified by a common PCI bus address. It's fine because on mlx5, each
physical port exposes its own address, I assumed there was no need to
additionally compare phys_switch_id. Can this happen?

> > > Note - I couldn't test it due to compilation errors:
> > >
> > >
> > /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> > _nl.c: In function 'mlx5_nl_switch_info_cb':
> > >
> > /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> > _
> > > nl.c:843:8: error: 'IFLA_PHYS_PORT_NAME' undecl ared (first use in this
> > function)
> > >    case IFLA_PHYS_PORT_NAME:
> > >         ^
> > >
> > /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> > _
> > > nl.c:843:8: note: each undeclared identifier is  reported only once
> > > for each function it appears in
> > >
> > /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx5
> > _
> > > nl.c:851:8: error: 'IFLA_PHYS_SWITCH_ID' undecl ared (first use in this
> > function)
> > >    case IFLA_PHYS_SWITCH_ID:
> > >         ^
> > >
> > > My system info:
> > > NAME="Red Hat Enterprise Linux Server"
> > > VERSION="7.3 (Maipo)"
> > > ID="rhel"
> > > ID_LIKE="fedora"
> > > VERSION_ID="7.3"
> > > PRETTY_NAME="Red Hat Enterprise Linux Server 7.3 (Maipo)"
> > > ANSI_COLOR="0;31"
> > > CPE_NAME="cpe:/o:redhat:enterprise_linux:7.3:GA:server"
> > >
> > HOME_URL="https://emea01.safelinks.protection.outlook.com/?url=https%
> > 3A%2F%2Fwww.redhat.com%2F&amp;data=02%7C01%7Cshahafs%40mellan
> > ox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e4d9ba6a4
> > d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=Lg8arhiYLvH5L
> > 2hef8DVhS8A3fVJ%2B5IZkLIHmqCd%2FmY%3D&amp;reserved=0"
> > >
> > BUG_REPORT_URL="https://emea01.safelinks.protection.outlook.com/?url=
> > https%3A%2F%2Fbugzilla.redhat.com%2F&amp;data=02%7C01%7Cshahafs%
> > 40mellanox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e
> > 4d9ba6a4d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=3Do
> > RKjxovM8tOgKLssC1mq2wwfhjpVUZSExXV4ywBEQ%3D&amp;reserved=0"
> > >
> > > REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7"
> > > REDHAT_BUGZILLA_PRODUCT_VERSION=7.3
> > > REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
> > > REDHAT_SUPPORT_PRODUCT_VERSION="7.3"
> > > Red Hat Enterprise Linux Server release 7.3 (Maipo) Red Hat Enterprise
> > > Linux Server release 7.3 (Maipo)
> > 
> > OK, I'll redefine in v5 in case they are missing on the host system.
> > 
> > <snip>
> > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > > > 704046270..cc01310e0 100644
> > > > --- a/drivers/net/mlx5/mlx5.h
> > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > @@ -159,6 +159,7 @@ struct priv {
> > > >  	struct ibv_context *ctx; /* Verbs context. */
> > > >  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> > > >  	struct ibv_pd *pd; /* Protection Domain. */
> > > > +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> > >
> > >
> > > Why we need a dedicated entry for the ibdev_name? it is already part of
> > priv->ctx->device->name.
> > 
> > Heh, same reason as the next line below, don't forget those damn
> > secondaries which can't dereference local pointers from the primary process
> > :)
> 
> Right 😊. 
> 
> > 
> > > >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> > > > secondary */
> > <snip>
> > > > struct rte_eth_dev_info *info)
> > > >  	info->speed_capa = priv->link_speed_capa;
> > > >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> > > >  	mlx5_set_default_params(dev, info);
> > > > +	info->switch_info.name = dev->data->name;
> > > > +	info->switch_info.domain_id = priv->domain_id;
> > > > +	info->switch_info.port_id = priv->representor_id;
> > > > +	if (priv->representor) {
> > > > +		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
> > > > +		uint16_t port_id[i];
> > > > +
> > > > +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i),
> > > > i);
> > > > +		while (i--) {
> > > > +			struct priv *opriv =
> > > > +				rte_eth_devices[port_id[i]].data-
> > > > >dev_private;
> > > > +
> > > > +			if (!opriv ||
> > > > +			    opriv->representor ||
> > > > +			    opriv->domain_id != priv->domain_id)
> > > > +				continue;
> > > > +			/*
> > > > +			 * Override switch name with that of the master
> > > > +			 * device.
> > > > +			 */
> > > > +			info->switch_info.name = opriv->dev_data->name;
> > > > +			break;
> > >
> > > According to this logic it means once the master device is closed, all the
> > representors are no longer belong to the same switch (switch name of each
> > is different) which is not correct.
> > 
> > They still share the same domain ID, which is what actually matters. The
> > switch name is only provided to let applications identify the master
> > (control) device in case it's needed.
> > 
> > > According to your notes it is possible to close master w/o closing the
> > representor.
> > 
> > This allows devices to be probed in any order on a needed basis, not all at
> > once. It's done on purpose to pave the way for hotplug support.
> > 
> > > Why not just storing the master switch name when probing the
> > representor and to use it as is on the dev_info?
> > 
> > The switch name *must* be that of the master device. If the master is not
> > probed, there can't be a switch name. However there's no real provision for
> > this in the API, so I chose the most acceptable unique name, which is the
> > name of the local device. Would you prefer an empty name instead?
> 
> The current approach is OK. 
> I was just suggesting to skip the loop iteration by saving the switch name on the private structure. 

This is unsafe, if the master device is never probed or somehow replaced by
a different device with no relationship, this information could be wrong.

Keep in mind these ethdev names are just identifiers. The only requirement
is that they must be unique, however anything can be written in there. If
some name is not taken, another device can use it.

> > Thing is, on mlx5 flow rules can be created directly between representors
> > without involving the master device. An empty switch name may be
> > misleading in this respect.
> > 
> > What do you suggest?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 09/10] net/mlx5: add parameter for port representors
  2018-07-10 10:58               ` Adrien Mazarguil
@ 2018-07-10 11:15                 ` Shahaf Shuler
  0 siblings, 0 replies; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-10 11:15 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

Tuesday, July 10, 2018 1:58 PM, Adrien Mazarguil:
> Subject: Re: [PATCH v4 09/10] net/mlx5: add parameter for port
> representors
> 
> On Tue, Jul 10, 2018 at 10:16:03AM +0000, Shahaf Shuler wrote:
> > Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> > > Subject: Re: [PATCH v4 09/10] net/mlx5: add parameter for port
> > > representors
> > >
> > > On Mon, Jul 09, 2018 at 11:57:37AM +0000, Shahaf Shuler wrote:
> > > > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > > > Subject: [PATCH v4 09/10] net/mlx5: add parameter for port
> > > > > representors
> > > > >
> > > > > Prior to this patch, all port representors detected on a given
> > > > > device were probed and Ethernet devices instantiated for each of
> them.
> > > > >
> > > > > This patch adds support for the standard "representor"
> > > > > parameter, which implies that port representors are not probed
> > > > > by default anymore, except for the list provided through device
> arguments.
> > > > >
> > > > > (Patch based on prior work from Yuanhan Liu)
> > > > >
> > > > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > > > --
> > > > > v3 changes:
> > > > >
> > > > > - Adapted representor detection to the reworked
> mlx5_dev_spawn().
> > > <snip>
> > > > > @@ -672,7 +679,9 @@ mlx5_uar_init_secondary(struct rte_eth_dev
> > > *dev)
> > > > >   *
> > > > >   * @return
> > > > >   *   A valid Ethernet device object on success, NULL otherwise and
> > > rte_errno
> > > > > - *   is set.
> > > > > + *   is set. The following error is defined:
> > > > > + *
> > > > > + *   EBUSY: device is not supposed to be spawned.
> > > > >   */
> > > > >  static struct rte_eth_dev *
> > > > >  mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -723,6 +732,26
> > > > > @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > > > >  	int own_domain_id = 0;
> > > > >  	unsigned int i;
> > > > >
> > > > > +	/* Determine if this port representor is supposed to be
> spawned. */
> > > > > +	if (switch_info->representor && dpdk_dev->devargs) {
> > > > > +		struct rte_eth_devargs eth_da;
> > > > > +
> > > > > +		err = rte_eth_devargs_parse(dpdk_dev->devargs-
> >args,
> > > > > &eth_da);
> > > > > +		if (err) {
> > > > > +			rte_errno = -err;
> > > > > +			DRV_LOG(ERR, "failed to process device
> arguments:
> > > > > %s",
> > > > > +				strerror(rte_errno));
> > > > > +			return NULL;
> > > > > +		}
> > > > > +		for (i = 0; i < eth_da.nb_representor_ports; ++i)
> > > > > +			if (eth_da.representor_ports[i] ==
> > > > > +			    (uint16_t)switch_info->port_name)
> > > > > +				break;
> > > > > +		if (i == eth_da.nb_representor_ports) {
> > > > > +			rte_errno = EBUSY;
> > > >
> > > > Why EBUSY is the correct errno? Will another attempts to probe the
> > > > device
> > > can be successful?
> > >
> > > That's the definition of EAGAIN :)
> > >
> > > I thought EBUSY in the sense of "don't disturb" would be
> > > appropriate. This value was also chosen because it is not likely to
> > > be returned by any intermediate function calls. I've defined EBUSY
> > > along with the return value of this function for clarity (see above). Any
> suggestion?
> >
> > How about ENODEV ?
> 
> Already used by many internal functions, typically returned if the associated
> netdevice doesn't exist (e.g. sent to another netns; a fatal error when
> probing representors).
> 
> We need a unique error code that says "OK, no problem, just not this one".

OK, we can keep the EBUSY. 

> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [dpdk-dev] [PATCH v4 07/10] net/mlx5: probe all port representors
  2018-07-10 10:58               ` Adrien Mazarguil
@ 2018-07-10 11:17                 ` Shahaf Shuler
  0 siblings, 0 replies; 100+ messages in thread
From: Shahaf Shuler @ 2018-07-10 11:17 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Nélio Laranjeiro, Xueming(Steven) Li

Tuesday, July 10, 2018 1:59 PM, Adrien Mazarguil:
> Subject: Re: [PATCH v4 07/10] net/mlx5: probe all port representors
> 
> On Tue, Jul 10, 2018 at 10:13:25AM +0000, Shahaf Shuler wrote:
> > Tuesday, July 10, 2018 12:37 PM, Adrien Mazarguil:
> > > Subject: Re: [PATCH v4 07/10] net/mlx5: probe all port representors
> > >
> > > On Mon, Jul 09, 2018 at 11:57:29AM +0000, Shahaf Shuler wrote:
> > > > Hi Adrien,
> > > >
> > > >
> > > > Thursday, July 5, 2018 11:46 AM, Adrien Mazarguil:
> > > > > Subject: [PATCH v4 07/10] net/mlx5: probe all port representors
> > > > >
> > > > > Probe existing port representors in addition to their master
> > > > > device and associate them automatically.
> > > > >
> > > > > To avoid collision between Ethernet devices, they are named as
> follows:
> > > > >
> > > > > - "{DBDF}" for master/switch devices.
> > > > > - "{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> > > > >   representors.
> > > > >
> > > > > (Patch based on prior work from Yuanhan Liu)
> > > > >
> > > > > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > > > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > > > > Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> > > > > Cc: Xueming Li <xuemingl@mellanox.com>
> > > > > Cc: Shahaf Shuler <shahafs@mellanox.com>
> > > > > --
> > > > > v4 changes:
> > > > >
> > > > > - Fixed domain ID release once the last port using it is closed. Closed
> > > > >   devices are not necessarily detached, their presence is not a good
> > > > >   indicator. Code was modified to check if they still use their domain
> IDs
> > > > >   before deciding to release it.
> > > <snip>
> > > > > @@ -883,6 +915,41 @@ mlx5_dev_spawn(struct rte_device
> *dpdk_dev,
> > > > >  	priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
> > > > >  	priv->nl_socket_route =	mlx5_nl_init(RTMGRP_LINK,
> > > > > NETLINK_ROUTE);
> > > > >  	priv->nl_sn = 0;
> > > > > +	priv->representor = !!switch_info->representor;
> > > > > +	priv->domain_id =
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> > > > > +	priv->representor_id =
> > > > > +		switch_info->representor ? switch_info->port_name
> : -1;
> > > > > +	/*
> > > > > +	 * Look for sibling devices in order to reuse their switch
> domain
> > > > > +	 * if any, otherwise allocate one.
> > > > > +	 */
> > > > > +	i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0);
> > > > > +	if (i > 0) {
> > > > > +		uint16_t port_id[i];
> > > > > +
> > > > > +		i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev,
> port_id, i), i);
> > > > > +		while (i--) {
> > > > > +			const struct priv *opriv =
> > > > > +				rte_eth_devices[port_id[i]].data-
> > > > > >dev_private;
> > > > > +
> > > > > +			if (!opriv ||
> > > > > +			    opriv->domain_id ==
> > > > > +
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > > > > +				continue;
> > > > > +			priv->domain_id = opriv->domain_id;
> > > >
> > > > It looks like for the second port it will use the domain_id of the
> > > > first port. Is
> > > that what you intent?
> > >
> > > Yes, it's on purpose. Master and representors of a given device must
> > > share the same domain ID to let applications know they can create
> > > flow rules to forward traffic between them all.
> >
> > But this is not the case in Mellanox devices. On Mellanox devices each PF
> along w/ its representors has a separate eswitch, and traffic cannot be
> routed between the switches using flow rules.
> > For example if we have PF0 along w/ its representor REP0_0 and PF1 along
> w/ its representor REP1_0 . PF0 and REP0_0 will belong to switch X and PF1
> and REP1_0 will belong to switch domain Y. it is also being reflected on the
> phys_switch_id.
> >
> > We should have switch domain per PF.
> 
> Looks like I didn't understand your previous comment. I confirm there is no
> such issue, one domain ID is allocated per PF/representors group, which are
> identified by a common PCI bus address. It's fine because on mlx5, each
> physical port exposes its own address, I assumed there was no need to
> additionally compare phys_switch_id. Can this happen?

OK great. It is OK, the PF has only a single switch domain on which all its representors are connected. 

> 
> > > > Note - I couldn't test it due to compilation errors:
> > > >
> > > >
> > >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx
> > > 5
> > > _nl.c: In function 'mlx5_nl_switch_info_cb':
> > > >
> > >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx
> > > 5
> > > _
> > > > nl.c:843:8: error: 'IFLA_PHYS_PORT_NAME' undecl ared (first use in
> > > > this
> > > function)
> > > >    case IFLA_PHYS_PORT_NAME:
> > > >         ^
> > > >
> > >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx
> > > 5
> > > _
> > > > nl.c:843:8: note: each undeclared identifier is  reported only
> > > > once for each function it appears in
> > > >
> > >
> /.autodirect/swgwork/shahafs/workspace/dpdk.org/drivers/net/mlx5/mlx
> > > 5
> > > _
> > > > nl.c:851:8: error: 'IFLA_PHYS_SWITCH_ID' undecl ared (first use in
> > > > this
> > > function)
> > > >    case IFLA_PHYS_SWITCH_ID:
> > > >         ^
> > > >
> > > > My system info:
> > > > NAME="Red Hat Enterprise Linux Server"
> > > > VERSION="7.3 (Maipo)"
> > > > ID="rhel"
> > > > ID_LIKE="fedora"
> > > > VERSION_ID="7.3"
> > > > PRETTY_NAME="Red Hat Enterprise Linux Server 7.3 (Maipo)"
> > > > ANSI_COLOR="0;31"
> > > > CPE_NAME="cpe:/o:redhat:enterprise_linux:7.3:GA:server"
> > > >
> > >
> HOME_URL="https://emea01.safelinks.protection.outlook.com/?url=https
> > > %
> 3A%2F%2Fwww.redhat.com%2F&amp;data=02%7C01%7Cshahafs%40mellan
> > >
> ox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e4d9ba6a4
> > >
> d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=Lg8arhiYLvH5L
> > > 2hef8DVhS8A3fVJ%2B5IZkLIHmqCd%2FmY%3D&amp;reserved=0"
> > > >
> > >
> BUG_REPORT_URL="https://emea01.safelinks.protection.outlook.com/?url
> > > =
> https%3A%2F%2Fbugzilla.redhat.com%2F&amp;data=02%7C01%7Cshahafs%
> > >
> 40mellanox.com%7C661e7b51087b460817c008d5e648bf1e%7Ca652971c7d2e
> > >
> 4d9ba6a4d149256f461b%7C0%7C0%7C636668122474445351&amp;sdata=3Do
> > >
> RKjxovM8tOgKLssC1mq2wwfhjpVUZSExXV4ywBEQ%3D&amp;reserved=0"
> > > >
> > > > REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7"
> > > > REDHAT_BUGZILLA_PRODUCT_VERSION=7.3
> > > > REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
> > > > REDHAT_SUPPORT_PRODUCT_VERSION="7.3"
> > > > Red Hat Enterprise Linux Server release 7.3 (Maipo) Red Hat
> > > > Enterprise Linux Server release 7.3 (Maipo)
> > >
> > > OK, I'll redefine in v5 in case they are missing on the host system.
> > >
> > > <snip>
> > > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > > > > index
> > > > > 704046270..cc01310e0 100644
> > > > > --- a/drivers/net/mlx5/mlx5.h
> > > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > > @@ -159,6 +159,7 @@ struct priv {
> > > > >  	struct ibv_context *ctx; /* Verbs context. */
> > > > >  	struct ibv_device_attr_ex device_attr; /* Device properties.
> */
> > > > >  	struct ibv_pd *pd; /* Protection Domain. */
> > > > > +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device
> name. */
> > > >
> > > >
> > > > Why we need a dedicated entry for the ibdev_name? it is already
> > > > part of
> > > priv->ctx->device->name.
> > >
> > > Heh, same reason as the next line below, don't forget those damn
> > > secondaries which can't dereference local pointers from the primary
> > > process
> > > :)
> >
> > Right 😊.
> >
> > >
> > > > >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path
> for
> > > > > secondary */
> > > <snip>
> > > > > struct rte_eth_dev_info *info)
> > > > >  	info->speed_capa = priv->link_speed_capa;
> > > > >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> > > > >  	mlx5_set_default_params(dev, info);
> > > > > +	info->switch_info.name = dev->data->name;
> > > > > +	info->switch_info.domain_id = priv->domain_id;
> > > > > +	info->switch_info.port_id = priv->representor_id;
> > > > > +	if (priv->representor) {
> > > > > +		unsigned int i = mlx5_dev_to_port_id(dev->device,
> NULL, 0);
> > > > > +		uint16_t port_id[i];
> > > > > +
> > > > > +		i = RTE_MIN(mlx5_dev_to_port_id(dev->device,
> port_id, i),
> > > > > i);
> > > > > +		while (i--) {
> > > > > +			struct priv *opriv =
> > > > > +				rte_eth_devices[port_id[i]].data-
> > > > > >dev_private;
> > > > > +
> > > > > +			if (!opriv ||
> > > > > +			    opriv->representor ||
> > > > > +			    opriv->domain_id != priv->domain_id)
> > > > > +				continue;
> > > > > +			/*
> > > > > +			 * Override switch name with that of the
> master
> > > > > +			 * device.
> > > > > +			 */
> > > > > +			info->switch_info.name = opriv->dev_data-
> >name;
> > > > > +			break;
> > > >
> > > > According to this logic it means once the master device is closed,
> > > > all the
> > > representors are no longer belong to the same switch (switch name of
> > > each is different) which is not correct.
> > >
> > > They still share the same domain ID, which is what actually matters.
> > > The switch name is only provided to let applications identify the
> > > master
> > > (control) device in case it's needed.
> > >
> > > > According to your notes it is possible to close master w/o closing
> > > > the
> > > representor.
> > >
> > > This allows devices to be probed in any order on a needed basis, not
> > > all at once. It's done on purpose to pave the way for hotplug support.
> > >
> > > > Why not just storing the master switch name when probing the
> > > representor and to use it as is on the dev_info?
> > >
> > > The switch name *must* be that of the master device. If the master
> > > is not probed, there can't be a switch name. However there's no real
> > > provision for this in the API, so I chose the most acceptable unique
> > > name, which is the name of the local device. Would you prefer an empty
> name instead?
> >
> > The current approach is OK.
> > I was just suggesting to skip the loop iteration by saving the switch name on
> the private structure.
> 
> This is unsafe, if the master device is never probed or somehow replaced by
> a different device with no relationship, this information could be wrong.
> 
> Keep in mind these ethdev names are just identifiers. The only requirement
> is that they must be unique, however anything can be written in there. If
> some name is not taken, another device can use it.
> 
> > > Thing is, on mlx5 flow rules can be created directly between
> > > representors without involving the master device. An empty switch
> > > name may be misleading in this respect.
> > >
> > > What do you suggest?
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support
  2018-07-05  8:45     ` [dpdk-dev] [PATCH v4 00/10] net/mlx5: add port representor support Adrien Mazarguil
                         ` (9 preceding siblings ...)
  2018-07-05  8:45       ` [dpdk-dev] [PATCH v4 10/10] net/mlx5: support negative identifiers " Adrien Mazarguil
@ 2018-07-10 16:04       ` Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
                           ` (10 more replies)
  10 siblings, 11 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 16:04 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This series adds support for port (VF) representors to the mlx5 PMD, which
can be instantiated using the standard "representor" device parameter.

Note the PMD only probes existing representors which exist as Verbs devices;
their creation is part of the host system configuration.

v5 changes:

- Fixed and added missing HAVE_* definitions to Makefile for systems that do
  not expose them. Series now compiles fine down to RHEL 7.2 inclusive.

v4 changes:

- Fixed domain ID release that did not work, see relevant patch.
- Rebased series.

v3 changes:

- Added the following patches:
  - net/mlx5: drop useless support for several Verbs ports
  - net/mlx5: probe port representors in natural order
  - net/mlx5: support negative identifiers for port representors
- See individual patches for details.
- Rebased series.

v2 changes:

- See individual patches for details.
- Rebased series.

Adrien Mazarguil (10):
  net/mlx5: rename confusing object in probe code
  net/mlx5: remove redundant objects in probe code
  net/mlx5: drop useless support for several Verbs ports
  net/mlx5: split PCI from generic probing code
  net/mlx5: re-indent generic probing function
  net/mlx5: add port representor awareness
  net/mlx5: probe all port representors
  net/mlx5: probe port representors in natural order
  net/mlx5: add parameter for port representors
  net/mlx5: support negative identifiers for port representors

 doc/guides/nics/mlx5.rst                |   12 +
 doc/guides/prog_guide/poll_mode_drv.rst |    2 +
 drivers/net/mlx5/Makefile               |   45 ++
 drivers/net/mlx5/mlx5.c                 | 1108 ++++++++++++++++----------
 drivers/net/mlx5/mlx5.h                 |   29 +-
 drivers/net/mlx5/mlx5_ethdev.c          |  135 +++-
 drivers/net/mlx5/mlx5_mac.c             |    2 +-
 drivers/net/mlx5/mlx5_nl.c              |  308 ++++++-
 drivers/net/mlx5/mlx5_stats.c           |    6 +-
 drivers/net/mlx5/mlx5_txq.c             |    2 +-
 10 files changed, 1175 insertions(+), 474 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v5 01/10] net/mlx5: rename confusing object in probe code
  2018-07-10 16:04       ` [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support Adrien Mazarguil
@ 2018-07-10 16:04         ` Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
                           ` (9 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 16:04 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

There are several attribute objects in this function:

- IB device attributes (struct ibv_device_attr_ex device_attr).
- Direct Verbs attributes (struct mlx5dv_context attrs_out).
- Port attributes (struct ibv_port_attr).
- IB device attributes again (struct ibv_device_attr_ex device_attr_ex).

"attrs_out" is both odd and initialized using a nonstandard syntax. Rename
it "dv_attr" for consistency.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Fixed ctx -> attr_ctx in mlx5_pci_probe().
---
 drivers/net/mlx5/mlx5.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d081bdd05..22cbce8d5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
@@ -670,7 +671,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
 	int i;
-	struct mlx5dv_context attrs_out = {0};
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
@@ -736,21 +736,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
 	/*
 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 	 * as all ConnectX-5 devices.
 	 */
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
-	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
-		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
 			mps = MLX5_MPW_ENHANCED;
 		} else {
@@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		mps = MLX5_MPW_DISABLED;
 	}
 #ifdef HAVE_IBV_MLX5_MOD_SWP
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
-		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
 	DRV_LOG(DEBUG, "SWP support: %u", swp);
 #endif
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
 		struct mlx5dv_striding_rq_caps mprq_caps =
-			attrs_out.striding_rq_caps;
+			dv_attr.striding_rq_caps;
 
 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
 			mprq_caps.min_single_stride_log_num_of_bytes);
@@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 #endif
 	if (RTE_CACHE_LINE_SIZE == 128 &&
-	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
-	if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
-		tunnel_en = ((attrs_out.tunnel_offloads_caps &
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
-			     (attrs_out.tunnel_offloads_caps &
+			     (dv_attr.tunnel_offloads_caps &
 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
 	}
 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
@@ -812,9 +812,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		"tunnel offloading disabled due to old OFED/rdma-core version");
 #endif
 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
-	mpls_en = ((attrs_out.tunnel_offloads_caps &
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
-		   (attrs_out.tunnel_offloads_caps &
+		   (dv_attr.tunnel_offloads_caps &
 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
 		mpls_en ? "" : "not ");
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v5 02/10] net/mlx5: remove redundant objects in probe code
  2018-07-10 16:04       ` [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
@ 2018-07-10 16:04         ` Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 03/10] net/mlx5: drop useless support for several Verbs ports Adrien Mazarguil
                           ` (8 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 16:04 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev

This patch gets rid of redundant calls to open the device and query its
attributes in order to simplify the code.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
--
v2 changes:

- Minor indent fix on existing code.
---
 drivers/net/mlx5/mlx5.c | 64 +++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 22cbce8d5..4e7f29f5b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
 	struct ibv_device **list = NULL;
 	struct ibv_device *ibv_dev;
+	struct ibv_context *ctx = NULL;
+	struct ibv_device_attr_ex attr;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	int err = 0;
-	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr_ex device_attr;
 	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
@@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		attr_ctx = mlx5_glue->open_device(list[i]);
+		ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
 		break;
 	}
-	if (attr_ctx == NULL) {
+	if (ctx == NULL) {
 		switch (err) {
 		case 0:
 			DRV_LOG(ERR,
@@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(attr_ctx, &dv_attr);
+	mlx5_glue->dv_query_device(ctx, &dv_attr);
 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
@@ -822,23 +822,20 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	if (err) {
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected",
-		device_attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
 		char name[RTE_ETH_NAME_MAX_LEN];
 		int len;
 		uint32_t port = i + 1; /* ports are indexed from one */
-		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (device_attr.orig_attr.phys_port_cnt > 1)
+		if (attr.orig_attr.phys_port_cnt > 1)
 			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
@@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		ctx = mlx5_glue->open_device(ibv_dev);
+		if (!ctx)
+			ctx = mlx5_glue->open_device(ibv_dev);
 		if (ctx == NULL) {
 			err = ENODEV;
 			goto port_error;
@@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = device_attr;
+		priv->device_attr = attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
@@ -960,17 +958,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				strerror(rte_errno));
 			goto port_error;
 		}
-		err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
-		if (err) {
-			DRV_LOG(ERR, "ibv_query_device_ex() failed");
-			goto port_error;
-		}
-		config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
+		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!(device_attr.max_counter_sets);
+		config.flow_counter_en = !!attr.max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -978,7 +971,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+			attr.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -986,29 +979,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr.raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+		config.hw_padding = !!attr.rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
-			      (device_attr_ex.tso_caps.supported_qpts &
-			      (1 << IBV_QPT_RAW_PACKET)));
+		config.tso = (attr.tso_caps.max_tso > 0 &&
+			      (attr.tso_caps.supported_qpts &
+			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz =
-					device_attr_ex.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr.tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1170,14 +1162,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
+		/*
+		 * Each eth_dev instance is assigned its own Verbs context,
+		 * since this one is consumed, let the next iteration open
+		 * another.
+		 */
+		ctx = NULL;
 		continue;
 port_error:
 		if (priv)
 			rte_free(priv);
 		if (pd)
 			claim_zero(mlx5_glue->dealloc_pd(pd));
-		if (ctx)
-			claim_zero(mlx5_glue->close_device(ctx));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
 		break;
@@ -1189,8 +1185,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
 error:
-	if (attr_ctx)
-		claim_zero(mlx5_glue->close_device(attr_ctx));
+	if (ctx)
+		claim_zero(mlx5_glue->close_device(ctx));
 	if (list)
 		mlx5_glue->free_device_list(list);
 	if (err) {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [dpdk-dev] [PATCH v5 03/10] net/mlx5: drop useless support for several Verbs ports
  2018-07-10 16:04       ` [dpdk-dev] [PATCH v5 00/10] net/mlx5: add port representor support Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 01/10] net/mlx5: rename confusing object in probe code Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 02/10] net/mlx5: remove redundant objects " Adrien Mazarguil
@ 2018-07-10 16:04         ` Adrien Mazarguil
  2018-07-10 16:04         ` [dpdk-dev] [PATCH v5 04/10] net/mlx5: split PCI from generic probing code Adrien Mazarguil
                           ` (7 subsequent siblings)
  10 siblings, 0 replies; 100+ messages in thread
From: Adrien Mazarguil @ 2018-07-10 16:04 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Xueming Li

Unlike mlx4 from which this capability was inherited, mlx5 devices expose
exactly one Verbs port per PCI bus address. Each physical port gets
assigned its own bus address with a single Verbs port.

While harmless, this code requires an extra loop that would get in the way
of subsequent refactoring.

No functional impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Shahaf Shuler <shahafs@mellanox.com>
Cc: Xueming Li <xuemingl@mellanox.com>
--
v3 changes:

This patch was not present in prior revisions. As discussed [1], it was
added after finally deciding to remove this support.

[1] https://mails.dpdk.org/archives/dev/2018-June/105661.html
---
 drivers/net/mlx5/mlx5.c        | 96 +++++++++++++------------------------
 drivers/net/mlx5/mlx5.h        |  1 -
 drivers/net/mlx5/mlx5_ethdev.c |  2 +-
 drivers/net/mlx5/mlx5_txq.c    |  2 +-
 4 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4e7f29f5b..717d8b268 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -652,11 +652,13 @@ static int
 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	       struct rte_pci_device *pci_dev)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
+	struct ibv_device **list;
 	struct ibv_context *ctx = NULL;
 	struct ibv_device_attr_ex attr;
+	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
+	struct priv *priv = NULL;
 	int err = 0;
 	unsigned int vf = 0;
 	unsigned int mps;
@@ -719,6 +721,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = rte_errno;
 		break;
 	}
+	mlx5_glue->free_device_list(list);
 	if (ctx == NULL) {
 		switch (err) {
 		case 0:
@@ -733,7 +736,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		}
 		goto error;
 	}
-	ibv_dev = list[i];
 	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
@@ -827,15 +829,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DEBUG("ibv_query_device_ex() failed");
 		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
-		struct ibv_pd *pd = NULL;
-		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -859,11 +855,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
+		snprintf(name, sizeof(name), PCI_PRI_FMT,
 			 pci_dev->addr.domain, pci_dev->addr.bus,
 			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -901,31 +895,22 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
 			rte_eth_dev_probing_finish(eth_dev);
-			continue;
-		}
-		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
+			claim_zero(mlx5_glue->close_device(ctx));
+			return 0;
 		}
 		/* Check port status. */
-		err = mlx5_glue->query_port(ctx, port, &port_attr);
+		err = mlx5_glue->query_port(ctx, 1, &port_attr);
 		if (err) {
 			DRV_LOG(ERR, "port query failed: %s", strerror(err));
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-			DRV_LOG(ERR,
-				"port %d is not configured in Ethernet mode",
-				port);
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
 			err = EINVAL;
-			goto port_error;
+			goto error;
 		}
 		if (port_attr.state != IBV_PORT_ACTIVE)
-			DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
-				port,
+			DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
 				mlx5_glue->port_state_str(port_attr.state),
 				port_attr.state);
 		/* Allocate protection domain. */
@@ -933,7 +918,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (pd == NULL) {
 			DRV_LOG(ERR, "PD allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		/* from rte_ethdev.c */
 		priv = rte_zmalloc("ethdev private structure",
@@ -942,13 +927,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (priv == NULL) {
 			DRV_LOG(ERR, "priv allocation failure");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
 		priv->device_attr = attr;
-		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		err = mlx5_args(&config, pci_dev->device.devargs);
@@ -956,7 +940,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
-			goto port_error;
+			goto error;
 		}
 		config.hw_csum = !!(attr.device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
@@ -1006,7 +990,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				"multi-packet send not supported on this device"
 				" (" MLX5_TXQ_MPW_EN ")");
 			err = ENOTSUP;
-			goto port_error;
+			goto error;
 		}
 		DRV_LOG(INFO, "%s MPS is %s",
 			config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
@@ -1038,7 +1022,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		if (eth_dev == NULL) {
 			DRV_LOG(ERR, "can not allocate rte ethdev");
 			err = ENOMEM;
-			goto port_error;
+			goto error;
 		}
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
@@ -1049,7 +1033,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
 			err = rte_errno;
-			goto port_error;