DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors
@ 2021-09-27  8:32 Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
                   ` (10 more replies)
  0 siblings, 11 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl

This patch set supports representor number of a PF to be more than 255.
CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255 SFs.

Xueming Li (8):
  common/mlx5: add netlink API to get RDMA port state
  net/mlx5: use netlink when IB port greater than 255
  net/mlx5: improve Verbs flow priority discover for scalable
  net/mlx5: check DevX to support more Verb ports
  net/mlx5: support flow item port of switch manager
  net/mlx5: supports flow item of normal Tx queue
  net/mlx5: fix internal root table flow priroity
  net/mlx5: enable DevX Tx queue creation

 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 116 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   3 +
 drivers/common/mlx5/version.map       |   1 +
 drivers/net/mlx5/linux/mlx5_os.c      | 114 ++++++++-----------------
 drivers/net/mlx5/mlx5.h               |   2 +
 drivers/net/mlx5/mlx5_devx.c          |  10 +--
 drivers/net/mlx5/mlx5_devx.h          |   2 +
 drivers/net/mlx5/mlx5_flow.c          |  81 +++++++++++++++++-
 drivers/net/mlx5/mlx5_flow.h          |   7 +-
 drivers/net/mlx5/mlx5_flow_dv.c       |  44 ++++++++--
 drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
 drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
 13 files changed, 276 insertions(+), 125 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko, Ray Kinsella

Introduce netlink api to get rdma port state.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 116 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   3 +
 drivers/common/mlx5/version.map       |   1 +
 4 files changed, 94 insertions(+), 28 deletions(-)

diff --git a/drivers/common/mlx5/linux/meson.build b/drivers/common/mlx5/linux/meson.build
index cbea58f557d..2dcd27b7786 100644
--- a/drivers/common/mlx5/linux/meson.build
+++ b/drivers/common/mlx5/linux/meson.build
@@ -175,6 +175,8 @@ has_sym_args = [
             'RDMA_NLDEV_ATTR_DEV_NAME' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_PORT_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_PORT_INDEX' ],
+        [ 'HAVE_RDMA_NLDEV_ATTR_PORT_STATE', 'rdma/rdma_netlink.h',
+            'RDMA_NLDEV_ATTR_PORT_STATE' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_NDEV_INDEX' ],
         [ 'HAVE_MLX5_DR_FLOW_DUMP', 'infiniband/mlx5dv.h',
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c
index 9120a697fd5..3e775e58b14 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -78,6 +78,9 @@
 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
 #endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
+#define RDMA_NLDEV_ATTR_PORT_STATE 12
+#endif
 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
 #endif
@@ -160,14 +163,16 @@ struct mlx5_nl_mac_addr {
 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 
 /** Data structure used by mlx5_nl_cmdget_cb(). */
-struct mlx5_nl_ifindex_data {
+struct mlx5_nl_port_info {
 	const char *name; /**< IB device name (in). */
 	uint32_t flags; /**< found attribute flags (out). */
 	uint32_t ibindex; /**< IB device index (out). */
 	uint32_t ifindex; /**< Network interface index (out). */
 	uint32_t portnum; /**< IB device max port number (out). */
+	uint16_t state; /**< IB device port state (out). */
 };
 
 uint32_t atomic_sn;
@@ -966,8 +971,8 @@ mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 static int
 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 {
-	struct mlx5_nl_ifindex_data *data = arg;
-	struct mlx5_nl_ifindex_data local = {
+	struct mlx5_nl_port_info *data = arg;
+	struct mlx5_nl_port_info local = {
 		.flags = 0,
 	};
 	size_t off = NLMSG_HDRLEN;
@@ -1000,6 +1005,10 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 			local.portnum = *(uint32_t *)payload;
 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
 			break;
+		case RDMA_NLDEV_ATTR_PORT_STATE:
+			local.state = *(uint8_t *)payload;
+			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
+			break;
 		default:
 			break;
 		}
@@ -1016,6 +1025,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 		data->ibindex = local.ibindex;
 		data->ifindex = local.ifindex;
 		data->portnum = local.portnum;
+		data->state = local.state;
 	}
 	return 0;
 error:
@@ -1024,7 +1034,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 }
 
 /**
- * Get index of network interface associated with some IB device.
+ * Get port info of network interface associated with some IB device.
  *
  * This is the only somewhat safe method to avoid resorting to heuristics
  * when faced with port representors. Unfortunately it requires at least
@@ -1036,23 +1046,19 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
  *   IB device name.
  * @param[in] pindex
  *   IB device port index, starting from 1
+ * @param[out] data
+ *   Pointer to port info.
  * @return
- *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
- *   is set.
+ *   A nonzero value on success, 0 otherwise and rte_errno is set.
  */
-unsigned int
-mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+static int
+mlx5_nl_port_info(int nl, const char *name, uint32_t pindex,
+		  struct mlx5_nl_port_info *data)
 {
-	struct mlx5_nl_ifindex_data data = {
-		.name = name,
-		.flags = 0,
-		.ibindex = 0, /* Determined during first pass. */
-		.ifindex = 0, /* Determined during second pass. */
-	};
 	union {
 		struct nlmsghdr nh;
 		uint8_t buf[NLMSG_HDRLEN +
-			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
 	} req = {
 		.nh = {
@@ -1066,26 +1072,27 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 	uint32_t sn = MLX5_NL_SN_GENERATE;
 	int ret;
 
+	data->name = name;
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
 		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
 		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
 		goto error;
-	data.flags = 0;
+	data->flags = 0;
 	sn = MLX5_NL_SN_GENERATE;
 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
 					     RDMA_NLDEV_CMD_PORT_GET);
 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
-	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-	       &data.ibindex, sizeof(data.ibindex));
+	       &data->ibindex, sizeof(data->ibindex));
 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
@@ -1094,20 +1101,73 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
 		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
 		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
-	    !data.ifindex)
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+	    !data->ifindex)
 		goto error;
-	return data.ifindex;
+	return 1;
 error:
 	rte_errno = ENODEV;
 	return 0;
 }
 
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = { .ifindex = 0 };
+
+	if (mlx5_nl_port_info(nl, name, pindex, &data) == 0)
+		return 0;
+	return data.ifindex;
+}
+
+/**
+ * Get IB device port state.
+ *
+ * This is the only somewhat safe method to get info for port number >= 255.
+ * Unfortunately it requires at least Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   Port state (ibv_port_state) on success, 0 otherwise
+ *   and rte_errno is set.
+ */
+enum ibv_port_state
+mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = { .state = 0 };
+
+	if (mlx5_nl_port_info(nl, name, pindex, &data) == 0)
+		return (enum ibv_port_state)0;
+	return (enum ibv_port_state)data.state;
+}
+
 /**
  * Get the number of physical ports of given IB device.
  *
@@ -1123,7 +1183,7 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 unsigned int
 mlx5_nl_portnum(int nl, const char *name)
 {
-	struct mlx5_nl_ifindex_data data = {
+	struct mlx5_nl_port_info data = {
 		.flags = 0,
 		.name = name,
 		.ifindex = 0,
diff --git a/drivers/common/mlx5/linux/mlx5_nl.h b/drivers/common/mlx5/linux/mlx5_nl.h
index 15129ffdc88..809639947a6 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.h
+++ b/drivers/common/mlx5/linux/mlx5_nl.h
@@ -54,6 +54,9 @@ unsigned int mlx5_nl_portnum(int nl, const char *name);
 __rte_internal
 unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
 __rte_internal
+enum ibv_port_state mlx5_nl_port_state(int nl, const char *name,
+				       uint32_t pindex);
+__rte_internal
 int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 			       struct rte_ether_addr *mac, int vf_index);
 __rte_internal
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index e5cb6b70604..d23634eef8a 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -131,6 +131,7 @@ INTERNAL {
 	mlx5_nl_mac_addr_flush; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_remove; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_sync; # WINDOWS_NO_EXPORT
+	mlx5_nl_port_state;
 	mlx5_nl_portnum; # WINDOWS_NO_EXPORT
 	mlx5_nl_promisc; # WINDOWS_NO_EXPORT
 	mlx5_nl_switch_info; # WINDOWS_NO_EXPORT
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 2/8] net/mlx5: use netlink when IB port greater than 255
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

IB spec doesn't allow 255 ports on a single HCA, port number of 256 was
cast to u8 value 0 which invalid to ibv_query_port()

This patch invoke netlink api to query port state when port number
greater than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 39 ++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 470b16cb9ad..79ab789df43 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -956,7 +956,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	const struct mlx5_switch_info *switch_info = &spawn->info;
 	struct mlx5_dev_ctx_shared *sh = NULL;
-	struct ibv_port_attr port_attr;
+	struct ibv_port_attr port_attr = { .state = IBV_PORT_NOP };
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct mlx5_priv *priv = NULL;
@@ -976,6 +976,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	uint16_t port_id;
 	struct mlx5_port_info vport_info = { .query_flags = 0 };
+	int nl_rdma = -1;
 	int i;
 
 	/* Determine if this port representor is supposed to be spawned. */
@@ -1170,19 +1171,29 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config->mpls_en = mpls_en;
+	nl_rdma = mlx5_nl_init(NETLINK_RDMA);
 	/* Check port status. */
-	err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
-	if (err) {
-		DRV_LOG(ERR, "port query failed: %s", strerror(err));
-		goto error;
-	}
-	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-		DRV_LOG(ERR, "port is not configured in Ethernet mode");
-		err = EINVAL;
-		goto error;
+	if (spawn->phys_port <= UINT8_MAX) {
+		/* Legacy Verbs api only support u8 port number. */
+		err = mlx5_glue->query_port(sh->ctx, spawn->phys_port,
+					    &port_attr);
+		if (err) {
+			DRV_LOG(ERR, "port query failed: %s", strerror(err));
+			goto error;
+		}
+		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
+			err = EINVAL;
+			goto error;
+		}
+	} else if (nl_rdma >= 0) {
+		/* IB doesn't allow more than 255 ports, must be Ethernet. */
+		port_attr.state = mlx5_nl_port_state(nl_rdma,
+			((struct ibv_device *)spawn->phys_dev)->name,
+			spawn->phys_port);
 	}
 	if (port_attr.state != IBV_PORT_ACTIVE)
-		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+		DRV_LOG(INFO, "port is not active: \"%s\" (%d)",
 			mlx5_glue->port_state_str(port_attr.state),
 			port_attr.state);
 	/* Allocate private eth device data. */
@@ -1199,7 +1210,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->pci_dev = spawn->pci_dev;
 	priv->mtu = RTE_ETHER_MTU;
 	/* Some internal functions rely on Netlink sockets, open them now. */
-	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_rdma = nl_rdma;
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
 	priv->representor = !!switch_info->representor;
 	priv->master = !!switch_info->master;
@@ -1910,8 +1921,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mlx5_os_free_shared_dr(priv);
 		if (priv->nl_socket_route >= 0)
 			close(priv->nl_socket_route);
-		if (priv->nl_socket_rdma >= 0)
-			close(priv->nl_socket_rdma);
 		if (priv->vmwa_context)
 			mlx5_vlan_vmwa_exit(priv->vmwa_context);
 		if (eth_dev && priv->drop_queue.hrxq)
@@ -1935,6 +1944,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (sh)
 		mlx5_free_shared_dev_ctx(sh);
+	if (nl_rdma >= 0)
+		close(nl_rdma);
 	MLX5_ASSERT(err > 0);
 	rte_errno = err;
 	return NULL;
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 3/8] net/mlx5: improve Verbs flow priority discover for scalable
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 4/8] net/mlx5: check DevX to support more Verb ports Xueming Li
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

To detect number flow Verbs flow priorities, PMD try to create Verbs
flows in different priority. While Verbs is not designed to support
ports larger than 255.

When DevX supported by kernel driver, 16 Verbs priorities must be
supported, no need to create Verbs flows.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_verbs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index b93fd4d2c96..93bffa4a0b9 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -83,6 +83,11 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 	int i;
 	int priority = 0;
 
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+	/* If DevX supported, driver must support 16 verbs flow priorities. */
+	priority = RTE_DIM(priority_map_5);
+	goto out;
+#endif
 	if (!drop->qp) {
 		rte_errno = ENOTSUP;
 		return -rte_errno;
@@ -109,6 +114,9 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 			dev->data->port_id, priority);
 		return -rte_errno;
 	}
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+out:
+#endif
 	DRV_LOG(INFO, "port %u supported flow priorities:"
 		" 0-%d for ingress or egress root table,"
 		" 0-%d for non-root table or transfer root table.",
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 4/8] net/mlx5: check DevX to support more Verb ports
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (2 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 5/8] net/mlx5: support flow item port of switch manager Xueming Li
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

Verbs API doesn't support device port number larger than 255 by design.

To support more VF or SubFunction port representors, forces DevX api
check when max ports larger than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 79ab789df43..e9256ad5245 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1344,9 +1344,16 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 #endif
 	if (spawn->max_port > UINT8_MAX) {
 		/* Verbs can't support ports larger than 255 by design. */
-		DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
-		err = EINVAL;
-		goto error;
+#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
+		if (!config->dv_flow_en || !config->dv_esw_en) {
+			DRV_LOG(INFO, "must enable DV and ESW when IB ports > 255");
+#else
+		{
+			DRV_LOG(ERR, "DevX does not provide UAR offset, can't support IB ports > UINT8_MAX");
+#endif
+			err = EINVAL;
+			goto error;
+		}
 	}
 	config->ind_table_max_size =
 		sh->device_attr.max_rwq_indirection_table_size;
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 5/8] net/mlx5: support flow item port of switch manager
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (3 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 4/8] net/mlx5: check DevX to support more Verb ports Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 6/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

When sending packet from representor, the vport ID in transport domain
is E-Switch manager vport ID since representor shares resources of
eswitch manager. To match packet sent by representor, pattern has to be
vport_id==<esw-mgr-id> && txq==<sqn>

On BlueField, eswitch manager ID is 0xfffe. 0 on other NIC.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.h    |  3 +++
 drivers/net/mlx5/mlx5_flow_dv.c | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 76ad53f2a1e..861e18fb3b1 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -18,6 +18,9 @@
 
 #include "mlx5.h"
 
+/* E-Switch Manager port, used for rte_flow_item_port_id. */
+#define MLX5_PORT_ESW_MGR UINT32_MAX
+
 /* Private rte flow items. */
 enum mlx5_rte_flow_item_type {
 	MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 3f6f5dcfbad..d4242a4aa8d 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -15,6 +15,7 @@
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_cycles.h>
+#include <rte_bus_pci.h>
 #include <rte_ip.h>
 #include <rte_gre.h>
 #include <rte_vxlan.h>
@@ -92,6 +93,23 @@ static int
 flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
 				  uint32_t rix_jump);
 
+static int16_t
+flow_dv_get_esw_manager_vport_id(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->pci_dev == NULL)
+		return 0;
+	switch (priv->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX7BF:
+		return (int16_t)0xfffe;
+	default:
+		return 0;
+	}
+}
+
 /**
  * Initialize flow attributes structure according to flow items' types.
  *
@@ -2224,6 +2242,8 @@ flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
 		return ret;
 	if (!spec)
 		return 0;
+	if (spec->id == MLX5_PORT_ESW_MGR)
+		return 0;
 	esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
 	if (!esw_priv)
 		return rte_flow_error_set(error, rte_errno,
@@ -9691,6 +9711,11 @@ flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
 	struct mlx5_priv *priv;
 	uint16_t mask, id;
 
+	if (pid_v && pid_v->id == MLX5_PORT_ESW_MGR) {
+		flow_dv_translate_item_source_vport(matcher, key,
+			flow_dv_get_esw_manager_vport_id(dev), 0xffff);
+		return 0;
+	}
 	mask = pid_m ? pid_m->id : 0xffff;
 	id = pid_v ? pid_v->id : dev->data->port_id;
 	priv = mlx5_port_to_eswitch_info(id, item == NULL);
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 6/8] net/mlx5: supports flow item of normal Tx queue
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (4 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 5/8] net/mlx5: support flow item port of switch manager Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 7/8] net/mlx5: fix internal root table flow priroity Xueming Li
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

Extends txq flow pattern to support both hairpin and regular txq.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_dv.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index d4242a4aa8d..e388e2d5e10 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -10916,22 +10916,22 @@ flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
 	void *misc_v =
 		MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
 	struct mlx5_txq_ctrl *txq;
-	uint32_t queue;
-
+	uint32_t queue, mask;
 
 	queue_m = (const void *)item->mask;
-	if (!queue_m)
-		return;
 	queue_v = (const void *)item->spec;
 	if (!queue_v)
 		return;
 	txq = mlx5_txq_get(dev, queue_v->queue);
 	if (!txq)
 		return;
-	queue = txq->obj->sq->id;
-	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
-	MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
-		 queue & queue_m->queue);
+	if (txq->type == MLX5_TXQ_TYPE_HAIRPIN)
+		queue = txq->obj->sq->id;
+	else
+		queue = txq->obj->sq_obj.sq->id;
+	mask = queue_m == NULL ? UINT32_MAX : queue_m->queue;
+	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, mask);
+	MLX5_SET(fte_match_set_misc, misc_v, source_sqn, queue & mask);
 	mlx5_txq_release(dev, queue_v->queue);
 }
 
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 7/8] net/mlx5: fix internal root table flow priroity
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (5 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 6/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 8/8] net/mlx5: enable DevX Tx queue creation Xueming Li
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko, Dong Zhou

When creating internal transfer flow on root table with lowerest
priority, the flow was created with u32 priority. It was wrong since
the flow is created in kernel and  max priority supported is 16.

This patch fixes this by adding internal flow check.

Fixes: 5f8ae44dd454 ("net/mlx5: enlarge maximal flow priority")

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.c    | 7 +++++--
 drivers/net/mlx5/mlx5_flow.h    | 4 ++--
 drivers/net/mlx5/mlx5_flow_dv.c | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c914a7120cc..8dc79340f2d 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -980,13 +980,15 @@ mlx5_get_lowest_priority(struct rte_eth_dev *dev,
  *   Pointer to device flow rule attributes.
  * @param[in] subpriority
  *   The priority based on the items.
+ * @param[in] external
+ *   Flow is user flow.
  * @return
  *   The matcher priority of the flow.
  */
 uint16_t
 mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 			  const struct rte_flow_attr *attr,
-			  uint32_t subpriority)
+			  uint32_t subpriority, bool external)
 {
 	uint16_t priority = (uint16_t)attr->priority;
 	struct mlx5_priv *priv = dev->data->dev_private;
@@ -997,7 +999,8 @@ mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 		return mlx5_os_flow_adjust_priority(dev, priority, subpriority);
 	}
 	if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
-		priority = MLX5_NON_ROOT_FLOW_MAX_PRIO;
+		priority = external ?
+			   MLX5_NON_ROOT_FLOW_MAX_PRIO : priv->config.flow_prio;
 	return priority * 3 + subpriority;
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 861e18fb3b1..1e31d25f319 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1434,8 +1434,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 uint32_t mlx5_get_lowest_priority(struct rte_eth_dev *dev,
 					const struct rte_flow_attr *attr);
 uint16_t mlx5_get_matcher_priority(struct rte_eth_dev *dev,
-				     const struct rte_flow_attr *attr,
-				     uint32_t subpriority);
+				   const struct rte_flow_attr *attr,
+				   uint32_t subpriority, bool external);
 int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
 				     enum mlx5_feature_name feature,
 				     uint32_t id,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index e388e2d5e10..3744f3e5917 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -13633,7 +13633,8 @@ flow_dv_translate(struct rte_eth_dev *dev,
 	matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
 				    matcher.mask.size);
 	matcher.priority = mlx5_get_matcher_priority(dev, attr,
-					matcher.priority);
+						     matcher.priority,
+						     dev_flow->external);
 	/**
 	 * When creating meter drop flow in drop table, using original
 	 * 5-tuple match, the matcher priority should be lower than
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH 8/8] net/mlx5: enable DevX Tx queue creation
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (6 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 7/8] net/mlx5: fix internal root table flow priroity Xueming Li
@ 2021-09-27  8:32 ` Xueming Li
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-09-27  8:32 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Matan Azrad, Viacheslav Ovsiienko

Verbs API has limitation to support port number larger 255 by design. To
support more representors on a single Verbs device, must enable DevX
API.

DevX SQ was disabled since all representors need a FDB default miss flow
to redirect packets sent from CPU to peer port(SF, VF or HPF).

Kernel creates representor default miss flow automatically for Verbs QP.
For DevX sq, PMD must to create it manually.

The default miss root flow matches esw-manager vport and sqn. Since root
table flow created on kernel, vport redirect action is not supported, so
split the default miss flow into:
1. per eswitch FDB root flow that matches ESW manager vport ID, jump to
   group 1.
2. per sq FDB flow in group 1 that matches ESW manager vport ID and sqn,
   redirect packet to peer vport.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 62 +-------------------------
 drivers/net/mlx5/mlx5.h          |  2 +
 drivers/net/mlx5/mlx5_devx.c     | 10 ++---
 drivers/net/mlx5/mlx5_devx.h     |  2 +
 drivers/net/mlx5/mlx5_flow.c     | 74 ++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_trigger.c  | 11 ++++-
 6 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index e9256ad5245..bcf040a8524 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -697,56 +697,6 @@ mlx5_init_once(void)
 	return ret;
 }
 
-/**
- * Create the Tx queue DevX/Verbs object.
- *
- * @param dev
- *   Pointer to Ethernet device.
- * @param idx
- *   Queue index in DPDK Tx queue array.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
-	struct mlx5_txq_ctrl *txq_ctrl =
-			container_of(txq_data, struct mlx5_txq_ctrl, txq);
-
-	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!priv->config.dv_esw_en)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#endif
-	return mlx5_txq_ibv_obj_new(dev, idx);
-}
-
-/**
- * Release an Tx DevX/verbs queue object.
- *
- * @param txq_obj
- *   DevX/Verbs Tx queue object.
- */
-static void
-mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
-{
-	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#endif
-	mlx5_txq_ibv_obj_release(txq_obj);
-}
-
 /**
  * DV flow counter mode detect and config.
  *
@@ -1812,16 +1762,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 						ibv_obj_ops.drop_action_create;
 		priv->obj_ops.drop_action_destroy =
 						ibv_obj_ops.drop_action_destroy;
-#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
-		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
-#else
-		if (config->dv_esw_en)
-			priv->obj_ops.txq_obj_modify =
-						ibv_obj_ops.txq_obj_modify;
-#endif
-		/* Use specific wrappers for Tx object. */
-		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
-		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
 		mlx5_queue_counter_id_prepare(eth_dev);
 		priv->obj_ops.lb_dummy_queue_create =
 					mlx5_rxq_ibv_obj_dummy_lb_create;
@@ -1832,7 +1772,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (config->tx_pp &&
 	    (priv->config.dv_esw_en ||
-	     priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+	     priv->obj_ops.txq_obj_new != mlx5_txq_devx_obj_new)) {
 		/*
 		 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
 		 * packet pacing and already checked above.
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e02714e2319..63737a1dafe 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1698,6 +1698,8 @@ int mlx5_ctrl_flow(struct rte_eth_dev *dev,
 		   struct rte_flow_item_eth *eth_mask);
 int mlx5_flow_lacp_miss(struct rte_eth_dev *dev);
 struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+uint32_t mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev,
+					    uint32_t txq);
 void mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 				       uint64_t async_id, int status);
 void mlx5_set_query_alarm(struct mlx5_dev_ctx_shared *sh);
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a1db53577a2..a49602cb957 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -102,9 +102,9 @@ mlx5_devx_modify_rq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
-mlx5_devx_modify_sq(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
-		    uint8_t dev_port)
+int
+mlx5_txq_devx_modify(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+		     uint8_t dev_port)
 {
 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
 	int ret;
@@ -1118,7 +1118,7 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
 	*txq_data->qp_db = 0;
 	txq_data->qp_num_8s = txq_obj->sq_obj.sq->id << 8;
 	/* Change Send Queue state to Ready-to-Send. */
-	ret = mlx5_devx_modify_sq(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
+	ret = mlx5_txq_devx_modify(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
 	if (ret) {
 		rte_errno = errno;
 		DRV_LOG(ERR,
@@ -1187,7 +1187,7 @@ struct mlx5_obj_ops devx_obj_ops = {
 	.drop_action_create = mlx5_devx_drop_action_create,
 	.drop_action_destroy = mlx5_devx_drop_action_destroy,
 	.txq_obj_new = mlx5_txq_devx_obj_new,
-	.txq_obj_modify = mlx5_devx_modify_sq,
+	.txq_obj_modify = mlx5_txq_devx_modify,
 	.txq_obj_release = mlx5_txq_devx_obj_release,
 	.lb_dummy_queue_create = NULL,
 	.lb_dummy_queue_release = NULL,
diff --git a/drivers/net/mlx5/mlx5_devx.h b/drivers/net/mlx5/mlx5_devx.h
index bc8a8d6b73c..a95207a6b9a 100644
--- a/drivers/net/mlx5/mlx5_devx.h
+++ b/drivers/net/mlx5/mlx5_devx.h
@@ -8,6 +8,8 @@
 #include "mlx5.h"
 
 int mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_devx_modify(struct mlx5_txq_obj *obj,
+			 enum mlx5_txq_modify_type type, uint8_t dev_port);
 void mlx5_txq_devx_obj_release(struct mlx5_txq_obj *txq_obj);
 
 extern struct mlx5_obj_ops devx_obj_ops;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 8dc79340f2d..71933e03772 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6571,6 +6571,80 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
 						   actions, false, &error);
 }
 
+/**
+ * Create a dedicated flow rule on e-switch table 1, matches ESW manager
+ * and sq number, directs all packets to peer vport.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param txq
+ *   Txq index.
+ *
+ * @return
+ *   Flow ID on success, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev, uint32_t txq)
+{
+	struct rte_flow_attr attr = {
+		.group = 0,
+		.priority = MLX5_FLOW_LOWEST_PRIO_INDICATOR,
+		.ingress = 1,
+		.egress = 0,
+		.transfer = 1,
+	};
+	struct rte_flow_item_port_id port_spec = {
+		.id = MLX5_PORT_ESW_MGR,
+	};
+	struct mlx5_rte_flow_item_tx_queue txq_spec = {
+		.queue = txq,
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_PORT_ID,
+			.spec = &port_spec,
+		},
+		{
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+			.spec = &txq_spec,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action_jump jump = {
+		.group = 1,
+	};
+	struct rte_flow_action_port_id port = {
+		.id = dev->data->port_id,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct rte_flow_error error;
+
+	/*
+	 * Creates group 0, highest priority jump flow.
+	 * Matches txq to bypass kernel packets.
+	 */
+	if (flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern, actions,
+			     false, &error) == 0)
+		return 0;
+	/* Create group 1, lowest priority redirect flow for txq. */
+	attr.group = 1;
+	actions[0].conf = &port;
+	actions[0].type = RTE_FLOW_ACTION_TYPE_PORT_ID;
+	return flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern,
+				actions, false, &error);
+}
+
 /**
  * Validate a flow supported by the NIC.
  *
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 54173bfacb2..42d8bb31128 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1255,9 +1255,18 @@ mlx5_traffic_enable(struct rte_eth_dev *dev)
 				goto error;
 			}
 		}
+		if ((priv->representor || priv->master) &&
+		    priv->config.dv_esw_en) {
+			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
+				DRV_LOG(ERR,
+					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
+					dev->data->port_id, i);
+				goto error;
+			}
+		}
 		mlx5_txq_release(dev, i);
 	}
-	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
+	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
 		if (mlx5_flow_create_esw_table_zero_flow(dev))
 			priv->fdb_def_rule = 1;
 		else
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (7 preceding siblings ...)
  2021-09-27  8:32 ` [dpdk-dev] [PATCH 8/8] net/mlx5: enable DevX Tx queue creation Xueming Li
@ 2021-10-16  8:07 ` Xueming Li
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
                     ` (7 more replies)
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
  10 siblings, 8 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit

This patch set supports representor number of a PF to be more than 255.
CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255 SFs.

v2:
 - fixed FDB root table flow priority
 - add error check to Netlink port state API
 - commit log update and other minor fixes

Xueming Li (8):
  common/mlx5: add netlink API to get RDMA port state
  net/mlx5: use netlink when IB port greater than 255
  net/mlx5: improve Verbs flow priority discover for scalable
  net/mlx5: support E-Switch manager egress traffic match
  net/mlx5: supports flow item of normal Tx queue
  net/mlx5: fix internal root table flow priroity
  net/mlx5: enable DevX Tx queue creation
  net/mlx5: check DevX to support more Verbs ports

 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 drivers/net/mlx5/linux/mlx5_os.c      | 119 +++++++---------------
 drivers/net/mlx5/mlx5.h               |   2 +
 drivers/net/mlx5/mlx5_devx.c          |  10 +-
 drivers/net/mlx5/mlx5_devx.h          |   2 +
 drivers/net/mlx5/mlx5_flow.c          |  81 ++++++++++++++-
 drivers/net/mlx5/mlx5_flow.h          |   7 +-
 drivers/net/mlx5/mlx5_flow_dv.c       |  44 +++++++--
 drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
 drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
 13 files changed, 291 insertions(+), 134 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:23     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
                     ` (6 subsequent siblings)
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev
  Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Ray Kinsella

Introduce netlink API to get rdma port state.

Port state is restrieved based on RDMA device name and port index.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/drivers/common/mlx5/linux/meson.build b/drivers/common/mlx5/linux/meson.build
index cbea58f557d..2dcd27b7786 100644
--- a/drivers/common/mlx5/linux/meson.build
+++ b/drivers/common/mlx5/linux/meson.build
@@ -175,6 +175,8 @@ has_sym_args = [
             'RDMA_NLDEV_ATTR_DEV_NAME' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_PORT_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_PORT_INDEX' ],
+        [ 'HAVE_RDMA_NLDEV_ATTR_PORT_STATE', 'rdma/rdma_netlink.h',
+            'RDMA_NLDEV_ATTR_PORT_STATE' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_NDEV_INDEX' ],
         [ 'HAVE_MLX5_DR_FLOW_DUMP', 'infiniband/mlx5dv.h',
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c
index 9120a697fd5..4b762850941 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -78,6 +78,9 @@
 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
 #endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
+#define RDMA_NLDEV_ATTR_PORT_STATE 12
+#endif
 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
 #endif
@@ -160,14 +163,16 @@ struct mlx5_nl_mac_addr {
 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 
 /** Data structure used by mlx5_nl_cmdget_cb(). */
-struct mlx5_nl_ifindex_data {
+struct mlx5_nl_port_info {
 	const char *name; /**< IB device name (in). */
 	uint32_t flags; /**< found attribute flags (out). */
 	uint32_t ibindex; /**< IB device index (out). */
 	uint32_t ifindex; /**< Network interface index (out). */
 	uint32_t portnum; /**< IB device max port number (out). */
+	uint16_t state; /**< IB device port state (out). */
 };
 
 uint32_t atomic_sn;
@@ -966,8 +971,8 @@ mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 static int
 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 {
-	struct mlx5_nl_ifindex_data *data = arg;
-	struct mlx5_nl_ifindex_data local = {
+	struct mlx5_nl_port_info *data = arg;
+	struct mlx5_nl_port_info local = {
 		.flags = 0,
 	};
 	size_t off = NLMSG_HDRLEN;
@@ -1000,6 +1005,10 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 			local.portnum = *(uint32_t *)payload;
 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
 			break;
+		case RDMA_NLDEV_ATTR_PORT_STATE:
+			local.state = *(uint8_t *)payload;
+			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
+			break;
 		default:
 			break;
 		}
@@ -1016,6 +1025,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 		data->ibindex = local.ibindex;
 		data->ifindex = local.ifindex;
 		data->portnum = local.portnum;
+		data->state = local.state;
 	}
 	return 0;
 error:
@@ -1024,7 +1034,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 }
 
 /**
- * Get index of network interface associated with some IB device.
+ * Get port info of network interface associated with some IB device.
  *
  * This is the only somewhat safe method to avoid resorting to heuristics
  * when faced with port representors. Unfortunately it requires at least
@@ -1032,27 +1042,20 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
  *
  * @param nl
  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
- * @param[in] name
- *   IB device name.
  * @param[in] pindex
  *   IB device port index, starting from 1
+ * @param[out] data
+ *   Pointer to port info.
  * @return
- *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
- *   is set.
+ *   0 on success, negative on error and rte_errno is set.
  */
-unsigned int
-mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+static int
+mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
 {
-	struct mlx5_nl_ifindex_data data = {
-		.name = name,
-		.flags = 0,
-		.ibindex = 0, /* Determined during first pass. */
-		.ifindex = 0, /* Determined during second pass. */
-	};
 	union {
 		struct nlmsghdr nh;
 		uint8_t buf[NLMSG_HDRLEN +
-			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
 	} req = {
 		.nh = {
@@ -1068,24 +1071,24 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
 		goto error;
-	data.flags = 0;
+	data->flags = 0;
 	sn = MLX5_NL_SN_GENERATE;
 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
 					     RDMA_NLDEV_CMD_PORT_GET);
 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
-	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-	       &data.ibindex, sizeof(data.ibindex));
+	       &data->ibindex, sizeof(data->ibindex));
 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
@@ -1093,19 +1096,82 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 	       &pindex, sizeof(pindex));
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
-	    !data.ifindex)
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+	    !data->ifindex)
 		goto error;
-	return data.ifindex;
+	return 1;
 error:
 	rte_errno = ENODEV;
-	return 0;
+	return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.ifindex = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return 0;
+	return data.ifindex;
+}
+
+/**
+ * Get IB device port state.
+ *
+ * This is the only somewhat safe method to get info for port number >= 255.
+ * Unfortunately it requires at least Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   Port state (ibv_port_state) on success, negative on error
+ *   and rte_errno is set.
+ */
+int
+mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.state = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return -rte_errno;
+	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	return (int)data.state;
 }
 
 /**
@@ -1123,7 +1189,7 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 unsigned int
 mlx5_nl_portnum(int nl, const char *name)
 {
-	struct mlx5_nl_ifindex_data data = {
+	struct mlx5_nl_port_info data = {
 		.flags = 0,
 		.name = name,
 		.ifindex = 0,
diff --git a/drivers/common/mlx5/linux/mlx5_nl.h b/drivers/common/mlx5/linux/mlx5_nl.h
index 15129ffdc88..396f8f3f20a 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.h
+++ b/drivers/common/mlx5/linux/mlx5_nl.h
@@ -54,6 +54,8 @@ unsigned int mlx5_nl_portnum(int nl, const char *name);
 __rte_internal
 unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
 __rte_internal
+int mlx5_nl_port_state(int nl, const char *name, uint32_t pindex);
+__rte_internal
 int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 			       struct rte_ether_addr *mac, int vf_index);
 __rte_internal
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index d3c5040aac8..2a2c7e51ba5 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -134,6 +134,7 @@ INTERNAL {
 	mlx5_nl_mac_addr_flush; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_remove; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_sync; # WINDOWS_NO_EXPORT
+	mlx5_nl_port_state; # WINDOWS_NO_EXPORT
 	mlx5_nl_portnum; # WINDOWS_NO_EXPORT
 	mlx5_nl_promisc; # WINDOWS_NO_EXPORT
 	mlx5_nl_switch_info; # WINDOWS_NO_EXPORT
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:24     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
                     ` (5 subsequent siblings)
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

IB spec doesn't allow 255 ports on a single HCA, port number of 256 was
cast to u8 value 0 which invalid to ibv_query_port()

This patch invokes Netlink api to query port state when port number
greater than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 46 ++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 3746057673d..f283a3779cc 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -956,7 +956,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	const struct mlx5_switch_info *switch_info = &spawn->info;
 	struct mlx5_dev_ctx_shared *sh = NULL;
-	struct ibv_port_attr port_attr;
+	struct ibv_port_attr port_attr = { .state = IBV_PORT_NOP };
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct mlx5_priv *priv = NULL;
@@ -976,6 +976,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	uint16_t port_id;
 	struct mlx5_port_info vport_info = { .query_flags = 0 };
+	int nl_rdma = -1;
 	int i;
 
 	/* Determine if this port representor is supposed to be spawned. */
@@ -1170,19 +1171,36 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config->mpls_en = mpls_en;
+	nl_rdma = mlx5_nl_init(NETLINK_RDMA);
 	/* Check port status. */
-	err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
-	if (err) {
-		DRV_LOG(ERR, "port query failed: %s", strerror(err));
-		goto error;
-	}
-	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-		DRV_LOG(ERR, "port is not configured in Ethernet mode");
-		err = EINVAL;
-		goto error;
+	if (spawn->phys_port <= UINT8_MAX) {
+		/* Legacy Verbs api only support u8 port number. */
+		err = mlx5_glue->query_port(sh->ctx, spawn->phys_port,
+					    &port_attr);
+		if (err) {
+			DRV_LOG(ERR, "port query failed: %s", strerror(err));
+			goto error;
+		}
+		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
+			err = EINVAL;
+			goto error;
+		}
+	} else if (nl_rdma >= 0) {
+		/* IB doesn't allow more than 255 ports, must be Ethernet. */
+		err = mlx5_nl_port_state(nl_rdma,
+			((struct ibv_device *)spawn->phys_dev)->name,
+			spawn->phys_port);
+		if (err < 0) {
+			DRV_LOG(INFO, "Failed to get netlink port state: %s",
+				strerror(rte_errno));
+			err = -rte_errno;
+			goto error;
+		}
+		port_attr.state = (enum ibv_port_state)err;
 	}
 	if (port_attr.state != IBV_PORT_ACTIVE)
-		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+		DRV_LOG(INFO, "port is not active: \"%s\" (%d)",
 			mlx5_glue->port_state_str(port_attr.state),
 			port_attr.state);
 	/* Allocate private eth device data. */
@@ -1199,7 +1217,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->pci_dev = spawn->pci_dev;
 	priv->mtu = RTE_ETHER_MTU;
 	/* Some internal functions rely on Netlink sockets, open them now. */
-	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_rdma = nl_rdma;
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
 	priv->representor = !!switch_info->representor;
 	priv->master = !!switch_info->master;
@@ -1910,8 +1928,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mlx5_os_free_shared_dr(priv);
 		if (priv->nl_socket_route >= 0)
 			close(priv->nl_socket_route);
-		if (priv->nl_socket_rdma >= 0)
-			close(priv->nl_socket_rdma);
 		if (priv->vmwa_context)
 			mlx5_vlan_vmwa_exit(priv->vmwa_context);
 		if (eth_dev && priv->drop_queue.hrxq)
@@ -1935,6 +1951,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (sh)
 		mlx5_free_shared_dev_ctx(sh);
+	if (nl_rdma >= 0)
+		close(nl_rdma);
 	MLX5_ASSERT(err > 0);
 	rte_errno = err;
 	return NULL;
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:26     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
                     ` (4 subsequent siblings)
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

To detect number flow Verbs flow priorities, PMD try to create Verbs
flows in different priority. While Verbs is not designed to support
ports larger than 255.

When DevX supported by kernel driver, 16 Verbs priorities must be
supported, no need to create Verbs flows.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_verbs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index b93fd4d2c96..178eabed163 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -83,6 +83,11 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 	int i;
 	int priority = 0;
 
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+	/* If DevX supported, driver must support 16 verbs flow priorities. */
+	priority = RTE_DIM(priority_map_5);
+	goto out;
+#endif
 	if (!drop->qp) {
 		rte_errno = ENOTSUP;
 		return -rte_errno;
@@ -109,6 +114,9 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 			dev->data->port_id, priority);
 		return -rte_errno;
 	}
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+out:
+#endif
 	DRV_LOG(INFO, "port %u supported flow priorities:"
 		" 0-%d for ingress or egress root table,"
 		" 0-%d for non-root table or transfer root table.",
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (2 preceding siblings ...)
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:26     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
                     ` (3 subsequent siblings)
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

For egress packet on representor, the vport ID in transport domain
is E-Switch manager vport ID since representor shares resources of
E-Switch manager. E-Switch manager vport ID and Tx queue internal device
index are used to match representor egress packet.

This patch adds flow item port ID match on E-Switch manager.

E-Switch manager vport ID is 0xfffe on BlueField, 0 otherwise.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.h    |  3 +++
 drivers/net/mlx5/mlx5_flow_dv.c | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 5c68d4f7d74..c25af8d9864 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -18,6 +18,9 @@
 
 #include "mlx5.h"
 
+/* E-Switch Manager port, used for rte_flow_item_port_id. */
+#define MLX5_PORT_ESW_MGR UINT32_MAX
+
 /* Private rte flow items. */
 enum mlx5_rte_flow_item_type {
 	MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index c6370cd1d68..f06ce54f7e7 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -15,6 +15,7 @@
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_cycles.h>
+#include <rte_bus_pci.h>
 #include <rte_ip.h>
 #include <rte_gre.h>
 #include <rte_vxlan.h>
@@ -92,6 +93,23 @@ static int
 flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
 				  uint32_t rix_jump);
 
+static int16_t
+flow_dv_get_esw_manager_vport_id(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->pci_dev == NULL)
+		return 0;
+	switch (priv->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX7BF:
+		return (int16_t)0xfffe;
+	default:
+		return 0;
+	}
+}
+
 /**
  * Initialize flow attributes structure according to flow items' types.
  *
@@ -2224,6 +2242,8 @@ flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
 		return ret;
 	if (!spec)
 		return 0;
+	if (spec->id == MLX5_PORT_ESW_MGR)
+		return 0;
 	esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
 	if (!esw_priv)
 		return rte_flow_error_set(error, rte_errno,
@@ -9685,6 +9705,11 @@ flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
 	struct mlx5_priv *priv;
 	uint16_t mask, id;
 
+	if (pid_v && pid_v->id == MLX5_PORT_ESW_MGR) {
+		flow_dv_translate_item_source_vport(matcher, key,
+			flow_dv_get_esw_manager_vport_id(dev), 0xffff);
+		return 0;
+	}
 	mask = pid_m ? pid_m->id : 0xffff;
 	id = pid_v ? pid_v->id : dev->data->port_id;
 	priv = mlx5_port_to_eswitch_info(id, item == NULL);
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (3 preceding siblings ...)
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:27     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
                     ` (2 subsequent siblings)
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Extends txq flow pattern to support both hairpin and regular txq.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_dv.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index f06ce54f7e7..4a17ca64a2e 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -10910,22 +10910,22 @@ flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
 	void *misc_v =
 		MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
 	struct mlx5_txq_ctrl *txq;
-	uint32_t queue;
-
+	uint32_t queue, mask;
 
 	queue_m = (const void *)item->mask;
-	if (!queue_m)
-		return;
 	queue_v = (const void *)item->spec;
 	if (!queue_v)
 		return;
 	txq = mlx5_txq_get(dev, queue_v->queue);
 	if (!txq)
 		return;
-	queue = txq->obj->sq->id;
-	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
-	MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
-		 queue & queue_m->queue);
+	if (txq->type == MLX5_TXQ_TYPE_HAIRPIN)
+		queue = txq->obj->sq->id;
+	else
+		queue = txq->obj->sq_obj.sq->id;
+	mask = queue_m == NULL ? UINT32_MAX : queue_m->queue;
+	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, mask);
+	MLX5_SET(fte_match_set_misc, misc_v, source_sqn, queue & mask);
 	mlx5_txq_release(dev, queue_v->queue);
 }
 
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (4 preceding siblings ...)
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:28     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Dong Zhou

When creating internal transfer flow on root table with lowerest
priority, the flow was created with max UINT32_MAX priority. It is wrong
since the flow is created in kernel and  max priority supported is 16.

This patch fixes this by adding internal flow check.

Fixes: 5f8ae44dd454 ("net/mlx5: enlarge maximal flow priority")

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.c    | 7 ++++++-
 drivers/net/mlx5/mlx5_flow.h    | 4 ++--
 drivers/net/mlx5/mlx5_flow_dv.c | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c914a7120cc..b5232cd46ae 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -980,13 +980,15 @@ mlx5_get_lowest_priority(struct rte_eth_dev *dev,
  *   Pointer to device flow rule attributes.
  * @param[in] subpriority
  *   The priority based on the items.
+ * @param[in] external
+ *   Flow is user flow.
  * @return
  *   The matcher priority of the flow.
  */
 uint16_t
 mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 			  const struct rte_flow_attr *attr,
-			  uint32_t subpriority)
+			  uint32_t subpriority, bool external)
 {
 	uint16_t priority = (uint16_t)attr->priority;
 	struct mlx5_priv *priv = dev->data->dev_private;
@@ -995,6 +997,9 @@ mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 		if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 			priority = priv->config.flow_prio - 1;
 		return mlx5_os_flow_adjust_priority(dev, priority, subpriority);
+	} else if (!external && attr->transfer && attr->group == 0 &&
+		   attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR) {
+		return (priv->config.flow_prio - 1) * 3;
 	}
 	if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 		priority = MLX5_NON_ROOT_FLOW_MAX_PRIO;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index c25af8d9864..f1a83d537d0 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1431,8 +1431,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 uint32_t mlx5_get_lowest_priority(struct rte_eth_dev *dev,
 					const struct rte_flow_attr *attr);
 uint16_t mlx5_get_matcher_priority(struct rte_eth_dev *dev,
-				     const struct rte_flow_attr *attr,
-				     uint32_t subpriority);
+				   const struct rte_flow_attr *attr,
+				   uint32_t subpriority, bool external);
 int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
 				     enum mlx5_feature_name feature,
 				     uint32_t id,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 4a17ca64a2e..ffc1fc8a05c 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -13646,7 +13646,8 @@ flow_dv_translate(struct rte_eth_dev *dev,
 	matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
 				    matcher.mask.size);
 	matcher.priority = mlx5_get_matcher_priority(dev, attr,
-					matcher.priority);
+						     matcher.priority,
+						     dev_flow->external);
 	/**
 	 * When creating meter drop flow in drop table, using original
 	 * 5-tuple match, the matcher priority should be lower than
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (5 preceding siblings ...)
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:29     ` Slava Ovsiienko
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API does not support Infiniband device port number larger 255 by
design. To support more representors on a single Infiniband device DevX
API should be engaged.

While creating Send Queue (SQ) object with Verbs API, the PMD assigned
IB device port attribute and kernel created the default miss flows in
FDB domain, to redirect egress traffic from the queue being created to
representor appropriate peer (wire, HPF, VF or SF).

With DevX API there is no IB-device port attribute (it is merely kernel
one, DevX operates in PRM terms) and PMD must create default miss flows
in FDB explicitly. PMD did not provide this and using DevX API for
E-Switch configurations was disabled.

The default miss FDB flow matches E-Switch manager vport (to make sure
the source is some representor) and SQn (Send Queue number - device
internal queue index). The root flow table managed by kernel/firmware
and it does not support vport redirect action, we have to split the
default miss flow into two ones:

- flow with lowest priority in the root table that matches E-Switch
manager vport ID and jump to group 1.
- flow in group 1 that matches E-Switch manager vport ID and SQn and
forwards packet to peer vport

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 62 +-------------------------
 drivers/net/mlx5/mlx5.h          |  2 +
 drivers/net/mlx5/mlx5_devx.c     | 10 ++---
 drivers/net/mlx5/mlx5_devx.h     |  2 +
 drivers/net/mlx5/mlx5_flow.c     | 74 ++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_trigger.c  | 11 ++++-
 6 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index f283a3779cc..93ee9318ebc 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -697,56 +697,6 @@ mlx5_init_once(void)
 	return ret;
 }
 
-/**
- * Create the Tx queue DevX/Verbs object.
- *
- * @param dev
- *   Pointer to Ethernet device.
- * @param idx
- *   Queue index in DPDK Tx queue array.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
-	struct mlx5_txq_ctrl *txq_ctrl =
-			container_of(txq_data, struct mlx5_txq_ctrl, txq);
-
-	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!priv->config.dv_esw_en)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#endif
-	return mlx5_txq_ibv_obj_new(dev, idx);
-}
-
-/**
- * Release an Tx DevX/verbs queue object.
- *
- * @param txq_obj
- *   DevX/Verbs Tx queue object.
- */
-static void
-mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
-{
-	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#endif
-	mlx5_txq_ibv_obj_release(txq_obj);
-}
-
 /**
  * DV flow counter mode detect and config.
  *
@@ -1812,16 +1762,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 						ibv_obj_ops.drop_action_create;
 		priv->obj_ops.drop_action_destroy =
 						ibv_obj_ops.drop_action_destroy;
-#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
-		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
-#else
-		if (config->dv_esw_en)
-			priv->obj_ops.txq_obj_modify =
-						ibv_obj_ops.txq_obj_modify;
-#endif
-		/* Use specific wrappers for Tx object. */
-		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
-		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
 		mlx5_queue_counter_id_prepare(eth_dev);
 		priv->obj_ops.lb_dummy_queue_create =
 					mlx5_rxq_ibv_obj_dummy_lb_create;
@@ -1832,7 +1772,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (config->tx_pp &&
 	    (priv->config.dv_esw_en ||
-	     priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+	     priv->obj_ops.txq_obj_new != mlx5_txq_devx_obj_new)) {
 		/*
 		 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
 		 * packet pacing and already checked above.
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3581414b789..570f827375a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1699,6 +1699,8 @@ int mlx5_ctrl_flow(struct rte_eth_dev *dev,
 		   struct rte_flow_item_eth *eth_mask);
 int mlx5_flow_lacp_miss(struct rte_eth_dev *dev);
 struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+uint32_t mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev,
+					    uint32_t txq);
 void mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 				       uint64_t async_id, int status);
 void mlx5_set_query_alarm(struct mlx5_dev_ctx_shared *sh);
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a1db53577a2..a49602cb957 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -102,9 +102,9 @@ mlx5_devx_modify_rq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
-mlx5_devx_modify_sq(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
-		    uint8_t dev_port)
+int
+mlx5_txq_devx_modify(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+		     uint8_t dev_port)
 {
 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
 	int ret;
@@ -1118,7 +1118,7 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
 	*txq_data->qp_db = 0;
 	txq_data->qp_num_8s = txq_obj->sq_obj.sq->id << 8;
 	/* Change Send Queue state to Ready-to-Send. */
-	ret = mlx5_devx_modify_sq(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
+	ret = mlx5_txq_devx_modify(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
 	if (ret) {
 		rte_errno = errno;
 		DRV_LOG(ERR,
@@ -1187,7 +1187,7 @@ struct mlx5_obj_ops devx_obj_ops = {
 	.drop_action_create = mlx5_devx_drop_action_create,
 	.drop_action_destroy = mlx5_devx_drop_action_destroy,
 	.txq_obj_new = mlx5_txq_devx_obj_new,
-	.txq_obj_modify = mlx5_devx_modify_sq,
+	.txq_obj_modify = mlx5_txq_devx_modify,
 	.txq_obj_release = mlx5_txq_devx_obj_release,
 	.lb_dummy_queue_create = NULL,
 	.lb_dummy_queue_release = NULL,
diff --git a/drivers/net/mlx5/mlx5_devx.h b/drivers/net/mlx5/mlx5_devx.h
index bc8a8d6b73c..a95207a6b9a 100644
--- a/drivers/net/mlx5/mlx5_devx.h
+++ b/drivers/net/mlx5/mlx5_devx.h
@@ -8,6 +8,8 @@
 #include "mlx5.h"
 
 int mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_devx_modify(struct mlx5_txq_obj *obj,
+			 enum mlx5_txq_modify_type type, uint8_t dev_port);
 void mlx5_txq_devx_obj_release(struct mlx5_txq_obj *txq_obj);
 
 extern struct mlx5_obj_ops devx_obj_ops;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index b5232cd46ae..1528f8c6b51 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6573,6 +6573,80 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
 						   actions, false, &error);
 }
 
+/**
+ * Create a dedicated flow rule on e-switch table 1, matches ESW manager
+ * and sq number, directs all packets to peer vport.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param txq
+ *   Txq index.
+ *
+ * @return
+ *   Flow ID on success, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev, uint32_t txq)
+{
+	struct rte_flow_attr attr = {
+		.group = 0,
+		.priority = MLX5_FLOW_LOWEST_PRIO_INDICATOR,
+		.ingress = 1,
+		.egress = 0,
+		.transfer = 1,
+	};
+	struct rte_flow_item_port_id port_spec = {
+		.id = MLX5_PORT_ESW_MGR,
+	};
+	struct mlx5_rte_flow_item_tx_queue txq_spec = {
+		.queue = txq,
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_PORT_ID,
+			.spec = &port_spec,
+		},
+		{
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+			.spec = &txq_spec,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action_jump jump = {
+		.group = 1,
+	};
+	struct rte_flow_action_port_id port = {
+		.id = dev->data->port_id,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct rte_flow_error error;
+
+	/*
+	 * Creates group 0, highest priority jump flow.
+	 * Matches txq to bypass kernel packets.
+	 */
+	if (flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern, actions,
+			     false, &error) == 0)
+		return 0;
+	/* Create group 1, lowest priority redirect flow for txq. */
+	attr.group = 1;
+	actions[0].conf = &port;
+	actions[0].type = RTE_FLOW_ACTION_TYPE_PORT_ID;
+	return flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern,
+				actions, false, &error);
+}
+
 /**
  * Validate a flow supported by the NIC.
  *
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 54173bfacb2..42d8bb31128 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1255,9 +1255,18 @@ mlx5_traffic_enable(struct rte_eth_dev *dev)
 				goto error;
 			}
 		}
+		if ((priv->representor || priv->master) &&
+		    priv->config.dv_esw_en) {
+			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
+				DRV_LOG(ERR,
+					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
+					dev->data->port_id, i);
+				goto error;
+			}
+		}
 		mlx5_txq_release(dev, i);
 	}
-	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
+	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
 		if (mlx5_flow_create_esw_table_zero_flow(dev))
 			priv->fdb_def_rule = 1;
 		else
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (6 preceding siblings ...)
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
@ 2021-10-16  8:07   ` Xueming Li
  2021-10-19  8:30     ` Slava Ovsiienko
  7 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-16  8:07 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API doesn't support device port number larger than 255 by design.

To support more VF or SubFunction port representors, forces DevX API
check when max Verbs device link ports larger than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 93ee9318ebc..39a9722d869 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1299,12 +1299,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		config->dv_flow_en = 0;
 	}
 #endif
-	if (spawn->max_port > UINT8_MAX) {
-		/* Verbs can't support ports larger than 255 by design. */
-		DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
-		err = EINVAL;
-		goto error;
-	}
 	config->ind_table_max_size =
 		sh->device_attr.max_rwq_indirection_table_size;
 	/*
@@ -1767,6 +1761,11 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 					mlx5_rxq_ibv_obj_dummy_lb_create;
 		priv->obj_ops.lb_dummy_queue_release =
 					mlx5_rxq_ibv_obj_dummy_lb_release;
+	} else if (spawn->max_port > UINT8_MAX) {
+		/* Verbs can't support ports larger than 255 by design. */
+		DRV_LOG(ERR, "must enable DV and ESW when RDMA link ports > 255");
+		err = ENOTSUP;
+		goto error;
 	} else {
 		priv->obj_ops = ibv_obj_ops;
 	}
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-10-19  8:23     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:23 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad, Ray Kinsella

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:07
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>; Ray Kinsella <mdr@ashroe.eu>
> Subject: [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port
> state
> 
> Introduce netlink API to get rdma port state.
> 
> Port state is restrieved based on RDMA device name and port index.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
@ 2021-10-19  8:24     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:24 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:07
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255
> 
> IB spec doesn't allow 255 ports on a single HCA, port number of 256 was cast
> to u8 value 0 which invalid to ibv_query_port()
> 
> This patch invokes Netlink api to query port state when port number greater
> than 255.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
@ 2021-10-19  8:26     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:26 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:07
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for
> scalable
> 
> To detect number flow Verbs flow priorities, PMD try to create Verbs flows in
> different priority. While Verbs is not designed to support ports larger than
> 255.
> 
> When DevX supported by kernel driver, 16 Verbs priorities must be supported,
> no need to create Verbs flows.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
@ 2021-10-19  8:26     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:26 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:07
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic
> match
> 
> For egress packet on representor, the vport ID in transport domain is E-Switch
> manager vport ID since representor shares resources of E-Switch manager. E-
> Switch manager vport ID and Tx queue internal device index are used to
> match representor egress packet.
> 
> This patch adds flow item port ID match on E-Switch manager.
> 
> E-Switch manager vport ID is 0xfffe on BlueField, 0 otherwise.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
@ 2021-10-19  8:27     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:27 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:07
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue
> 
> Extends txq flow pattern to support both hairpin and regular txq.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
@ 2021-10-19  8:28     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:28 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad, Bill Zhou

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:08
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>; Bill Zhou <dongzhou@nvidia.com>
> Subject: [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity
> 
> When creating internal transfer flow on root table with lowerest priority, the
> flow was created with max UINT32_MAX priority. It is wrong since the flow is
> created in kernel and  max priority supported is 16.
> 
> This patch fixes this by adding internal flow check.
> 
> Fixes: 5f8ae44dd454 ("net/mlx5: enlarge maximal flow priority")
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
@ 2021-10-19  8:29     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:29 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:08
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation
> 
> Verbs API does not support Infiniband device port number larger 255 by
> design. To support more representors on a single Infiniband device DevX API
> should be engaged.
> 
> While creating Send Queue (SQ) object with Verbs API, the PMD assigned IB
> device port attribute and kernel created the default miss flows in FDB domain,
> to redirect egress traffic from the queue being created to representor
> appropriate peer (wire, HPF, VF or SF).
> 
> With DevX API there is no IB-device port attribute (it is merely kernel one,
> DevX operates in PRM terms) and PMD must create default miss flows in FDB
> explicitly. PMD did not provide this and using DevX API for E-Switch
> configurations was disabled.
> 
> The default miss FDB flow matches E-Switch manager vport (to make sure the
> source is some representor) and SQn (Send Queue number - device internal
> queue index). The root flow table managed by kernel/firmware and it does
> not support vport redirect action, we have to split the default miss flow into
> two ones:
> 
> - flow with lowest priority in the root table that matches E-Switch manager
> vport ID and jump to group 1.
> - flow in group 1 that matches E-Switch manager vport ID and SQn and
> forwards packet to peer vport
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports
  2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
@ 2021-10-19  8:30     ` Slava Ovsiienko
  0 siblings, 0 replies; 47+ messages in thread
From: Slava Ovsiienko @ 2021-10-19  8:30 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev; +Cc: Lior Margalit, Matan Azrad

> -----Original Message-----
> From: Xueming(Steven) Li <xuemingl@nvidia.com>
> Sent: Saturday, October 16, 2021 11:08
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>; Matan
> Azrad <matan@nvidia.com>
> Subject: [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports
> 
> Verbs API doesn't support device port number larger than 255 by design.
> 
> To support more VF or SubFunction port representors, forces DevX API check
> when max Verbs device link ports larger than 255.
> 
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (8 preceding siblings ...)
  2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
@ 2021-10-19 10:34 ` Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
                     ` (8 more replies)
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
  10 siblings, 9 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit

This patch set supports representor number of a PF to be more than 255.
CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255 SFs.

v2:
 - fixed FDB root table flow priority
 - add error check to Netlink port state API
 - commit log update and other minor fixes
v3:
 - fix compilation issue

Xueming Li (8):
  common/mlx5: add netlink API to get RDMA port state
  net/mlx5: use netlink when IB port greater than 255
  net/mlx5: improve Verbs flow priority discover for scalable
  net/mlx5: support E-Switch manager egress traffic match
  net/mlx5: supports flow item of normal Tx queue
  net/mlx5: fix internal root table flow priroity
  net/mlx5: enable DevX Tx queue creation
  net/mlx5: check DevX to support more Verbs ports

 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 drivers/net/mlx5/linux/mlx5_os.c      | 119 +++++++---------------
 drivers/net/mlx5/mlx5.h               |   2 +
 drivers/net/mlx5/mlx5_devx.c          |  10 +-
 drivers/net/mlx5/mlx5_devx.h          |   2 +
 drivers/net/mlx5/mlx5_flow.c          |  81 ++++++++++++++-
 drivers/net/mlx5/mlx5_flow.h          |   7 +-
 drivers/net/mlx5/mlx5_flow_dv.c       |  44 +++++++--
 drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
 drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
 13 files changed, 291 insertions(+), 134 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-21 13:34     ` Ferruh Yigit
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
                     ` (7 subsequent siblings)
  8 siblings, 1 reply; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev
  Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Ray Kinsella

Introduce netlink API to get rdma port state.

Port state is restrieved based on RDMA device name and port index.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/drivers/common/mlx5/linux/meson.build b/drivers/common/mlx5/linux/meson.build
index cbea58f557d..2dcd27b7786 100644
--- a/drivers/common/mlx5/linux/meson.build
+++ b/drivers/common/mlx5/linux/meson.build
@@ -175,6 +175,8 @@ has_sym_args = [
             'RDMA_NLDEV_ATTR_DEV_NAME' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_PORT_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_PORT_INDEX' ],
+        [ 'HAVE_RDMA_NLDEV_ATTR_PORT_STATE', 'rdma/rdma_netlink.h',
+            'RDMA_NLDEV_ATTR_PORT_STATE' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_NDEV_INDEX' ],
         [ 'HAVE_MLX5_DR_FLOW_DUMP', 'infiniband/mlx5dv.h',
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c
index 9120a697fd5..4b762850941 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -78,6 +78,9 @@
 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
 #endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
+#define RDMA_NLDEV_ATTR_PORT_STATE 12
+#endif
 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
 #endif
@@ -160,14 +163,16 @@ struct mlx5_nl_mac_addr {
 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 
 /** Data structure used by mlx5_nl_cmdget_cb(). */
-struct mlx5_nl_ifindex_data {
+struct mlx5_nl_port_info {
 	const char *name; /**< IB device name (in). */
 	uint32_t flags; /**< found attribute flags (out). */
 	uint32_t ibindex; /**< IB device index (out). */
 	uint32_t ifindex; /**< Network interface index (out). */
 	uint32_t portnum; /**< IB device max port number (out). */
+	uint16_t state; /**< IB device port state (out). */
 };
 
 uint32_t atomic_sn;
@@ -966,8 +971,8 @@ mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 static int
 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 {
-	struct mlx5_nl_ifindex_data *data = arg;
-	struct mlx5_nl_ifindex_data local = {
+	struct mlx5_nl_port_info *data = arg;
+	struct mlx5_nl_port_info local = {
 		.flags = 0,
 	};
 	size_t off = NLMSG_HDRLEN;
@@ -1000,6 +1005,10 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 			local.portnum = *(uint32_t *)payload;
 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
 			break;
+		case RDMA_NLDEV_ATTR_PORT_STATE:
+			local.state = *(uint8_t *)payload;
+			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
+			break;
 		default:
 			break;
 		}
@@ -1016,6 +1025,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 		data->ibindex = local.ibindex;
 		data->ifindex = local.ifindex;
 		data->portnum = local.portnum;
+		data->state = local.state;
 	}
 	return 0;
 error:
@@ -1024,7 +1034,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 }
 
 /**
- * Get index of network interface associated with some IB device.
+ * Get port info of network interface associated with some IB device.
  *
  * This is the only somewhat safe method to avoid resorting to heuristics
  * when faced with port representors. Unfortunately it requires at least
@@ -1032,27 +1042,20 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
  *
  * @param nl
  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
- * @param[in] name
- *   IB device name.
  * @param[in] pindex
  *   IB device port index, starting from 1
+ * @param[out] data
+ *   Pointer to port info.
  * @return
- *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
- *   is set.
+ *   0 on success, negative on error and rte_errno is set.
  */
-unsigned int
-mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+static int
+mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
 {
-	struct mlx5_nl_ifindex_data data = {
-		.name = name,
-		.flags = 0,
-		.ibindex = 0, /* Determined during first pass. */
-		.ifindex = 0, /* Determined during second pass. */
-	};
 	union {
 		struct nlmsghdr nh;
 		uint8_t buf[NLMSG_HDRLEN +
-			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
 	} req = {
 		.nh = {
@@ -1068,24 +1071,24 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
 		goto error;
-	data.flags = 0;
+	data->flags = 0;
 	sn = MLX5_NL_SN_GENERATE;
 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
 					     RDMA_NLDEV_CMD_PORT_GET);
 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
-	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-	       &data.ibindex, sizeof(data.ibindex));
+	       &data->ibindex, sizeof(data->ibindex));
 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
@@ -1093,19 +1096,82 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 	       &pindex, sizeof(pindex));
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
-	    !data.ifindex)
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+	    !data->ifindex)
 		goto error;
-	return data.ifindex;
+	return 1;
 error:
 	rte_errno = ENODEV;
-	return 0;
+	return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.ifindex = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return 0;
+	return data.ifindex;
+}
+
+/**
+ * Get IB device port state.
+ *
+ * This is the only somewhat safe method to get info for port number >= 255.
+ * Unfortunately it requires at least Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   Port state (ibv_port_state) on success, negative on error
+ *   and rte_errno is set.
+ */
+int
+mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.state = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return -rte_errno;
+	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	return (int)data.state;
 }
 
 /**
@@ -1123,7 +1189,7 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 unsigned int
 mlx5_nl_portnum(int nl, const char *name)
 {
-	struct mlx5_nl_ifindex_data data = {
+	struct mlx5_nl_port_info data = {
 		.flags = 0,
 		.name = name,
 		.ifindex = 0,
diff --git a/drivers/common/mlx5/linux/mlx5_nl.h b/drivers/common/mlx5/linux/mlx5_nl.h
index 15129ffdc88..396f8f3f20a 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.h
+++ b/drivers/common/mlx5/linux/mlx5_nl.h
@@ -54,6 +54,8 @@ unsigned int mlx5_nl_portnum(int nl, const char *name);
 __rte_internal
 unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
 __rte_internal
+int mlx5_nl_port_state(int nl, const char *name, uint32_t pindex);
+__rte_internal
 int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 			       struct rte_ether_addr *mac, int vf_index);
 __rte_internal
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index d3c5040aac8..2a2c7e51ba5 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -134,6 +134,7 @@ INTERNAL {
 	mlx5_nl_mac_addr_flush; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_remove; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_sync; # WINDOWS_NO_EXPORT
+	mlx5_nl_port_state; # WINDOWS_NO_EXPORT
 	mlx5_nl_portnum; # WINDOWS_NO_EXPORT
 	mlx5_nl_promisc; # WINDOWS_NO_EXPORT
 	mlx5_nl_switch_info; # WINDOWS_NO_EXPORT
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 2/8] net/mlx5: use netlink when IB port greater than 255
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

IB spec doesn't allow 255 ports on a single HCA, port number of 256 was
cast to u8 value 0 which invalid to ibv_query_port()

This patch invokes Netlink api to query port state when port number
greater than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 46 ++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 3746057673d..f283a3779cc 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -956,7 +956,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	const struct mlx5_switch_info *switch_info = &spawn->info;
 	struct mlx5_dev_ctx_shared *sh = NULL;
-	struct ibv_port_attr port_attr;
+	struct ibv_port_attr port_attr = { .state = IBV_PORT_NOP };
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct mlx5_priv *priv = NULL;
@@ -976,6 +976,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	uint16_t port_id;
 	struct mlx5_port_info vport_info = { .query_flags = 0 };
+	int nl_rdma = -1;
 	int i;
 
 	/* Determine if this port representor is supposed to be spawned. */
@@ -1170,19 +1171,36 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config->mpls_en = mpls_en;
+	nl_rdma = mlx5_nl_init(NETLINK_RDMA);
 	/* Check port status. */
-	err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
-	if (err) {
-		DRV_LOG(ERR, "port query failed: %s", strerror(err));
-		goto error;
-	}
-	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-		DRV_LOG(ERR, "port is not configured in Ethernet mode");
-		err = EINVAL;
-		goto error;
+	if (spawn->phys_port <= UINT8_MAX) {
+		/* Legacy Verbs api only support u8 port number. */
+		err = mlx5_glue->query_port(sh->ctx, spawn->phys_port,
+					    &port_attr);
+		if (err) {
+			DRV_LOG(ERR, "port query failed: %s", strerror(err));
+			goto error;
+		}
+		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
+			err = EINVAL;
+			goto error;
+		}
+	} else if (nl_rdma >= 0) {
+		/* IB doesn't allow more than 255 ports, must be Ethernet. */
+		err = mlx5_nl_port_state(nl_rdma,
+			((struct ibv_device *)spawn->phys_dev)->name,
+			spawn->phys_port);
+		if (err < 0) {
+			DRV_LOG(INFO, "Failed to get netlink port state: %s",
+				strerror(rte_errno));
+			err = -rte_errno;
+			goto error;
+		}
+		port_attr.state = (enum ibv_port_state)err;
 	}
 	if (port_attr.state != IBV_PORT_ACTIVE)
-		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+		DRV_LOG(INFO, "port is not active: \"%s\" (%d)",
 			mlx5_glue->port_state_str(port_attr.state),
 			port_attr.state);
 	/* Allocate private eth device data. */
@@ -1199,7 +1217,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->pci_dev = spawn->pci_dev;
 	priv->mtu = RTE_ETHER_MTU;
 	/* Some internal functions rely on Netlink sockets, open them now. */
-	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_rdma = nl_rdma;
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
 	priv->representor = !!switch_info->representor;
 	priv->master = !!switch_info->master;
@@ -1910,8 +1928,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mlx5_os_free_shared_dr(priv);
 		if (priv->nl_socket_route >= 0)
 			close(priv->nl_socket_route);
-		if (priv->nl_socket_rdma >= 0)
-			close(priv->nl_socket_rdma);
 		if (priv->vmwa_context)
 			mlx5_vlan_vmwa_exit(priv->vmwa_context);
 		if (eth_dev && priv->drop_queue.hrxq)
@@ -1935,6 +1951,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (sh)
 		mlx5_free_shared_dev_ctx(sh);
+	if (nl_rdma >= 0)
+		close(nl_rdma);
 	MLX5_ASSERT(err > 0);
 	rte_errno = err;
 	return NULL;
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 3/8] net/mlx5: improve Verbs flow priority discover for scalable
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

To detect number flow Verbs flow priorities, PMD try to create Verbs
flows in different priority. While Verbs is not designed to support
ports larger than 255.

When DevX supported by kernel driver, 16 Verbs priorities must be
supported, no need to create Verbs flows.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_verbs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index b93fd4d2c96..f265e176940 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -83,6 +83,11 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 	int i;
 	int priority = 0;
 
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+	/* If DevX supported, driver must support 16 verbs flow priorities. */
+	priority = 16;
+	goto out;
+#endif
 	if (!drop->qp) {
 		rte_errno = ENOTSUP;
 		return -rte_errno;
@@ -109,6 +114,9 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 			dev->data->port_id, priority);
 		return -rte_errno;
 	}
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+out:
+#endif
 	DRV_LOG(INFO, "port %u supported flow priorities:"
 		" 0-%d for ingress or egress root table,"
 		" 0-%d for non-root table or transfer root table.",
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 4/8] net/mlx5: support E-Switch manager egress traffic match
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (2 preceding siblings ...)
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

For egress packet on representor, the vport ID in transport domain
is E-Switch manager vport ID since representor shares resources of
E-Switch manager. E-Switch manager vport ID and Tx queue internal device
index are used to match representor egress packet.

This patch adds flow item port ID match on E-Switch manager.

E-Switch manager vport ID is 0xfffe on BlueField, 0 otherwise.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.h    |  3 +++
 drivers/net/mlx5/mlx5_flow_dv.c | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 5c68d4f7d74..c25af8d9864 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -18,6 +18,9 @@
 
 #include "mlx5.h"
 
+/* E-Switch Manager port, used for rte_flow_item_port_id. */
+#define MLX5_PORT_ESW_MGR UINT32_MAX
+
 /* Private rte flow items. */
 enum mlx5_rte_flow_item_type {
 	MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index c6370cd1d68..f06ce54f7e7 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -15,6 +15,7 @@
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_cycles.h>
+#include <rte_bus_pci.h>
 #include <rte_ip.h>
 #include <rte_gre.h>
 #include <rte_vxlan.h>
@@ -92,6 +93,23 @@ static int
 flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
 				  uint32_t rix_jump);
 
+static int16_t
+flow_dv_get_esw_manager_vport_id(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->pci_dev == NULL)
+		return 0;
+	switch (priv->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX7BF:
+		return (int16_t)0xfffe;
+	default:
+		return 0;
+	}
+}
+
 /**
  * Initialize flow attributes structure according to flow items' types.
  *
@@ -2224,6 +2242,8 @@ flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
 		return ret;
 	if (!spec)
 		return 0;
+	if (spec->id == MLX5_PORT_ESW_MGR)
+		return 0;
 	esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
 	if (!esw_priv)
 		return rte_flow_error_set(error, rte_errno,
@@ -9685,6 +9705,11 @@ flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
 	struct mlx5_priv *priv;
 	uint16_t mask, id;
 
+	if (pid_v && pid_v->id == MLX5_PORT_ESW_MGR) {
+		flow_dv_translate_item_source_vport(matcher, key,
+			flow_dv_get_esw_manager_vport_id(dev), 0xffff);
+		return 0;
+	}
 	mask = pid_m ? pid_m->id : 0xffff;
 	id = pid_v ? pid_v->id : dev->data->port_id;
 	priv = mlx5_port_to_eswitch_info(id, item == NULL);
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 5/8] net/mlx5: supports flow item of normal Tx queue
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (3 preceding siblings ...)
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Extends txq flow pattern to support both hairpin and regular txq.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_dv.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index f06ce54f7e7..4a17ca64a2e 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -10910,22 +10910,22 @@ flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
 	void *misc_v =
 		MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
 	struct mlx5_txq_ctrl *txq;
-	uint32_t queue;
-
+	uint32_t queue, mask;
 
 	queue_m = (const void *)item->mask;
-	if (!queue_m)
-		return;
 	queue_v = (const void *)item->spec;
 	if (!queue_v)
 		return;
 	txq = mlx5_txq_get(dev, queue_v->queue);
 	if (!txq)
 		return;
-	queue = txq->obj->sq->id;
-	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
-	MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
-		 queue & queue_m->queue);
+	if (txq->type == MLX5_TXQ_TYPE_HAIRPIN)
+		queue = txq->obj->sq->id;
+	else
+		queue = txq->obj->sq_obj.sq->id;
+	mask = queue_m == NULL ? UINT32_MAX : queue_m->queue;
+	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, mask);
+	MLX5_SET(fte_match_set_misc, misc_v, source_sqn, queue & mask);
 	mlx5_txq_release(dev, queue_v->queue);
 }
 
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 6/8] net/mlx5: fix internal root table flow priroity
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (4 preceding siblings ...)
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
@ 2021-10-19 10:34   ` Xueming Li
  2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:34 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Dong Zhou

When creating internal transfer flow on root table with lowerest
priority, the flow was created with max UINT32_MAX priority. It is wrong
since the flow is created in kernel and  max priority supported is 16.

This patch fixes this by adding internal flow check.

Fixes: 5f8ae44dd454 ("net/mlx5: enlarge maximal flow priority")

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.c    | 7 ++++++-
 drivers/net/mlx5/mlx5_flow.h    | 4 ++--
 drivers/net/mlx5/mlx5_flow_dv.c | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c914a7120cc..b5232cd46ae 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -980,13 +980,15 @@ mlx5_get_lowest_priority(struct rte_eth_dev *dev,
  *   Pointer to device flow rule attributes.
  * @param[in] subpriority
  *   The priority based on the items.
+ * @param[in] external
+ *   Flow is user flow.
  * @return
  *   The matcher priority of the flow.
  */
 uint16_t
 mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 			  const struct rte_flow_attr *attr,
-			  uint32_t subpriority)
+			  uint32_t subpriority, bool external)
 {
 	uint16_t priority = (uint16_t)attr->priority;
 	struct mlx5_priv *priv = dev->data->dev_private;
@@ -995,6 +997,9 @@ mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 		if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 			priority = priv->config.flow_prio - 1;
 		return mlx5_os_flow_adjust_priority(dev, priority, subpriority);
+	} else if (!external && attr->transfer && attr->group == 0 &&
+		   attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR) {
+		return (priv->config.flow_prio - 1) * 3;
 	}
 	if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 		priority = MLX5_NON_ROOT_FLOW_MAX_PRIO;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index c25af8d9864..f1a83d537d0 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1431,8 +1431,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 uint32_t mlx5_get_lowest_priority(struct rte_eth_dev *dev,
 					const struct rte_flow_attr *attr);
 uint16_t mlx5_get_matcher_priority(struct rte_eth_dev *dev,
-				     const struct rte_flow_attr *attr,
-				     uint32_t subpriority);
+				   const struct rte_flow_attr *attr,
+				   uint32_t subpriority, bool external);
 int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
 				     enum mlx5_feature_name feature,
 				     uint32_t id,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 4a17ca64a2e..ffc1fc8a05c 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -13646,7 +13646,8 @@ flow_dv_translate(struct rte_eth_dev *dev,
 	matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
 				    matcher.mask.size);
 	matcher.priority = mlx5_get_matcher_priority(dev, attr,
-					matcher.priority);
+						     matcher.priority,
+						     dev_flow->external);
 	/**
 	 * When creating meter drop flow in drop table, using original
 	 * 5-tuple match, the matcher priority should be lower than
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 7/8] net/mlx5: enable DevX Tx queue creation
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (5 preceding siblings ...)
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
@ 2021-10-19 10:35   ` Xueming Li
  2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
  2021-10-20 13:40   ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Raslan Darawsheh
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:35 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API does not support Infiniband device port number larger 255 by
design. To support more representors on a single Infiniband device DevX
API should be engaged.

While creating Send Queue (SQ) object with Verbs API, the PMD assigned
IB device port attribute and kernel created the default miss flows in
FDB domain, to redirect egress traffic from the queue being created to
representor appropriate peer (wire, HPF, VF or SF).

With DevX API there is no IB-device port attribute (it is merely kernel
one, DevX operates in PRM terms) and PMD must create default miss flows
in FDB explicitly. PMD did not provide this and using DevX API for
E-Switch configurations was disabled.

The default miss FDB flow matches E-Switch manager vport (to make sure
the source is some representor) and SQn (Send Queue number - device
internal queue index). The root flow table managed by kernel/firmware
and it does not support vport redirect action, we have to split the
default miss flow into two ones:

- flow with lowest priority in the root table that matches E-Switch
manager vport ID and jump to group 1.
- flow in group 1 that matches E-Switch manager vport ID and SQn and
forwards packet to peer vport

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 62 +-------------------------
 drivers/net/mlx5/mlx5.h          |  2 +
 drivers/net/mlx5/mlx5_devx.c     | 10 ++---
 drivers/net/mlx5/mlx5_devx.h     |  2 +
 drivers/net/mlx5/mlx5_flow.c     | 74 ++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_trigger.c  | 11 ++++-
 6 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index f283a3779cc..93ee9318ebc 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -697,56 +697,6 @@ mlx5_init_once(void)
 	return ret;
 }
 
-/**
- * Create the Tx queue DevX/Verbs object.
- *
- * @param dev
- *   Pointer to Ethernet device.
- * @param idx
- *   Queue index in DPDK Tx queue array.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
-	struct mlx5_txq_ctrl *txq_ctrl =
-			container_of(txq_data, struct mlx5_txq_ctrl, txq);
-
-	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!priv->config.dv_esw_en)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#endif
-	return mlx5_txq_ibv_obj_new(dev, idx);
-}
-
-/**
- * Release an Tx DevX/verbs queue object.
- *
- * @param txq_obj
- *   DevX/Verbs Tx queue object.
- */
-static void
-mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
-{
-	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#endif
-	mlx5_txq_ibv_obj_release(txq_obj);
-}
-
 /**
  * DV flow counter mode detect and config.
  *
@@ -1812,16 +1762,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 						ibv_obj_ops.drop_action_create;
 		priv->obj_ops.drop_action_destroy =
 						ibv_obj_ops.drop_action_destroy;
-#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
-		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
-#else
-		if (config->dv_esw_en)
-			priv->obj_ops.txq_obj_modify =
-						ibv_obj_ops.txq_obj_modify;
-#endif
-		/* Use specific wrappers for Tx object. */
-		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
-		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
 		mlx5_queue_counter_id_prepare(eth_dev);
 		priv->obj_ops.lb_dummy_queue_create =
 					mlx5_rxq_ibv_obj_dummy_lb_create;
@@ -1832,7 +1772,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (config->tx_pp &&
 	    (priv->config.dv_esw_en ||
-	     priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+	     priv->obj_ops.txq_obj_new != mlx5_txq_devx_obj_new)) {
 		/*
 		 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
 		 * packet pacing and already checked above.
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3581414b789..570f827375a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1699,6 +1699,8 @@ int mlx5_ctrl_flow(struct rte_eth_dev *dev,
 		   struct rte_flow_item_eth *eth_mask);
 int mlx5_flow_lacp_miss(struct rte_eth_dev *dev);
 struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+uint32_t mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev,
+					    uint32_t txq);
 void mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 				       uint64_t async_id, int status);
 void mlx5_set_query_alarm(struct mlx5_dev_ctx_shared *sh);
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a1db53577a2..a49602cb957 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -102,9 +102,9 @@ mlx5_devx_modify_rq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
-mlx5_devx_modify_sq(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
-		    uint8_t dev_port)
+int
+mlx5_txq_devx_modify(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+		     uint8_t dev_port)
 {
 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
 	int ret;
@@ -1118,7 +1118,7 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
 	*txq_data->qp_db = 0;
 	txq_data->qp_num_8s = txq_obj->sq_obj.sq->id << 8;
 	/* Change Send Queue state to Ready-to-Send. */
-	ret = mlx5_devx_modify_sq(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
+	ret = mlx5_txq_devx_modify(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
 	if (ret) {
 		rte_errno = errno;
 		DRV_LOG(ERR,
@@ -1187,7 +1187,7 @@ struct mlx5_obj_ops devx_obj_ops = {
 	.drop_action_create = mlx5_devx_drop_action_create,
 	.drop_action_destroy = mlx5_devx_drop_action_destroy,
 	.txq_obj_new = mlx5_txq_devx_obj_new,
-	.txq_obj_modify = mlx5_devx_modify_sq,
+	.txq_obj_modify = mlx5_txq_devx_modify,
 	.txq_obj_release = mlx5_txq_devx_obj_release,
 	.lb_dummy_queue_create = NULL,
 	.lb_dummy_queue_release = NULL,
diff --git a/drivers/net/mlx5/mlx5_devx.h b/drivers/net/mlx5/mlx5_devx.h
index bc8a8d6b73c..a95207a6b9a 100644
--- a/drivers/net/mlx5/mlx5_devx.h
+++ b/drivers/net/mlx5/mlx5_devx.h
@@ -8,6 +8,8 @@
 #include "mlx5.h"
 
 int mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_devx_modify(struct mlx5_txq_obj *obj,
+			 enum mlx5_txq_modify_type type, uint8_t dev_port);
 void mlx5_txq_devx_obj_release(struct mlx5_txq_obj *txq_obj);
 
 extern struct mlx5_obj_ops devx_obj_ops;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index b5232cd46ae..1528f8c6b51 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6573,6 +6573,80 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
 						   actions, false, &error);
 }
 
+/**
+ * Create a dedicated flow rule on e-switch table 1, matches ESW manager
+ * and sq number, directs all packets to peer vport.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param txq
+ *   Txq index.
+ *
+ * @return
+ *   Flow ID on success, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev, uint32_t txq)
+{
+	struct rte_flow_attr attr = {
+		.group = 0,
+		.priority = MLX5_FLOW_LOWEST_PRIO_INDICATOR,
+		.ingress = 1,
+		.egress = 0,
+		.transfer = 1,
+	};
+	struct rte_flow_item_port_id port_spec = {
+		.id = MLX5_PORT_ESW_MGR,
+	};
+	struct mlx5_rte_flow_item_tx_queue txq_spec = {
+		.queue = txq,
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_PORT_ID,
+			.spec = &port_spec,
+		},
+		{
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+			.spec = &txq_spec,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action_jump jump = {
+		.group = 1,
+	};
+	struct rte_flow_action_port_id port = {
+		.id = dev->data->port_id,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct rte_flow_error error;
+
+	/*
+	 * Creates group 0, highest priority jump flow.
+	 * Matches txq to bypass kernel packets.
+	 */
+	if (flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern, actions,
+			     false, &error) == 0)
+		return 0;
+	/* Create group 1, lowest priority redirect flow for txq. */
+	attr.group = 1;
+	actions[0].conf = &port;
+	actions[0].type = RTE_FLOW_ACTION_TYPE_PORT_ID;
+	return flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern,
+				actions, false, &error);
+}
+
 /**
  * Validate a flow supported by the NIC.
  *
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 54173bfacb2..42d8bb31128 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1255,9 +1255,18 @@ mlx5_traffic_enable(struct rte_eth_dev *dev)
 				goto error;
 			}
 		}
+		if ((priv->representor || priv->master) &&
+		    priv->config.dv_esw_en) {
+			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
+				DRV_LOG(ERR,
+					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
+					dev->data->port_id, i);
+				goto error;
+			}
+		}
 		mlx5_txq_release(dev, i);
 	}
-	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
+	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
 		if (mlx5_flow_create_esw_table_zero_flow(dev))
 			priv->fdb_def_rule = 1;
 		else
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v3 8/8] net/mlx5: check DevX to support more Verbs ports
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (6 preceding siblings ...)
  2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
@ 2021-10-19 10:35   ` Xueming Li
  2021-10-20 13:40   ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Raslan Darawsheh
  8 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-19 10:35 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API doesn't support device port number larger than 255 by design.

To support more VF or SubFunction port representors, forces DevX API
check when max Verbs device link ports larger than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 93ee9318ebc..39a9722d869 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1299,12 +1299,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		config->dv_flow_en = 0;
 	}
 #endif
-	if (spawn->max_port > UINT8_MAX) {
-		/* Verbs can't support ports larger than 255 by design. */
-		DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
-		err = EINVAL;
-		goto error;
-	}
 	config->ind_table_max_size =
 		sh->device_attr.max_rwq_indirection_table_size;
 	/*
@@ -1767,6 +1761,11 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 					mlx5_rxq_ibv_obj_dummy_lb_create;
 		priv->obj_ops.lb_dummy_queue_release =
 					mlx5_rxq_ibv_obj_dummy_lb_release;
+	} else if (spawn->max_port > UINT8_MAX) {
+		/* Verbs can't support ports larger than 255 by design. */
+		DRV_LOG(ERR, "must enable DV and ESW when RDMA link ports > 255");
+		err = ENOTSUP;
+		goto error;
 	} else {
 		priv->obj_ops = ibv_obj_ops;
 	}
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
                     ` (7 preceding siblings ...)
  2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
@ 2021-10-20 13:40   ` Raslan Darawsheh
  2021-10-20 16:00     ` Xueming(Steven) Li
  8 siblings, 1 reply; 47+ messages in thread
From: Raslan Darawsheh @ 2021-10-20 13:40 UTC (permalink / raw)
  To: Xueming(Steven) Li, dev
  Cc: Xueming(Steven) Li, Slava Ovsiienko, Lior Margalit

Hi,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Xueming Li
> Sent: Tuesday, October 19, 2021 1:35 PM
> To: dev@dpdk.org
> Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>
> Subject: [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255
> representors
> 
> This patch set supports representor number of a PF to be more than 255.
> CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255
> SFs.
> 
> v2:
>  - fixed FDB root table flow priority
>  - add error check to Netlink port state API
>  - commit log update and other minor fixes
> v3:
>  - fix compilation issue
> 
> Xueming Li (8):
>   common/mlx5: add netlink API to get RDMA port state
>   net/mlx5: use netlink when IB port greater than 255
>   net/mlx5: improve Verbs flow priority discover for scalable
>   net/mlx5: support E-Switch manager egress traffic match
>   net/mlx5: supports flow item of normal Tx queue
>   net/mlx5: fix internal root table flow priroity
>   net/mlx5: enable DevX Tx queue creation
>   net/mlx5: check DevX to support more Verbs ports
> 
>  drivers/common/mlx5/linux/meson.build |   2 +
>  drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
>  drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
>  drivers/common/mlx5/version.map       |   1 +
>  drivers/net/mlx5/linux/mlx5_os.c      | 119 +++++++---------------
>  drivers/net/mlx5/mlx5.h               |   2 +
>  drivers/net/mlx5/mlx5_devx.c          |  10 +-
>  drivers/net/mlx5/mlx5_devx.h          |   2 +
>  drivers/net/mlx5/mlx5_flow.c          |  81 ++++++++++++++-
>  drivers/net/mlx5/mlx5_flow.h          |   7 +-
>  drivers/net/mlx5/mlx5_flow_dv.c       |  44 +++++++--
>  drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
>  drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
>  13 files changed, 291 insertions(+), 134 deletions(-)
> 
> --
> 2.33.0

Series applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors
  2021-10-20 13:40   ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Raslan Darawsheh
@ 2021-10-20 16:00     ` Xueming(Steven) Li
  0 siblings, 0 replies; 47+ messages in thread
From: Xueming(Steven) Li @ 2021-10-20 16:00 UTC (permalink / raw)
  To: Raslan Darawsheh, dev; +Cc: Lior Margalit, Slava Ovsiienko

On Wed, 2021-10-20 at 13:40 +0000, Raslan Darawsheh wrote:
> Hi,
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Xueming Li
> > Sent: Tuesday, October 19, 2021 1:35 PM
> > To: dev@dpdk.org
> > Cc: Xueming(Steven) Li <xuemingl@nvidia.com>; Slava Ovsiienko
> > <viacheslavo@nvidia.com>; Lior Margalit <lmargalit@nvidia.com>
> > Subject: [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255
> > representors
> > 
> > This patch set supports representor number of a PF to be more than 255.
> > CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255
> > SFs.
> > 
> > v2:
> >  - fixed FDB root table flow priority
> >  - add error check to Netlink port state API
> >  - commit log update and other minor fixes
> > v3:
> >  - fix compilation issue
> > 
> > Xueming Li (8):
> >   common/mlx5: add netlink API to get RDMA port state
> >   net/mlx5: use netlink when IB port greater than 255
> >   net/mlx5: improve Verbs flow priority discover for scalable
> >   net/mlx5: support E-Switch manager egress traffic match
> >   net/mlx5: supports flow item of normal Tx queue
> >   net/mlx5: fix internal root table flow priroity
> >   net/mlx5: enable DevX Tx queue creation
> >   net/mlx5: check DevX to support more Verbs ports
> > 
> >  drivers/common/mlx5/linux/meson.build |   2 +
> >  drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
> >  drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
> >  drivers/common/mlx5/version.map       |   1 +
> >  drivers/net/mlx5/linux/mlx5_os.c      | 119 +++++++---------------
> >  drivers/net/mlx5/mlx5.h               |   2 +
> >  drivers/net/mlx5/mlx5_devx.c          |  10 +-
> >  drivers/net/mlx5/mlx5_devx.h          |   2 +
> >  drivers/net/mlx5/mlx5_flow.c          |  81 ++++++++++++++-
> >  drivers/net/mlx5/mlx5_flow.h          |   7 +-
> >  drivers/net/mlx5/mlx5_flow_dv.c       |  44 +++++++--
> >  drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
> >  drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
> >  13 files changed, 291 insertions(+), 134 deletions(-)
> > 
> > --
> > 2.33.0
> 
> Series applied to next-net-mlx,
> 
> Kindest regards,
> Raslan Darawsheh

Thanks!

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-10-21 13:34     ` Ferruh Yigit
  0 siblings, 0 replies; 47+ messages in thread
From: Ferruh Yigit @ 2021-10-21 13:34 UTC (permalink / raw)
  To: Xueming Li, dev
  Cc: Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Ray Kinsella

On 10/19/2021 11:34 AM, Xueming Li wrote:
> Introduce netlink API to get rdma port state.
> 
> Port state is restrieved based on RDMA device name and port index.
> 
> Signed-off-by: Xueming Li<xuemingl@nvidia.com>
> Acked-by: Viacheslav Ovsiienko<viacheslavo@nvidia.com>

Is there a kernel driver version dependency for used netlink message?

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 0/8] net/mlx5: support more than 255 representors
  2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
                   ` (9 preceding siblings ...)
  2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
@ 2021-10-22  9:11 ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
                     ` (7 more replies)
  10 siblings, 8 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit

This patch set supports representor number of a PF to be more than 255.
CX6 and current OFED driver supports maxium 512 SFs. CX5 supports max 255 SFs.

v2:
 - fixed FDB root table flow priority
 - add error check to Netlink port state API
 - commit log update and other minor fixes
v3:
 - fix compilation issue
v4:
 - fix rebase issue
 - rebase on latest upstream

Xueming Li (8):
  common/mlx5: add netlink API to get RDMA port state
  net/mlx5: use netlink when IB port greater than 255
  net/mlx5: improve Verbs flow priority discover for scalable
  net/mlx5: support E-Switch manager egress traffic match
  net/mlx5: supports flow item of normal Tx queue
  net/mlx5: fix internal root table flow priroity
  net/mlx5: enable DevX Tx queue creation
  net/mlx5: check DevX to support more Verbs ports

 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 drivers/net/mlx5/linux/mlx5_os.c      | 119 +++++++---------------
 drivers/net/mlx5/mlx5.h               |   2 +
 drivers/net/mlx5/mlx5_devx.c          |  10 +-
 drivers/net/mlx5/mlx5_devx.h          |   2 +
 drivers/net/mlx5/mlx5_flow.c          |  81 ++++++++++++++-
 drivers/net/mlx5/mlx5_flow.h          |   7 +-
 drivers/net/mlx5/mlx5_flow_dv.c       |  44 +++++++--
 drivers/net/mlx5/mlx5_flow_verbs.c    |   8 ++
 drivers/net/mlx5/mlx5_trigger.c       |  11 ++-
 13 files changed, 290 insertions(+), 135 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 1/8] common/mlx5: add netlink API to get RDMA port state
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev
  Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad, Ray Kinsella

Introduce netlink API to get rdma port state.

Port state is restrieved based on RDMA device name and port index.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/common/mlx5/linux/meson.build |   2 +
 drivers/common/mlx5/linux/mlx5_nl.c   | 136 +++++++++++++++++++-------
 drivers/common/mlx5/linux/mlx5_nl.h   |   2 +
 drivers/common/mlx5/version.map       |   1 +
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/drivers/common/mlx5/linux/meson.build b/drivers/common/mlx5/linux/meson.build
index cbea58f557d..2dcd27b7786 100644
--- a/drivers/common/mlx5/linux/meson.build
+++ b/drivers/common/mlx5/linux/meson.build
@@ -175,6 +175,8 @@ has_sym_args = [
             'RDMA_NLDEV_ATTR_DEV_NAME' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_PORT_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_PORT_INDEX' ],
+        [ 'HAVE_RDMA_NLDEV_ATTR_PORT_STATE', 'rdma/rdma_netlink.h',
+            'RDMA_NLDEV_ATTR_PORT_STATE' ],
         [ 'HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX', 'rdma/rdma_netlink.h',
             'RDMA_NLDEV_ATTR_NDEV_INDEX' ],
         [ 'HAVE_MLX5_DR_FLOW_DUMP', 'infiniband/mlx5dv.h',
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c
index 530d491b660..fd4c2d26253 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -78,6 +78,9 @@
 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
 #endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
+#define RDMA_NLDEV_ATTR_PORT_STATE 12
+#endif
 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
 #endif
@@ -160,14 +163,16 @@ struct mlx5_nl_mac_addr {
 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 
 /** Data structure used by mlx5_nl_cmdget_cb(). */
-struct mlx5_nl_ifindex_data {
+struct mlx5_nl_port_info {
 	const char *name; /**< IB device name (in). */
 	uint32_t flags; /**< found attribute flags (out). */
 	uint32_t ibindex; /**< IB device index (out). */
 	uint32_t ifindex; /**< Network interface index (out). */
 	uint32_t portnum; /**< IB device max port number (out). */
+	uint16_t state; /**< IB device port state (out). */
 };
 
 uint32_t atomic_sn;
@@ -966,8 +971,8 @@ mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 static int
 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 {
-	struct mlx5_nl_ifindex_data *data = arg;
-	struct mlx5_nl_ifindex_data local = {
+	struct mlx5_nl_port_info *data = arg;
+	struct mlx5_nl_port_info local = {
 		.flags = 0,
 	};
 	size_t off = NLMSG_HDRLEN;
@@ -1000,6 +1005,10 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 			local.portnum = *(uint32_t *)payload;
 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
 			break;
+		case RDMA_NLDEV_ATTR_PORT_STATE:
+			local.state = *(uint8_t *)payload;
+			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
+			break;
 		default:
 			break;
 		}
@@ -1016,6 +1025,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 		data->ibindex = local.ibindex;
 		data->ifindex = local.ifindex;
 		data->portnum = local.portnum;
+		data->state = local.state;
 	}
 	return 0;
 error:
@@ -1024,7 +1034,7 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 }
 
 /**
- * Get index of network interface associated with some IB device.
+ * Get port info of network interface associated with some IB device.
  *
  * This is the only somewhat safe method to avoid resorting to heuristics
  * when faced with port representors. Unfortunately it requires at least
@@ -1032,27 +1042,20 @@ mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
  *
  * @param nl
  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
- * @param[in] name
- *   IB device name.
  * @param[in] pindex
  *   IB device port index, starting from 1
+ * @param[out] data
+ *   Pointer to port info.
  * @return
- *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
- *   is set.
+ *   0 on success, negative on error and rte_errno is set.
  */
-unsigned int
-mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+static int
+mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
 {
-	struct mlx5_nl_ifindex_data data = {
-		.name = name,
-		.flags = 0,
-		.ibindex = 0, /* Determined during first pass. */
-		.ifindex = 0, /* Determined during second pass. */
-	};
 	union {
 		struct nlmsghdr nh;
 		uint8_t buf[NLMSG_HDRLEN +
-			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
 	} req = {
 		.nh = {
@@ -1068,24 +1071,24 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
 		goto error;
-	data.flags = 0;
+	data->flags = 0;
 	sn = MLX5_NL_SN_GENERATE;
 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
 					     RDMA_NLDEV_CMD_PORT_GET);
 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
-	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-	       &data.ibindex, sizeof(data.ibindex));
+	       &data->ibindex, sizeof(data->ibindex));
 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
@@ -1093,19 +1096,82 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 	       &pindex, sizeof(pindex));
 	ret = mlx5_nl_send(nl, &req.nh, sn);
 	if (ret < 0)
-		return 0;
-	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+		return ret;
+	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
 	if (ret < 0)
-		return 0;
-	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
-	    !data.ifindex)
+		return ret;
+	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
+	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+	    !data->ifindex)
 		goto error;
-	return data.ifindex;
+	return 1;
 error:
 	rte_errno = ENODEV;
-	return 0;
+	return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.ifindex = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return 0;
+	return data.ifindex;
+}
+
+/**
+ * Get IB device port state.
+ *
+ * This is the only somewhat safe method to get info for port number >= 255.
+ * Unfortunately it requires at least Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   Port state (ibv_port_state) on success, negative on error
+ *   and rte_errno is set.
+ */
+int
+mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
+{
+	struct mlx5_nl_port_info data = {
+			.state = 0,
+			.name = name,
+	};
+
+	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
+		return -rte_errno;
+	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	return (int)data.state;
 }
 
 /**
@@ -1123,7 +1189,7 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 unsigned int
 mlx5_nl_portnum(int nl, const char *name)
 {
-	struct mlx5_nl_ifindex_data data = {
+	struct mlx5_nl_port_info data = {
 		.flags = 0,
 		.name = name,
 		.ifindex = 0,
diff --git a/drivers/common/mlx5/linux/mlx5_nl.h b/drivers/common/mlx5/linux/mlx5_nl.h
index 202849f52ad..2063c0deeb9 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.h
+++ b/drivers/common/mlx5/linux/mlx5_nl.h
@@ -54,6 +54,8 @@ unsigned int mlx5_nl_portnum(int nl, const char *name);
 __rte_internal
 unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
 __rte_internal
+int mlx5_nl_port_state(int nl, const char *name, uint32_t pindex);
+__rte_internal
 int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 			       struct rte_ether_addr *mac, int vf_index);
 __rte_internal
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index 1167fcd3236..7c95172fe87 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -120,6 +120,7 @@ INTERNAL {
 	mlx5_nl_mac_addr_flush; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_remove; # WINDOWS_NO_EXPORT
 	mlx5_nl_mac_addr_sync; # WINDOWS_NO_EXPORT
+	mlx5_nl_port_state; # WINDOWS_NO_EXPORT
 	mlx5_nl_portnum; # WINDOWS_NO_EXPORT
 	mlx5_nl_promisc; # WINDOWS_NO_EXPORT
 	mlx5_nl_switch_info; # WINDOWS_NO_EXPORT
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 2/8] net/mlx5: use netlink when IB port greater than 255
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

IB spec doesn't allow 255 ports on a single HCA, port number of 256 was
cast to u8 value 0 which invalid to ibv_query_port()

This patch invokes Netlink api to query port state when port number
greater than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 46 +++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 54e0ba9f3a9..101ef943f42 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -905,7 +905,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 {
 	const struct mlx5_switch_info *switch_info = &spawn->info;
 	struct mlx5_dev_ctx_shared *sh = NULL;
-	struct ibv_port_attr port_attr;
+	struct ibv_port_attr port_attr = { .state = IBV_PORT_NOP };
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct mlx5_priv *priv = NULL;
@@ -924,6 +924,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	int own_domain_id = 0;
 	uint16_t port_id;
 	struct mlx5_port_info vport_info = { .query_flags = 0 };
+	int nl_rdma = -1;
 	int i;
 
 	/* Determine if this port representor is supposed to be spawned. */
@@ -1121,20 +1122,35 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config->mpls_en = mpls_en;
+	nl_rdma = mlx5_nl_init(NETLINK_RDMA);
 	/* Check port status. */
-	err = mlx5_glue->query_port(sh->cdev->ctx, spawn->phys_port,
-				    &port_attr);
-	if (err) {
-		DRV_LOG(ERR, "port query failed: %s", strerror(err));
-		goto error;
-	}
-	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
-		DRV_LOG(ERR, "port is not configured in Ethernet mode");
-		err = EINVAL;
-		goto error;
+	if (spawn->phys_port <= UINT8_MAX) {
+		/* Legacy Verbs api only support u8 port number. */
+		err = mlx5_glue->query_port(sh->cdev->ctx, spawn->phys_port,
+					    &port_attr);
+		if (err) {
+			DRV_LOG(ERR, "port query failed: %s", strerror(err));
+			goto error;
+		}
+		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			DRV_LOG(ERR, "port is not configured in Ethernet mode");
+			err = EINVAL;
+			goto error;
+		}
+	} else if (nl_rdma >= 0) {
+		/* IB doesn't allow more than 255 ports, must be Ethernet. */
+		err = mlx5_nl_port_state(nl_rdma, spawn->phys_dev_name,
+					 spawn->phys_port);
+		if (err < 0) {
+			DRV_LOG(INFO, "Failed to get netlink port state: %s",
+				strerror(rte_errno));
+			err = -rte_errno;
+			goto error;
+		}
+		port_attr.state = (enum ibv_port_state)err;
 	}
 	if (port_attr.state != IBV_PORT_ACTIVE)
-		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+		DRV_LOG(INFO, "port is not active: \"%s\" (%d)",
 			mlx5_glue->port_state_str(port_attr.state),
 			port_attr.state);
 	/* Allocate private eth device data. */
@@ -1151,7 +1167,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->pci_dev = spawn->pci_dev;
 	priv->mtu = RTE_ETHER_MTU;
 	/* Some internal functions rely on Netlink sockets, open them now. */
-	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_rdma = nl_rdma;
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
 	priv->representor = !!switch_info->representor;
 	priv->master = !!switch_info->master;
@@ -1844,8 +1860,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			mlx5_os_free_shared_dr(priv);
 		if (priv->nl_socket_route >= 0)
 			close(priv->nl_socket_route);
-		if (priv->nl_socket_rdma >= 0)
-			close(priv->nl_socket_rdma);
 		if (priv->vmwa_context)
 			mlx5_vlan_vmwa_exit(priv->vmwa_context);
 		if (eth_dev && priv->drop_queue.hrxq)
@@ -1869,6 +1883,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (sh)
 		mlx5_free_shared_dev_ctx(sh);
+	if (nl_rdma >= 0)
+		close(nl_rdma);
 	MLX5_ASSERT(err > 0);
 	rte_errno = err;
 	return NULL;
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 3/8] net/mlx5: improve Verbs flow priority discover for scalable
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

To detect number flow Verbs flow priorities, PMD try to create Verbs
flows in different priority. While Verbs is not designed to support
ports larger than 255.

When DevX supported by kernel driver, 16 Verbs priorities must be
supported, no need to create Verbs flows.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_verbs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 60029f71178..3f5aaa885fb 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -83,6 +83,11 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 	int i;
 	int priority = 0;
 
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+	/* If DevX supported, driver must support 16 verbs flow priorities. */
+	priority = RTE_DIM(priority_map_5);
+	goto out;
+#endif
 	if (!drop->qp) {
 		rte_errno = ENOTSUP;
 		return -rte_errno;
@@ -109,6 +114,9 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
 			dev->data->port_id, priority);
 		return -rte_errno;
 	}
+#if defined(HAVE_MLX5DV_DR_DEVX_PORT) || defined(HAVE_MLX5DV_DR_DEVX_PORT_V35)
+out:
+#endif
 	DRV_LOG(INFO, "port %u supported flow priorities:"
 		" 0-%d for ingress or egress root table,"
 		" 0-%d for non-root table or transfer root table.",
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 4/8] net/mlx5: support E-Switch manager egress traffic match
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
                     ` (2 preceding siblings ...)
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

For egress packet on representor, the vport ID in transport domain
is E-Switch manager vport ID since representor shares resources of
E-Switch manager. E-Switch manager vport ID and Tx queue internal device
index are used to match representor egress packet.

This patch adds flow item port ID match on E-Switch manager.

E-Switch manager vport ID is 0xfffe on BlueField, 0 otherwise.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.h    |  3 +++
 drivers/net/mlx5/mlx5_flow_dv.c | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 5c68d4f7d74..c25af8d9864 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -18,6 +18,9 @@
 
 #include "mlx5.h"
 
+/* E-Switch Manager port, used for rte_flow_item_port_id. */
+#define MLX5_PORT_ESW_MGR UINT32_MAX
+
 /* Private rte flow items. */
 enum mlx5_rte_flow_item_type {
 	MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index f2fde912947..5b964153860 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -15,6 +15,7 @@
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_cycles.h>
+#include <rte_bus_pci.h>
 #include <rte_ip.h>
 #include <rte_gre.h>
 #include <rte_vxlan.h>
@@ -92,6 +93,23 @@ static int
 flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
 				  uint32_t rix_jump);
 
+static int16_t
+flow_dv_get_esw_manager_vport_id(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->pci_dev == NULL)
+		return 0;
+	switch (priv->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX7BF:
+		return (int16_t)0xfffe;
+	default:
+		return 0;
+	}
+}
+
 /**
  * Initialize flow attributes structure according to flow items' types.
  *
@@ -2184,6 +2202,8 @@ flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
 		return ret;
 	if (!spec)
 		return 0;
+	if (spec->id == MLX5_PORT_ESW_MGR)
+		return 0;
 	esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
 	if (!esw_priv)
 		return rte_flow_error_set(error, rte_errno,
@@ -9575,6 +9595,11 @@ flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
 	struct mlx5_priv *priv;
 	uint16_t mask, id;
 
+	if (pid_v && pid_v->id == MLX5_PORT_ESW_MGR) {
+		flow_dv_translate_item_source_vport(matcher, key,
+			flow_dv_get_esw_manager_vport_id(dev), 0xffff);
+		return 0;
+	}
 	mask = pid_m ? pid_m->id : 0xffff;
 	id = pid_v ? pid_v->id : dev->data->port_id;
 	priv = mlx5_port_to_eswitch_info(id, item == NULL);
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 5/8] net/mlx5: supports flow item of normal Tx queue
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
                     ` (3 preceding siblings ...)
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Extends txq flow pattern to support both hairpin and regular txq.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow_dv.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 5b964153860..e505cdbb0f7 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -10818,22 +10818,22 @@ flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
 	void *misc_v =
 		MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
 	struct mlx5_txq_ctrl *txq;
-	uint32_t queue;
-
+	uint32_t queue, mask;
 
 	queue_m = (const void *)item->mask;
-	if (!queue_m)
-		return;
 	queue_v = (const void *)item->spec;
 	if (!queue_v)
 		return;
 	txq = mlx5_txq_get(dev, queue_v->queue);
 	if (!txq)
 		return;
-	queue = txq->obj->sq->id;
-	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
-	MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
-		 queue & queue_m->queue);
+	if (txq->type == MLX5_TXQ_TYPE_HAIRPIN)
+		queue = txq->obj->sq->id;
+	else
+		queue = txq->obj->sq_obj.sq->id;
+	mask = queue_m == NULL ? UINT32_MAX : queue_m->queue;
+	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, mask);
+	MLX5_SET(fte_match_set_misc, misc_v, source_sqn, queue & mask);
 	mlx5_txq_release(dev, queue_v->queue);
 }
 
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 6/8] net/mlx5: fix internal root table flow priroity
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
                     ` (4 preceding siblings ...)
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev
  Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, stable,
	Matan Azrad, Dong Zhou

When creating internal transfer flow on root table with lowerest
priority, the flow was created with max UINT32_MAX priority. It is wrong
since the flow is created in kernel and  max priority supported is 16.

This patch fixes this by adding internal flow check.

Fixes: 5f8ae44dd454 ("net/mlx5: enlarge maximal flow priority")
Cc: stable@dpdk.org

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_flow.c    | 7 ++++++-
 drivers/net/mlx5/mlx5_flow.h    | 4 ++--
 drivers/net/mlx5/mlx5_flow_dv.c | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index ffcc031bff3..4abeae8ce2d 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1003,13 +1003,15 @@ mlx5_get_lowest_priority(struct rte_eth_dev *dev,
  *   Pointer to device flow rule attributes.
  * @param[in] subpriority
  *   The priority based on the items.
+ * @param[in] external
+ *   Flow is user flow.
  * @return
  *   The matcher priority of the flow.
  */
 uint16_t
 mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 			  const struct rte_flow_attr *attr,
-			  uint32_t subpriority)
+			  uint32_t subpriority, bool external)
 {
 	uint16_t priority = (uint16_t)attr->priority;
 	struct mlx5_priv *priv = dev->data->dev_private;
@@ -1018,6 +1020,9 @@ mlx5_get_matcher_priority(struct rte_eth_dev *dev,
 		if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 			priority = priv->config.flow_prio - 1;
 		return mlx5_os_flow_adjust_priority(dev, priority, subpriority);
+	} else if (!external && attr->transfer && attr->group == 0 &&
+		   attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR) {
+		return (priv->config.flow_prio - 1) * 3;
 	}
 	if (attr->priority == MLX5_FLOW_LOWEST_PRIO_INDICATOR)
 		priority = MLX5_NON_ROOT_FLOW_MAX_PRIO;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index c25af8d9864..f1a83d537d0 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1431,8 +1431,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 uint32_t mlx5_get_lowest_priority(struct rte_eth_dev *dev,
 					const struct rte_flow_attr *attr);
 uint16_t mlx5_get_matcher_priority(struct rte_eth_dev *dev,
-				     const struct rte_flow_attr *attr,
-				     uint32_t subpriority);
+				   const struct rte_flow_attr *attr,
+				   uint32_t subpriority, bool external);
 int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
 				     enum mlx5_feature_name feature,
 				     uint32_t id,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index e505cdbb0f7..6413b45d8d3 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -13556,7 +13556,8 @@ flow_dv_translate(struct rte_eth_dev *dev,
 	matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
 				    matcher.mask.size);
 	matcher.priority = mlx5_get_matcher_priority(dev, attr,
-					matcher.priority);
+						     matcher.priority,
+						     dev_flow->external);
 	/**
 	 * When creating meter drop flow in drop table, using original
 	 * 5-tuple match, the matcher priority should be lower than
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 7/8] net/mlx5: enable DevX Tx queue creation
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
                     ` (5 preceding siblings ...)
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API does not support Infiniband device port number larger 255 by
design. To support more representors on a single Infiniband device DevX
API should be engaged.

While creating Send Queue (SQ) object with Verbs API, the PMD assigned
IB device port attribute and kernel created the default miss flows in
FDB domain, to redirect egress traffic from the queue being created to
representor appropriate peer (wire, HPF, VF or SF).

With DevX API there is no IB-device port attribute (it is merely kernel
one, DevX operates in PRM terms) and PMD must create default miss flows
in FDB explicitly. PMD did not provide this and using DevX API for
E-Switch configurations was disabled.

The default miss FDB flow matches E-Switch manager vport (to make sure
the source is some representor) and SQn (Send Queue number - device
internal queue index). The root flow table managed by kernel/firmware
and it does not support vport redirect action, we have to split the
default miss flow into two ones:

- flow with lowest priority in the root table that matches E-Switch
manager vport ID and jump to group 1.
- flow in group 1 that matches E-Switch manager vport ID and SQn and
forwards packet to peer vport

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 62 +-------------------------
 drivers/net/mlx5/mlx5.h          |  2 +
 drivers/net/mlx5/mlx5_devx.c     | 10 ++---
 drivers/net/mlx5/mlx5_devx.h     |  2 +
 drivers/net/mlx5/mlx5_flow.c     | 74 ++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_trigger.c  | 11 ++++-
 6 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 101ef943f42..2db842cb983 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -646,56 +646,6 @@ mlx5_init_once(void)
 	return ret;
 }
 
-/**
- * Create the Tx queue DevX/Verbs object.
- *
- * @param dev
- *   Pointer to Ethernet device.
- * @param idx
- *   Queue index in DPDK Tx queue array.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
-	struct mlx5_txq_ctrl *txq_ctrl =
-			container_of(txq_data, struct mlx5_txq_ctrl, txq);
-
-	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!priv->config.dv_esw_en)
-		return mlx5_txq_devx_obj_new(dev, idx);
-#endif
-	return mlx5_txq_ibv_obj_new(dev, idx);
-}
-
-/**
- * Release an Tx DevX/verbs queue object.
- *
- * @param txq_obj
- *   DevX/Verbs Tx queue object.
- */
-static void
-mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
-{
-	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
-	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
-		mlx5_txq_devx_obj_release(txq_obj);
-		return;
-	}
-#endif
-	mlx5_txq_ibv_obj_release(txq_obj);
-}
-
 /**
  * DV flow counter mode detect and config.
  *
@@ -1744,16 +1694,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 						ibv_obj_ops.drop_action_create;
 		priv->obj_ops.drop_action_destroy =
 						ibv_obj_ops.drop_action_destroy;
-#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
-		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
-#else
-		if (config->dv_esw_en)
-			priv->obj_ops.txq_obj_modify =
-						ibv_obj_ops.txq_obj_modify;
-#endif
-		/* Use specific wrappers for Tx object. */
-		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
-		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
 		mlx5_queue_counter_id_prepare(eth_dev);
 		priv->obj_ops.lb_dummy_queue_create =
 					mlx5_rxq_ibv_obj_dummy_lb_create;
@@ -1764,7 +1704,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	if (config->tx_pp &&
 	    (priv->config.dv_esw_en ||
-	     priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+	     priv->obj_ops.txq_obj_new != mlx5_txq_devx_obj_new)) {
 		/*
 		 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
 		 * packet pacing and already checked above.
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 6f5a78b2493..adef86d3ae0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1664,6 +1664,8 @@ int mlx5_ctrl_flow(struct rte_eth_dev *dev,
 		   struct rte_flow_item_eth *eth_mask);
 int mlx5_flow_lacp_miss(struct rte_eth_dev *dev);
 struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+uint32_t mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev,
+					    uint32_t txq);
 void mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 				       uint64_t async_id, int status);
 void mlx5_set_query_alarm(struct mlx5_dev_ctx_shared *sh);
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 6b6b9c77ae4..9050a32eb1c 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -102,9 +102,9 @@ mlx5_devx_modify_rq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
-mlx5_devx_modify_sq(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
-		    uint8_t dev_port)
+int
+mlx5_txq_devx_modify(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+		     uint8_t dev_port)
 {
 	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
 	int ret;
@@ -1121,7 +1121,7 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
 	*txq_data->qp_db = 0;
 	txq_data->qp_num_8s = txq_obj->sq_obj.sq->id << 8;
 	/* Change Send Queue state to Ready-to-Send. */
-	ret = mlx5_devx_modify_sq(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
+	ret = mlx5_txq_devx_modify(txq_obj, MLX5_TXQ_MOD_RST2RDY, 0);
 	if (ret) {
 		rte_errno = errno;
 		DRV_LOG(ERR,
@@ -1190,7 +1190,7 @@ struct mlx5_obj_ops devx_obj_ops = {
 	.drop_action_create = mlx5_devx_drop_action_create,
 	.drop_action_destroy = mlx5_devx_drop_action_destroy,
 	.txq_obj_new = mlx5_txq_devx_obj_new,
-	.txq_obj_modify = mlx5_devx_modify_sq,
+	.txq_obj_modify = mlx5_txq_devx_modify,
 	.txq_obj_release = mlx5_txq_devx_obj_release,
 	.lb_dummy_queue_create = NULL,
 	.lb_dummy_queue_release = NULL,
diff --git a/drivers/net/mlx5/mlx5_devx.h b/drivers/net/mlx5/mlx5_devx.h
index bc8a8d6b73c..a95207a6b9a 100644
--- a/drivers/net/mlx5/mlx5_devx.h
+++ b/drivers/net/mlx5/mlx5_devx.h
@@ -8,6 +8,8 @@
 #include "mlx5.h"
 
 int mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_devx_modify(struct mlx5_txq_obj *obj,
+			 enum mlx5_txq_modify_type type, uint8_t dev_port);
 void mlx5_txq_devx_obj_release(struct mlx5_txq_obj *txq_obj);
 
 extern struct mlx5_obj_ops devx_obj_ops;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 4abeae8ce2d..1d493f12075 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6596,6 +6596,80 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
 						   actions, false, &error);
 }
 
+/**
+ * Create a dedicated flow rule on e-switch table 1, matches ESW manager
+ * and sq number, directs all packets to peer vport.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param txq
+ *   Txq index.
+ *
+ * @return
+ *   Flow ID on success, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_flow_create_devx_sq_miss_flow(struct rte_eth_dev *dev, uint32_t txq)
+{
+	struct rte_flow_attr attr = {
+		.group = 0,
+		.priority = MLX5_FLOW_LOWEST_PRIO_INDICATOR,
+		.ingress = 1,
+		.egress = 0,
+		.transfer = 1,
+	};
+	struct rte_flow_item_port_id port_spec = {
+		.id = MLX5_PORT_ESW_MGR,
+	};
+	struct mlx5_rte_flow_item_tx_queue txq_spec = {
+		.queue = txq,
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_PORT_ID,
+			.spec = &port_spec,
+		},
+		{
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+			.spec = &txq_spec,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action_jump jump = {
+		.group = 1,
+	};
+	struct rte_flow_action_port_id port = {
+		.id = dev->data->port_id,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct rte_flow_error error;
+
+	/*
+	 * Creates group 0, highest priority jump flow.
+	 * Matches txq to bypass kernel packets.
+	 */
+	if (flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern, actions,
+			     false, &error) == 0)
+		return 0;
+	/* Create group 1, lowest priority redirect flow for txq. */
+	attr.group = 1;
+	actions[0].conf = &port;
+	actions[0].type = RTE_FLOW_ACTION_TYPE_PORT_ID;
+	return flow_list_create(dev, MLX5_FLOW_TYPE_CTL, &attr, pattern,
+				actions, false, &error);
+}
+
 /**
  * Validate a flow supported by the NIC.
  *
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 54c28934372..ca43bd51aab 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1308,9 +1308,18 @@ mlx5_traffic_enable(struct rte_eth_dev *dev)
 				goto error;
 			}
 		}
+		if ((priv->representor || priv->master) &&
+		    priv->config.dv_esw_en) {
+			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
+				DRV_LOG(ERR,
+					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
+					dev->data->port_id, i);
+				goto error;
+			}
+		}
 		mlx5_txq_release(dev, i);
 	}
-	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
+	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
 		if (mlx5_flow_create_esw_table_zero_flow(dev))
 			priv->fdb_def_rule = 1;
 		else
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [dpdk-dev] [PATCH v4 8/8] net/mlx5: check DevX to support more Verbs ports
  2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
                     ` (6 preceding siblings ...)
  2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
@ 2021-10-22  9:11   ` Xueming Li
  7 siblings, 0 replies; 47+ messages in thread
From: Xueming Li @ 2021-10-22  9:11 UTC (permalink / raw)
  To: dev; +Cc: xuemingl, Viacheslav Ovsiienko, Lior Margalit, Matan Azrad

Verbs API doesn't support device port number larger than 255 by design.

To support more VF or SubFunction port representors, forces DevX API
check when max Verbs device link ports larger than 255.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 2db842cb983..17192c7fd55 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1243,12 +1243,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		config->dv_flow_en = 0;
 	}
 #endif
-	if (spawn->max_port > UINT8_MAX) {
-		/* Verbs can't support ports larger than 255 by design. */
-		DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
-		err = EINVAL;
-		goto error;
-	}
 	config->ind_table_max_size =
 		sh->device_attr.max_rwq_indirection_table_size;
 	/*
@@ -1699,6 +1693,11 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 					mlx5_rxq_ibv_obj_dummy_lb_create;
 		priv->obj_ops.lb_dummy_queue_release =
 					mlx5_rxq_ibv_obj_dummy_lb_release;
+	} else if (spawn->max_port > UINT8_MAX) {
+		/* Verbs can't support ports larger than 255 by design. */
+		DRV_LOG(ERR, "must enable DV and ESW when RDMA link ports > 255");
+		err = ENOTSUP;
+		goto error;
 	} else {
 		priv->obj_ops = ibv_obj_ops;
 	}
-- 
2.33.0


^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2021-10-22  9:13 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-27  8:32 [dpdk-dev] [PATCH 0/8] net/mlx5: support more than 255 representors Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 4/8] net/mlx5: check DevX to support more Verb ports Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 5/8] net/mlx5: support flow item port of switch manager Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 6/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 7/8] net/mlx5: fix internal root table flow priroity Xueming Li
2021-09-27  8:32 ` [dpdk-dev] [PATCH 8/8] net/mlx5: enable DevX Tx queue creation Xueming Li
2021-10-16  8:07 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: support more than 255 representors Xueming Li
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
2021-10-19  8:23     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
2021-10-19  8:24     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
2021-10-19  8:26     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
2021-10-19  8:26     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
2021-10-19  8:27     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
2021-10-19  8:28     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
2021-10-19  8:29     ` Slava Ovsiienko
2021-10-16  8:07   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
2021-10-19  8:30     ` Slava Ovsiienko
2021-10-19 10:34 ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Xueming Li
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
2021-10-21 13:34     ` Ferruh Yigit
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
2021-10-19 10:34   ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
2021-10-19 10:35   ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li
2021-10-20 13:40   ` [dpdk-dev] [PATCH v3 0/8] net/mlx5: support more than 255 representors Raslan Darawsheh
2021-10-20 16:00     ` Xueming(Steven) Li
2021-10-22  9:11 ` [dpdk-dev] [PATCH v4 " Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 1/8] common/mlx5: add netlink API to get RDMA port state Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 2/8] net/mlx5: use netlink when IB port greater than 255 Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 3/8] net/mlx5: improve Verbs flow priority discover for scalable Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 4/8] net/mlx5: support E-Switch manager egress traffic match Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 5/8] net/mlx5: supports flow item of normal Tx queue Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 6/8] net/mlx5: fix internal root table flow priroity Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 7/8] net/mlx5: enable DevX Tx queue creation Xueming Li
2021-10-22  9:11   ` [dpdk-dev] [PATCH v4 8/8] net/mlx5: check DevX to support more Verbs ports Xueming Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).