* [dpdk-dev] [PATCH] net/mlx5: fix ESXi VLAN in virtual machine
@ 2019-07-15 13:45 Viacheslav Ovsiienko
  2019-07-29 15:14 ` Matan Azrad
  2019-07-29 15:26 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  0 siblings, 2 replies; 7+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-15 13:45 UTC (permalink / raw)
  To: dev; +Cc: yskoh
On ESXi setups when we have SR-IOV and E-Switch enabled there is the
problem to receive VLAN traffic on VF interfaces. The NIC driver
in ESXi hypervisor does not setup E-Switch vport setting correctly
and VLAN traffic targeted to VF is dropped.
The patch provides the temporary workaround - if the rule
containing the VLAN pattern is being installed for VF the VLAN
network interface over VF is created, like the command does:
  ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
The PMD in DPDK maintains the database of created VLAN interfaces
for each existing VF and requested VLAN tags. When all of the RTE
Flows using the given VLAN tag are removed the created VLAN interface
with this VLAN tag is deleted.
The name of created VLAN interface follows the format:
  evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
Implementation limitations:
- mask in rules is ignored, rule must specify VLAN tags exactly,
  no wildcards (which are implemented by the masks) are allowed
- virtual environment is detected via rte_hypervisor() call,
  currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
  platform. For other architectures workaround always
  applied for the Flow over PCI VF
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c            |   6 +
 drivers/net/mlx5/mlx5.h            |  30 ++++
 drivers/net/mlx5/mlx5_flow.c       |  22 +++
 drivers/net/mlx5/mlx5_flow.h       |   5 +
 drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
 drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
 drivers/net/mlx5/mlx5_nl.c         | 279 +++++++++++++++++++++++++++++++++++++
 7 files changed, 396 insertions(+), 4 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d93f92d..8549167 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -690,6 +690,8 @@ struct mlx5_dev_spawn_data {
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->esxi_context)
+		mlx5_vlan_esxi_exit(priv->esxi_context);
 	if (priv->sh) {
 		/*
 		 * Free the shared context in last turn, because the cleanup
@@ -1546,6 +1548,8 @@ struct mlx5_dev_spawn_data {
 #endif
 	/* Store device configuration on private structure. */
 	priv->config = config;
+	/* Create context for virtual machine VLAN workaround. */
+	priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
 	if (config.dv_flow_en) {
 		err = mlx5_alloc_shared_dr(priv);
 		if (err)
@@ -1572,6 +1576,8 @@ struct mlx5_dev_spawn_data {
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->esxi_context)
+			mlx5_vlan_esxi_exit(priv->esxi_context);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 5af3f41..87afa7a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -231,6 +231,27 @@ enum mlx5_verbs_alloc_type {
 	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
 };
 
+/* VLAN netdev for ESXi VLAN workaround. */
+struct mlx5_vlan_dev {
+	uint32_t refcnt;
+	uint32_t ifindex; /**< Own interface index. */
+};
+
+/* Structure for VF ESXi VLAN workaround. */
+struct mlx5_vf_vlan {
+	uint32_t tag:12;
+	uint32_t created:1;
+};
+
+/* Array of VLAN devices created on the base of VF */
+struct mlx5_vlan_esxi_context {
+	int nl_socket;
+	uint32_t nl_sn;
+	uint32_t vf_ifindex;
+	struct rte_eth_dev *dev;
+	struct mlx5_vlan_dev vlan_dev[4096];
+};
+
 /**
  * Verbs allocator needs a context to know in the callback which kind of
  * resources it is allocating.
@@ -386,6 +407,7 @@ struct mlx5_priv {
 	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
 	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
+	struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context. */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
 	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
@@ -582,6 +604,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 			struct mlx5_switch_info *info);
 
+struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+						   uint32_t ifindex);
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx);
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+
 /* mlx5_devx_cmds.c */
 
 int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 4ba34db..42743d2 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1200,6 +1200,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  *   Item specification.
  * @param[in] item_flags
  *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -1209,6 +1211,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 int
 mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 			     uint64_t item_flags,
+			     struct rte_eth_dev *dev,
 			     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
@@ -1243,6 +1246,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 					error);
 	if (ret)
 		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->esxi_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor (ESXi) to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
 	if (spec) {
 		vlan_tag = spec->tci;
 		vlan_tag &= mask->tci;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 72b339e..ac20572 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -318,6 +318,8 @@ struct mlx5_flow_dv {
 	/**< Pointer to the jump action resource. */
 	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
 	/**< Pointer to port ID action resource. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
 	/**< Action list. */
@@ -343,6 +345,8 @@ struct mlx5_flow_verbs {
 	struct ibv_flow *flow; /**< Verbs flow pointer. */
 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
 	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 };
 
 /** Device flow structure. */
@@ -507,6 +511,7 @@ int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
 				struct rte_flow_error *error);
 int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 				 uint64_t item_flags,
+				 struct rte_eth_dev *dev,
 				 struct rte_flow_error *error);
 int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
 				  uint64_t item_flags,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 3fa624b..63183b5 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -2363,7 +2363,7 @@ struct field_modify_info modify_tcp[] = {
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
@@ -2914,6 +2914,8 @@ struct field_modify_info modify_tcp[] = {
 /**
  * Add VLAN item to matcher and to the value.
  *
+ * @param[in, out] dev_flow
+ *   Flow descriptor.
  * @param[in, out] matcher
  *   Flow matcher.
  * @param[in, out] key
@@ -2924,7 +2926,8 @@ struct field_modify_info modify_tcp[] = {
  *   Item is inner pattern.
  */
 static void
-flow_dv_translate_item_vlan(void *matcher, void *key,
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+			    void *matcher, void *key,
 			    const struct rte_flow_item *item,
 			    int inner)
 {
@@ -2951,6 +2954,12 @@ struct field_modify_info modify_tcp[] = {
 		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
 					 outer_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+		/*
+		 * This is workaround, masks are not supported,
+		 * and pre-validated.
+		 */
+		dev_flow->dv.vf_vlan.tag =
+			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
 	}
 	tci_m = rte_be_to_cpu_16(vlan_m->tci);
 	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
@@ -4443,7 +4452,8 @@ struct field_modify_info modify_tcp[] = {
 					     MLX5_FLOW_LAYER_OUTER_L2;
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
-			flow_dv_translate_item_vlan(match_mask, match_value,
+			flow_dv_translate_item_vlan(dev_flow,
+						    match_mask, match_value,
 						    items, tunnel);
 			matcher.priority = MLX5_PRIORITY_MAP_L2;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -4658,6 +4668,17 @@ struct field_modify_info modify_tcp[] = {
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->dv.vf_vlan.tag &&
+		    !dev_flow->dv.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->dv.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -4671,6 +4692,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -4871,6 +4895,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 2f4c80c..5909488 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -386,6 +386,9 @@
 		flow_verbs_spec_add(&dev_flow->verbs, ð, size);
 	else
 		flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð);
+	if (!tunnel)
+		dev_flow->verbs.vf_vlan.tag =
+			rte_be_to_cpu_16(spec->tci) & 0x0fff;
 }
 
 /**
@@ -1049,7 +1052,7 @@
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -1587,6 +1590,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 }
 
@@ -1634,6 +1641,7 @@
 flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		 struct rte_flow_error *error)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 	struct mlx5_flow *dev_flow;
 	int err;
@@ -1683,6 +1691,17 @@
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->verbs.vf_vlan.tag &&
+		    !dev_flow->verbs.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -1696,6 +1715,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index 5773fa7..8516442 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -12,11 +12,14 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdalign.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
 #include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -28,6 +31,8 @@
 /* Receive buffer size for the Netlink socket */
 #define MLX5_RECV_BUF_SIZE 32768
 
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
 /*
  * Define NDA_RTA as defined in iproute2 sources.
  *
@@ -987,3 +992,277 @@ struct mlx5_nl_ifindex_data {
 	}
 	return ret;
 }
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex)
+{
+	int ret;
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+			.nlmsg_type = RTM_DELLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+
+	if (ifindex) {
+		++esxi->nl_sn;
+		if (!esxi->nl_sn)
+			++esxi->nl_sn;
+		ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
+		if (ret >= 0)
+			ret = mlx5_nl_recv(esxi->nl_socket,
+					   esxi->nl_sn,
+					   NULL, NULL);
+		if (ret < 0)
+			DRV_LOG(WARNING, "netlink: error deleting"
+					 " VLAN ESXi ifindex %u, %d",
+					 ifindex, ret);
+	}
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+	return (struct nlattr *)
+		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+	struct nlattr *nla = nl_msg_tail(nlh);
+
+	nla->nla_type = type;
+	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+	if (alen)
+		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+	nl_attr_put(nlh, type, NULL, 0);
+	return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex,
+		      uint16_t tag)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
+
+	alignas(RTE_CACHE_LINE_SIZE)
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(name)) +
+		    NLMSG_ALIGN(sizeof("vlan")) +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+	struct nlattr *na_info;
+	struct nlattr *na_vlan;
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	++esxi->nl_sn;
+	if (!esxi->nl_sn)
+		++esxi->nl_sn;
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_EXCL | NLM_F_ACK;
+	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct ifinfomsg);
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_type = 0;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+	ret = snprintf(name, sizeof(name), "%s.%u.%u",
+		       MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
+	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+	nl_attr_nest_end(nlh, na_vlan);
+	nl_attr_nest_end(nlh, na_info);
+	assert(sizeof(buf) >= nlh->nlmsg_len);
+	ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"netlink: VLAN %s create failure (%d)",
+			name, ret);
+	}
+	// Try to get ifindex of created or pre-existing device.
+	ret = if_nametoindex(name);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"VLAN %s failed to get index (%d)",
+			name, errno);
+		return 0;
+	}
+	return ret;
+}
+
+/*
+ * Release VLAN network device, created for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(vlan->created);
+	assert(priv->esxi_context);
+	if (!vlan->created || !esxi)
+		return;
+	vlan->created = 0;
+	assert(vlan_dev[vlan->tag].refcnt);
+	if (--vlan_dev[vlan->tag].refcnt == 0 &&
+	    vlan_dev[vlan->tag].ifindex) {
+		mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex = 0;
+	}
+}
+
+/**
+ * Acquire VLAN interface with specified tag for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(!vlan->created);
+	assert(priv->esxi_context);
+	if (vlan->created || !esxi)
+		return;
+	if (vlan_dev[vlan->tag].refcnt == 0) {
+		assert(!vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex =
+			mlx5_vlan_esxi_create(esxi,
+					      esxi->vf_ifindex,
+					      vlan->tag);
+	}
+	if (vlan_dev[vlan->tag].ifindex) {
+		vlan_dev[vlan->tag].refcnt++;
+		vlan->created = 1;
+	}
+}
+
+/*
+ * Create per ethernet device VLAN ESXi workaround context
+ */
+struct mlx5_vlan_esxi_context *
+mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+		    uint32_t ifindex)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_vlan_esxi_context *esxi;
+
+	/* Do not engage workaround over PF. */
+	if (!config->vf)
+		return NULL;
+	/* Check whether there is virtual environment */
+	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
+		return NULL;
+	esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
+	if (!esxi) {
+		DRV_LOG(WARNING,
+			"Can not allocate memory"
+			" for ESXi VLAN context");
+		return NULL;
+	}
+	esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+	if (esxi->nl_socket < 0) {
+		DRV_LOG(WARNING,
+			"Can not create Netlink socket"
+			" for ESXi VLAN context");
+		rte_free(esxi);
+		return NULL;
+	}
+	esxi->nl_sn = random();
+	esxi->vf_ifindex = ifindex;
+	esxi->dev = dev;
+	/* Cleanup for existing VLAN devices. */
+	return esxi;
+}
+
+/*
+ * Destroy per ethernet device VLAN ESXi workaround context
+ */
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi)
+{
+	unsigned int i;
+
+	/* Delete all remaining VLAN devices. */
+	for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
+		if (esxi->vlan_dev[i].ifindex)
+			mlx5_vlan_esxi_delete(esxi, esxi->vlan_dev[i].ifindex);
+	}
+	if (esxi->nl_socket >= 0)
+		close(esxi->nl_socket);
+	rte_free(esxi);
+}
-- 
1.8.3.1
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH] net/mlx5: fix ESXi VLAN in virtual machine
  2019-07-15 13:45 [dpdk-dev] [PATCH] net/mlx5: fix ESXi VLAN in virtual machine Viacheslav Ovsiienko
@ 2019-07-29 15:14 ` Matan Azrad
  2019-07-29 15:26 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  1 sibling, 0 replies; 7+ messages in thread
From: Matan Azrad @ 2019-07-29 15:14 UTC (permalink / raw)
  To: Slava Ovsiienko, dev; +Cc: Yongseok Koh
From: Viacheslav Ovsiienko
> On ESXi setups when we have SR-IOV and E-Switch enabled there is the
> problem to receive VLAN traffic on VF interfaces. The NIC driver in ESXi
> hypervisor does not setup E-Switch vport setting correctly and VLAN traffic
> targeted to VF is dropped.
> 
> The patch provides the temporary workaround - if the rule containing the
> VLAN pattern is being installed for VF the VLAN network interface over VF is
> created, like the command does:
> 
>   ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
> 
> The PMD in DPDK maintains the database of created VLAN interfaces for
> each existing VF and requested VLAN tags. When all of the RTE Flows using
> the given VLAN tag are removed the created VLAN interface with this VLAN
> tag is deleted.
> 
> The name of created VLAN interface follows the format:
> 
>   evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
> 
> Implementation limitations:
> 
> - mask in rules is ignored, rule must specify VLAN tags exactly,
>   no wildcards (which are implemented by the masks) are allowed
> 
> - virtual environment is detected via rte_hypervisor() call,
>   currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
>   platform. For other architectures workaround always
>   applied for the Flow over PCI VF
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
After rebase, 
Acked-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c            |   6 +
>  drivers/net/mlx5/mlx5.h            |  30 ++++
>  drivers/net/mlx5/mlx5_flow.c       |  22 +++
>  drivers/net/mlx5/mlx5_flow.h       |   5 +
>  drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
>  drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
>  drivers/net/mlx5/mlx5_nl.c         | 279
> +++++++++++++++++++++++++++++++++++++
>  7 files changed, 396 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> d93f92d..8549167 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -690,6 +690,8 @@ struct mlx5_dev_spawn_data {
>  		close(priv->nl_socket_route);
>  	if (priv->nl_socket_rdma >= 0)
>  		close(priv->nl_socket_rdma);
> +	if (priv->esxi_context)
> +		mlx5_vlan_esxi_exit(priv->esxi_context);
>  	if (priv->sh) {
>  		/*
>  		 * Free the shared context in last turn, because the cleanup
> @@ -1546,6 +1548,8 @@ struct mlx5_dev_spawn_data {  #endif
>  	/* Store device configuration on private structure. */
>  	priv->config = config;
> +	/* Create context for virtual machine VLAN workaround. */
> +	priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
>  	if (config.dv_flow_en) {
>  		err = mlx5_alloc_shared_dr(priv);
>  		if (err)
> @@ -1572,6 +1576,8 @@ struct mlx5_dev_spawn_data {
>  			close(priv->nl_socket_route);
>  		if (priv->nl_socket_rdma >= 0)
>  			close(priv->nl_socket_rdma);
> +		if (priv->esxi_context)
> +			mlx5_vlan_esxi_exit(priv->esxi_context);
>  		if (own_domain_id)
>  			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
>  		rte_free(priv);
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 5af3f41..87afa7a 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -231,6 +231,27 @@ enum mlx5_verbs_alloc_type {
>  	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
>  };
> 
> +/* VLAN netdev for ESXi VLAN workaround. */ struct mlx5_vlan_dev {
> +	uint32_t refcnt;
> +	uint32_t ifindex; /**< Own interface index. */ };
> +
> +/* Structure for VF ESXi VLAN workaround. */ struct mlx5_vf_vlan {
> +	uint32_t tag:12;
> +	uint32_t created:1;
> +};
> +
> +/* Array of VLAN devices created on the base of VF */ struct
> +mlx5_vlan_esxi_context {
> +	int nl_socket;
> +	uint32_t nl_sn;
> +	uint32_t vf_ifindex;
> +	struct rte_eth_dev *dev;
> +	struct mlx5_vlan_dev vlan_dev[4096];
> +};
> +
>  /**
>   * Verbs allocator needs a context to know in the callback which kind of
>   * resources it is allocating.
> @@ -386,6 +407,7 @@ struct mlx5_priv {
>  	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
>  	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
>  	uint32_t nl_sn; /* Netlink message sequence number. */
> +	struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context.
> */
>  #ifndef RTE_ARCH_64
>  	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
>  	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; @@ -582,6
> +604,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct
> rte_ether_addr *mac,  int mlx5_nl_switch_info(int nl, unsigned int ifindex,
>  			struct mlx5_switch_info *info);
> 
> +struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev
> *dev,
> +						   uint32_t ifindex);
> +void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx); void
> +mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vf_vlan);
> +void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vf_vlan);
> +
>  /* mlx5_devx_cmds.c */
> 
>  int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx, diff --git
> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
> 4ba34db..42743d2 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -1200,6 +1200,8 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,
>   *   Item specification.
>   * @param[in] item_flags
>   *   Bit-fields that holds the items detected until now.
> + * @param[in] dev
> + *   Ethernet device flow is being created on.
>   * @param[out] error
>   *   Pointer to error structure.
>   *
> @@ -1209,6 +1211,7 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,  int  mlx5_flow_validate_item_vlan(const
> struct rte_flow_item *item,
>  			     uint64_t item_flags,
> +			     struct rte_eth_dev *dev,
>  			     struct rte_flow_error *error)
>  {
>  	const struct rte_flow_item_vlan *spec = item->spec; @@ -1243,6
> +1246,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev,
> int32_t priority,
>  					error);
>  	if (ret)
>  		return ret;
> +	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
> +		struct mlx5_priv *priv = dev->data->dev_private;
> +
> +		if (priv->esxi_context) {
> +			/*
> +			 * Non-NULL context means we have a virtual
> machine
> +			 * and SR-IOV enabled, we have to create VLAN
> interface
> +			 * to make hypervisor (ESXi) to setup E-Switch vport
> +			 * context correctly. We avoid creating the multiple
> +			 * VLAN interfaces, so we cannot support VLAN tag
> mask.
> +			 */
> +			return rte_flow_error_set(error, EINVAL,
> +
> RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "VLAN tag mask is not"
> +						  " supported in virtual"
> +						  " environment");
> +		}
> +	}
>  	if (spec) {
>  		vlan_tag = spec->tci;
>  		vlan_tag &= mask->tci;
> diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
> index 72b339e..ac20572 100644
> --- a/drivers/net/mlx5/mlx5_flow.h
> +++ b/drivers/net/mlx5/mlx5_flow.h
> @@ -318,6 +318,8 @@ struct mlx5_flow_dv {
>  	/**< Pointer to the jump action resource. */
>  	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
>  	/**< Pointer to port ID action resource. */
> +	struct mlx5_vf_vlan vf_vlan;
> +	/**< Structure for VF ESXi VLAN workaround. */
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>  	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
>  	/**< Action list. */
> @@ -343,6 +345,8 @@ struct mlx5_flow_verbs {
>  	struct ibv_flow *flow; /**< Verbs flow pointer. */
>  	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
>  	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
> +	struct mlx5_vf_vlan vf_vlan;
> +	/**< Structure for VF ESXi VLAN workaround. */
>  };
> 
>  /** Device flow structure. */
> @@ -507,6 +511,7 @@ int mlx5_flow_validate_item_udp(const struct
> rte_flow_item *item,
>  				struct rte_flow_error *error);
>  int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
>  				 uint64_t item_flags,
> +				 struct rte_eth_dev *dev,
>  				 struct rte_flow_error *error);
>  int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
>  				  uint64_t item_flags,
> diff --git a/drivers/net/mlx5/mlx5_flow_dv.c
> b/drivers/net/mlx5/mlx5_flow_dv.c index 3fa624b..63183b5 100644
> --- a/drivers/net/mlx5/mlx5_flow_dv.c
> +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> @@ -2363,7 +2363,7 @@ struct field_modify_info modify_tcp[] = {
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
>  			ret = mlx5_flow_validate_item_vlan(items,
> item_flags,
> -							   error);
> +							   dev, error);
>  			if (ret < 0)
>  				return ret;
>  			last_item = tunnel ?
> MLX5_FLOW_LAYER_INNER_VLAN :
> @@ -2914,6 +2914,8 @@ struct field_modify_info modify_tcp[] = {
>  /**
>   * Add VLAN item to matcher and to the value.
>   *
> + * @param[in, out] dev_flow
> + *   Flow descriptor.
>   * @param[in, out] matcher
>   *   Flow matcher.
>   * @param[in, out] key
> @@ -2924,7 +2926,8 @@ struct field_modify_info modify_tcp[] = {
>   *   Item is inner pattern.
>   */
>  static void
> -flow_dv_translate_item_vlan(void *matcher, void *key,
> +flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
> +			    void *matcher, void *key,
>  			    const struct rte_flow_item *item,
>  			    int inner)
>  {
> @@ -2951,6 +2954,12 @@ struct field_modify_info modify_tcp[] = {
>  		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
>  					 outer_headers);
>  		headers_v = MLX5_ADDR_OF(fte_match_param, key,
> outer_headers);
> +		/*
> +		 * This is workaround, masks are not supported,
> +		 * and pre-validated.
> +		 */
> +		dev_flow->dv.vf_vlan.tag =
> +			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
>  	}
>  	tci_m = rte_be_to_cpu_16(vlan_m->tci);
>  	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); @@ -4443,7
> +4452,8 @@ struct field_modify_info modify_tcp[] = {
>  					     MLX5_FLOW_LAYER_OUTER_L2;
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
> -			flow_dv_translate_item_vlan(match_mask,
> match_value,
> +			flow_dv_translate_item_vlan(dev_flow,
> +						    match_mask, match_value,
>  						    items, tunnel);
>  			matcher.priority = MLX5_PRIORITY_MAP_L2;
>  			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2
> | @@ -4658,6 +4668,17 @@ struct field_modify_info modify_tcp[] = {
>  					   "hardware refuses to create flow");
>  			goto error;
>  		}
> +		if (priv->esxi_context &&
> +		    dev_flow->dv.vf_vlan.tag &&
> +		    !dev_flow->dv.vf_vlan.created) {
> +			/*
> +			 * The rule contains the VLAN pattern.
> +			 * For VF we are going to create VLAN
> +			 * interface to make ESXi set correct
> +			 * e-Switch vport context.
> +			 */
> +			mlx5_vlan_esxi_acquire(dev, &dev_flow-
> >dv.vf_vlan);
> +		}
>  	}
>  	return 0;
>  error:
> @@ -4671,6 +4692,9 @@ struct field_modify_info modify_tcp[] = {
>  				mlx5_hrxq_release(dev, dv->hrxq);
>  			dv->hrxq = NULL;
>  		}
> +		if (dev_flow->dv.vf_vlan.tag &&
> +		    dev_flow->dv.vf_vlan.created)
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >dv.vf_vlan);
>  	}
>  	rte_errno = err; /* Restore rte_errno. */
>  	return -rte_errno;
> @@ -4871,6 +4895,9 @@ struct field_modify_info modify_tcp[] = {
>  				mlx5_hrxq_release(dev, dv->hrxq);
>  			dv->hrxq = NULL;
>  		}
> +		if (dev_flow->dv.vf_vlan.tag &&
> +		    dev_flow->dv.vf_vlan.created)
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >dv.vf_vlan);
>  	}
>  }
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c
> b/drivers/net/mlx5/mlx5_flow_verbs.c
> index 2f4c80c..5909488 100644
> --- a/drivers/net/mlx5/mlx5_flow_verbs.c
> +++ b/drivers/net/mlx5/mlx5_flow_verbs.c
> @@ -386,6 +386,9 @@
>  		flow_verbs_spec_add(&dev_flow->verbs, ð, size);
>  	else
>  		flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð);
> +	if (!tunnel)
> +		dev_flow->verbs.vf_vlan.tag =
> +			rte_be_to_cpu_16(spec->tci) & 0x0fff;
>  }
> 
>  /**
> @@ -1049,7 +1052,7 @@
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
>  			ret = mlx5_flow_validate_item_vlan(items,
> item_flags,
> -							   error);
> +							   dev, error);
>  			if (ret < 0)
>  				return ret;
>  			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2
> | @@ -1587,6 +1590,10 @@
>  				mlx5_hrxq_release(dev, verbs->hrxq);
>  			verbs->hrxq = NULL;
>  		}
> +		if (dev_flow->verbs.vf_vlan.tag &&
> +		    dev_flow->verbs.vf_vlan.created) {
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  }
> 
> @@ -1634,6 +1641,7 @@
>  flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
>  		 struct rte_flow_error *error)
>  {
> +	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_flow_verbs *verbs;
>  	struct mlx5_flow *dev_flow;
>  	int err;
> @@ -1683,6 +1691,17 @@
>  					   "hardware refuses to create flow");
>  			goto error;
>  		}
> +		if (priv->esxi_context &&
> +		    dev_flow->verbs.vf_vlan.tag &&
> +		    !dev_flow->verbs.vf_vlan.created) {
> +			/*
> +			 * The rule contains the VLAN pattern.
> +			 * For VF we are going to create VLAN
> +			 * interface to make ESXi set correct
> +			 * e-Switch vport context.
> +			 */
> +			mlx5_vlan_esxi_acquire(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  	return 0;
>  error:
> @@ -1696,6 +1715,10 @@
>  				mlx5_hrxq_release(dev, verbs->hrxq);
>  			verbs->hrxq = NULL;
>  		}
> +		if (dev_flow->verbs.vf_vlan.tag &&
> +		    dev_flow->verbs.vf_vlan.created) {
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  	rte_errno = err; /* Restore rte_errno. */
>  	return -rte_errno;
> diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c index
> 5773fa7..8516442 100644
> --- a/drivers/net/mlx5/mlx5_nl.c
> +++ b/drivers/net/mlx5/mlx5_nl.c
> @@ -12,11 +12,14 @@
>  #include <stdbool.h>
>  #include <stdint.h>
>  #include <stdlib.h>
> +#include <stdalign.h>
>  #include <string.h>
>  #include <sys/socket.h>
>  #include <unistd.h>
> 
>  #include <rte_errno.h>
> +#include <rte_malloc.h>
> +#include <rte_hypervisor.h>
> 
>  #include "mlx5.h"
>  #include "mlx5_utils.h"
> @@ -28,6 +31,8 @@
>  /* Receive buffer size for the Netlink socket */  #define
> MLX5_RECV_BUF_SIZE 32768
> 
> +/** Parameters of VLAN devices created by driver. */ #define
> +MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
>  /*
>   * Define NDA_RTA as defined in iproute2 sources.
>   *
> @@ -987,3 +992,277 @@ struct mlx5_nl_ifindex_data {
>  	}
>  	return ret;
>  }
> +
> +/*
> + * Delete VLAN network device by ifindex.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_vlan_esxi_init().
> + * @param[in] ifindex
> + *   Interface index of network device to delete.
> + */
> +static void
> +mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
> +		      uint32_t ifindex)
> +{
> +	int ret;
> +	struct {
> +		struct nlmsghdr nh;
> +		struct ifinfomsg info;
> +	} req = {
> +		.nh = {
> +			.nlmsg_len = NLMSG_LENGTH(sizeof(struct
> ifinfomsg)),
> +			.nlmsg_type = RTM_DELLINK,
> +			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
> +		},
> +		.info = {
> +			.ifi_family = AF_UNSPEC,
> +			.ifi_index = ifindex,
> +		},
> +	};
> +
> +	if (ifindex) {
> +		++esxi->nl_sn;
> +		if (!esxi->nl_sn)
> +			++esxi->nl_sn;
> +		ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
> +		if (ret >= 0)
> +			ret = mlx5_nl_recv(esxi->nl_socket,
> +					   esxi->nl_sn,
> +					   NULL, NULL);
> +		if (ret < 0)
> +			DRV_LOG(WARNING, "netlink: error deleting"
> +					 " VLAN ESXi ifindex %u, %d",
> +					 ifindex, ret);
> +	}
> +}
> +
> +/* Set of subroutines to build Netlink message. */ static struct nlattr
> +* nl_msg_tail(struct nlmsghdr *nlh) {
> +	return (struct nlattr *)
> +		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len)); }
> +
> +static void
> +nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
> +{
> +	struct nlattr *nla = nl_msg_tail(nlh);
> +
> +	nla->nla_type = type;
> +	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
> +	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
> +
> +	if (alen)
> +		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen); }
> +
> +static struct nlattr *
> +nl_attr_nest_start(struct nlmsghdr *nlh, int type) {
> +	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
> +
> +	nl_attr_put(nlh, type, NULL, 0);
> +	return nest;
> +}
> +
> +static void
> +nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest) {
> +	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest; }
> +
> +/*
> + * Create network VLAN device with specified VLAN tag.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_vlan_esxi_init().
> + * @param[in] ifindex
> + *   Base network interface index.
> + * @param[in] tag
> + *   VLAN tag for VLAN network device to create.
> + */
> +static uint32_t
> +mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
> +		      uint32_t ifindex,
> +		      uint16_t tag)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifinfomsg *ifm;
> +	char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
> +
> +	alignas(RTE_CACHE_LINE_SIZE)
> +	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
> +		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
> +		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
> +		    NLMSG_ALIGN(sizeof(uint32_t)) +
> +		    NLMSG_ALIGN(sizeof(name)) +
> +		    NLMSG_ALIGN(sizeof("vlan")) +
> +		    NLMSG_ALIGN(sizeof(uint32_t)) +
> +		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
> +	struct nlattr *na_info;
> +	struct nlattr *na_vlan;
> +	int ret;
> +
> +	memset(buf, 0, sizeof(buf));
> +	++esxi->nl_sn;
> +	if (!esxi->nl_sn)
> +		++esxi->nl_sn;
> +	nlh = (struct nlmsghdr *)buf;
> +	nlh->nlmsg_len = sizeof(struct nlmsghdr);
> +	nlh->nlmsg_type = RTM_NEWLINK;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
> +			   NLM_F_EXCL | NLM_F_ACK;
> +	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
> +	nlh->nlmsg_len += sizeof(struct ifinfomsg);
> +	ifm->ifi_family = AF_UNSPEC;
> +	ifm->ifi_type = 0;
> +	ifm->ifi_index = 0;
> +	ifm->ifi_flags = IFF_UP;
> +	ifm->ifi_change = 0xffffffff;
> +	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
> +	ret = snprintf(name, sizeof(name), "%s.%u.%u",
> +		       MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
> +	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
> +	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
> +	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
> +	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
> +	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
> +	nl_attr_nest_end(nlh, na_vlan);
> +	nl_attr_nest_end(nlh, na_info);
> +	assert(sizeof(buf) >= nlh->nlmsg_len);
> +	ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
> +	if (ret >= 0)
> +		ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL,
> NULL);
> +	if (ret < 0) {
> +		DRV_LOG(WARNING,
> +			"netlink: VLAN %s create failure (%d)",
> +			name, ret);
> +	}
> +	// Try to get ifindex of created or pre-existing device.
> +	ret = if_nametoindex(name);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"VLAN %s failed to get index (%d)",
> +			name, errno);
> +		return 0;
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Release VLAN network device, created for ESXi workaround.
> + *
> + * @param[in] dev
> + *   Ethernet device object, Netlink context provider.
> + * @param[in] vlan
> + *   Object representing the network device to release.
> + */
> +void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vlan)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
> +	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
> +
> +	assert(vlan->created);
> +	assert(priv->esxi_context);
> +	if (!vlan->created || !esxi)
> +		return;
> +	vlan->created = 0;
> +	assert(vlan_dev[vlan->tag].refcnt);
> +	if (--vlan_dev[vlan->tag].refcnt == 0 &&
> +	    vlan_dev[vlan->tag].ifindex) {
> +		mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
> +		vlan_dev[vlan->tag].ifindex = 0;
> +	}
> +}
> +
> +/**
> + * Acquire VLAN interface with specified tag for ESXi workaround.
> + *
> + * @param[in] dev
> + *   Ethernet device object, Netlink context provider.
> + * @param[in] vlan
> + *   Object representing the network device to acquire.
> + */
> +void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vlan)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
> +	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
> +
> +	assert(!vlan->created);
> +	assert(priv->esxi_context);
> +	if (vlan->created || !esxi)
> +		return;
> +	if (vlan_dev[vlan->tag].refcnt == 0) {
> +		assert(!vlan_dev[vlan->tag].ifindex);
> +		vlan_dev[vlan->tag].ifindex =
> +			mlx5_vlan_esxi_create(esxi,
> +					      esxi->vf_ifindex,
> +					      vlan->tag);
> +	}
> +	if (vlan_dev[vlan->tag].ifindex) {
> +		vlan_dev[vlan->tag].refcnt++;
> +		vlan->created = 1;
> +	}
> +}
> +
> +/*
> + * Create per ethernet device VLAN ESXi workaround context  */ struct
> +mlx5_vlan_esxi_context * mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
> +		    uint32_t ifindex)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_dev_config *config = &priv->config;
> +	struct mlx5_vlan_esxi_context *esxi;
> +
> +	/* Do not engage workaround over PF. */
> +	if (!config->vf)
> +		return NULL;
> +	/* Check whether there is virtual environment */
> +	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
> +		return NULL;
> +	esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
> +	if (!esxi) {
> +		DRV_LOG(WARNING,
> +			"Can not allocate memory"
> +			" for ESXi VLAN context");
> +		return NULL;
> +	}
> +	esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
> +	if (esxi->nl_socket < 0) {
> +		DRV_LOG(WARNING,
> +			"Can not create Netlink socket"
> +			" for ESXi VLAN context");
> +		rte_free(esxi);
> +		return NULL;
> +	}
> +	esxi->nl_sn = random();
> +	esxi->vf_ifindex = ifindex;
> +	esxi->dev = dev;
> +	/* Cleanup for existing VLAN devices. */
> +	return esxi;
> +}
> +
> +/*
> + * Destroy per ethernet device VLAN ESXi workaround context  */ void
> +mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi) {
> +	unsigned int i;
> +
> +	/* Delete all remaining VLAN devices. */
> +	for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
> +		if (esxi->vlan_dev[i].ifindex)
> +			mlx5_vlan_esxi_delete(esxi, esxi-
> >vlan_dev[i].ifindex);
> +	}
> +	if (esxi->nl_socket >= 0)
> +		close(esxi->nl_socket);
> +	rte_free(esxi);
> +}
> --
> 1.8.3.1
^ permalink raw reply	[flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v2] net/mlx5: fix ESXi VLAN in virtual machine
  2019-07-15 13:45 [dpdk-dev] [PATCH] net/mlx5: fix ESXi VLAN in virtual machine Viacheslav Ovsiienko
  2019-07-29 15:14 ` Matan Azrad
@ 2019-07-29 15:26 ` Viacheslav Ovsiienko
  2019-07-30  5:05   ` Shahaf Shuler
  2019-07-30  9:20   ` [dpdk-dev] [PATCH v3] net/mlx5: add workaround for " Viacheslav Ovsiienko
  1 sibling, 2 replies; 7+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-29 15:26 UTC (permalink / raw)
  To: dev; +Cc: yskoh, shahafs
On ESXi setups when we have SR-IOV and E-Switch enabled there is the
problem to receive VLAN traffic on VF interfaces. The NIC driver
in ESXi hypervisor does not setup E-Switch vport setting correctly
and VLAN traffic targeted to VF is dropped.
The patch provides the temporary workaround - if the rule
containing the VLAN pattern is being installed for VF the VLAN
network interface over VF is created, like the command does:
  ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
The PMD in DPDK maintains the database of created VLAN interfaces
for each existing VF and requested VLAN tags. When all of the RTE
Flows using the given VLAN tag are removed the created VLAN interface
with this VLAN tag is deleted.
The name of created VLAN interface follows the format:
  evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
Implementation limitations:
- mask in rules is ignored, rule must specify VLAN tags exactly,
  no wildcards (which are implemented by the masks) are allowed
- virtual environment is detected via rte_hypervisor() call,
  currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
  platform. For other architectures workaround always
  applied for the Flow over PCI VF
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
---
v2: - rebase
v1: - http://patches.dpdk.org/patch/56450/
 drivers/net/mlx5/mlx5.c            |   6 +
 drivers/net/mlx5/mlx5.h            |  30 ++++
 drivers/net/mlx5/mlx5_flow.c       |  22 +++
 drivers/net/mlx5/mlx5_flow.h       |   5 +
 drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
 drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
 drivers/net/mlx5/mlx5_nl.c         | 279 +++++++++++++++++++++++++++++++++++++
 7 files changed, 396 insertions(+), 4 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index a490bf2..a5bb956 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -843,6 +843,8 @@ struct mlx5_dev_spawn_data {
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->esxi_context)
+		mlx5_vlan_esxi_exit(priv->esxi_context);
 	if (priv->sh) {
 		/*
 		 * Free the shared context in last turn, because the cleanup
@@ -1989,6 +1991,8 @@ struct mlx5_dev_spawn_data {
 	mlx5_set_min_inline(spawn, &config);
 	/* Store device configuration on private structure. */
 	priv->config = config;
+	/* Create context for virtual machine VLAN workaround. */
+	priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
 	if (config.dv_flow_en) {
 		err = mlx5_alloc_shared_dr(priv);
 		if (err)
@@ -2015,6 +2019,8 @@ struct mlx5_dev_spawn_data {
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->esxi_context)
+			mlx5_vlan_esxi_exit(priv->esxi_context);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e812374..646fa45 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -355,6 +355,27 @@ enum mlx5_verbs_alloc_type {
 	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
 };
 
+/* VLAN netdev for ESXi VLAN workaround. */
+struct mlx5_vlan_dev {
+	uint32_t refcnt;
+	uint32_t ifindex; /**< Own interface index. */
+};
+
+/* Structure for VF ESXi VLAN workaround. */
+struct mlx5_vf_vlan {
+	uint32_t tag:12;
+	uint32_t created:1;
+};
+
+/* Array of VLAN devices created on the base of VF */
+struct mlx5_vlan_esxi_context {
+	int nl_socket;
+	uint32_t nl_sn;
+	uint32_t vf_ifindex;
+	struct rte_eth_dev *dev;
+	struct mlx5_vlan_dev vlan_dev[4096];
+};
+
 /**
  * Verbs allocator needs a context to know in the callback which kind of
  * resources it is allocating.
@@ -631,6 +652,7 @@ struct mlx5_priv {
 	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
 	LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */
+	struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context. */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
 	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
@@ -830,6 +852,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 			struct mlx5_switch_info *info);
 
+struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+						   uint32_t ifindex);
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx);
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+
 /* mlx5_devx_cmds.c */
 
 struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 3d2d5fc..9ad845c 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1204,6 +1204,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  *   Item specification.
  * @param[in] item_flags
  *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -1213,6 +1215,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 int
 mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 			     uint64_t item_flags,
+			     struct rte_eth_dev *dev,
 			     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
@@ -1247,6 +1250,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 					error);
 	if (ret)
 		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->esxi_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor (ESXi) to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
 	if (spec) {
 		vlan_tag = spec->tci;
 		vlan_tag &= mask->tci;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 24da74b..2f0195f 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -330,6 +330,8 @@ struct mlx5_flow_dv {
 	/**< Pointer to the jump action resource. */
 	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
 	/**< Pointer to port ID action resource. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
 	/**< Action list. */
@@ -355,6 +357,8 @@ struct mlx5_flow_verbs {
 	struct ibv_flow *flow; /**< Verbs flow pointer. */
 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
 	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 };
 
 /** Device flow structure. */
@@ -505,6 +509,7 @@ int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
 				struct rte_flow_error *error);
 int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 				 uint64_t item_flags,
+				 struct rte_eth_dev *dev,
 				 struct rte_flow_error *error);
 int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
 				  uint64_t item_flags,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 59ef716..805985e 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -2892,7 +2892,7 @@ struct field_modify_info modify_tcp[] = {
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
@@ -3450,6 +3450,8 @@ struct field_modify_info modify_tcp[] = {
 /**
  * Add VLAN item to matcher and to the value.
  *
+ * @param[in, out] dev_flow
+ *   Flow descriptor.
  * @param[in, out] matcher
  *   Flow matcher.
  * @param[in, out] key
@@ -3460,7 +3462,8 @@ struct field_modify_info modify_tcp[] = {
  *   Item is inner pattern.
  */
 static void
-flow_dv_translate_item_vlan(void *matcher, void *key,
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+			    void *matcher, void *key,
 			    const struct rte_flow_item *item,
 			    int inner)
 {
@@ -3487,6 +3490,12 @@ struct field_modify_info modify_tcp[] = {
 		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
 					 outer_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+		/*
+		 * This is workaround, masks are not supported,
+		 * and pre-validated.
+		 */
+		dev_flow->dv.vf_vlan.tag =
+			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
 	}
 	tci_m = rte_be_to_cpu_16(vlan_m->tci);
 	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
@@ -4995,7 +5004,8 @@ struct field_modify_info modify_tcp[] = {
 					     MLX5_FLOW_LAYER_OUTER_L2;
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
-			flow_dv_translate_item_vlan(match_mask, match_value,
+			flow_dv_translate_item_vlan(dev_flow,
+						    match_mask, match_value,
 						    items, tunnel);
 			matcher.priority = MLX5_PRIORITY_MAP_L2;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -5211,6 +5221,17 @@ struct field_modify_info modify_tcp[] = {
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->dv.vf_vlan.tag &&
+		    !dev_flow->dv.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->dv.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -5224,6 +5245,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -5424,6 +5448,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index fd6f2d5..00422df 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -391,6 +391,9 @@
 		flow_verbs_spec_add(&dev_flow->verbs, ð, size);
 	else
 		flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð);
+	if (!tunnel)
+		dev_flow->verbs.vf_vlan.tag =
+			rte_be_to_cpu_16(spec->tci) & 0x0fff;
 }
 
 /**
@@ -1054,7 +1057,7 @@
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -1592,6 +1595,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 }
 
@@ -1639,6 +1646,7 @@
 flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		 struct rte_flow_error *error)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 	struct mlx5_flow *dev_flow;
 	int err;
@@ -1688,6 +1696,17 @@
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->verbs.vf_vlan.tag &&
+		    !dev_flow->verbs.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -1701,6 +1720,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index 5773fa7..8516442 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -12,11 +12,14 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdalign.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
 #include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -28,6 +31,8 @@
 /* Receive buffer size for the Netlink socket */
 #define MLX5_RECV_BUF_SIZE 32768
 
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
 /*
  * Define NDA_RTA as defined in iproute2 sources.
  *
@@ -987,3 +992,277 @@ struct mlx5_nl_ifindex_data {
 	}
 	return ret;
 }
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex)
+{
+	int ret;
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+			.nlmsg_type = RTM_DELLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+
+	if (ifindex) {
+		++esxi->nl_sn;
+		if (!esxi->nl_sn)
+			++esxi->nl_sn;
+		ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
+		if (ret >= 0)
+			ret = mlx5_nl_recv(esxi->nl_socket,
+					   esxi->nl_sn,
+					   NULL, NULL);
+		if (ret < 0)
+			DRV_LOG(WARNING, "netlink: error deleting"
+					 " VLAN ESXi ifindex %u, %d",
+					 ifindex, ret);
+	}
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+	return (struct nlattr *)
+		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+	struct nlattr *nla = nl_msg_tail(nlh);
+
+	nla->nla_type = type;
+	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+	if (alen)
+		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+	nl_attr_put(nlh, type, NULL, 0);
+	return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex,
+		      uint16_t tag)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
+
+	alignas(RTE_CACHE_LINE_SIZE)
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(name)) +
+		    NLMSG_ALIGN(sizeof("vlan")) +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+	struct nlattr *na_info;
+	struct nlattr *na_vlan;
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	++esxi->nl_sn;
+	if (!esxi->nl_sn)
+		++esxi->nl_sn;
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_EXCL | NLM_F_ACK;
+	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct ifinfomsg);
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_type = 0;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+	ret = snprintf(name, sizeof(name), "%s.%u.%u",
+		       MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
+	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+	nl_attr_nest_end(nlh, na_vlan);
+	nl_attr_nest_end(nlh, na_info);
+	assert(sizeof(buf) >= nlh->nlmsg_len);
+	ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"netlink: VLAN %s create failure (%d)",
+			name, ret);
+	}
+	// Try to get ifindex of created or pre-existing device.
+	ret = if_nametoindex(name);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"VLAN %s failed to get index (%d)",
+			name, errno);
+		return 0;
+	}
+	return ret;
+}
+
+/*
+ * Release VLAN network device, created for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(vlan->created);
+	assert(priv->esxi_context);
+	if (!vlan->created || !esxi)
+		return;
+	vlan->created = 0;
+	assert(vlan_dev[vlan->tag].refcnt);
+	if (--vlan_dev[vlan->tag].refcnt == 0 &&
+	    vlan_dev[vlan->tag].ifindex) {
+		mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex = 0;
+	}
+}
+
+/**
+ * Acquire VLAN interface with specified tag for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(!vlan->created);
+	assert(priv->esxi_context);
+	if (vlan->created || !esxi)
+		return;
+	if (vlan_dev[vlan->tag].refcnt == 0) {
+		assert(!vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex =
+			mlx5_vlan_esxi_create(esxi,
+					      esxi->vf_ifindex,
+					      vlan->tag);
+	}
+	if (vlan_dev[vlan->tag].ifindex) {
+		vlan_dev[vlan->tag].refcnt++;
+		vlan->created = 1;
+	}
+}
+
+/*
+ * Create per ethernet device VLAN ESXi workaround context
+ */
+struct mlx5_vlan_esxi_context *
+mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+		    uint32_t ifindex)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_vlan_esxi_context *esxi;
+
+	/* Do not engage workaround over PF. */
+	if (!config->vf)
+		return NULL;
+	/* Check whether there is virtual environment */
+	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
+		return NULL;
+	esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
+	if (!esxi) {
+		DRV_LOG(WARNING,
+			"Can not allocate memory"
+			" for ESXi VLAN context");
+		return NULL;
+	}
+	esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+	if (esxi->nl_socket < 0) {
+		DRV_LOG(WARNING,
+			"Can not create Netlink socket"
+			" for ESXi VLAN context");
+		rte_free(esxi);
+		return NULL;
+	}
+	esxi->nl_sn = random();
+	esxi->vf_ifindex = ifindex;
+	esxi->dev = dev;
+	/* Cleanup for existing VLAN devices. */
+	return esxi;
+}
+
+/*
+ * Destroy per ethernet device VLAN ESXi workaround context
+ */
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi)
+{
+	unsigned int i;
+
+	/* Delete all remaining VLAN devices. */
+	for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
+		if (esxi->vlan_dev[i].ifindex)
+			mlx5_vlan_esxi_delete(esxi, esxi->vlan_dev[i].ifindex);
+	}
+	if (esxi->nl_socket >= 0)
+		close(esxi->nl_socket);
+	rte_free(esxi);
+}
-- 
1.8.3.1
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v2] net/mlx5: fix ESXi VLAN in virtual machine
  2019-07-29 15:26 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
@ 2019-07-30  5:05   ` Shahaf Shuler
  2019-07-30  9:20   ` [dpdk-dev] [PATCH v3] net/mlx5: add workaround for " Viacheslav Ovsiienko
  1 sibling, 0 replies; 7+ messages in thread
From: Shahaf Shuler @ 2019-07-30  5:05 UTC (permalink / raw)
  To: Slava Ovsiienko, dev; +Cc: Yongseok Koh
Hi Slava,
Monday, July 29, 2019 6:27 PM, Viacheslav Ovsiienko:
> Subject: [dpdk-dev] [PATCH v2] net/mlx5: fix ESXi VLAN in virtual machine
> 
> On ESXi setups when we have SR-IOV and E-Switch enabled there is the
> problem to receive VLAN traffic on VF interfaces. The NIC driver in ESXi
> hypervisor does not setup E-Switch vport setting correctly and VLAN traffic
> targeted to VF is dropped.
> 
> The patch provides the temporary workaround - if the rule containing the
> VLAN pattern is being installed for VF the VLAN network interface over VF is
> created, like the command does:
> 
>   ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
> 
> The PMD in DPDK maintains the database of created VLAN interfaces for
> each existing VF and requested VLAN tags. When all of the RTE Flows using
> the given VLAN tag are removed the created VLAN interface with this VLAN
> tag is deleted.
> 
> The name of created VLAN interface follows the format:
> 
>   evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
> 
> Implementation limitations:
> 
> - mask in rules is ignored, rule must specify VLAN tags exactly,
>   no wildcards (which are implemented by the masks) are allowed
> 
> - virtual environment is detected via rte_hypervisor() call,
>   currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
>   platform. For other architectures workaround always
>   applied for the Flow over PCI VF
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> Acked-by: Matan Azrad <matan@mellanox.com>
> ---
> v2: - rebase
> v1: -
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F56450%2F&data=02%7C01%7Cshahafs%40mel
> lanox.com%7C3f213439528840d072f308d714393442%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637000108243298850&sdata=sXgknK92ce
> Xr7QXrdzQU6iUHzFDZZufEGU8Butqst6I%3D&reserved=0
> 
>  drivers/net/mlx5/mlx5.c            |   6 +
>  drivers/net/mlx5/mlx5.h            |  30 ++++
>  drivers/net/mlx5/mlx5_flow.c       |  22 +++
>  drivers/net/mlx5/mlx5_flow.h       |   5 +
>  drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
>  drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
>  drivers/net/mlx5/mlx5_nl.c         | 279
> +++++++++++++++++++++++++++++++++++++
>  7 files changed, 396 insertions(+), 4 deletions(-)
> 
[...]
> +
> +/**
> + * Acquire VLAN interface with specified tag for ESXi workaround.
> + *
> + * @param[in] dev
> + *   Ethernet device object, Netlink context provider.
> + * @param[in] vlan
> + *   Object representing the network device to acquire.
> + */
> +void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vlan)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
> +	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
> +
> +	assert(!vlan->created);
> +	assert(priv->esxi_context);
> +	if (vlan->created || !esxi)
> +		return;
> +	if (vlan_dev[vlan->tag].refcnt == 0) {
> +		assert(!vlan_dev[vlan->tag].ifindex);
> +		vlan_dev[vlan->tag].ifindex =
> +			mlx5_vlan_esxi_create(esxi,
> +					      esxi->vf_ifindex,
> +					      vlan->tag);
> +	}
> +	if (vlan_dev[vlan->tag].ifindex) {
> +		vlan_dev[vlan->tag].refcnt++;
> +		vlan->created = 1;
> +	}
> +}
> +
> +/*
> + * Create per ethernet device VLAN ESXi workaround context  */ struct
> +mlx5_vlan_esxi_context * mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
I would rather avoid the esxi word all over this patch. 
You cannot know whether other HV do it also. Better to make it generic lke mlx5_vlan_context or what ever other name you prefer. 
> +		    uint32_t ifindex)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_dev_config *config = &priv->config;
> +	struct mlx5_vlan_esxi_context *esxi;
> +
> +	/* Do not engage workaround over PF. */
> +	if (!config->vf)
> +		return NULL;
I think the check should be only for VF. The below can be dropped.
Because even if the VF is probed on the HV and use legacy switch, still VLAN traffic will not reach the target VF. 
> +	/* Check whether there is virtual environment */
> +	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
> +		return NULL;
^ permalink raw reply	[flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3] net/mlx5: add workaround for VLAN in virtual machine
  2019-07-29 15:26 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  2019-07-30  5:05   ` Shahaf Shuler
@ 2019-07-30  9:20   ` Viacheslav Ovsiienko
  2019-07-31  6:14     ` Shahaf Shuler
  2019-07-31  7:39     ` Raslan Darawsheh
  1 sibling, 2 replies; 7+ messages in thread
From: Viacheslav Ovsiienko @ 2019-07-30  9:20 UTC (permalink / raw)
  To: dev; +Cc: yskoh, shahafs
On some virtual setups (particularly on ESXi) when we have SR-IOV and
E-Switch enabled there is the problem to receive VLAN traffic on VF
interfaces. The NIC driver in ESXi hypervisor does not setup E-Switch
vport setting correctly and VLAN traffic targeted to VF is dropped.
The patch provides the temporary workaround - if the rule
containing the VLAN pattern is being installed for VF the VLAN
network interface over VF is created, like the command does:
  ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
The PMD in DPDK maintains the database of created VLAN interfaces
for each existing VF and requested VLAN tags. When all of the RTE
Flows using the given VLAN tag are removed the created VLAN interface
with this VLAN tag is deleted.
The name of created VLAN interface follows the format:
  evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
Implementation limitations:
- mask in rules is ignored, rule must specify VLAN tags exactly,
  no wildcards (which are implemented by the masks) are allowed
- virtual environment is detected via rte_hypervisor() call,
  and the type of hypervisor is checked. Currently we engage
  the workaround for ESXi and unrecognized hypervisors (which
  always happen on platforms other than x86 - it means workaround
  applied for the Flow over PCI VF). There are no confirmed data
  the other hypervisors (HyperV, Qemu) need this workaround,
  we are trying to reduce the list of configurations on those
  workaround should be applied.
  
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
---
v3: - address the comments, workaround engaged on ESXi and
      unrecognized hypervisors only. 
v2: - http://patches.dpdk.org/patch/57257/ 
    - rebase
v1: - http://patches.dpdk.org/patch/56450/
---
 drivers/net/mlx5/mlx5.c            |   6 +
 drivers/net/mlx5/mlx5.h            |  33 +++++
 drivers/net/mlx5/mlx5_flow.c       |  22 +++
 drivers/net/mlx5/mlx5_flow.h       |   5 +
 drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
 drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
 drivers/net/mlx5/mlx5_nl.c         | 294 +++++++++++++++++++++++++++++++++++++
 7 files changed, 414 insertions(+), 4 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3a345c7..f5bc31f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -843,6 +843,8 @@ struct mlx5_dev_spawn_data {
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->vmwa_context)
+		mlx5_vlan_vmwa_exit(priv->vmwa_context);
 	if (priv->sh) {
 		/*
 		 * Free the shared context in last turn, because the cleanup
@@ -1990,6 +1992,8 @@ struct mlx5_dev_spawn_data {
 	mlx5_set_min_inline(spawn, &config);
 	/* Store device configuration on private structure. */
 	priv->config = config;
+	/* Create context for virtual machine VLAN workaround. */
+	priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
 	if (config.dv_flow_en) {
 		err = mlx5_alloc_shared_dr(priv);
 		if (err)
@@ -2016,6 +2020,8 @@ struct mlx5_dev_spawn_data {
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->vmwa_context)
+			mlx5_vlan_vmwa_exit(priv->vmwa_context);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e812374..caf2491 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -355,6 +355,30 @@ enum mlx5_verbs_alloc_type {
 	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
 };
 
+/* VLAN netdev for VLAN workaround. */
+struct mlx5_vlan_dev {
+	uint32_t refcnt;
+	uint32_t ifindex; /**< Own interface index. */
+};
+
+/* Structure for VF VLAN workaround. */
+struct mlx5_vf_vlan {
+	uint32_t tag:12;
+	uint32_t created:1;
+};
+
+/*
+ * Array of VLAN devices created on the base of VF
+ * used for workaround in virtual environments.
+ */
+struct mlx5_vlan_vmwa_context {
+	int nl_socket;
+	uint32_t nl_sn;
+	uint32_t vf_ifindex;
+	struct rte_eth_dev *dev;
+	struct mlx5_vlan_dev vlan_dev[4096];
+};
+
 /**
  * Verbs allocator needs a context to know in the callback which kind of
  * resources it is allocating.
@@ -631,6 +655,7 @@ struct mlx5_priv {
 	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
 	LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */
+	struct mlx5_vlan_vmwa_context *vmwa_context; /* VLAN WA context. */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
 	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
@@ -830,6 +855,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 			struct mlx5_switch_info *info);
 
+struct mlx5_vlan_vmwa_context *mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
+						   uint32_t ifindex);
+void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *ctx);
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+
 /* mlx5_devx_cmds.c */
 
 struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 3d2d5fc..f40fee5 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1204,6 +1204,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  *   Item specification.
  * @param[in] item_flags
  *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -1213,6 +1215,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 int
 mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 			     uint64_t item_flags,
+			     struct rte_eth_dev *dev,
 			     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
@@ -1247,6 +1250,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 					error);
 	if (ret)
 		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->vmwa_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
 	if (spec) {
 		vlan_tag = spec->tci;
 		vlan_tag &= mask->tci;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 24da74b..822ff36 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -330,6 +330,8 @@ struct mlx5_flow_dv {
 	/**< Pointer to the jump action resource. */
 	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
 	/**< Pointer to port ID action resource. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF VLAN workaround. */
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
 	/**< Action list. */
@@ -355,6 +357,8 @@ struct mlx5_flow_verbs {
 	struct ibv_flow *flow; /**< Verbs flow pointer. */
 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
 	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF VLAN workaround. */
 };
 
 /** Device flow structure. */
@@ -505,6 +509,7 @@ int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
 				struct rte_flow_error *error);
 int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 				 uint64_t item_flags,
+				 struct rte_eth_dev *dev,
 				 struct rte_flow_error *error);
 int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
 				  uint64_t item_flags,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 59ef716..9c0a261 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -2892,7 +2892,7 @@ struct field_modify_info modify_tcp[] = {
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
@@ -3450,6 +3450,8 @@ struct field_modify_info modify_tcp[] = {
 /**
  * Add VLAN item to matcher and to the value.
  *
+ * @param[in, out] dev_flow
+ *   Flow descriptor.
  * @param[in, out] matcher
  *   Flow matcher.
  * @param[in, out] key
@@ -3460,7 +3462,8 @@ struct field_modify_info modify_tcp[] = {
  *   Item is inner pattern.
  */
 static void
-flow_dv_translate_item_vlan(void *matcher, void *key,
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+			    void *matcher, void *key,
 			    const struct rte_flow_item *item,
 			    int inner)
 {
@@ -3487,6 +3490,12 @@ struct field_modify_info modify_tcp[] = {
 		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
 					 outer_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+		/*
+		 * This is workaround, masks are not supported,
+		 * and pre-validated.
+		 */
+		dev_flow->dv.vf_vlan.tag =
+			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
 	}
 	tci_m = rte_be_to_cpu_16(vlan_m->tci);
 	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
@@ -4995,7 +5004,8 @@ struct field_modify_info modify_tcp[] = {
 					     MLX5_FLOW_LAYER_OUTER_L2;
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
-			flow_dv_translate_item_vlan(match_mask, match_value,
+			flow_dv_translate_item_vlan(dev_flow,
+						    match_mask, match_value,
 						    items, tunnel);
 			matcher.priority = MLX5_PRIORITY_MAP_L2;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -5211,6 +5221,17 @@ struct field_modify_info modify_tcp[] = {
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->vmwa_context &&
+		    dev_flow->dv.vf_vlan.tag &&
+		    !dev_flow->dv.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make hypervisor set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_vmwa_acquire(dev, &dev_flow->dv.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -5224,6 +5245,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &dev_flow->dv.vf_vlan);
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -5424,6 +5448,9 @@ struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &dev_flow->dv.vf_vlan);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index fd6f2d5..c5b28e3 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -391,6 +391,9 @@
 		flow_verbs_spec_add(&dev_flow->verbs, ð, size);
 	else
 		flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð);
+	if (!tunnel)
+		dev_flow->verbs.vf_vlan.tag =
+			rte_be_to_cpu_16(spec->tci) & 0x0fff;
 }
 
 /**
@@ -1054,7 +1057,7 @@
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -1592,6 +1595,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_vmwa_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 }
 
@@ -1639,6 +1646,7 @@
 flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		 struct rte_flow_error *error)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 	struct mlx5_flow *dev_flow;
 	int err;
@@ -1688,6 +1696,17 @@
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->vmwa_context &&
+		    dev_flow->verbs.vf_vlan.tag &&
+		    !dev_flow->verbs.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make hypervisor set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_vmwa_acquire(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -1701,6 +1720,10 @@
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_vmwa_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index 5773fa7..f0f57de 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -12,11 +12,14 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdalign.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
 #include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -28,6 +31,8 @@
 /* Receive buffer size for the Netlink socket */
 #define MLX5_RECV_BUF_SIZE 32768
 
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
 /*
  * Define NDA_RTA as defined in iproute2 sources.
  *
@@ -987,3 +992,292 @@ struct mlx5_nl_ifindex_data {
 	}
 	return ret;
 }
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
+		      uint32_t ifindex)
+{
+	int ret;
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+			.nlmsg_type = RTM_DELLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+
+	if (ifindex) {
+		++vmwa->nl_sn;
+		if (!vmwa->nl_sn)
+			++vmwa->nl_sn;
+		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
+		if (ret >= 0)
+			ret = mlx5_nl_recv(vmwa->nl_socket,
+					   vmwa->nl_sn,
+					   NULL, NULL);
+		if (ret < 0)
+			DRV_LOG(WARNING, "netlink: error deleting"
+					 " VLAN WA ifindex %u, %d",
+					 ifindex, ret);
+	}
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+	return (struct nlattr *)
+		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+	struct nlattr *nla = nl_msg_tail(nlh);
+
+	nla->nla_type = type;
+	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+	if (alen)
+		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+	nl_attr_put(nlh, type, NULL, 0);
+	return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
+		      uint32_t ifindex,
+		      uint16_t tag)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
+
+	alignas(RTE_CACHE_LINE_SIZE)
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(name)) +
+		    NLMSG_ALIGN(sizeof("vlan")) +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+	struct nlattr *na_info;
+	struct nlattr *na_vlan;
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	++vmwa->nl_sn;
+	if (!vmwa->nl_sn)
+		++vmwa->nl_sn;
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_EXCL | NLM_F_ACK;
+	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct ifinfomsg);
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_type = 0;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+	ret = snprintf(name, sizeof(name), "%s.%u.%u",
+		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
+	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+	nl_attr_nest_end(nlh, na_vlan);
+	nl_attr_nest_end(nlh, na_info);
+	assert(sizeof(buf) >= nlh->nlmsg_len);
+	ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"netlink: VLAN %s create failure (%d)",
+			name, ret);
+	}
+	// Try to get ifindex of created or pre-existing device.
+	ret = if_nametoindex(name);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"VLAN %s failed to get index (%d)",
+			name, errno);
+		return 0;
+	}
+	return ret;
+}
+
+/*
+ * Release VLAN network device, created for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
+	struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+	assert(vlan->created);
+	assert(priv->vmwa_context);
+	if (!vlan->created || !vmwa)
+		return;
+	vlan->created = 0;
+	assert(vlan_dev[vlan->tag].refcnt);
+	if (--vlan_dev[vlan->tag].refcnt == 0 &&
+	    vlan_dev[vlan->tag].ifindex) {
+		mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex = 0;
+	}
+}
+
+/**
+ * Acquire VLAN interface with specified tag for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
+	struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+	assert(!vlan->created);
+	assert(priv->vmwa_context);
+	if (vlan->created || !vmwa)
+		return;
+	if (vlan_dev[vlan->tag].refcnt == 0) {
+		assert(!vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex =
+			mlx5_vlan_vmwa_create(vmwa,
+					      vmwa->vf_ifindex,
+					      vlan->tag);
+	}
+	if (vlan_dev[vlan->tag].ifindex) {
+		vlan_dev[vlan->tag].refcnt++;
+		vlan->created = 1;
+	}
+}
+
+/*
+ * Create per ethernet device VLAN VM workaround context
+ */
+struct mlx5_vlan_vmwa_context *
+mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
+		    uint32_t ifindex)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_vlan_vmwa_context *vmwa;
+	enum rte_hypervisor hv_type;
+
+	/* Do not engage workaround over PF. */
+	if (!config->vf)
+		return NULL;
+	/* Check whether there is desired virtual environment */
+	hv_type = rte_hypervisor_get();
+	switch (hv_type) {
+	case RTE_HYPERVISOR_UNKNOWN:
+	case RTE_HYPERVISOR_VMWARE:
+		/*
+		 * The "white list" of configurations
+		 * to engage the workaround.
+		 */
+		break;
+	default:
+		/*
+		 * The configuration is not found in the "white list".
+		 * We should not engage the VLAN workaround.
+		 */
+		return NULL;
+	}
+	vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
+	if (!vmwa) {
+		DRV_LOG(WARNING,
+			"Can not allocate memory"
+			" for VLAN workaround context");
+		return NULL;
+	}
+	vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+	if (vmwa->nl_socket < 0) {
+		DRV_LOG(WARNING,
+			"Can not create Netlink socket"
+			" for VLAN workaround context");
+		rte_free(vmwa);
+		return NULL;
+	}
+	vmwa->nl_sn = random();
+	vmwa->vf_ifindex = ifindex;
+	vmwa->dev = dev;
+	/* Cleanup for existing VLAN devices. */
+	return vmwa;
+}
+
+/*
+ * Destroy per ethernet device VLAN VM workaround context
+ */
+void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
+{
+	unsigned int i;
+
+	/* Delete all remaining VLAN devices. */
+	for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
+		if (vmwa->vlan_dev[i].ifindex)
+			mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
+	}
+	if (vmwa->nl_socket >= 0)
+		close(vmwa->nl_socket);
+	rte_free(vmwa);
+}
-- 
1.8.3.1
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v3] net/mlx5: add workaround for VLAN in virtual machine
  2019-07-30  9:20   ` [dpdk-dev] [PATCH v3] net/mlx5: add workaround for " Viacheslav Ovsiienko
@ 2019-07-31  6:14     ` Shahaf Shuler
  2019-07-31  7:39     ` Raslan Darawsheh
  1 sibling, 0 replies; 7+ messages in thread
From: Shahaf Shuler @ 2019-07-31  6:14 UTC (permalink / raw)
  To: Slava Ovsiienko, dev; +Cc: Yongseok Koh
Tuesday, July 30, 2019 12:20 PM, Viacheslav Ovsiienko:
> Subject: [dpdk-dev] [PATCH v3] net/mlx5: add workaround for VLAN in
> virtual machine
> 
> On some virtual setups (particularly on ESXi) when we have SR-IOV and E-
> Switch enabled there is the problem to receive VLAN traffic on VF interfaces.
> The NIC driver in ESXi hypervisor does not setup E-Switch vport setting
> correctly and VLAN traffic targeted to VF is dropped.
> 
> The patch provides the temporary workaround - if the rule containing the
> VLAN pattern is being installed for VF the VLAN network interface over VF is
> created, like the command does:
> 
>   ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
> 
> The PMD in DPDK maintains the database of created VLAN interfaces for
> each existing VF and requested VLAN tags. When all of the RTE Flows using
> the given VLAN tag are removed the created VLAN interface with this VLAN
> tag is deleted.
> 
> The name of created VLAN interface follows the format:
> 
>   evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
> 
> Implementation limitations:
> 
> - mask in rules is ignored, rule must specify VLAN tags exactly,
>   no wildcards (which are implemented by the masks) are allowed
> 
> - virtual environment is detected via rte_hypervisor() call,
>   and the type of hypervisor is checked. Currently we engage
>   the workaround for ESXi and unrecognized hypervisors (which
>   always happen on platforms other than x86 - it means workaround
>   applied for the Flow over PCI VF). There are no confirmed data
>   the other hypervisors (HyperV, Qemu) need this workaround,
>   we are trying to reduce the list of configurations on those
>   workaround should be applied.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> Acked-by: Matan Azrad <matan@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v3] net/mlx5: add workaround for VLAN in virtual machine
  2019-07-30  9:20   ` [dpdk-dev] [PATCH v3] net/mlx5: add workaround for " Viacheslav Ovsiienko
  2019-07-31  6:14     ` Shahaf Shuler
@ 2019-07-31  7:39     ` Raslan Darawsheh
  1 sibling, 0 replies; 7+ messages in thread
From: Raslan Darawsheh @ 2019-07-31  7:39 UTC (permalink / raw)
  To: Slava Ovsiienko, dev; +Cc: Yongseok Koh, Shahaf Shuler
Hi,
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Viacheslav Ovsiienko
> Sent: Tuesday, July 30, 2019 12:20 PM
> To: dev@dpdk.org
> Cc: Yongseok Koh <yskoh@mellanox.com>; Shahaf Shuler
> <shahafs@mellanox.com>
> Subject: [dpdk-dev] [PATCH v3] net/mlx5: add workaround for VLAN in
> virtual machine
> 
> On some virtual setups (particularly on ESXi) when we have SR-IOV and
> E-Switch enabled there is the problem to receive VLAN traffic on VF
> interfaces. The NIC driver in ESXi hypervisor does not setup E-Switch
> vport setting correctly and VLAN traffic targeted to VF is dropped.
> 
> The patch provides the temporary workaround - if the rule
> containing the VLAN pattern is being installed for VF the VLAN
> network interface over VF is created, like the command does:
> 
>   ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
> 
> The PMD in DPDK maintains the database of created VLAN interfaces
> for each existing VF and requested VLAN tags. When all of the RTE
> Flows using the given VLAN tag are removed the created VLAN interface
> with this VLAN tag is deleted.
> 
> The name of created VLAN interface follows the format:
> 
>   evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
> 
> Implementation limitations:
> 
> - mask in rules is ignored, rule must specify VLAN tags exactly,
>   no wildcards (which are implemented by the masks) are allowed
> 
> - virtual environment is detected via rte_hypervisor() call,
>   and the type of hypervisor is checked. Currently we engage
>   the workaround for ESXi and unrecognized hypervisors (which
>   always happen on platforms other than x86 - it means workaround
>   applied for the Flow over PCI VF). There are no confirmed data
>   the other hypervisors (HyperV, Qemu) need this workaround,
>   we are trying to reduce the list of configurations on those
>   workaround should be applied.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> Acked-by: Matan Azrad <matan@mellanox.com>
> 
> ---
> v3: - address the comments, workaround engaged on ESXi and
>       unrecognized hypervisors only.
> v2: -
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F57257%2F&data=02%7C01%7Crasland%40mell
> anox.com%7C08938c4de2d04258577408d714cf2ed5%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637000752398914593&sdata=af8Rd126wr
> zuOLQ3UMDbArZwDQbdGq1PwpOtHbBM3Cc%3D&reserved=0
>     - rebase
> v1: -
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F56450%2F&data=02%7C01%7Crasland%40mell
> anox.com%7C08938c4de2d04258577408d714cf2ed5%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637000752398914593&sdata=ppBPLBrdw
> O442I4xyrZOqqdqNirHlg%2FpeOn4lqjH4hc%3D&reserved=0
> ---
>  drivers/net/mlx5/mlx5.c            |   6 +
>  drivers/net/mlx5/mlx5.h            |  33 +++++
>  drivers/net/mlx5/mlx5_flow.c       |  22 +++
>  drivers/net/mlx5/mlx5_flow.h       |   5 +
>  drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
>  drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
>  drivers/net/mlx5/mlx5_nl.c         | 294
> +++++++++++++++++++++++++++++++++++++
>  7 files changed, 414 insertions(+), 4 deletions(-)
> 
Patch applied to next-net-mlx,
Kindest regards,
Raslan Darawsheh
^ permalink raw reply	[flat|nested] 7+ messages in thread
end of thread, other threads:[~2019-07-31  7:39 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-15 13:45 [dpdk-dev] [PATCH] net/mlx5: fix ESXi VLAN in virtual machine Viacheslav Ovsiienko
2019-07-29 15:14 ` Matan Azrad
2019-07-29 15:26 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
2019-07-30  5:05   ` Shahaf Shuler
2019-07-30  9:20   ` [dpdk-dev] [PATCH v3] net/mlx5: add workaround for " Viacheslav Ovsiienko
2019-07-31  6:14     ` Shahaf Shuler
2019-07-31  7:39     ` Raslan Darawsheh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).