DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/4] net/tap: support flow API
@ 2017-03-03 10:45 Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header Pascal Mazon
                   ` (5 more replies)
  0 siblings, 6 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-03 10:45 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

This series applies on top of:

  [PATCH 0/6] net/tap: add additional management ops

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 drivers/net/tap/Makefile         |   39 ++
 drivers/net/tap/rte_eth_tap.c    |   73 +--
 drivers/net/tap/tap.h            |   76 +++
 drivers/net/tap/tap_flow.c       | 1050 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 +++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 ++++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 10 files changed, 2140 insertions(+), 34 deletions(-)
 create mode 100644 drivers/net/tap/tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-03 10:45 ` Pascal Mazon
  2017-03-03 15:38   ` Wiles, Keith
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-03 10:45 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 34 ++------------------
 drivers/net/tap/tap.h         | 73 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/tap/tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index e18f30c56f52..bdbe69e62a4e 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 3fd057225ab3..fa57d645f3b1 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -51,6 +51,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 /* Tun/Tap allocation routine
  *
  * name is the number of the interface to use, unless NULL to take the host
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
new file mode 100644
index 000000000000..88f62b895feb
--- /dev/null
+++ b/drivers/net/tap/tap.h
@@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_H_
+#define _TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets; /* Number of output packets */
+	uint64_t ipackets; /* Number of input packets */
+	uint64_t obytes; /* Number of bytes on output */
+	uint64_t ibytes; /* Number of bytes on input */
+	uint64_t errs; /* Number of error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp; /* Mempool for RX packets */
+	uint16_t in_port; /* Port ID */
+	int fd;
+	struct pkt_stats stats; /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats; /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */
+	uint16_t nb_queues; /* Number of queues supported */
+	struct ether_addr eth_addr; /* Mac address of the device port */
+	int if_index; /* IF_INDEX for the port */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _TAP_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH 2/4] net/tap: add preliminary support for rte_flow
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-03 10:45 ` Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-03 10:45 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/tap.h            |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index bdbe69e62a4e..386b8b0594d3 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index fa57d645f3b1..e80de0c29377 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -52,6 +52,7 @@
 #include <fcntl.h>
 
 #include <tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -788,6 +789,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -850,6 +852,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -962,6 +966,8 @@ rte_pmd_tap_remove(const char *name)
 	if (!eth_dev)
 		return 0;
 
+	tap_flow_flush(eth_dev, NULL);
+
 	internals = eth_dev->data->dev_private;
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index 88f62b895feb..323f5705a324 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -34,6 +34,7 @@
 #ifndef _TAP_H_
 #define _TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -66,6 +67,7 @@ struct pmd_internals {
 	uint16_t nb_queues; /* Number of queues supported */
 	struct ether_addr eth_addr; /* Mac address of the device port */
 	int if_index; /* IF_INDEX for the port */
+	LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..de41c127c920
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <tap_flow.h>
+#include <tap.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH 3/4] net/tap: add netlink back-end for flow API
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-03 10:45 ` Pascal Mazon
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-03 10:45 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 386b8b0594d3..4ae2ca6cfbab 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..10f00d1931c6
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   the number of received bytes on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
                   ` (2 preceding siblings ...)
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-03 10:45 ` Pascal Mazon
  2017-03-03 15:47   ` Wiles, Keith
  2017-03-03 15:54 ` [dpdk-dev] [PATCH 0/4] net/tap: support flow API Wiles, Keith
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
  5 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-03 10:45 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  35 ++
 drivers/net/tap/rte_eth_tap.c |  33 +-
 drivers/net/tap/tap.h         |   1 +
 drivers/net/tap/tap_flow.c    | 891 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 5 files changed, 957 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 4ae2ca6cfbab..10a97312945d 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -57,5 +58,39 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mbuf
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index e80de0c29377..aa605f74e375 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -46,13 +46,14 @@
 #include <unistd.h>
 #include <poll.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
 #include <tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -594,9 +595,30 @@ tap_setup_queue(struct rte_eth_dev *dev,
 	rx->fd = fd;
 	tx->fd = fd;
 
-	if (qid == 0)
+	if (qid == 0) {
 		tap_mac_set(dev, &pmd->eth_addr);
 
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+	}
+
 	return fd;
 }
 
@@ -853,6 +875,11 @@ eth_dev_tap_create(const char *name, char *tap_name)
 	}
 
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -969,6 +996,8 @@ rte_pmd_tap_remove(const char *name)
 	tap_flow_flush(eth_dev, NULL);
 
 	internals = eth_dev->data->dev_private;
+	if (internals->nlsk_fd)
+		nl_final(internals->nlsk_fd);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index 323f5705a324..96d9025412b3 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -64,6 +64,7 @@ struct tx_queue {
 
 struct pmd_internals {
 	char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */
+	int nlsk_fd; /* Netlink socket fd */
 	uint16_t nb_queues; /* Number of queues supported */
 	struct ether_addr eth_addr; /* Mac address of the device port */
 	int if_index; /* IF_INDEX for the port */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index de41c127c920..bf989bd5be6c 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,36 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
+#include <tap_autoconf.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 #include <tap.h>
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
 };
 
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+#ifdef HAVE_TC_VLAN_ID
+	uint8_t vlan;
+#endif
+	struct rte_flow *flow;
+};
+
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+#ifdef HAVE_TC_VLAN_ID
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+#endif
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +89,761 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+#ifdef HAVE_TC_VLAN_ID
+			RTE_FLOW_ITEM_TYPE_VLAN,
+#endif
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+#ifdef HAVE_TC_VLAN_ID
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+#endif
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+#ifdef HAVE_TC_VLAN_ID
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+#endif
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+#ifdef HAVE_TC_VLAN_ID
+	if (!info->vlan)
+#endif
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+#ifdef HAVE_TC_VLAN_ID
+	if (!info->vlan)
+#endif
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+#ifdef HAVE_TC_VLAN_ID
+		if (data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else
+#endif
+		if (data.eth_type)
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +851,54 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
+	handle = rte_jhash(&flow, sizeof(flow), 1);
+#else
+	if (sizeof(flow) == 4) {
+		/* 32-bits arch */
+		uint64_t h = (uint64_t)flow;
+
+		handle = (uint32_t)h;
+	} else {
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	}
+#endif
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +916,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +965,33 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "couldn't receive kernel ack to our request");
+		goto end;
+	}
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-03 15:38   ` Wiles, Keith
  2017-03-06 14:18     ` Pascal Mazon
  0 siblings, 1 reply; 57+ messages in thread
From: Wiles, Keith @ 2017-03-03 15:38 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> In the next patch, access to struct pmd_internals will be necessary in
> tap_flow.c to store the flows.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
> ---
> drivers/net/tap/Makefile      |  1 +
> drivers/net/tap/rte_eth_tap.c | 34 ++------------------
> drivers/net/tap/tap.h         | 73 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 76 insertions(+), 32 deletions(-)
> create mode 100644 drivers/net/tap/tap.h
> 
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index e18f30c56f52..bdbe69e62a4e 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
> LIBABIVER := 1
> 
> CFLAGS += -O3
> +CFLAGS += -I$(SRCDIR)
> CFLAGS += $(WERROR_FLAGS)
> 
> #
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index 3fd057225ab3..fa57d645f3b1 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -51,6 +51,8 @@
> #include <linux/if_ether.h>
> #include <fcntl.h>
> 
> +#include <tap.h>
> +
> /* Linux based path to the TUN device */
> #define TUN_TAP_DEV_PATH        "/dev/net/tun"
> #define DEFAULT_TAP_NAME        "dtap"
> @@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
> 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
> };
> 
> -struct pkt_stats {
> -	uint64_t opackets;		/* Number of output packets */
> -	uint64_t ipackets;		/* Number of input packets */
> -	uint64_t obytes;		/* Number of bytes on output */
> -	uint64_t ibytes;		/* Number of bytes on input */
> -	uint64_t errs;			/* Number of error packets */
> -};
> -
> -struct rx_queue {
> -	struct rte_mempool *mp;		/* Mempool for RX packets */
> -	uint16_t in_port;		/* Port ID */
> -	int fd;
> -
> -	struct pkt_stats stats;		/* Stats for this RX queue */
> -};
> -
> -struct tx_queue {
> -	int fd;
> -	struct pkt_stats stats;		/* Stats for this TX queue */
> -};
> -
> -struct pmd_internals {
> -	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
> -	uint16_t nb_queues;		/* Number of queues supported */
> -	struct ether_addr eth_addr;	/* Mac address of the device port */
> -
> -	int if_index;			/* IF_INDEX for the port */
> -
> -	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
> -	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
> -};
> -
> /* Tun/Tap allocation routine
>  *
>  * name is the number of the interface to use, unless NULL to take the host
> diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
> new file mode 100644
> index 000000000000..88f62b895feb
> --- /dev/null
> +++ b/drivers/net/tap/tap.h
> @@ -0,0 +1,73 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright 2017 6WIND S.A.
> + *   Copyright 2017 Mellanox.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of 6WIND S.A. nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _TAP_H_
> +#define _TAP_H_
> +
> +#include <inttypes.h>
> +
> +#include <rte_ethdev.h>
> +#include <rte_ether.h>
> +
> +#define RTE_PMD_TAP_MAX_QUEUES 16
> +
> +struct pkt_stats {
> +	uint64_t opackets; /* Number of output packets */
> +	uint64_t ipackets; /* Number of input packets */
> +	uint64_t obytes; /* Number of bytes on output */
> +	uint64_t ibytes; /* Number of bytes on input */
> +	uint64_t errs; /* Number of error packets */
> +};
> +
> +struct rx_queue {
> +	struct rte_mempool *mp; /* Mempool for RX packets */
> +	uint16_t in_port; /* Port ID */
> +	int fd;
> +	struct pkt_stats stats; /* Stats for this RX queue */
> +};
> +
> +struct tx_queue {
> +	int fd;
> +	struct pkt_stats stats; /* Stats for this TX queue */
> +};
> +
> +struct pmd_internals {
> +	char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */
> +	uint16_t nb_queues; /* Number of queues supported */
> +	struct ether_addr eth_addr; /* Mac address of the device port */
> +	int if_index; /* IF_INDEX for the port */
> +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
> +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
> +};

I guess I am going to be a bit picky here on the formatting. Moving the code from .c to .h you compress a lot of white space out and now I think it is very hard to read. Can you add back some of the white space for readability.

> +
> +#endif /* _TAP_H_ */
> -- 
> 2.8.0.rc0
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-03 15:47   ` Wiles, Keith
  2017-03-06 14:22     ` Pascal Mazon
  0 siblings, 1 reply; 57+ messages in thread
From: Wiles, Keith @ 2017-03-03 15:47 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> Supported flow rules are now mapped to TC rules on the tap netdevice.
> The netlink message used for creating the TC rule is stored in struct
> rte_flow. That way, by simply changing a metadata in it, we can require
> for the rule deletion without further parsing.
> 
> Supported items:
> - eth: src and dst (with variable masks), and eth_type (0xffff mask).
> - vlan: vid, pcp, tpid, but not eid.
> - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
> - udp/tcp: src and dst port (0xffff) mask.
> 
> Supported actions:
> - DROP
> - QUEUE
> - PASSTHRU
> 
> It is generally not possible to provide a "last" item. However, if the
> "last" item, once masked, is identical to the masked spec, then it is
> supported.
> 
> Only IPv4/6 and MAC addresses can use a variable mask. All other
> items need a full mask (exact match).
> 
> Support for VLAN requires kernel headers >= 4.9, checked using
> auto-config.sh.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
> —
> /**
> @@ -120,13 +965,33 @@ tap_flow_create(struct rte_eth_dev *dev,
>  * @see rte_flow_ops
>  */
> static int
> -tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
> +tap_flow_destroy(struct rte_eth_dev *dev,
> 		 struct rte_flow *flow,
> -		 struct rte_flow_error *error __rte_unused)
> +		 struct rte_flow_error *error)
> {
> +	struct pmd_internals *pmd = dev->data->dev_private;
> +	int ret = 0;
> +
> 	LIST_REMOVE(flow, next);
> +	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
> +	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
> +
> +	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
> +	if (ret < 0) {
> +		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
> +				   NULL, "couldn't send request to kernel");
> +		goto end;
> +	}
> +	ret = nl_recv_ack(pmd->nlsk_fd);
> +	if (ret < 0) {
> +		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
> +				   NULL,
> +				   "couldn't receive kernel ack to our request");
> +		goto end;

This goto is not required.

> +	}
> +end:
> 	rte_free(flow);
> -	return 0;
> +	return ret;
> }
> 
> /**
> diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
> index 377a9f7b758a..a05e945df523 100644
> --- a/drivers/net/tap/tap_flow.h
> +++ b/drivers/net/tap/tap_flow.h
> @@ -37,6 +37,18 @@
> #include <rte_flow.h>
> #include <rte_flow_driver.h>
> 
> +/**
> + * In TC, priority 0 means we require the kernel to allocate one for us.
> + * In rte_flow, however, we want the priority 0 to be the most important one.
> + * Use an offset to have the most important priority being 1 in TC.
> + */
> +#define PRIORITY_OFFSET 1
> +#define PRIORITY_MASK (0xfff)
> +#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
> +#define GROUP_MASK (0xf)
> +#define GROUP_SHIFT 12
> +#define MAX_GROUP GROUP_MASK
> +
> int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
> 			enum rte_filter_type filter_type,
> 			enum rte_filter_op filter_op,
> -- 
> 2.8.0.rc0
> 

Regards,
Keith


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 0/4] net/tap: support flow API
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
                   ` (3 preceding siblings ...)
  2017-03-03 10:45 ` [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-03 15:54 ` Wiles, Keith
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
  5 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-03 15:54 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> This series add support for the flow API in tap PMD.
> 
> It enables filtering specific packets incoming on the tap netdevice, to
> process only desired ones. Under the hood, it uses kernel TC (traffic
> control), which takes place very early in the stack, and supports most
> common pattern items and actions defined in the flow API.
> 
> This series applies on top of:
> 
>  [PATCH 0/6] net/tap: add additional management ops
> 
> Pascal Mazon (4):
>  net/tap: move private elements to external header
>  net/tap: add preliminary support for rte_flow
>  net/tap: add netlink back-end for flow API
>  net/tap: add basic flow API patterns and actions
> 
> doc/guides/nics/features/tap.ini |    1 +
> drivers/net/tap/Makefile         |   39 ++
> drivers/net/tap/rte_eth_tap.c    |   73 +--
> drivers/net/tap/tap.h            |   76 +++
> drivers/net/tap/tap_flow.c       | 1050 ++++++++++++++++++++++++++++++++++++++
> drivers/net/tap/tap_flow.h       |   58 +++
> drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
> drivers/net/tap/tap_netlink.h    |   69 +++
> drivers/net/tap/tap_tcmsgs.c     |  378 ++++++++++++++
> drivers/net/tap/tap_tcmsgs.h     |   63 +++
> 10 files changed, 2140 insertions(+), 34 deletions(-)
> create mode 100644 drivers/net/tap/tap.h
> create mode 100644 drivers/net/tap/tap_flow.c
> create mode 100644 drivers/net/tap/tap_flow.h
> create mode 100644 drivers/net/tap/tap_netlink.c
> create mode 100644 drivers/net/tap/tap_netlink.h
> create mode 100644 drivers/net/tap/tap_tcmsgs.c
> create mode 100644 drivers/net/tap/tap_tcmsgs.h

A lot of great changes to the TAP PMD here, but I did not see the tap.rst file getting updated to include the new remote option and other items you see fit to add to the docs.


> 
> -- 
> 2.8.0.rc0
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header
  2017-03-03 15:38   ` Wiles, Keith
@ 2017-03-06 14:18     ` Pascal Mazon
  2017-03-06 14:51       ` Wiles, Keith
  0 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 14:18 UTC (permalink / raw)
  To: Wiles, Keith; +Cc: dev

On Fri, 3 Mar 2017 15:38:11 +0000
"Wiles, Keith" <keith.wiles@intel.com> wrote:

> 
> > On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com>
> > wrote:
> > 
> > In the next patch, access to struct pmd_internals will be necessary
> > in tap_flow.c to store the flows.
> > 
> > Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> > Acked-by: Olga Shern <olgas@mellanox.com>
> > ---
> > drivers/net/tap/Makefile      |  1 +
> > drivers/net/tap/rte_eth_tap.c | 34 ++------------------
> > drivers/net/tap/tap.h         | 73
> > +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76
> > insertions(+), 32 deletions(-) create mode 100644
> > drivers/net/tap/tap.h
> > 
> > diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> > index e18f30c56f52..bdbe69e62a4e 100644
> > --- a/drivers/net/tap/Makefile
> > +++ b/drivers/net/tap/Makefile
> > @@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
> > LIBABIVER := 1
> > 
> > CFLAGS += -O3
> > +CFLAGS += -I$(SRCDIR)
> > CFLAGS += $(WERROR_FLAGS)
> > 
> > #
> > diff --git a/drivers/net/tap/rte_eth_tap.c
> > b/drivers/net/tap/rte_eth_tap.c index 3fd057225ab3..fa57d645f3b1
> > 100644 --- a/drivers/net/tap/rte_eth_tap.c
> > +++ b/drivers/net/tap/rte_eth_tap.c
> > @@ -51,6 +51,8 @@
> > #include <linux/if_ether.h>
> > #include <fcntl.h>
> > 
> > +#include <tap.h>
> > +
> > /* Linux based path to the TUN device */
> > #define TUN_TAP_DEV_PATH        "/dev/net/tun"
> > #define DEFAULT_TAP_NAME        "dtap"
> > @@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
> > 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
> > };
> > 
> > -struct pkt_stats {
> > -	uint64_t opackets;		/* Number of output
> > packets */
> > -	uint64_t ipackets;		/* Number of input
> > packets */
> > -	uint64_t obytes;		/* Number of bytes on
> > output */
> > -	uint64_t ibytes;		/* Number of bytes on
> > input */
> > -	uint64_t errs;			/* Number of error
> > packets */ -};
> > -
> > -struct rx_queue {
> > -	struct rte_mempool *mp;		/* Mempool for RX
> > packets */
> > -	uint16_t in_port;		/* Port ID */
> > -	int fd;
> > -
> > -	struct pkt_stats stats;		/* Stats for this
> > RX queue */ -};
> > -
> > -struct tx_queue {
> > -	int fd;
> > -	struct pkt_stats stats;		/* Stats for this
> > TX queue */ -};
> > -
> > -struct pmd_internals {
> > -	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap
> > device name */
> > -	uint16_t nb_queues;		/* Number of queues
> > supported */
> > -	struct ether_addr eth_addr;	/* Mac address of the
> > device port */ -
> > -	int if_index;			/* IF_INDEX for the
> > port */ -
> > -	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/*
> > List of RX queues */
> > -	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/*
> > List of TX queues */ -};
> > -
> > /* Tun/Tap allocation routine
> >  *
> >  * name is the number of the interface to use, unless NULL to take
> > the host diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
> > new file mode 100644
> > index 000000000000..88f62b895feb
> > --- /dev/null
> > +++ b/drivers/net/tap/tap.h
> > @@ -0,0 +1,73 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright 2017 6WIND S.A.
> > + *   Copyright 2017 Mellanox.
> > + *
> > + *   Redistribution and use in source and binary forms, with or
> > without
> > + *   modification, are permitted provided that the following
> > conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above
> > copyright
> > + *       notice, this list of conditions and the following
> > disclaimer.
> > + *     * Redistributions in binary form must reproduce the above
> > copyright
> > + *       notice, this list of conditions and the following
> > disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of 6WIND S.A. nor the names of its
> > + *       contributors may be used to endorse or promote products
> > derived
> > + *       from this software without specific prior written
> > permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> > CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> > NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> > FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> > COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> > INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> > NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> > OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> > AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
> > OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> > THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> > DAMAGE.
> > + */
> > +
> > +#ifndef _TAP_H_
> > +#define _TAP_H_
> > +
> > +#include <inttypes.h>
> > +
> > +#include <rte_ethdev.h>
> > +#include <rte_ether.h>
> > +
> > +#define RTE_PMD_TAP_MAX_QUEUES 16
> > +
> > +struct pkt_stats {
> > +	uint64_t opackets; /* Number of output packets */
> > +	uint64_t ipackets; /* Number of input packets */
> > +	uint64_t obytes; /* Number of bytes on output */
> > +	uint64_t ibytes; /* Number of bytes on input */
> > +	uint64_t errs; /* Number of error packets */
> > +};
> > +
> > +struct rx_queue {
> > +	struct rte_mempool *mp; /* Mempool for RX packets */
> > +	uint16_t in_port; /* Port ID */
> > +	int fd;
> > +	struct pkt_stats stats; /* Stats for this RX queue */
> > +};
> > +
> > +struct tx_queue {
> > +	int fd;
> > +	struct pkt_stats stats; /* Stats for this TX queue */
> > +};
> > +
> > +struct pmd_internals {
> > +	char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device
> > name */
> > +	uint16_t nb_queues; /* Number of queues supported */
> > +	struct ether_addr eth_addr; /* Mac address of the device
> > port */
> > +	int if_index; /* IF_INDEX for the port */
> > +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX
> > queues */
> > +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX
> > queues */ +};
> 
> I guess I am going to be a bit picky here on the formatting. Moving
> the code from .c to .h you compress a lot of white space out and now
> I think it is very hard to read. Can you add back some of the white
> space for readability.
> 

Do you mean whitespaces between ";" and the comment, or do you mean line
jumps? Would you rather have it look like what's described there?
  http://dpdk.org/doc/guides/contributing/coding_style.html#structure-declarations

Regards,
Pascal

> > +
> > +#endif /* _TAP_H_ */
> > -- 
> > 2.8.0.rc0
> > 
> 
> Regards,
> Keith
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions
  2017-03-03 15:47   ` Wiles, Keith
@ 2017-03-06 14:22     ` Pascal Mazon
  0 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 14:22 UTC (permalink / raw)
  To: Wiles, Keith; +Cc: dev

On Fri, 3 Mar 2017 15:47:58 +0000
"Wiles, Keith" <keith.wiles@intel.com> wrote:

> 
> > On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com>
> > wrote:
> > 
> > Supported flow rules are now mapped to TC rules on the tap
> > netdevice. The netlink message used for creating the TC rule is
> > stored in struct rte_flow. That way, by simply changing a metadata
> > in it, we can require for the rule deletion without further parsing.
> > 
> > Supported items:
> > - eth: src and dst (with variable masks), and eth_type (0xffff
> > mask).
> > - vlan: vid, pcp, tpid, but not eid.
> > - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff
> > mask).
> > - udp/tcp: src and dst port (0xffff) mask.
> > 
> > Supported actions:
> > - DROP
> > - QUEUE
> > - PASSTHRU
> > 
> > It is generally not possible to provide a "last" item. However, if
> > the "last" item, once masked, is identical to the masked spec, then
> > it is supported.
> > 
> > Only IPv4/6 and MAC addresses can use a variable mask. All other
> > items need a full mask (exact match).
> > 
> > Support for VLAN requires kernel headers >= 4.9, checked using
> > auto-config.sh.
> > 
> > Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> > Acked-by: Olga Shern <olgas@mellanox.com>
> > —
> 
> …
> 
> > /**
> > @@ -120,13 +965,33 @@ tap_flow_create(struct rte_eth_dev *dev,
> >  * @see rte_flow_ops
> >  */
> > static int
> > -tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
> > +tap_flow_destroy(struct rte_eth_dev *dev,
> > 		 struct rte_flow *flow,
> > -		 struct rte_flow_error *error __rte_unused)
> > +		 struct rte_flow_error *error)
> > {
> > +	struct pmd_internals *pmd = dev->data->dev_private;
> > +	int ret = 0;
> > +
> > 	LIST_REMOVE(flow, next);
> > +	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
> > +	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
> > +
> > +	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
> > +	if (ret < 0) {
> > +		rte_flow_error_set(error, ENOTSUP,
> > RTE_FLOW_ERROR_TYPE_HANDLE,
> > +				   NULL, "couldn't send request to
> > kernel");
> > +		goto end;
> > +	}
> > +	ret = nl_recv_ack(pmd->nlsk_fd);
> > +	if (ret < 0) {
> > +		rte_flow_error_set(error, ENOTSUP,
> > RTE_FLOW_ERROR_TYPE_HANDLE,
> > +				   NULL,
> > +				   "couldn't receive kernel ack to
> > our request");
> > +		goto end;
> 
> This goto is not required.
> 

Indeed!
I'll fix that in v2.

Regards,
Pascal

> > +	}
> > +end:
> > 	rte_free(flow);
> > -	return 0;
> > +	return ret;
> > }
> > 
> > /**
> > diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
> > index 377a9f7b758a..a05e945df523 100644
> > --- a/drivers/net/tap/tap_flow.h
> > +++ b/drivers/net/tap/tap_flow.h
> > @@ -37,6 +37,18 @@
> > #include <rte_flow.h>
> > #include <rte_flow_driver.h>
> > 
> > +/**
> > + * In TC, priority 0 means we require the kernel to allocate one
> > for us.
> > + * In rte_flow, however, we want the priority 0 to be the most
> > important one.
> > + * Use an offset to have the most important priority being 1 in TC.
> > + */
> > +#define PRIORITY_OFFSET 1
> > +#define PRIORITY_MASK (0xfff)
> > +#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
> > +#define GROUP_MASK (0xf)
> > +#define GROUP_SHIFT 12
> > +#define MAX_GROUP GROUP_MASK
> > +
> > int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
> > 			enum rte_filter_type filter_type,
> > 			enum rte_filter_op filter_op,
> > -- 
> > 2.8.0.rc0
> > 
> 
> Regards,
> Keith
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header
  2017-03-06 14:18     ` Pascal Mazon
@ 2017-03-06 14:51       ` Wiles, Keith
  0 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-06 14:51 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 6, 2017, at 8:18 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> On Fri, 3 Mar 2017 15:38:11 +0000
> "Wiles, Keith" <keith.wiles@intel.com> wrote:
> 
>> 
>>> On Mar 3, 2017, at 4:45 AM, Pascal Mazon <pascal.mazon@6wind.com>
>>> wrote:
>>> 
>>> In the next patch, access to struct pmd_internals will be necessary
>>> in tap_flow.c to store the flows.
>>> 
>>> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
>>> Acked-by: Olga Shern <olgas@mellanox.com>
>>> ---
>>> drivers/net/tap/Makefile      |  1 +
>>> drivers/net/tap/rte_eth_tap.c | 34 ++------------------
>>> drivers/net/tap/tap.h         | 73
>>> +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76
>>> insertions(+), 32 deletions(-) create mode 100644
>>> drivers/net/tap/tap.h
>>> 
>>> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
>>> index e18f30c56f52..bdbe69e62a4e 100644
>>> --- a/drivers/net/tap/Makefile
>>> +++ b/drivers/net/tap/Makefile
>>> @@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
>>> LIBABIVER := 1
>>> 
>>> CFLAGS += -O3
>>> +CFLAGS += -I$(SRCDIR)
>>> CFLAGS += $(WERROR_FLAGS)
>>> 
>>> #
>>> diff --git a/drivers/net/tap/rte_eth_tap.c
>>> b/drivers/net/tap/rte_eth_tap.c index 3fd057225ab3..fa57d645f3b1
>>> 100644 --- a/drivers/net/tap/rte_eth_tap.c
>>> +++ b/drivers/net/tap/rte_eth_tap.c
>>> @@ -51,6 +51,8 @@
>>> #include <linux/if_ether.h>
>>> #include <fcntl.h>
>>> 
>>> +#include <tap.h>
>>> +
>>> /* Linux based path to the TUN device */
>>> #define TUN_TAP_DEV_PATH        "/dev/net/tun"
>>> #define DEFAULT_TAP_NAME        "dtap"
>>> @@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
>>> 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
>>> };
>>> 
>>> -struct pkt_stats {
>>> -	uint64_t opackets;		/* Number of output
>>> packets */
>>> -	uint64_t ipackets;		/* Number of input
>>> packets */
>>> -	uint64_t obytes;		/* Number of bytes on
>>> output */
>>> -	uint64_t ibytes;		/* Number of bytes on
>>> input */
>>> -	uint64_t errs;			/* Number of error
>>> packets */ -};
>>> -
>>> -struct rx_queue {
>>> -	struct rte_mempool *mp;		/* Mempool for RX
>>> packets */
>>> -	uint16_t in_port;		/* Port ID */
>>> -	int fd;
>>> -
>>> -	struct pkt_stats stats;		/* Stats for this
>>> RX queue */ -};
>>> -
>>> -struct tx_queue {
>>> -	int fd;
>>> -	struct pkt_stats stats;		/* Stats for this
>>> TX queue */ -};
>>> -
>>> -struct pmd_internals {
>>> -	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap
>>> device name */
>>> -	uint16_t nb_queues;		/* Number of queues
>>> supported */
>>> -	struct ether_addr eth_addr;	/* Mac address of the
>>> device port */ -
>>> -	int if_index;			/* IF_INDEX for the
>>> port */ -
>>> -	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/*
>>> List of RX queues */
>>> -	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/*
>>> List of TX queues */ -};
>>> -
>>> /* Tun/Tap allocation routine
>>> *
>>> * name is the number of the interface to use, unless NULL to take
>>> the host diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
>>> new file mode 100644
>>> index 000000000000..88f62b895feb
>>> --- /dev/null
>>> +++ b/drivers/net/tap/tap.h
>>> @@ -0,0 +1,73 @@
>>> +/*-
>>> + *   BSD LICENSE
>>> + *
>>> + *   Copyright 2017 6WIND S.A.
>>> + *   Copyright 2017 Mellanox.
>>> + *
>>> + *   Redistribution and use in source and binary forms, with or
>>> without
>>> + *   modification, are permitted provided that the following
>>> conditions
>>> + *   are met:
>>> + *
>>> + *     * Redistributions of source code must retain the above
>>> copyright
>>> + *       notice, this list of conditions and the following
>>> disclaimer.
>>> + *     * Redistributions in binary form must reproduce the above
>>> copyright
>>> + *       notice, this list of conditions and the following
>>> disclaimer in
>>> + *       the documentation and/or other materials provided with the
>>> + *       distribution.
>>> + *     * Neither the name of 6WIND S.A. nor the names of its
>>> + *       contributors may be used to endorse or promote products
>>> derived
>>> + *       from this software without specific prior written
>>> permission.
>>> + *
>>> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
>>> CONTRIBUTORS
>>> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
>>> NOT
>>> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
>>> FITNESS FOR
>>> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
>>> COPYRIGHT
>>> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>>> INCIDENTAL,
>>> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
>>> NOT
>>> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
>>> OF USE,
>>> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
>>> AND ON ANY
>>> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
>>> OR TORT
>>> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
>>> THE USE
>>> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
>>> DAMAGE.
>>> + */
>>> +
>>> +#ifndef _TAP_H_
>>> +#define _TAP_H_
>>> +
>>> +#include <inttypes.h>
>>> +
>>> +#include <rte_ethdev.h>
>>> +#include <rte_ether.h>
>>> +
>>> +#define RTE_PMD_TAP_MAX_QUEUES 16
>>> +
>>> +struct pkt_stats {
>>> +	uint64_t opackets; /* Number of output packets */
>>> +	uint64_t ipackets; /* Number of input packets */
>>> +	uint64_t obytes; /* Number of bytes on output */
>>> +	uint64_t ibytes; /* Number of bytes on input */
>>> +	uint64_t errs; /* Number of error packets */
>>> +};
>>> +
>>> +struct rx_queue {
>>> +	struct rte_mempool *mp; /* Mempool for RX packets */
>>> +	uint16_t in_port; /* Port ID */
>>> +	int fd;
>>> +	struct pkt_stats stats; /* Stats for this RX queue */
>>> +};
>>> +
>>> +struct tx_queue {
>>> +	int fd;
>>> +	struct pkt_stats stats; /* Stats for this TX queue */
>>> +};
>>> +
>>> +struct pmd_internals {
>>> +	char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device
>>> name */
>>> +	uint16_t nb_queues; /* Number of queues supported */
>>> +	struct ether_addr eth_addr; /* Mac address of the device
>>> port */
>>> +	int if_index; /* IF_INDEX for the port */
>>> +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX
>>> queues */
>>> +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX
>>> queues */ +};
>> 
>> I guess I am going to be a bit picky here on the formatting. Moving
>> the code from .c to .h you compress a lot of white space out and now
>> I think it is very hard to read. Can you add back some of the white
>> space for readability.
>> 
> 
> Do you mean whitespaces between ";" and the comment, or do you mean line
> jumps? Would you rather have it look like what's described there?
>  http://dpdk.org/doc/guides/contributing/coding_style.html#structure-declarations

The spaces between the ; and comment is what I wanted here. The extra blank lines are optional to break up the structure some is good as well. You do not need to align the variables in the same columns as long as the comments are spaced out and aligned would be nice.

To me white space is your friend when writing code and the compiler does not care normally :-)
> 
> Regards,
> Pascal
> 
>>> +
>>> +#endif /* _TAP_H_ */
>>> -- 
>>> 2.8.0.rc0
>>> 
>> 
>> Regards,
>> Keith
>> 
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API
  2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
                   ` (4 preceding siblings ...)
  2017-03-03 15:54 ` [dpdk-dev] [PATCH 0/4] net/tap: support flow API Wiles, Keith
@ 2017-03-06 17:05 ` Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 1/4] net/tap: move private elements to external header Pascal Mazon
                     ` (8 more replies)
  5 siblings, 9 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 17:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

This series applies on top of:

  [PATCH 0/6] net/tap: add additional management ops

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |   94 ++--
 drivers/net/tap/tap.h            |   77 +++
 drivers/net/tap/tap_flow.c       | 1084 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 ++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 10 files changed, 2202 insertions(+), 33 deletions(-)
 create mode 100644 drivers/net/tap/tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] net/tap: move private elements to external header
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
@ 2017-03-06 17:05   ` Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 17:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 34 ++------------------
 drivers/net/tap/tap.h         | 73 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/tap/tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index e18f30c56f52..bdbe69e62a4e 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 21c667c82b3f..2cc77317f03e 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -51,6 +51,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 /* Tun/Tap allocation routine
  *
  * name is the number of the interface to use, unless NULL to take the host
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
new file mode 100644
index 000000000000..abd1795b2a43
--- /dev/null
+++ b/drivers/net/tap/tap.h
@@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_H_
+#define _TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets; /* Number of output packets */
+	uint64_t ipackets; /* Number of input packets */
+	uint64_t obytes; /* Number of bytes on output */
+	uint64_t ibytes; /* Number of bytes on input */
+	uint64_t errs; /* Number of error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp; /* Mempool for RX packets */
+	uint16_t in_port; /* Port ID */
+	int fd;
+	struct pkt_stats stats; /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats; /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _TAP_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] net/tap: add preliminary support for rte_flow
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-06 17:05   ` Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 17:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/tap.h            |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index bdbe69e62a4e..386b8b0594d3 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 2cc77317f03e..f891def81af5 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -52,6 +52,7 @@
 #include <fcntl.h>
 
 #include <tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -788,6 +789,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -851,6 +853,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -963,6 +967,8 @@ rte_pmd_tap_remove(const char *name)
 	if (!eth_dev)
 		return 0;
 
+	tap_flow_flush(eth_dev, NULL);
+
 	internals = eth_dev->data->dev_private;
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index abd1795b2a43..60c0c58c7c1a 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -34,6 +34,7 @@
 #ifndef _TAP_H_
 #define _TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -66,6 +67,7 @@ struct pmd_internals {
 	uint16_t nb_queues;               /* Number of queues supported */
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..de41c127c920
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <tap_flow.h>
+#include <tap.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] net/tap: add netlink back-end for flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-06 17:05   ` Pascal Mazon
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 17:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 386b8b0594d3..4ae2ca6cfbab 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..10f00d1931c6
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   the number of received bytes on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] net/tap: add basic flow API patterns and actions
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (2 preceding siblings ...)
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-06 17:05   ` Pascal Mazon
  2017-03-07 15:05   ` [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API Pascal Mazon
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-06 17:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  56 ++-
 drivers/net/tap/tap.h         |   2 +
 drivers/net/tap/tap_flow.c    | 925 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 5 files changed, 1020 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 4ae2ca6cfbab..bb8217fe48a2 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -57,5 +58,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mbuf
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f891def81af5..b7f34dd92fd7 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -42,17 +42,20 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <poll.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -86,6 +89,8 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+
 /* Tun/Tap allocation routine
  *
  * name is the number of the interface to use, unless NULL to take the host
@@ -594,9 +599,32 @@ tap_setup_queue(struct rte_eth_dev *dev,
 	rx->fd = fd;
 	tx->fd = fd;
 
-	if (qid == 0)
+	if (qid == 0) {
 		tap_mac_set(dev, &pmd->eth_addr);
 
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+	}
+
 	return fd;
 }
 
@@ -793,6 +821,20 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_flower_supported(void)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3 ||
+	    KERNEL_VERSION(ver[0], ver[1], ver[2]) < FLOWER_KERNEL_VERSION)
+		return 0;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -853,7 +895,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	pmd->flower_support = tap_flower_supported();
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -970,6 +1020,8 @@ rte_pmd_tap_remove(const char *name)
 	tap_flow_flush(eth_dev, NULL);
 
 	internals = eth_dev->data->dev_private;
+	if (internals->flower_support && internals->nlsk_fd)
+		nl_final(internals->nlsk_fd);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index 60c0c58c7c1a..db157c60d1cb 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -67,6 +67,8 @@ struct pmd_internals {
 	uint16_t nb_queues;               /* Number of queues supported */
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index de41c127c920..9c299d82c00b 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,68 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
+#include <tap_autoconf.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 #include <tap.h>
 
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+	__TCA_FLOWER_MAX,
+};
+#endif
+
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+#ifdef HAVE_TC_VLAN_ID
+	uint8_t vlan;
+#endif
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+#ifdef HAVE_TC_VLAN_ID
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+#endif
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +121,761 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+#ifdef HAVE_TC_VLAN_ID
+			RTE_FLOW_ITEM_TYPE_VLAN,
+#endif
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+#ifdef HAVE_TC_VLAN_ID
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+#endif
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+#ifdef HAVE_TC_VLAN_ID
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+#endif
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+#ifdef HAVE_TC_VLAN_ID
+	if (!info->vlan)
+#endif
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+#ifdef HAVE_TC_VLAN_ID
+	if (!info->vlan)
+#endif
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+#ifdef HAVE_TC_VLAN_ID
+		if (data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else
+#endif
+		if (data.eth_type)
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +883,54 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
+	handle = rte_jhash(&flow, sizeof(flow), 1);
+#else
+	if (sizeof(flow) == 4) {
+		/* 32-bits arch */
+		uint64_t h = (uint64_t)flow;
+
+		handle = (uint32_t)h;
+	} else {
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	}
+#endif
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +948,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +997,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1065,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (3 preceding siblings ...)
  2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-07 15:05   ` Pascal Mazon
  2017-03-07 15:08     ` Wiles, Keith
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
                     ` (3 subsequent siblings)
  8 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 15:05 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev

Hi Keith,

I'm working on a v3 for that series:

- I added info regarding flow API support in tap.rst doc
- I fixed support for kernels where flower/vlan was not supported.

Do you have any other remarks, or can I send the v3 (hopefully ok for
integration)?

I'll send a v2 for the latest series (introducing remote capture),
because struct pmd_internals changed (whitespaces).
Same question there, do you have remarks?

Thank you.

Best regards,
Pascal

On Mon,  6 Mar 2017 18:05:26 +0100
Pascal Mazon <pascal.mazon@6wind.com> wrote:

> This series add support for the flow API in tap PMD.
> 
> It enables filtering specific packets incoming on the tap netdevice,
> to process only desired ones. Under the hood, it uses kernel TC
> (traffic control), which takes place very early in the stack, and
> supports most common pattern items and actions defined in the flow
> API.
> 
> This series applies on top of:
> 
>   [PATCH 0/6] net/tap: add additional management ops
> 
> v2 changes:
>   - support compilation on kernels < 4.2 (where flower support
> appeared)
>   - set whitespaces in tap.h
>   - remove unnecessary goto
> 
> Pascal Mazon (4):
>   net/tap: move private elements to external header
>   net/tap: add preliminary support for rte_flow
>   net/tap: add netlink back-end for flow API
>   net/tap: add basic flow API patterns and actions
> 
>  doc/guides/nics/features/tap.ini |    1 +
>  drivers/net/tap/Makefile         |   44 ++
>  drivers/net/tap/rte_eth_tap.c    |   94 ++--
>  drivers/net/tap/tap.h            |   77 +++
>  drivers/net/tap/tap_flow.c       | 1084
> ++++++++++++++++++++++++++++++++++++++
> drivers/net/tap/tap_flow.h       |   58 ++
> drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
> drivers/net/tap/tap_netlink.h    |   69 +++
> drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
> drivers/net/tap/tap_tcmsgs.h     |   63 +++ 10 files changed, 2202
> insertions(+), 33 deletions(-) create mode 100644
> drivers/net/tap/tap.h create mode 100644 drivers/net/tap/tap_flow.c
>  create mode 100644 drivers/net/tap/tap_flow.h
>  create mode 100644 drivers/net/tap/tap_netlink.c
>  create mode 100644 drivers/net/tap/tap_netlink.h
>  create mode 100644 drivers/net/tap/tap_tcmsgs.c
>  create mode 100644 drivers/net/tap/tap_tcmsgs.h
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API
  2017-03-07 15:05   ` [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-07 15:08     ` Wiles, Keith
  0 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-07 15:08 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 7, 2017, at 9:05 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> Hi Keith,
> 
> I'm working on a v3 for that series:
> 
> - I added info regarding flow API support in tap.rst doc
> - I fixed support for kernels where flower/vlan was not supported.
> 
> Do you have any other remarks, or can I send the v3 (hopefully ok for
> integration)?
> 
> I'll send a v2 for the latest series (introducing remote capture),
> because struct pmd_internals changed (whitespaces).
> Same question there, do you have remarks?

I think I am ok with everything we discussed and you can send your v2/v3 when you want.

> 
> Thank you.
> 
> Best regards,
> Pascal
> 
> On Mon,  6 Mar 2017 18:05:26 +0100
> Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
>> This series add support for the flow API in tap PMD.
>> 
>> It enables filtering specific packets incoming on the tap netdevice,
>> to process only desired ones. Under the hood, it uses kernel TC
>> (traffic control), which takes place very early in the stack, and
>> supports most common pattern items and actions defined in the flow
>> API.
>> 
>> This series applies on top of:
>> 
>>  [PATCH 0/6] net/tap: add additional management ops
>> 
>> v2 changes:
>>  - support compilation on kernels < 4.2 (where flower support
>> appeared)
>>  - set whitespaces in tap.h
>>  - remove unnecessary goto
>> 
>> Pascal Mazon (4):
>>  net/tap: move private elements to external header
>>  net/tap: add preliminary support for rte_flow
>>  net/tap: add netlink back-end for flow API
>>  net/tap: add basic flow API patterns and actions
>> 
>> doc/guides/nics/features/tap.ini |    1 +
>> drivers/net/tap/Makefile         |   44 ++
>> drivers/net/tap/rte_eth_tap.c    |   94 ++--
>> drivers/net/tap/tap.h            |   77 +++
>> drivers/net/tap/tap_flow.c       | 1084
>> ++++++++++++++++++++++++++++++++++++++
>> drivers/net/tap/tap_flow.h       |   58 ++
>> drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
>> drivers/net/tap/tap_netlink.h    |   69 +++
>> drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
>> drivers/net/tap/tap_tcmsgs.h     |   63 +++ 10 files changed, 2202
>> insertions(+), 33 deletions(-) create mode 100644
>> drivers/net/tap/tap.h create mode 100644 drivers/net/tap/tap_flow.c
>> create mode 100644 drivers/net/tap/tap_flow.h
>> create mode 100644 drivers/net/tap/tap_netlink.c
>> create mode 100644 drivers/net/tap/tap_netlink.h
>> create mode 100644 drivers/net/tap/tap_tcmsgs.c
>> create mode 100644 drivers/net/tap/tap_tcmsgs.h
>> 
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v3 0/4] net/tap: support flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (4 preceding siblings ...)
  2017-03-07 15:05   ` [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-07 16:35   ` Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header Pascal Mazon
                       ` (3 more replies)
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
                     ` (2 subsequent siblings)
  8 siblings, 4 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 16:35 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

This series applies on top of:

  [PATCH 0/6] net/tap: add additional management ops

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

v3 changes:
  - vlan patterns enabled depending on running kernel (4.9+)
  - update doc/guides/nics/tap.rst for Flow API support
  - rebase on top of "net/tap: add additional management ops" series

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 doc/guides/nics/tap.rst          |   23 +
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |   96 ++--
 drivers/net/tap/tap.h            |   78 +++
 drivers/net/tap/tap_flow.c       | 1078 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 ++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 11 files changed, 2223 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/tap/tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
@ 2017-03-07 16:35     ` Pascal Mazon
  2017-03-09 15:28       ` Ferruh Yigit
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 16:35 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 34 ++------------------
 drivers/net/tap/tap.h         | 73 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/tap/tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index e18f30c56f52..bdbe69e62a4e 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index cbfb3b9641c8..308519d265cb 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -51,6 +51,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -83,38 +85,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 /* Tun/Tap allocation routine
  *
  * name is the number of the interface to use, unless NULL to take the host
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
new file mode 100644
index 000000000000..abd1795b2a43
--- /dev/null
+++ b/drivers/net/tap/tap.h
@@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_H_
+#define _TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets; /* Number of output packets */
+	uint64_t ipackets; /* Number of input packets */
+	uint64_t obytes; /* Number of bytes on output */
+	uint64_t ibytes; /* Number of bytes on input */
+	uint64_t errs; /* Number of error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp; /* Mempool for RX packets */
+	uint16_t in_port; /* Port ID */
+	int fd;
+	struct pkt_stats stats; /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats; /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _TAP_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] net/tap: add preliminary support for rte_flow
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-07 16:35     ` Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  3 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 16:35 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/tap.h            |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index bdbe69e62a4e..386b8b0594d3 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 308519d265cb..d4f2ff72e8b4 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -52,6 +52,7 @@
 #include <fcntl.h>
 
 #include <tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -797,6 +798,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -860,6 +862,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -972,6 +976,8 @@ rte_pmd_tap_remove(const char *name)
 	if (!eth_dev)
 		return 0;
 
+	tap_flow_flush(eth_dev, NULL);
+
 	internals = eth_dev->data->dev_private;
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index abd1795b2a43..60c0c58c7c1a 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -34,6 +34,7 @@
 #ifndef _TAP_H_
 #define _TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -66,6 +67,7 @@ struct pmd_internals {
 	uint16_t nb_queues;               /* Number of queues supported */
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..de41c127c920
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <tap_flow.h>
+#include <tap.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-07 16:35     ` Pascal Mazon
  2017-03-09 15:29       ` Ferruh Yigit
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  3 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 16:35 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 386b8b0594d3..4ae2ca6cfbab 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..10f00d1931c6
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   the number of received bytes on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] net/tap: add basic flow API patterns and actions
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
                       ` (2 preceding siblings ...)
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-07 16:35     ` Pascal Mazon
  3 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-07 16:35 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/tap.rst       |  23 ++
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  58 ++-
 drivers/net/tap/tap.h         |   3 +
 drivers/net/tap/tap_flow.c    | 919 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 6 files changed, 1041 insertions(+), 14 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c4f207be3b47..cdb528b5eae4 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -82,6 +82,29 @@ can utilize that stack to handle the network protocols. Plus you would be able
 to address the interface using an IP address assigned to the internal
 interface.
 
+Flow API support
+----------------
+
+The tap PMD supports major flow API pattern items and actions, when running on
+linux kernels above 4.2 ("Flower" classifier required). Supported items:
+
+- eth: src and dst (with variable masks), and eth_type (0xffff mask).
+- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
+- udp/tcp: src and dst port (0xffff) mask.
+
+Supported actions:
+
+- DROP
+- QUEUE
+- PASSTHRU
+
+It is generally not possible to provide a "last" item. However, if the "last"
+item, once masked, is identical to the masked spec, then it is supported.
+
+Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
+full mask (exact match).
+
 Example
 -------
 
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 4ae2ca6cfbab..a6542dad1d66 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -57,5 +58,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mbuf
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index d4f2ff72e8b4..5727f6228b17 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -42,17 +42,20 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <poll.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -86,6 +89,9 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
+
 /* Tun/Tap allocation routine
  *
  * name is the number of the interface to use, unless NULL to take the host
@@ -604,6 +610,28 @@ tap_setup_queue(struct rte_eth_dev *dev,
 			return fd;
 
 		tap_mac_set(dev, &pmd->eth_addr);
+
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
 	}
 
 	return fd;
@@ -802,6 +830,24 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_kernel_support(struct pmd_internals *pmd)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3)
+		return 0;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
+		pmd->flower_support = 1;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
+	    FLOWER_VLAN_KERNEL_VERSION)
+		pmd->flower_vlan_support = 1;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -862,7 +908,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	tap_kernel_support(pmd);
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -979,6 +1033,8 @@ rte_pmd_tap_remove(const char *name)
 	tap_flow_flush(eth_dev, NULL);
 
 	internals = eth_dev->data->dev_private;
+	if (internals->flower_support && internals->nlsk_fd)
+		nl_final(internals->nlsk_fd);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index 60c0c58c7c1a..2c8cc7d5b485 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -67,6 +67,9 @@ struct pmd_internals {
 	uint16_t nb_queues;               /* Number of queues supported */
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index de41c127c920..3fb28b1db917 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,71 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
+#include <tap_autoconf.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 #include <tap.h>
 
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+};
+#endif
+#ifndef HAVE_TC_VLAN_ID
+enum {
+	/* TCA_FLOWER_FLAGS, */
+	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
+};
+#endif
+
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+	uint8_t vlan;
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+			RTE_FLOW_ITEM_TYPE_VLAN,
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			if (!pmd->flower_vlan_support &&
+			    cur_item->convert == tap_flow_create_vlan)
+				goto exit_item_not_supported;
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+		if (pmd->flower_vlan_support && data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else if (data.eth_type) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +877,54 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
+	handle = rte_jhash(&flow, sizeof(flow), 1);
+#else
+	if (sizeof(flow) == 4) {
+		/* 32-bits arch */
+		uint64_t h = (uint64_t)flow;
+
+		handle = (uint32_t)h;
+	} else {
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	}
+#endif
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +942,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +991,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1059,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-09 15:28       ` Ferruh Yigit
  2017-03-10  9:40         ` Pascal Mazon
  0 siblings, 1 reply; 57+ messages in thread
From: Ferruh Yigit @ 2017-03-09 15:28 UTC (permalink / raw)
  To: Pascal Mazon, keith.wiles; +Cc: dev

On 3/7/2017 4:35 PM, Pascal Mazon wrote:
> In the next patch, access to struct pmd_internals will be necessary in
> tap_flow.c to store the flows.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
> ---
>  drivers/net/tap/Makefile      |  1 +
>  drivers/net/tap/rte_eth_tap.c | 34 ++------------------
>  drivers/net/tap/tap.h         | 73 +++++++++++++++++++++++++++++++++++++++++++

tap.h is a generic name, I think rte_eth_tap.h fits better here.

<...>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API
  2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-09 15:29       ` Ferruh Yigit
  2017-03-10  9:39         ` Pascal Mazon
  0 siblings, 1 reply; 57+ messages in thread
From: Ferruh Yigit @ 2017-03-09 15:29 UTC (permalink / raw)
  To: Pascal Mazon, keith.wiles; +Cc: dev

On 3/7/2017 4:35 PM, Pascal Mazon wrote:
> Each kernel netdevice may have queueing disciplines set for it, which
> determine how to handle the packet (mostly on egress). That's part of
> the TC (Traffic Control) mechanism.

This is nice.
qdisc is egress part of the network stack right, is there any ingress
part of it?

> 
> Through TC, it is possible to set filter rules that match specific
> packets, and act according to what is in the rule. This is a perfect
> candidate to implement the flow API for the tap PMD, as it has an
> associated kernel netdevice automatically.
> 
> Each flow API rule will be translated into its TC counterpart.

What can be use cases here?

> 
> To leverage TC, it is necessary to communicate with the kernel using
> netlink. This patch introduces a library to help that communication.
> 

What do you think implementing these out of tap PMD? These can be used
by KNI too.

> Inside netlink.c, functions are generic for any netlink messaging.
> Inside tcmsgs.c, functions are specific to deal with TC rules.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
<...>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API
  2017-03-09 15:29       ` Ferruh Yigit
@ 2017-03-10  9:39         ` Pascal Mazon
  0 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-10  9:39 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: keith.wiles, dev

On Thu, 9 Mar 2017 15:29:01 +0000
Ferruh Yigit <ferruh.yigit@intel.com> wrote:

> On 3/7/2017 4:35 PM, Pascal Mazon wrote:
> > Each kernel netdevice may have queueing disciplines set for it,
> > which determine how to handle the packet (mostly on egress). That's
> > part of the TC (Traffic Control) mechanism.
> 
> This is nice.
> qdisc is egress part of the network stack right, is there any ingress
> part of it?
> 

qdisc is mainly for egress (can range from 0 to fffe), but there is one
qdisc for ingress (ffff).

> > 
> > Through TC, it is possible to set filter rules that match specific
> > packets, and act according to what is in the rule. This is a perfect
> > candidate to implement the flow API for the tap PMD, as it has an
> > associated kernel netdevice automatically.
> > 
> > Each flow API rule will be translated into its TC counterpart.
> 
> What can be use cases here?

Well, it can be any case with rte_flow. Such as directing incoming
packets to specific queues for the application, dropping them, and any
kind of filtering (along those supported, see later patch).

> 
> > 
> > To leverage TC, it is necessary to communicate with the kernel using
> > netlink. This patch introduces a library to help that communication.
> > 
> 
> What do you think implementing these out of tap PMD? These can be used
> by KNI too.
> 

Well, I don't know about KNI, but I think setting it in tap PMD, which
is the current sole user for this, is a good start.
It will always be time later to make it more generic for other uses.

Regards,
Pascal

> > Inside netlink.c, functions are generic for any netlink messaging.
> > Inside tcmsgs.c, functions are specific to deal with TC rules.
> > 
> > Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> > Acked-by: Olga Shern <olgas@mellanox.com>
> <...>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header
  2017-03-09 15:28       ` Ferruh Yigit
@ 2017-03-10  9:40         ` Pascal Mazon
  0 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-10  9:40 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: keith.wiles, dev

On Thu, 9 Mar 2017 15:28:31 +0000
Ferruh Yigit <ferruh.yigit@intel.com> wrote:

> On 3/7/2017 4:35 PM, Pascal Mazon wrote:
> > In the next patch, access to struct pmd_internals will be necessary
> > in tap_flow.c to store the flows.
> > 
> > Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> > Acked-by: Olga Shern <olgas@mellanox.com>
> > ---
> >  drivers/net/tap/Makefile      |  1 +
> >  drivers/net/tap/rte_eth_tap.c | 34 ++------------------
> >  drivers/net/tap/tap.h         | 73
> > +++++++++++++++++++++++++++++++++++++++++++
> 
> tap.h is a generic name, I think rte_eth_tap.h fits better here.
> 
> <...>

I'm ok with that. I'll change it in my next version.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (5 preceding siblings ...)
  2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
@ 2017-03-14  8:29   ` Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header Pascal Mazon
                       ` (3 more replies)
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
  8 siblings, 4 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-14  8:29 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

This series applies on top of:

  [PATCH 0/6] net/tap: add additional management ops

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

v3 changes:
  - vlan patterns enabled depending on running kernel (4.9+)
  - update doc/guides/nics/tap.rst for Flow API support
  - rebase on top of "net/tap: add additional management ops" series

v4 changes:
  - rebase on top of "net/tap: add additional management ops" series
  - fix a few netlink doxygen comments
  - rename tap.h -> rte_eth_tap.h
  - flush flow rules only when applicable

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 doc/guides/nics/tap.rst          |   23 +
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |  100 ++--
 drivers/net/tap/rte_eth_tap.h    |   79 +++
 drivers/net/tap/tap_flow.c       | 1078 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 ++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 11 files changed, 2226 insertions(+), 34 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-14  8:29     ` Pascal Mazon
  2017-03-14 14:05       ` Wiles, Keith
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-14  8:29 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 35 ++------------------
 drivers/net/tap/rte_eth_tap.h | 74 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 33 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index e18f30c56f52..bdbe69e62a4e 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 7557abb2ebfc..536fd206e789 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -51,6 +51,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <rte_eth_tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -81,39 +83,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-	int ioctl_sock;			/* socket for ioctl calls */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 static int
 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
 	  struct ifreq *ifr, int set);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
new file mode 100644
index 000000000000..880ec1b4fc8c
--- /dev/null
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_TAP_H_
+#define _RTE_ETH_TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets; /* Number of output packets */
+	uint64_t ipackets; /* Number of input packets */
+	uint64_t obytes; /* Number of bytes on output */
+	uint64_t ibytes; /* Number of bytes on input */
+	uint64_t errs; /* Number of error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp; /* Mempool for RX packets */
+	uint16_t in_port; /* Port ID */
+	int fd;
+	struct pkt_stats stats; /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats; /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	int ioctl_sock;                   /* socket for ioctl calls */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _RTE_ETH_TAP_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] net/tap: add preliminary support for rte_flow
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-14  8:29     ` Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  3 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-14  8:29 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/rte_eth_tap.h    |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index bdbe69e62a4e..386b8b0594d3 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 536fd206e789..78eac9a11ea0 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -52,6 +52,7 @@
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -435,6 +436,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused)
 	struct pmd_internals *internals = dev->data->dev_private;
 
 	tap_link_set_down(dev);
+	tap_flow_flush(dev, NULL);
 
 	for (i = 0; i < internals->nb_queues; i++) {
 		if (internals->rxq[i].fd != -1)
@@ -758,6 +760,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -829,6 +832,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -942,6 +947,7 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
+	tap_flow_flush(eth_dev, NULL);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 880ec1b4fc8c..a64116f5a35e 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -34,6 +34,7 @@
 #ifndef _RTE_ETH_TAP_H_
 #define _RTE_ETH_TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -67,6 +68,7 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..c32ed382d745
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <rte_eth_tap.h>
+#include <tap_flow.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-14  8:29     ` Pascal Mazon
  2017-03-14 14:03       ` Wiles, Keith
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  3 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-14  8:29 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 386b8b0594d3..4ae2ca6cfbab 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..10f00d1931c6
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   the number of received bytes on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] net/tap: add basic flow API patterns and actions
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
                       ` (2 preceding siblings ...)
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-14  8:29     ` Pascal Mazon
  3 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-14  8:29 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/tap.rst       |  23 ++
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  61 ++-
 drivers/net/tap/rte_eth_tap.h |   3 +
 drivers/net/tap/tap_flow.c    | 919 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 6 files changed, 1043 insertions(+), 15 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c4f207be3b47..cdb528b5eae4 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -82,6 +82,29 @@ can utilize that stack to handle the network protocols. Plus you would be able
 to address the interface using an IP address assigned to the internal
 interface.
 
+Flow API support
+----------------
+
+The tap PMD supports major flow API pattern items and actions, when running on
+linux kernels above 4.2 ("Flower" classifier required). Supported items:
+
+- eth: src and dst (with variable masks), and eth_type (0xffff mask).
+- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
+- udp/tcp: src and dst port (0xffff) mask.
+
+Supported actions:
+
+- DROP
+- QUEUE
+- PASSTHRU
+
+It is generally not possible to provide a "last" item. However, if the "last"
+item, once masked, is identical to the masked spec, then it is supported.
+
+Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
+full mask (exact match).
+
 Example
 -------
 
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 4ae2ca6cfbab..a6542dad1d66 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -57,5 +58,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mbuf
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 78eac9a11ea0..ed2099212e2a 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -42,17 +42,20 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <poll.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -67,6 +70,9 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
+
 static struct rte_vdev_driver pmd_tap_drv;
 
 static const char *valid_arguments[] = {
@@ -159,6 +165,28 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid)
 			goto error;
 		rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
 			   ETHER_ADDR_LEN);
+
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
 	}
 
 	return fd;
@@ -764,6 +792,24 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_kernel_support(struct pmd_internals *pmd)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3)
+		return 0;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
+		pmd->flower_support = 1;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
+	    FLOWER_VLAN_KERNEL_VERSION)
+		pmd->flower_vlan_support = 1;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -832,7 +878,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	tap_kernel_support(pmd);
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -947,7 +1001,10 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
-	tap_flow_flush(eth_dev, NULL);
+	if (internals->flower_support && internals->nlsk_fd) {
+		tap_flow_flush(eth_dev, NULL);
+		nl_final(internals->nlsk_fd);
+	}
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index a64116f5a35e..15ae0b980ace 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -68,6 +68,9 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index c32ed382d745..b119c31b0dea 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,71 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_autoconf.h>
+#include <tap_tcmsgs.h>
+
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+};
+#endif
+#ifndef HAVE_TC_VLAN_ID
+enum {
+	/* TCA_FLOWER_FLAGS, */
+	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
+};
+#endif
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+	uint8_t vlan;
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+			RTE_FLOW_ITEM_TYPE_VLAN,
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			if (!pmd->flower_vlan_support &&
+			    cur_item->convert == tap_flow_create_vlan)
+				goto exit_item_not_supported;
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+		if (pmd->flower_vlan_support && data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else if (data.eth_type) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +877,54 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
+	handle = rte_jhash(&flow, sizeof(flow), 1);
+#else
+	if (sizeof(flow) == 4) {
+		/* 32-bits arch */
+		uint64_t h = (uint64_t)flow;
+
+		handle = (uint32_t)h;
+	} else {
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	}
+#endif
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +942,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +991,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1059,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-14 14:03       ` Wiles, Keith
  0 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-14 14:03 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 14, 2017, at 4:29 PM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> Each kernel netdevice may have queueing disciplines set for it, which
> determine how to handle the packet (mostly on egress). That's part of
> the TC (Traffic Control) mechanism.
> 
> Through TC, it is possible to set filter rules that match specific
> packets, and act according to what is in the rule. This is a perfect
> candidate to implement the flow API for the tap PMD, as it has an
> associated kernel netdevice automatically.
> 
> Each flow API rule will be translated into its TC counterpart.
> 
> To leverage TC, it is necessary to communicate with the kernel using
> netlink. This patch introduces a library to help that communication.
> 
> Inside netlink.c, functions are generic for any netlink messaging.
> Inside tcmsgs.c, functions are specific to deal with TC rules.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>

Acked-by: Keith Wiles <keith.wiles@intel.com>


Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header
  2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-14 14:05       ` Wiles, Keith
  0 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-14 14:05 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 14, 2017, at 4:29 PM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> In the next patch, access to struct pmd_internals will be necessary in
> tap_flow.c to store the flows.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>

Acked-by: Keith Wiles <keith.wiles@intel.com>

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (6 preceding siblings ...)
  2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-15 14:54   ` Pascal Mazon
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header Pascal Mazon
                       ` (4 more replies)
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
  8 siblings, 5 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-15 14:54 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

This series applies on top of:

  [PATCH 0/6] net/tap: add additional management ops

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

v3 changes:
  - vlan patterns enabled depending on running kernel (4.9+)
  - update doc/guides/nics/tap.rst for Flow API support
  - rebase on top of "net/tap: add additional management ops" series

v4 changes:
  - rebase on top of "net/tap: add additional management ops" series
  - fix a few netlink doxygen comments
  - rename tap.h -> rte_eth_tap.h
  - flush flow rules only when applicable

v5 changes:
  - rebase after adrien's patches on Tx poll and Rx signaling
  - better spaces for comments in rte_eth_tap.h

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 doc/guides/nics/tap.rst          |   23 +
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |  101 ++--
 drivers/net/tap/rte_eth_tap.h    |   80 +++
 drivers/net/tap/tap_flow.c       | 1078 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 ++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 11 files changed, 2227 insertions(+), 35 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-15 14:54     ` Pascal Mazon
  2017-03-21 15:32       ` Wiles, Keith
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-15 14:54 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 36 ++-------------------
 drivers/net/tap/rte_eth_tap.h | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 34 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ddf87232d335..fa4658bd1e75 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f8d9cc7dc3b2..6bb63e5ec873 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -55,6 +55,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <rte_eth_tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -87,40 +89,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint32_t trigger_seen;		/* Last seen Rx trigger value */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-	int ioctl_sock;			/* socket for ioctl calls */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 static void
 tap_trigger_cb(int sig __rte_unused)
 {
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
new file mode 100644
index 000000000000..aafdef1faa99
--- /dev/null
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -0,0 +1,75 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_TAP_H_
+#define _RTE_ETH_TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets;              /* Number of output packets */
+	uint64_t ipackets;              /* Number of input packets */
+	uint64_t obytes;                /* Number of bytes on output */
+	uint64_t ibytes;                /* Number of bytes on input */
+	uint64_t errs;                  /* Number of TX error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp;         /* Mempool for RX packets */
+	uint32_t trigger_seen;          /* Last seen Rx trigger value */
+	uint16_t in_port;               /* Port ID */
+	int fd;
+	struct pkt_stats stats;         /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats;         /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	int ioctl_sock;                   /* socket for ioctl calls */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _RTE_ETH_TAP_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-15 14:54     ` Pascal Mazon
  2017-03-21 15:35       ` Wiles, Keith
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-15 14:54 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/rte_eth_tap.h    |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index fa4658bd1e75..45c67de8e970 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 6bb63e5ec873..9127c739a214 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -56,6 +56,7 @@
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -482,6 +483,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused)
 	struct pmd_internals *internals = dev->data->dev_private;
 
 	tap_link_set_down(dev);
+	tap_flow_flush(dev, NULL);
 
 	for (i = 0; i < internals->nb_queues; i++) {
 		if (internals->rxq[i].fd != -1)
@@ -806,6 +808,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -877,6 +880,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -990,6 +995,7 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
+	tap_flow_flush(eth_dev, NULL);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index aafdef1faa99..bf8226736627 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -34,6 +34,7 @@
 #ifndef _RTE_ETH_TAP_H_
 #define _RTE_ETH_TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -68,6 +69,7 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..c32ed382d745
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <rte_eth_tap.h>
+#include <tap_flow.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v5 3/4] net/tap: add netlink back-end for flow API
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-15 14:54     ` Pascal Mazon
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  2017-03-21 15:48     ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Wiles, Keith
  4 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-15 14:54 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 45c67de8e970..3a33b560d3b5 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..9710e41a7801
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
                       ` (2 preceding siblings ...)
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-15 14:54     ` Pascal Mazon
  2017-03-21 17:10       ` Ferruh Yigit
  2017-03-21 15:48     ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Wiles, Keith
  4 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-15 14:54 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/tap.rst       |  23 ++
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  61 ++-
 drivers/net/tap/rte_eth_tap.h |   3 +
 drivers/net/tap/tap_flow.c    | 919 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 6 files changed, 1043 insertions(+), 15 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c4f207be3b47..cdb528b5eae4 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -82,6 +82,29 @@ can utilize that stack to handle the network protocols. Plus you would be able
 to address the interface using an IP address assigned to the internal
 interface.
 
+Flow API support
+----------------
+
+The tap PMD supports major flow API pattern items and actions, when running on
+linux kernels above 4.2 ("Flower" classifier required). Supported items:
+
+- eth: src and dst (with variable masks), and eth_type (0xffff mask).
+- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
+- udp/tcp: src and dst port (0xffff) mask.
+
+Supported actions:
+
+- DROP
+- QUEUE
+- PASSTHRU
+
+It is generally not possible to provide a "last" item. However, if the "last"
+item, once masked, is identical to the masked spec, then it is supported.
+
+Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
+full mask (exact match).
+
 Example
 -------
 
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3a33b560d3b5..c42a680555e9 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -58,5 +59,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_net
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 9127c739a214..c711b36c3222 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -44,19 +44,22 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdint.h>
 #include <unistd.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -71,6 +74,9 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
+
 static struct rte_vdev_driver pmd_tap_drv;
 
 static const char *valid_arguments[] = {
@@ -209,6 +215,28 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid)
 			goto error;
 		rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
 			   ETHER_ADDR_LEN);
+
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
 	}
 
 	return fd;
@@ -812,6 +840,24 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_kernel_support(struct pmd_internals *pmd)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3)
+		return 0;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
+		pmd->flower_support = 1;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
+	    FLOWER_VLAN_KERNEL_VERSION)
+		pmd->flower_vlan_support = 1;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -880,7 +926,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	tap_kernel_support(pmd);
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -995,7 +1049,10 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
-	tap_flow_flush(eth_dev, NULL);
+	if (internals->flower_support && internals->nlsk_fd) {
+		tap_flow_flush(eth_dev, NULL);
+		nl_final(internals->nlsk_fd);
+	}
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index bf8226736627..741ec5350886 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -69,6 +69,9 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index c32ed382d745..b119c31b0dea 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,71 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_autoconf.h>
+#include <tap_tcmsgs.h>
+
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+};
+#endif
+#ifndef HAVE_TC_VLAN_ID
+enum {
+	/* TCA_FLOWER_FLAGS, */
+	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
+};
+#endif
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+	uint8_t vlan;
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+			RTE_FLOW_ITEM_TYPE_VLAN,
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			if (!pmd->flower_vlan_support &&
+			    cur_item->convert == tap_flow_create_vlan)
+				goto exit_item_not_supported;
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+		if (pmd->flower_vlan_support && data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else if (data.eth_type) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +877,54 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
+	handle = rte_jhash(&flow, sizeof(flow), 1);
+#else
+	if (sizeof(flow) == 4) {
+		/* 32-bits arch */
+		uint64_t h = (uint64_t)flow;
+
+		handle = (uint32_t)h;
+	} else {
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	}
+#endif
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +942,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +991,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1059,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.8.0.rc0

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-21 15:32       ` Wiles, Keith
  2017-03-21 16:57         ` Pascal Mazon
  0 siblings, 1 reply; 57+ messages in thread
From: Wiles, Keith @ 2017-03-21 15:32 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 15, 2017, at 9:54 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> In the next patch, access to struct pmd_internals will be necessary in
> tap_flow.c to store the flows.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
> ---
> drivers/net/tap/Makefile      |  1 +
> drivers/net/tap/rte_eth_tap.c | 36 ++-------------------
> drivers/net/tap/rte_eth_tap.h | 75 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 78 insertions(+), 34 deletions(-)
> create mode 100644 drivers/net/tap/rte_eth_tap.h
> 
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index ddf87232d335..fa4658bd1e75 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
> LIBABIVER := 1
> 
> CFLAGS += -O3
> +CFLAGS += -I$(SRCDIR)
> CFLAGS += $(WERROR_FLAGS)
> 
> #
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index f8d9cc7dc3b2..6bb63e5ec873 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -55,6 +55,8 @@
> #include <linux/if_ether.h>
> #include <fcntl.h>
> 
> +#include <rte_eth_tap.h>
> +
> /* Linux based path to the TUN device */
> #define TUN_TAP_DEV_PATH        "/dev/net/tun"
> #define DEFAULT_TAP_NAME        "dtap"
> @@ -87,40 +89,6 @@ static struct rte_eth_link pmd_link = {
> 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
> };
> 
> -struct pkt_stats {
> -	uint64_t opackets;		/* Number of output packets */
> -	uint64_t ipackets;		/* Number of input packets */
> -	uint64_t obytes;		/* Number of bytes on output */
> -	uint64_t ibytes;		/* Number of bytes on input */
> -	uint64_t errs;			/* Number of error packets */
> -};
> -
> -struct rx_queue {
> -	struct rte_mempool *mp;		/* Mempool for RX packets */
> -	uint32_t trigger_seen;		/* Last seen Rx trigger value */
> -	uint16_t in_port;		/* Port ID */
> -	int fd;
> -
> -	struct pkt_stats stats;		/* Stats for this RX queue */
> -};
> -
> -struct tx_queue {
> -	int fd;
> -	struct pkt_stats stats;		/* Stats for this TX queue */
> -};
> -
> -struct pmd_internals {
> -	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
> -	uint16_t nb_queues;		/* Number of queues supported */
> -	struct ether_addr eth_addr;	/* Mac address of the device port */
> -
> -	int if_index;			/* IF_INDEX for the port */
> -	int ioctl_sock;			/* socket for ioctl calls */
> -
> -	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
> -	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
> -};
> -
> static void
> tap_trigger_cb(int sig __rte_unused)
> {
> diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
> new file mode 100644
> index 000000000000..aafdef1faa99
> --- /dev/null
> +++ b/drivers/net/tap/rte_eth_tap.h
> @@ -0,0 +1,75 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright 2017 6WIND S.A.
> + *   Copyright 2017 Mellanox.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of 6WIND S.A. nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_ETH_TAP_H_
> +#define _RTE_ETH_TAP_H_
> +
> +#include <inttypes.h>
> +
> +#include <rte_ethdev.h>
> +#include <rte_ether.h>

Just noticed this new header does not have the C++ ifdefs. Create a new patch to fix this problem, unless you need to update this patch series. Just starting my review of this one, sorry was traveling last week.

> +
> +#define RTE_PMD_TAP_MAX_QUEUES 16
> +
> +struct pkt_stats {
> +	uint64_t opackets;              /* Number of output packets */
> +	uint64_t ipackets;              /* Number of input packets */
> +	uint64_t obytes;                /* Number of bytes on output */
> +	uint64_t ibytes;                /* Number of bytes on input */
> +	uint64_t errs;                  /* Number of TX error packets */
> +};
> +
> +struct rx_queue {
> +	struct rte_mempool *mp;         /* Mempool for RX packets */
> +	uint32_t trigger_seen;          /* Last seen Rx trigger value */
> +	uint16_t in_port;               /* Port ID */
> +	int fd;
> +	struct pkt_stats stats;         /* Stats for this RX queue */
> +};
> +
> +struct tx_queue {
> +	int fd;
> +	struct pkt_stats stats;         /* Stats for this TX queue */
> +};
> +
> +struct pmd_internals {
> +	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
> +	uint16_t nb_queues;               /* Number of queues supported */
> +	struct ether_addr eth_addr;       /* Mac address of the device port */
> +	int if_index;                     /* IF_INDEX for the port */
> +	int ioctl_sock;                   /* socket for ioctl calls */
> +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
> +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
> +};
> +
> +#endif /* _RTE_ETH_TAP_H_ */
> -- 
> 2.8.0.rc0
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-21 15:35       ` Wiles, Keith
  0 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-21 15:35 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 15, 2017, at 9:54 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> The flow API provides the ability to classify packets received by a tap
> netdevice.
> 
> This patch only implements skeleton functions for flow API support, no
> patterns are supported yet.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>
> +#ifndef _TAP_FLOW_H_
> +#define _TAP_FLOW_H_
> +
> +#include <rte_flow.h>
> +#include <rte_flow_driver.h>

Missing the C++ ifdefs.

> +
> +int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
> +			enum rte_filter_type filter_type,
> +			enum rte_filter_op filter_op,
> +			void *arg);
> +int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
> +
> +#endif /* _TAP_FLOW_H_ */
> -- 
> 2.8.0.rc0
> 

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
                       ` (3 preceding siblings ...)
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-21 15:48     ` Wiles, Keith
  4 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-21 15:48 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 15, 2017, at 9:54 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> This series add support for the flow API in tap PMD.
> 
> It enables filtering specific packets incoming on the tap netdevice, to
> process only desired ones. Under the hood, it uses kernel TC (traffic
> control), which takes place very early in the stack, and supports most
> common pattern items and actions defined in the flow API.
> 
> This series applies on top of:
> 
>  [PATCH 0/6] net/tap: add additional management ops
> 
> v2 changes:
>  - support compilation on kernels < 4.2 (where flower support appeared)
>  - set whitespaces in tap.h
>  - remove unnecessary goto
> 
> v3 changes:
>  - vlan patterns enabled depending on running kernel (4.9+)
>  - update doc/guides/nics/tap.rst for Flow API support
>  - rebase on top of "net/tap: add additional management ops" series
> 
> v4 changes:
>  - rebase on top of "net/tap: add additional management ops" series
>  - fix a few netlink doxygen comments
>  - rename tap.h -> rte_eth_tap.h
>  - flush flow rules only when applicable
> 
> v5 changes:
>  - rebase after adrien's patches on Tx poll and Rx signaling
>  - better spaces for comments in rte_eth_tap.h
> 
> Pascal Mazon (4):
>  net/tap: move private elements to external header
>  net/tap: add preliminary support for rte_flow
>  net/tap: add netlink back-end for flow API
>  net/tap: add basic flow API patterns and actions
> 
> doc/guides/nics/features/tap.ini |    1 +
> doc/guides/nics/tap.rst          |   23 +
> drivers/net/tap/Makefile         |   44 ++
> drivers/net/tap/rte_eth_tap.c    |  101 ++--
> drivers/net/tap/rte_eth_tap.h    |   80 +++
> drivers/net/tap/tap_flow.c       | 1078 ++++++++++++++++++++++++++++++++++++++
> drivers/net/tap/tap_flow.h       |   58 ++
> drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
> drivers/net/tap/tap_netlink.h    |   69 +++
> drivers/net/tap/tap_tcmsgs.c     |  378 +++++++++++++
> drivers/net/tap/tap_tcmsgs.h     |   63 +++
> 11 files changed, 2227 insertions(+), 35 deletions(-)
> create mode 100644 drivers/net/tap/rte_eth_tap.h
> create mode 100644 drivers/net/tap/tap_flow.c
> create mode 100644 drivers/net/tap/tap_flow.h
> create mode 100644 drivers/net/tap/tap_netlink.c
> create mode 100644 drivers/net/tap/tap_netlink.h
> create mode 100644 drivers/net/tap/tap_tcmsgs.c
> create mode 100644 drivers/net/tap/tap_tcmsgs.h
> 
> -- 
> 2.8.0.rc0
> 

Going to Ack the series, but I am not an expert on flows. The only thing I saw was the missing C++ ifdefs in the headers, if that is required, then you can send that update in a new patch instead of reissuing this one, unless you want.

Acked-by: Keith.Wiles <keith.wiles@intel.com> for the series.

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header
  2017-03-21 15:32       ` Wiles, Keith
@ 2017-03-21 16:57         ` Pascal Mazon
  0 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-21 16:57 UTC (permalink / raw)
  To: Wiles, Keith; +Cc: dev

On Tue, 21 Mar 2017 15:32:06 +0000
"Wiles, Keith" <keith.wiles@intel.com> wrote:

> 
> Just noticed this new header does not have the C++ ifdefs. Create a
> new patch to fix this problem, unless you need to update this patch
> series. Just starting my review of this one, sorry was traveling last
> week.
> 
> Regards,
> Keith
> 

Hi Keith,

rte_eth_tap.h header is only presenting functions for local use within
the tap driver. That part of the code is completely internal to the
driver, compiled as pure C, and is not to be used directly by the
(potentially c++) user application linking itself with DPDK.

It's thus normal to keep it standard C without the need for c++ ifdefs.

The other PMDs in drivers/net also don't use the c++ ifdefs in their
headers, by the way.

Best regards,
Pascal

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions
  2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-21 17:10       ` Ferruh Yigit
  0 siblings, 0 replies; 57+ messages in thread
From: Ferruh Yigit @ 2017-03-21 17:10 UTC (permalink / raw)
  To: Pascal Mazon, keith.wiles; +Cc: dev

On 3/15/2017 2:54 PM, Pascal Mazon wrote:
> Supported flow rules are now mapped to TC rules on the tap netdevice.
> The netlink message used for creating the TC rule is stored in struct
> rte_flow. That way, by simply changing a metadata in it, we can require
> for the rule deletion without further parsing.
> 
> Supported items:
> - eth: src and dst (with variable masks), and eth_type (0xffff mask).
> - vlan: vid, pcp, tpid, but not eid.
> - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
> - udp/tcp: src and dst port (0xffff) mask.
> 
> Supported actions:
> - DROP
> - QUEUE
> - PASSTHRU
> 
> It is generally not possible to provide a "last" item. However, if the
> "last" item, once masked, is identical to the masked spec, then it is
> supported.
> 
> Only IPv4/6 and MAC addresses can use a variable mask. All other
> items need a full mask (exact match).
> 
> Support for VLAN requires kernel headers >= 4.9, checked using
> auto-config.sh.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>

<...>

> +/**
> + * Set a unique handle in a flow.
> + *
> + * The kernel supports TC rules with equal priority, as long as they use the
> + * same matching fields (e.g.: dst mac and ipv4) with different values (and
> + * full mask to ensure no collision is possible).
> + * In those rules, the handle (uint32_t) is the part that would identify
> + * specifically each rule.
> + *
> + * On 32-bit architectures, the handle can simply be the flow's pointer address.
> + * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
> + * unique handle.
> + *
> + * @param[in, out] flow
> + *   The flow that needs its handle set.
> + */
> +static void
> +tap_flow_set_handle(struct rte_flow *flow)
> +{
> +	uint32_t handle = 0;
> +
> +#if !defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8
> +	handle = rte_jhash(&flow, sizeof(flow), 1);
> +#else
> +	if (sizeof(flow) == 4) {
> +		/* 32-bits arch */
> +		uint64_t h = (uint64_t)flow;

This line is causing build error for i686 target:

.../drivers/net/tap/tap_flow.c: In function ‘tap_flow_set_handle’:
.../drivers/net/tap/tap_flow.c:917:16:
error: cast from pointer to integer of different size
[-Werror=pointer-to-int-cast]
   uint64_t h = (uint64_t)flow;
                ^

> +
> +		handle = (uint32_t)h;
> +	} else {
> +		handle = rte_jhash(&flow, sizeof(flow), 1);
> +	}
> +#endif
> +	/* must be at least 1 to avoid letting the kernel choose one for us */
> +	if (!handle)
> +		handle = 1;
> +	flow->msg.t.tcm_handle = handle;
>  }

<...>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] net/tap: support flow API
  2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
                     ` (7 preceding siblings ...)
  2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
@ 2017-03-22  9:48   ` Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 1/4] net/tap: move private elements to external header Pascal Mazon
                       ` (5 more replies)
  8 siblings, 6 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-22  9:48 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series add support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

v6 changes:
  - fix compilation issue on i686 (wrong cast for rte flow handle)

v5 changes:
  - rebase after adrien's patches on Tx poll and Rx signaling
  - better spaces for comments in rte_eth_tap.h

v4 changes:
  - rebase on top of "net/tap: add additional management ops" series
  - fix a few netlink doxygen comments
  - rename tap.h -> rte_eth_tap.h
  - flush flow rules only when applicable

v3 changes:
  - vlan patterns enabled depending on running kernel (4.9+)
  - update doc/guides/nics/tap.rst for Flow API support
  - rebase on top of "net/tap: add additional management ops" series

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 doc/guides/nics/tap.rst          |   23 +
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |  101 ++--
 drivers/net/tap/rte_eth_tap.h    |   80 +++
 drivers/net/tap/tap_flow.c       | 1070 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 +++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 ++++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 11 files changed, 2219 insertions(+), 35 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] net/tap: move private elements to external header
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
@ 2017-03-22  9:48     ` Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                       ` (4 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-22  9:48 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 36 ++-------------------
 drivers/net/tap/rte_eth_tap.h | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 34 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ddf87232d335..fa4658bd1e75 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f8d9cc7dc3b2..6bb63e5ec873 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -55,6 +55,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <rte_eth_tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -87,40 +89,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint32_t trigger_seen;		/* Last seen Rx trigger value */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-	int ioctl_sock;			/* socket for ioctl calls */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 static void
 tap_trigger_cb(int sig __rte_unused)
 {
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
new file mode 100644
index 000000000000..aafdef1faa99
--- /dev/null
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -0,0 +1,75 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_TAP_H_
+#define _RTE_ETH_TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets;              /* Number of output packets */
+	uint64_t ipackets;              /* Number of input packets */
+	uint64_t obytes;                /* Number of bytes on output */
+	uint64_t ibytes;                /* Number of bytes on input */
+	uint64_t errs;                  /* Number of TX error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp;         /* Mempool for RX packets */
+	uint32_t trigger_seen;          /* Last seen Rx trigger value */
+	uint16_t in_port;               /* Port ID */
+	int fd;
+	struct pkt_stats stats;         /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats;         /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	int ioctl_sock;                   /* socket for ioctl calls */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _RTE_ETH_TAP_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] net/tap: add preliminary support for rte_flow
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-22  9:48     ` Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-22  9:48 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/rte_eth_tap.h    |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index fa4658bd1e75..45c67de8e970 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 6bb63e5ec873..9127c739a214 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -56,6 +56,7 @@
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -482,6 +483,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused)
 	struct pmd_internals *internals = dev->data->dev_private;
 
 	tap_link_set_down(dev);
+	tap_flow_flush(dev, NULL);
 
 	for (i = 0; i < internals->nb_queues; i++) {
 		if (internals->rxq[i].fd != -1)
@@ -806,6 +808,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -877,6 +880,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -990,6 +995,7 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
+	tap_flow_flush(eth_dev, NULL);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index aafdef1faa99..bf8226736627 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -34,6 +34,7 @@
 #ifndef _RTE_ETH_TAP_H_
 #define _RTE_ETH_TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -68,6 +69,7 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..c32ed382d745
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <rte_eth_tap.h>
+#include <tap_flow.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] net/tap: add netlink back-end for flow API
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-22  9:48     ` Pascal Mazon
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-22  9:48 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 45c67de8e970..3a33b560d3b5 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..9710e41a7801
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
                       ` (2 preceding siblings ...)
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-22  9:48     ` Pascal Mazon
  2017-03-22 13:56       ` Ferruh Yigit
  2017-03-22 14:22     ` [dpdk-dev] [PATCH v6 0/4] net/tap: support flow API Wiles, Keith
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
  5 siblings, 1 reply; 57+ messages in thread
From: Pascal Mazon @ 2017-03-22  9:48 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/tap.rst       |  23 ++
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  61 ++-
 drivers/net/tap/rte_eth_tap.h |   3 +
 drivers/net/tap/tap_flow.c    | 911 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 6 files changed, 1035 insertions(+), 15 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c4f207be3b47..cdb528b5eae4 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -82,6 +82,29 @@ can utilize that stack to handle the network protocols. Plus you would be able
 to address the interface using an IP address assigned to the internal
 interface.
 
+Flow API support
+----------------
+
+The tap PMD supports major flow API pattern items and actions, when running on
+linux kernels above 4.2 ("Flower" classifier required). Supported items:
+
+- eth: src and dst (with variable masks), and eth_type (0xffff mask).
+- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
+- udp/tcp: src and dst port (0xffff) mask.
+
+Supported actions:
+
+- DROP
+- QUEUE
+- PASSTHRU
+
+It is generally not possible to provide a "last" item. However, if the "last"
+item, once masked, is identical to the masked spec, then it is supported.
+
+Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
+full mask (exact match).
+
 Example
 -------
 
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3a33b560d3b5..c42a680555e9 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -58,5 +59,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_net
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 9127c739a214..c711b36c3222 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -44,19 +44,22 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdint.h>
 #include <unistd.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -71,6 +74,9 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
+
 static struct rte_vdev_driver pmd_tap_drv;
 
 static const char *valid_arguments[] = {
@@ -209,6 +215,28 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid)
 			goto error;
 		rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
 			   ETHER_ADDR_LEN);
+
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
 	}
 
 	return fd;
@@ -812,6 +840,24 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_kernel_support(struct pmd_internals *pmd)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3)
+		return 0;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
+		pmd->flower_support = 1;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
+	    FLOWER_VLAN_KERNEL_VERSION)
+		pmd->flower_vlan_support = 1;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -880,7 +926,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	tap_kernel_support(pmd);
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -995,7 +1049,10 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
-	tap_flow_flush(eth_dev, NULL);
+	if (internals->flower_support && internals->nlsk_fd) {
+		tap_flow_flush(eth_dev, NULL);
+		nl_final(internals->nlsk_fd);
+	}
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index bf8226736627..741ec5350886 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -69,6 +69,9 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index c32ed382d745..6adacdc22d4a 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,71 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_autoconf.h>
+#include <tap_tcmsgs.h>
+
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+};
+#endif
+#ifndef HAVE_TC_VLAN_ID
+enum {
+	/* TCA_FLOWER_FLAGS, */
+	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
+};
+#endif
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+	uint8_t vlan;
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+			RTE_FLOW_ITEM_TYPE_VLAN,
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			if (!pmd->flower_vlan_support &&
+			    cur_item->convert == tap_flow_create_vlan)
+				goto exit_item_not_supported;
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+		if (pmd->flower_vlan_support && data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else if (data.eth_type) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +877,46 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+	if (sizeof(flow) > 4)
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	else
+		handle = (uintptr_t)flow;
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +934,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +983,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1051,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-22 13:56       ` Ferruh Yigit
  0 siblings, 0 replies; 57+ messages in thread
From: Ferruh Yigit @ 2017-03-22 13:56 UTC (permalink / raw)
  To: Pascal Mazon, keith.wiles; +Cc: dev

On 3/22/2017 9:48 AM, Pascal Mazon wrote:
> Supported flow rules are now mapped to TC rules on the tap netdevice.
> The netlink message used for creating the TC rule is stored in struct
> rte_flow. That way, by simply changing a metadata in it, we can require
> for the rule deletion without further parsing.
> 
> Supported items:
> - eth: src and dst (with variable masks), and eth_type (0xffff mask).
> - vlan: vid, pcp, tpid, but not eid.
> - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
> - udp/tcp: src and dst port (0xffff) mask.
> 
> Supported actions:
> - DROP
> - QUEUE
> - PASSTHRU
> 
> It is generally not possible to provide a "last" item. However, if the
> "last" item, once masked, is identical to the masked spec, then it is
> supported.
> 
> Only IPv4/6 and MAC addresses can use a variable mask. All other
> items need a full mask (exact match).
> 
> Support for VLAN requires kernel headers >= 4.9, checked using
> auto-config.sh.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> Acked-by: Olga Shern <olgas@mellanox.com>

<...>

> diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
> index c4f207be3b47..cdb528b5eae4 100644
> --- a/doc/guides/nics/tap.rst
> +++ b/doc/guides/nics/tap.rst
> @@ -82,6 +82,29 @@ can utilize that stack to handle the network protocols. Plus you would be able
>  to address the interface using an IP address assigned to the internal
>  interface.
>  
> +Flow API support
> +----------------
> +
> +The tap PMD supports major flow API pattern items and actions, when running on
> +linux kernels above 4.2 ("Flower" classifier required). Supported items:
> +
> +- eth: src and dst (with variable masks), and eth_type (0xffff mask).
> +- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
> +- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
> +- udp/tcp: src and dst port (0xffff) mask.
> +
> +Supported actions:
> +
> +- DROP
> +- QUEUE
> +- PASSTHRU
> +
> +It is generally not possible to provide a "last" item. However, if the "last"
> +item, once masked, is identical to the masked spec, then it is supported.
> +
> +Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
> +full mask (exact match).
> +
>  Example
>  -------

Hi Pascal,

I believe this is a good feature that deserves more explanation, would
you mind adding more documentation, more use cases and a few testpmd
usage samples?

btw, you can keep Ack from Keith for next version.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] net/tap: support flow API
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
                       ` (3 preceding siblings ...)
  2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-22 14:22     ` Wiles, Keith
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
  5 siblings, 0 replies; 57+ messages in thread
From: Wiles, Keith @ 2017-03-22 14:22 UTC (permalink / raw)
  To: Pascal Mazon; +Cc: dev


> On Mar 22, 2017, at 4:48 AM, Pascal Mazon <pascal.mazon@6wind.com> wrote:
> 
> This series add support for the flow API in tap PMD.
> 
> It enables filtering specific packets incoming on the tap netdevice, to
> process only desired ones. Under the hood, it uses kernel TC (traffic
> control), which takes place very early in the stack, and supports most
> common pattern items and actions defined in the flow API.
> 
> v6 changes:
>  - fix compilation issue on i686 (wrong cast for rte flow handle)
> 
> v5 changes:
>  - rebase after adrien's patches on Tx poll and Rx signaling
>  - better spaces for comments in rte_eth_tap.h
> 
> v4 changes:
>  - rebase on top of "net/tap: add additional management ops" series
>  - fix a few netlink doxygen comments
>  - rename tap.h -> rte_eth_tap.h
>  - flush flow rules only when applicable
> 
> v3 changes:
>  - vlan patterns enabled depending on running kernel (4.9+)
>  - update doc/guides/nics/tap.rst for Flow API support
>  - rebase on top of "net/tap: add additional management ops" series
> 
> v2 changes:
>  - support compilation on kernels < 4.2 (where flower support appeared)
>  - set whitespaces in tap.h
>  - remove unnecessary goto
> 
> Pascal Mazon (4):
>  net/tap: move private elements to external header
>  net/tap: add preliminary support for rte_flow
>  net/tap: add netlink back-end for flow API
>  net/tap: add basic flow API patterns and actions
> 
> doc/guides/nics/features/tap.ini |    1 +
> doc/guides/nics/tap.rst          |   23 +
> drivers/net/tap/Makefile         |   44 ++
> drivers/net/tap/rte_eth_tap.c    |  101 ++--
> drivers/net/tap/rte_eth_tap.h    |   80 +++
> drivers/net/tap/tap_flow.c       | 1070 ++++++++++++++++++++++++++++++++++++++
> drivers/net/tap/tap_flow.h       |   58 +++
> drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
> drivers/net/tap/tap_netlink.h    |   69 +++
> drivers/net/tap/tap_tcmsgs.c     |  378 ++++++++++++++
> drivers/net/tap/tap_tcmsgs.h     |   63 +++
> 11 files changed, 2219 insertions(+), 35 deletions(-)
> create mode 100644 drivers/net/tap/rte_eth_tap.h
> create mode 100644 drivers/net/tap/tap_flow.c
> create mode 100644 drivers/net/tap/tap_flow.h
> create mode 100644 drivers/net/tap/tap_netlink.c
> create mode 100644 drivers/net/tap/tap_netlink.h
> create mode 100644 drivers/net/tap/tap_tcmsgs.c
> create mode 100644 drivers/net/tap/tap_tcmsgs.h
> 
> -- 
> 2.12.0.306.g4a9b9b3
> 

Acked-by: Keith Wiles <keith.wiles@intel.com>

Did not add my ack to the list, but that is OK I guess.

I still think all headers in DPDK should have the C++ ifdefs as it does not hurt in anyway and just make sure someone does not include it directly.

Regards,
Keith

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] net/tap: support flow API
  2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
                       ` (4 preceding siblings ...)
  2017-03-22 14:22     ` [dpdk-dev] [PATCH v6 0/4] net/tap: support flow API Wiles, Keith
@ 2017-03-23  8:33     ` Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 1/4] net/tap: move private elements to external header Pascal Mazon
                         ` (4 more replies)
  5 siblings, 5 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-23  8:33 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

This series adds support for the flow API in tap PMD.

It enables filtering specific packets incoming on the tap netdevice, to
process only desired ones. Under the hood, it uses kernel TC (traffic
control), which takes place very early in the stack, and supports most
common pattern items and actions defined in the flow API.

v7 changes:
   - provide more details in doc/guides/nics/tap.rst

v6 changes:
  - fix compilation issue on i686 (wrong cast for rte flow handle)

v5 changes:
  - rebase after adrien's patches on Tx poll and Rx signaling
  - better spaces for comments in rte_eth_tap.h

v4 changes:
  - rebase on top of "net/tap: add additional management ops" series
  - fix a few netlink doxygen comments
  - rename tap.h -> rte_eth_tap.h
  - flush flow rules only when applicable

v3 changes:
  - vlan patterns enabled depending on running kernel (4.9+)
  - update doc/guides/nics/tap.rst for Flow API support
  - rebase on top of "net/tap: add additional management ops" series

v2 changes:
  - support compilation on kernels < 4.2 (where flower support appeared)
  - set whitespaces in tap.h
  - remove unnecessary goto

Pascal Mazon (4):
  net/tap: move private elements to external header
  net/tap: add preliminary support for rte_flow
  net/tap: add netlink back-end for flow API
  net/tap: add basic flow API patterns and actions

 doc/guides/nics/features/tap.ini |    1 +
 doc/guides/nics/tap.rst          |   45 ++
 drivers/net/tap/Makefile         |   44 ++
 drivers/net/tap/rte_eth_tap.c    |  101 ++--
 drivers/net/tap/rte_eth_tap.h    |   80 +++
 drivers/net/tap/tap_flow.c       | 1070 ++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |   58 +++
 drivers/net/tap/tap_netlink.c    |  367 +++++++++++++
 drivers/net/tap/tap_netlink.h    |   69 +++
 drivers/net/tap/tap_tcmsgs.c     |  378 ++++++++++++++
 drivers/net/tap/tap_tcmsgs.h     |   63 +++
 11 files changed, 2241 insertions(+), 35 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] net/tap: move private elements to external header
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
@ 2017-03-23  8:33       ` Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-23  8:33 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

In the next patch, access to struct pmd_internals will be necessary in
tap_flow.c to store the flows.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
Acked-by: Keith Wiles <keith.wiles@intel.com>
---
 drivers/net/tap/Makefile      |  1 +
 drivers/net/tap/rte_eth_tap.c | 36 ++-------------------
 drivers/net/tap/rte_eth_tap.h | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 34 deletions(-)
 create mode 100644 drivers/net/tap/rte_eth_tap.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ddf87232d335..fa4658bd1e75 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -40,6 +40,7 @@ EXPORT_MAP := rte_pmd_tap_version.map
 LIBABIVER := 1
 
 CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
 CFLAGS += $(WERROR_FLAGS)
 
 #
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f8d9cc7dc3b2..6bb63e5ec873 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -55,6 +55,8 @@
 #include <linux/if_ether.h>
 #include <fcntl.h>
 
+#include <rte_eth_tap.h>
+
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
 #define DEFAULT_TAP_NAME        "dtap"
@@ -87,40 +89,6 @@ static struct rte_eth_link pmd_link = {
 	.link_autoneg = ETH_LINK_SPEED_AUTONEG
 };
 
-struct pkt_stats {
-	uint64_t opackets;		/* Number of output packets */
-	uint64_t ipackets;		/* Number of input packets */
-	uint64_t obytes;		/* Number of bytes on output */
-	uint64_t ibytes;		/* Number of bytes on input */
-	uint64_t errs;			/* Number of error packets */
-};
-
-struct rx_queue {
-	struct rte_mempool *mp;		/* Mempool for RX packets */
-	uint32_t trigger_seen;		/* Last seen Rx trigger value */
-	uint16_t in_port;		/* Port ID */
-	int fd;
-
-	struct pkt_stats stats;		/* Stats for this RX queue */
-};
-
-struct tx_queue {
-	int fd;
-	struct pkt_stats stats;		/* Stats for this TX queue */
-};
-
-struct pmd_internals {
-	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
-	uint16_t nb_queues;		/* Number of queues supported */
-	struct ether_addr eth_addr;	/* Mac address of the device port */
-
-	int if_index;			/* IF_INDEX for the port */
-	int ioctl_sock;			/* socket for ioctl calls */
-
-	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
-	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
-};
-
 static void
 tap_trigger_cb(int sig __rte_unused)
 {
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
new file mode 100644
index 000000000000..aafdef1faa99
--- /dev/null
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -0,0 +1,75 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_TAP_H_
+#define _RTE_ETH_TAP_H_
+
+#include <inttypes.h>
+
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+
+#define RTE_PMD_TAP_MAX_QUEUES 16
+
+struct pkt_stats {
+	uint64_t opackets;              /* Number of output packets */
+	uint64_t ipackets;              /* Number of input packets */
+	uint64_t obytes;                /* Number of bytes on output */
+	uint64_t ibytes;                /* Number of bytes on input */
+	uint64_t errs;                  /* Number of TX error packets */
+};
+
+struct rx_queue {
+	struct rte_mempool *mp;         /* Mempool for RX packets */
+	uint32_t trigger_seen;          /* Last seen Rx trigger value */
+	uint16_t in_port;               /* Port ID */
+	int fd;
+	struct pkt_stats stats;         /* Stats for this RX queue */
+};
+
+struct tx_queue {
+	int fd;
+	struct pkt_stats stats;         /* Stats for this TX queue */
+};
+
+struct pmd_internals {
+	char name[RTE_ETH_NAME_MAX_LEN];  /* Internal Tap device name */
+	uint16_t nb_queues;               /* Number of queues supported */
+	struct ether_addr eth_addr;       /* Mac address of the device port */
+	int if_index;                     /* IF_INDEX for the port */
+	int ioctl_sock;                   /* socket for ioctl calls */
+	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
+	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
+};
+
+#endif /* _RTE_ETH_TAP_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] net/tap: add preliminary support for rte_flow
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 1/4] net/tap: move private elements to external header Pascal Mazon
@ 2017-03-23  8:33       ` Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
                         ` (2 subsequent siblings)
  4 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-23  8:33 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

The flow API provides the ability to classify packets received by a tap
netdevice.

This patch only implements skeleton functions for flow API support, no
patterns are supported yet.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
Acked-by: Keith Wiles <keith.wiles@intel.com>
---
 doc/guides/nics/features/tap.ini |   1 +
 drivers/net/tap/Makefile         |   1 +
 drivers/net/tap/rte_eth_tap.c    |   6 ++
 drivers/net/tap/rte_eth_tap.h    |   2 +
 drivers/net/tap/tap_flow.c       | 185 +++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_flow.h       |  46 ++++++++++
 6 files changed, 241 insertions(+)
 create mode 100644 drivers/net/tap/tap_flow.c
 create mode 100644 drivers/net/tap/tap_flow.h

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index a51712dce066..9d73f61cca3b 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -9,6 +9,7 @@ Jumbo frame          = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
+Flow API             = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index fa4658bd1e75..45c67de8e970 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -47,6 +47,7 @@ CFLAGS += $(WERROR_FLAGS)
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 6bb63e5ec873..9127c739a214 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -56,6 +56,7 @@
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
+#include <tap_flow.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -482,6 +483,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused)
 	struct pmd_internals *internals = dev->data->dev_private;
 
 	tap_link_set_down(dev);
+	tap_flow_flush(dev, NULL);
 
 	for (i = 0; i < internals->nb_queues; i++) {
 		if (internals->rxq[i].fd != -1)
@@ -806,6 +808,7 @@ static const struct eth_dev_ops ops = {
 	.stats_get              = tap_stats_get,
 	.stats_reset            = tap_stats_reset,
 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
+	.filter_ctrl            = tap_dev_filter_ctrl,
 };
 
 static int
@@ -877,6 +880,8 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	LIST_INIT(&pmd->flows);
+
 	return 0;
 
 error_exit:
@@ -990,6 +995,7 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
+	tap_flow_flush(eth_dev, NULL);
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index aafdef1faa99..bf8226736627 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -34,6 +34,7 @@
 #ifndef _RTE_ETH_TAP_H_
 #define _RTE_ETH_TAP_H_
 
+#include <sys/queue.h>
 #include <inttypes.h>
 
 #include <rte_ethdev.h>
@@ -68,6 +69,7 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
new file mode 100644
index 000000000000..c32ed382d745
--- /dev/null
+++ b/drivers/net/tap/tap_flow.c
@@ -0,0 +1,185 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include <rte_malloc.h>
+#include <rte_eth_tap.h>
+#include <tap_flow.h>
+
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+};
+
+static int
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error);
+
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error);
+
+static int
+tap_flow_destroy(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error);
+
+static const struct rte_flow_ops tap_flow_ops = {
+	.validate = tap_flow_validate,
+	.create = tap_flow_create,
+	.destroy = tap_flow_destroy,
+	.flush = tap_flow_flush,
+};
+
+/**
+ * Validate a flow.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	return -rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "not implemented yet");
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+tap_flow_create(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+
+	if (tap_flow_validate(dev, attr, items, actions, error))
+		return NULL;
+	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "cannot allocate memory for rte_flow");
+		return NULL;
+	}
+	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	return flow;
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow,
+		 struct rte_flow_error *error __rte_unused)
+{
+	LIST_REMOVE(flow, next);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *flow;
+
+	while (!LIST_EMPTY(&pmd->flows)) {
+		flow = LIST_FIRST(&pmd->flows);
+		if (tap_flow_destroy(dev, flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+		    enum rte_filter_type filter_type,
+		    enum rte_filter_op filter_op,
+		    void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &tap_flow_ops;
+		return 0;
+	default:
+		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
+			(void *)dev, filter_type);
+	}
+	return -EINVAL;
+}
+
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
new file mode 100644
index 000000000000..377a9f7b758a
--- /dev/null
+++ b/drivers/net/tap/tap_flow.h
@@ -0,0 +1,46 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_FLOW_H_
+#define _TAP_FLOW_H_
+
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+
+int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
+			enum rte_filter_type filter_type,
+			enum rte_filter_op filter_op,
+			void *arg);
+int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+
+#endif /* _TAP_FLOW_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] net/tap: add netlink back-end for flow API
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 1/4] net/tap: move private elements to external header Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
@ 2017-03-23  8:33       ` Pascal Mazon
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
  2017-03-23 12:50       ` [dpdk-dev] [PATCH v7 0/4] net/tap: support flow API Ferruh Yigit
  4 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-23  8:33 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
Acked-by: Keith Wiles <keith.wiles@intel.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 45c67de8e970..3a33b560d3b5 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..9710e41a7801
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] net/tap: add basic flow API patterns and actions
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
                         ` (2 preceding siblings ...)
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
@ 2017-03-23  8:33       ` Pascal Mazon
  2017-03-23 12:50       ` [dpdk-dev] [PATCH v7 0/4] net/tap: support flow API Ferruh Yigit
  4 siblings, 0 replies; 57+ messages in thread
From: Pascal Mazon @ 2017-03-23  8:33 UTC (permalink / raw)
  To: keith.wiles; +Cc: dev, Pascal Mazon

Supported flow rules are now mapped to TC rules on the tap netdevice.
The netlink message used for creating the TC rule is stored in struct
rte_flow. That way, by simply changing a metadata in it, we can require
for the rule deletion without further parsing.

Supported items:
- eth: src and dst (with variable masks), and eth_type (0xffff mask).
- vlan: vid, pcp, tpid, but not eid.
- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
- udp/tcp: src and dst port (0xffff) mask.

Supported actions:
- DROP
- QUEUE
- PASSTHRU

It is generally not possible to provide a "last" item. However, if the
"last" item, once masked, is identical to the masked spec, then it is
supported.

Only IPv4/6 and MAC addresses can use a variable mask. All other
items need a full mask (exact match).

Support for VLAN requires kernel headers >= 4.9, checked using
auto-config.sh.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
Acked-by: Keith Wiles <keith.wiles@intel.com>
---
 doc/guides/nics/tap.rst       |  45 +++
 drivers/net/tap/Makefile      |  40 ++
 drivers/net/tap/rte_eth_tap.c |  61 ++-
 drivers/net/tap/rte_eth_tap.h |   3 +
 drivers/net/tap/tap_flow.c    | 911 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tap/tap_flow.h    |  12 +
 6 files changed, 1057 insertions(+), 15 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c4f207be3b47..4986e47e9f57 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -82,6 +82,51 @@ can utilize that stack to handle the network protocols. Plus you would be able
 to address the interface using an IP address assigned to the internal
 interface.
 
+Flow API support
+----------------
+
+The tap PMD supports major flow API pattern items and actions, when running on
+linux kernels above 4.2 ("Flower" classifier required). Supported items:
+
+- eth: src and dst (with variable masks), and eth_type (0xffff mask).
+- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
+- udp/tcp: src and dst port (0xffff) mask.
+
+Supported actions:
+
+- DROP
+- QUEUE
+- PASSTHRU
+
+It is generally not possible to provide a "last" item. However, if the "last"
+item, once masked, is identical to the masked spec, then it is supported.
+
+Only IPv4/6 and MAC addresses can use a variable mask. All other items need a
+full mask (exact match).
+
+As rules are translated to TC, it is possible to show them with something like::
+
+   tc -s filter show dev tap1 parent 1:
+
+Examples of testpmd flow rules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Drop packets for destination IP 192.168.0.1::
+
+   testpmd> flow create 0 priority 1 ingress pattern eth / ipv4 dst is 1.1.1.1 \
+            / end actions drop / end
+
+Ensure packets from a given MAC address are received on a queue 2::
+
+   testpmd> flow create 0 priority 2 ingress pattern eth src is 06:05:04:03:02:01 \
+            / end actions queue index 2 / end
+
+Drop UDP packets in vlan 3::
+
+   testpmd> flow create 0 priority 3 ingress pattern eth / vlan vid is 3 / \
+            ipv4 proto is 17 / end actions drop / end
+
 Example
 -------
 
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3a33b560d3b5..c42a680555e9 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -41,6 +41,7 @@ LIBABIVER := 1
 
 CFLAGS += -O3
 CFLAGS += -I$(SRCDIR)
+CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 
 #
@@ -58,5 +59,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_net
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash
 
 include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up tap_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+tap_autoconf.h.new: FORCE
+
+tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_FLOWER \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_VLAN_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_VLAN_PRIO \
+		$(AUTOCONF_OUTPUT)
+
+# Create tap_autoconf.h or update it in case it differs from the new one.
+
+tap_autoconf.h: tap_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
+
+clean_tap: FORCE
+	$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
+
+clean: clean_tap
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 9127c739a214..c711b36c3222 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -44,19 +44,22 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdint.h>
 #include <unistd.h>
 #include <arpa/inet.h>
-#include <linux/if.h>
+#include <net/if.h>
 #include <linux/if_tun.h>
 #include <linux/if_ether.h>
+#include <linux/version.h>
 #include <fcntl.h>
 
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_tcmsgs.h>
 
 /* Linux based path to the TUN device */
 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
@@ -71,6 +74,9 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
+#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
+
 static struct rte_vdev_driver pmd_tap_drv;
 
 static const char *valid_arguments[] = {
@@ -209,6 +215,28 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid)
 			goto error;
 		rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
 			   ETHER_ADDR_LEN);
+
+		pmd->if_index = if_nametoindex(pmd->name);
+		if (!pmd->if_index) {
+			RTE_LOG(ERR, PMD,
+				"Could not find ifindex for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (!pmd->flower_support)
+			return fd;
+		if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
+		if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
+				pmd->name);
+			return fd;
+		}
 	}
 
 	return fd;
@@ -812,6 +840,24 @@ static const struct eth_dev_ops ops = {
 };
 
 static int
+tap_kernel_support(struct pmd_internals *pmd)
+{
+	struct utsname utsname;
+	int ver[3];
+
+	if (uname(&utsname) == -1 ||
+	    sscanf(utsname.release, "%d.%d.%d",
+		   &ver[0], &ver[1], &ver[2]) != 3)
+		return 0;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
+		pmd->flower_support = 1;
+	if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
+	    FLOWER_VLAN_KERNEL_VERSION)
+		pmd->flower_vlan_support = 1;
+	return 1;
+}
+
+static int
 eth_dev_tap_create(const char *name, char *tap_name)
 {
 	int numa_node = rte_socket_id();
@@ -880,7 +926,15 @@ eth_dev_tap_create(const char *name, char *tap_name)
 		pmd->txq[i].fd = -1;
 	}
 
+	tap_kernel_support(pmd);
+	if (!pmd->flower_support)
+		return 0;
 	LIST_INIT(&pmd->flows);
+	/*
+	 * If no netlink socket can be created, then it will fail when
+	 * creating/destroying flow rules.
+	 */
+	pmd->nlsk_fd = nl_init();
 
 	return 0;
 
@@ -995,7 +1049,10 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	internals = eth_dev->data->dev_private;
-	tap_flow_flush(eth_dev, NULL);
+	if (internals->flower_support && internals->nlsk_fd) {
+		tap_flow_flush(eth_dev, NULL);
+		nl_final(internals->nlsk_fd);
+	}
 	for (i = 0; i < internals->nb_queues; i++)
 		if (internals->rxq[i].fd != -1)
 			close(internals->rxq[i].fd);
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index bf8226736627..741ec5350886 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -69,6 +69,9 @@ struct pmd_internals {
 	struct ether_addr eth_addr;       /* Mac address of the device port */
 	int if_index;                     /* IF_INDEX for the port */
 	int ioctl_sock;                   /* socket for ioctl calls */
+	int nlsk_fd;                      /* Netlink socket fd */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index c32ed382d745..6adacdc22d4a 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,14 +33,71 @@
 
 #include <sys/queue.h>
 
+#include <rte_byteorder.h>
+#include <rte_jhash.h>
 #include <rte_malloc.h>
 #include <rte_eth_tap.h>
 #include <tap_flow.h>
+#include <tap_autoconf.h>
+#include <tap_tcmsgs.h>
+
+#ifndef HAVE_TC_FLOWER
+/*
+ * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
+ * avoid sending TC messages the kernel cannot understand.
+ */
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
+	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
+};
+#endif
+#ifndef HAVE_TC_VLAN_ID
+enum {
+	/* TCA_FLOWER_FLAGS, */
+	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
+};
+#endif
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct nlmsg msg;
+};
+
+struct convert_data {
+	uint16_t eth_type;
+	uint16_t ip_proto;
+	uint8_t vlan;
+	struct rte_flow *flow;
 };
 
+static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
+static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
 static int
 tap_flow_validate(struct rte_eth_dev *dev,
 		  const struct rte_flow_attr *attr,
@@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = {
 	.flush = tap_flow_flush,
 };
 
+/* Static initializer for items. */
+#define ITEMS(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/* Structure to generate a simple graph of layers supported by the NIC. */
+struct tap_flow_items {
+	/* Bit-mask corresponding to what is supported for this item. */
+	const void *mask;
+	const unsigned int mask_sz; /* Bit-mask size in bytes. */
+	/*
+	 * Bit-mask corresponding to the default mask, if none is provided
+	 * along with the item.
+	 */
+	const void *default_mask;
+	/**
+	 * Conversion function from rte_flow to netlink attributes.
+	 *
+	 * @param item
+	 *   rte_flow item to convert.
+	 * @param data
+	 *   Internal structure to store the conversion.
+	 *
+	 * @return
+	 *   0 on success, negative value otherwise.
+	 */
+	int (*convert)(const struct rte_flow_item *item, void *data);
+	/** List of possible following items.  */
+	const enum rte_flow_item_type *const items;
+};
+
+/* Graph of supported items and associated actions. */
+static const struct tap_flow_items tap_flow_items[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.items = ITEMS(
+			RTE_FLOW_ITEM_TYPE_VLAN,
+			RTE_FLOW_ITEM_TYPE_IPV4,
+			RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_eth){
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			.type = -1,
+		},
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.default_mask = &rte_flow_item_eth_mask,
+		.convert = tap_flow_create_eth,
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
+			       RTE_FLOW_ITEM_TYPE_IPV6),
+		.mask = &(const struct rte_flow_item_vlan){
+			.tpid = -1,
+			/* DEI matching is not supported */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			.tci = 0xffef,
+#else
+			.tci = 0xefff,
+#endif
+		},
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.default_mask = &rte_flow_item_vlan_mask,
+		.convert = tap_flow_create_vlan,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = -1,
+				.dst_addr = -1,
+				.next_proto_id = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.default_mask = &rte_flow_item_ipv4_mask,
+		.convert = tap_flow_create_ipv4,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV6] = {
+		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
+			       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask = &(const struct rte_flow_item_ipv6){
+			.hdr = {
+				.src_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.dst_addr = {
+					"\xff\xff\xff\xff\xff\xff\xff\xff"
+					"\xff\xff\xff\xff\xff\xff\xff\xff",
+				},
+				.proto = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_ipv6),
+		.default_mask = &rte_flow_item_ipv6_mask,
+		.convert = tap_flow_create_ipv6,
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.default_mask = &rte_flow_item_udp_mask,
+		.convert = tap_flow_create_udp,
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = -1,
+				.dst_port = -1,
+			},
+		},
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.default_mask = &rte_flow_item_tcp_mask,
+		.convert = tap_flow_create_tcp,
+	},
+};
+
+/**
+ * Make as much checks as possible on an Ethernet item, and if a flow is
+ * provided, fill it appropriately with Ethernet info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_eth(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
+	/* TC does not support eth_type masking. Only accept if exact match. */
+	if (mask->type && mask->type != 0xffff)
+		return -1;
+	if (!spec)
+		return 0;
+	/* store eth_type for consistency if ipv4/6 pattern item comes next */
+	if (spec->type & mask->type)
+		info->eth_type = spec->type;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (spec->type & mask->type)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
+					    (spec->type & mask->type));
+	if (!is_zero_ether_addr(&spec->dst)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
+			   &spec->dst.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
+			   &mask->dst.addr_bytes);
+	}
+	if (!is_zero_ether_addr(&mask->src)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
+			   &spec->src.addr_bytes);
+		nlattr_add(&msg->nh,
+			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
+			   &mask->src.addr_bytes);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a VLAN item, and if a flow is provided,
+ * fill it appropriately with VLAN info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
+	/* TC does not support tpid masking. Only accept if exact match. */
+	if (mask->tpid && mask->tpid != 0xffff)
+		return -1;
+	/* Double-tagging not supported. */
+	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+		return -1;
+	info->vlan = 1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
+#define VLAN_PRIO(tci) ((tci) >> 13)
+#define VLAN_ID(tci) ((tci) & 0xfff)
+	if (!spec)
+		return 0;
+	if (spec->tci) {
+		uint16_t tci = ntohs(spec->tci) & mask->tci;
+		uint16_t prio = VLAN_PRIO(tci);
+		uint8_t vid = VLAN_ID(tci);
+
+		if (prio)
+			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
+		if (vid)
+			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
+	}
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv4 item, and if a flow is provided,
+ * fill it appropriately with IPv4 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
+	/* check that previous eth type is compatible with ipv4 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.next_proto_id;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IP);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
+			     spec->hdr.dst_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
+			     mask->hdr.dst_addr);
+	}
+	if (spec->hdr.src_addr) {
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
+			     spec->hdr.src_addr);
+		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+			     mask->hdr.src_addr);
+	}
+	if (spec->hdr.next_proto_id)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
+			    spec->hdr.next_proto_id);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on an IPv6 item, and if a flow is provided,
+ * fill it appropriately with IPv6 info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	uint8_t empty_addr[16] = { 0 };
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
+	/* check that previous eth type is compatible with ipv6 */
+	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
+		return -1;
+	/* store ip_proto for consistency if udp/tcp pattern item comes next */
+	if (spec)
+		info->ip_proto = spec->hdr.proto;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	if (!info->eth_type)
+		info->eth_type = htons(ETH_P_IPV6);
+	if (!info->vlan)
+		msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
+	if (!spec)
+		return 0;
+	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
+			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
+	}
+	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
+			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
+		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
+	}
+	if (spec->hdr.proto)
+		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a UDP item, and if a flow is provided,
+ * fill it appropriately with UDP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_udp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
+	/* check that previous ip_proto is compatible with udp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Make as much checks as possible on a TCP item, and if a flow is provided,
+ * fill it appropriately with TCP info.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in, out] data
+ *   Additional data structure to tell next layers we've been here.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
+{
+	struct convert_data *info = (struct convert_data *)data;
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	struct rte_flow *flow = info->flow;
+	struct nlmsg *msg;
+
+	/* use default mask if none provided */
+	if (!mask)
+		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
+	/* check that previous ip_proto is compatible with tcp */
+	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
+		return -1;
+	if (!flow)
+		return 0;
+	msg = &flow->msg;
+	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
+	if (!spec)
+		return 0;
+	if (spec->hdr.dst_port &&
+	    (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
+			     spec->hdr.dst_port);
+	if (spec->hdr.src_port &&
+	    (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
+		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
+			     spec->hdr.src_port);
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param[in] supported_mask
+ *   Bit-mask covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param[in] default_mask
+ *   Bit-mask default mask if none is provided in \item.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+tap_flow_item_validate(const struct rte_flow_item *item,
+		       unsigned int size,
+		       const uint8_t *supported_mask,
+		       const uint8_t *default_mask)
+{
+	int ret = 0;
+
+	/* An empty layer is allowed, as long as all fields are NULL */
+	if (!item->spec && (item->mask || item->last))
+		return -1;
+	/* Is the item spec compatible with what the NIC supports? */
+	if (item->spec && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->spec;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+		/* Is the default mask compatible with what the NIC supports? */
+		for (i = 0; i < size; i++)
+			if ((default_mask[i] | supported_mask[i]) !=
+			    supported_mask[i])
+				return -1;
+	}
+	/* Is the item last compatible with what the NIC supports? */
+	if (item->last && !item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->last;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/* Is the item mask compatible with what the NIC supports? */
+	if (item->mask) {
+		unsigned int i;
+		const uint8_t *spec = item->mask;
+
+		for (i = 0; i < size; ++i)
+			if ((spec[i] | supported_mask[i]) != supported_mask[i])
+				return -1;
+	}
+	/**
+	 * Once masked, Are item spec and item last equal?
+	 * TC does not support range so anything else is invalid.
+	 */
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		const uint8_t *apply = default_mask;
+		unsigned int i;
+
+		if (item->mask)
+			apply = item->mask;
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
+			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
+		}
+		ret = memcmp(spec, last, size);
+	}
+	return ret;
+}
+
+/**
+ * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] action
+ *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_gact(struct rte_flow *flow, int action)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_gact p = {
+		.action = action
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Transform a QUEUE action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] queue
+ *   Queue id to use.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_skbedit p = {
+		.action = TC_ACT_PIPE
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
+	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
+ * Validate a flow supported by TC.
+ * If flow param is not NULL, then also fill the netlink message inside.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] flow
+ *   Flow structure to update.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_process(struct pmd_internals *pmd,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow *flow)
+{
+	const struct tap_flow_items *cur_item = tap_flow_items;
+	struct convert_data data = {
+		.eth_type = 0,
+		.ip_proto = 0,
+		.flow = flow,
+	};
+	int action = 0; /* Only one action authorized for now */
+
+	if (attr->group > MAX_GROUP) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			NULL, "group value too big: cannot exceed 15");
+		return -rte_errno;
+	}
+	if (attr->priority > MAX_PRIORITY) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		return -rte_errno;
+	} else if (flow) {
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
+						 flow->msg.t.tcm_info);
+	}
+	if (!attr->ingress) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
+				   NULL, "direction should be ingress");
+		return -rte_errno;
+	}
+	/* rte_flow ingress is actually egress as seen in the kernel */
+	if (attr->ingress && flow)
+		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	if (flow) {
+		/* use flower filter type */
+		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
+		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
+			goto exit_item_not_supported;
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct tap_flow_items *token = NULL;
+		unsigned int i;
+		int err = 0;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &tap_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = tap_flow_item_validate(
+			items, cur_item->mask_sz,
+			(const uint8_t *)cur_item->mask,
+			(const uint8_t *)cur_item->default_mask);
+		if (err)
+			goto exit_item_not_supported;
+		if (flow && cur_item->convert) {
+			if (!pmd->flower_vlan_support &&
+			    cur_item->convert == tap_flow_create_vlan)
+				goto exit_item_not_supported;
+			err = cur_item->convert(items, &data);
+			if (err)
+				goto exit_item_not_supported;
+		}
+	}
+	if (flow) {
+		if (pmd->flower_vlan_support && data.vlan) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     htons(ETH_P_8021Q));
+			nlattr_add16(&flow->msg.nh,
+				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				     data.eth_type ?
+				     data.eth_type : htons(ETH_P_ALL));
+		} else if (data.eth_type) {
+			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
+				     data.eth_type);
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
+		int err = 0;
+
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
+			continue;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_SHOT);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (flow)
+				err = add_action_gact(flow, TC_ACT_UNSPEC);
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue =
+				(const struct rte_flow_action_queue *)
+				actions->conf;
+			if (action)
+				goto exit_action_not_supported;
+			action = 1;
+			if (!queue || (queue->index >= pmd->nb_queues))
+				goto exit_action_not_supported;
+			if (flow)
+				err = add_action_skbedit(flow, queue->index);
+		} else {
+			goto exit_action_not_supported;
+		}
+		if (err)
+			goto exit_action_not_supported;
+	}
+	if (flow)
+		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+exit_action_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "action not supported");
+	return -rte_errno;
+}
+
+
+
 /**
  * Validate a flow.
  *
@@ -74,15 +877,46 @@ static const struct rte_flow_ops tap_flow_ops = {
  * @see rte_flow_ops
  */
 static int
-tap_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		  const struct rte_flow_attr *attr __rte_unused,
-		  const struct rte_flow_item items[] __rte_unused,
-		  const struct rte_flow_action actions[] __rte_unused,
+tap_flow_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error)
 {
-	return -rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "not implemented yet");
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+}
+
+/**
+ * Set a unique handle in a flow.
+ *
+ * The kernel supports TC rules with equal priority, as long as they use the
+ * same matching fields (e.g.: dst mac and ipv4) with different values (and
+ * full mask to ensure no collision is possible).
+ * In those rules, the handle (uint32_t) is the part that would identify
+ * specifically each rule.
+ *
+ * On 32-bit architectures, the handle can simply be the flow's pointer address.
+ * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
+ * unique handle.
+ *
+ * @param[in, out] flow
+ *   The flow that needs its handle set.
+ */
+static void
+tap_flow_set_handle(struct rte_flow *flow)
+{
+	uint32_t handle = 0;
+
+	if (sizeof(flow) > 4)
+		handle = rte_jhash(&flow, sizeof(flow), 1);
+	else
+		handle = (uintptr_t)flow;
+	/* must be at least 1 to avoid letting the kernel choose one for us */
+	if (!handle)
+		handle = 1;
+	flow->msg.t.tcm_handle = handle;
 }
 
 /**
@@ -100,17 +934,46 @@ tap_flow_create(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rte_flow *flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err;
 
-	if (tap_flow_validate(dev, attr, items, actions, error))
-		return NULL;
+	if (!pmd->if_index) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL,
+				   "can't create rule, ifindex not found");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate memory for rte_flow");
-		return NULL;
+		goto fail;
+	}
+	msg = &flow->msg;
+	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(flow);
+	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+		goto fail;
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "overlapping rules");
+		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
 	return flow;
+fail:
+	if (flow)
+		rte_free(flow);
+	return NULL;
 }
 
 /**
@@ -120,13 +983,31 @@ tap_flow_create(struct rte_eth_dev *dev,
  * @see rte_flow_ops
  */
 static int
-tap_flow_destroy(struct rte_eth_dev *dev __rte_unused,
+tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow *flow,
-		 struct rte_flow_error *error __rte_unused)
+		 struct rte_flow_error *error)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+	int ret = 0;
+
 	LIST_REMOVE(flow, next);
+	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
+	if (ret < 0) {
+		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				   NULL, "couldn't send request to kernel");
+		goto end;
+	}
+	ret = nl_recv_ack(pmd->nlsk_fd);
+	if (ret < 0)
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"couldn't receive kernel ack to our request");
+end:
 	rte_free(flow);
-	return 0;
+	return ret;
 }
 
 /**
@@ -170,6 +1051,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 		    enum rte_filter_op filter_op,
 		    void *arg)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
+
+	if (!pmd->flower_support)
+		return -ENOTSUP;
 	switch (filter_type) {
 	case RTE_ETH_FILTER_GENERIC:
 		if (filter_op != RTE_ETH_FILTER_GET)
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 377a9f7b758a..a05e945df523 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,18 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 
+/**
+ * In TC, priority 0 means we require the kernel to allocate one for us.
+ * In rte_flow, however, we want the priority 0 to be the most important one.
+ * Use an offset to have the most important priority being 1 in TC.
+ */
+#define PRIORITY_OFFSET 1
+#define PRIORITY_MASK (0xfff)
+#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET)
+#define GROUP_MASK (0xf)
+#define GROUP_SHIFT 12
+#define MAX_GROUP GROUP_MASK
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
-- 
2.12.0.306.g4a9b9b3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [dpdk-dev] [PATCH v7 0/4] net/tap: support flow API
  2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
                         ` (3 preceding siblings ...)
  2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
@ 2017-03-23 12:50       ` Ferruh Yigit
  4 siblings, 0 replies; 57+ messages in thread
From: Ferruh Yigit @ 2017-03-23 12:50 UTC (permalink / raw)
  To: Pascal Mazon, keith.wiles; +Cc: dev

On 3/23/2017 8:33 AM, Pascal Mazon wrote:
> This series adds support for the flow API in tap PMD.
> 
> It enables filtering specific packets incoming on the tap netdevice, to
> process only desired ones. Under the hood, it uses kernel TC (traffic
> control), which takes place very early in the stack, and supports most
> common pattern items and actions defined in the flow API.
> 
> v7 changes:
>    - provide more details in doc/guides/nics/tap.rst
> 
> v6 changes:
>   - fix compilation issue on i686 (wrong cast for rte flow handle)
> 
> v5 changes:
>   - rebase after adrien's patches on Tx poll and Rx signaling
>   - better spaces for comments in rte_eth_tap.h
> 
> v4 changes:
>   - rebase on top of "net/tap: add additional management ops" series
>   - fix a few netlink doxygen comments
>   - rename tap.h -> rte_eth_tap.h
>   - flush flow rules only when applicable
> 
> v3 changes:
>   - vlan patterns enabled depending on running kernel (4.9+)
>   - update doc/guides/nics/tap.rst for Flow API support
>   - rebase on top of "net/tap: add additional management ops" series
> 
> v2 changes:
>   - support compilation on kernels < 4.2 (where flower support appeared)
>   - set whitespaces in tap.h
>   - remove unnecessary goto
> 
> Pascal Mazon (4):
>   net/tap: move private elements to external header
>   net/tap: add preliminary support for rte_flow
>   net/tap: add netlink back-end for flow API
>   net/tap: add basic flow API patterns and actions

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2017-03-23 12:50 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-03 10:45 [dpdk-dev] [PATCH 0/4] net/tap: support flow API Pascal Mazon
2017-03-03 10:45 ` [dpdk-dev] [PATCH 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-03 15:38   ` Wiles, Keith
2017-03-06 14:18     ` Pascal Mazon
2017-03-06 14:51       ` Wiles, Keith
2017-03-03 10:45 ` [dpdk-dev] [PATCH 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-03 10:45 ` [dpdk-dev] [PATCH 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-03 10:45 ` [dpdk-dev] [PATCH 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-03 15:47   ` Wiles, Keith
2017-03-06 14:22     ` Pascal Mazon
2017-03-03 15:54 ` [dpdk-dev] [PATCH 0/4] net/tap: support flow API Wiles, Keith
2017-03-06 17:05 ` [dpdk-dev] [PATCH v2 " Pascal Mazon
2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-06 17:05   ` [dpdk-dev] [PATCH v2 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-07 15:05   ` [dpdk-dev] [PATCH v2 0/4] net/tap: support flow API Pascal Mazon
2017-03-07 15:08     ` Wiles, Keith
2017-03-07 16:35   ` [dpdk-dev] [PATCH v3 " Pascal Mazon
2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-09 15:28       ` Ferruh Yigit
2017-03-10  9:40         ` Pascal Mazon
2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-09 15:29       ` Ferruh Yigit
2017-03-10  9:39         ` Pascal Mazon
2017-03-07 16:35     ` [dpdk-dev] [PATCH v3 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-14  8:29   ` [dpdk-dev] [PATCH v4 0/4] net/tap: support flow API Pascal Mazon
2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-14 14:05       ` Wiles, Keith
2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-14 14:03       ` Wiles, Keith
2017-03-14  8:29     ` [dpdk-dev] [PATCH v4 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-15 14:54   ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Pascal Mazon
2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-21 15:32       ` Wiles, Keith
2017-03-21 16:57         ` Pascal Mazon
2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-21 15:35       ` Wiles, Keith
2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-15 14:54     ` [dpdk-dev] [PATCH v5 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-21 17:10       ` Ferruh Yigit
2017-03-21 15:48     ` [dpdk-dev] [PATCH v5 0/4] net/tap: support flow API Wiles, Keith
2017-03-22  9:48   ` [dpdk-dev] [PATCH v6 " Pascal Mazon
2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-22  9:48     ` [dpdk-dev] [PATCH v6 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-22 13:56       ` Ferruh Yigit
2017-03-22 14:22     ` [dpdk-dev] [PATCH v6 0/4] net/tap: support flow API Wiles, Keith
2017-03-23  8:33     ` [dpdk-dev] [PATCH v7 " Pascal Mazon
2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 1/4] net/tap: move private elements to external header Pascal Mazon
2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 2/4] net/tap: add preliminary support for rte_flow Pascal Mazon
2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 3/4] net/tap: add netlink back-end for flow API Pascal Mazon
2017-03-23  8:33       ` [dpdk-dev] [PATCH v7 4/4] net/tap: add basic flow API patterns and actions Pascal Mazon
2017-03-23 12:50       ` [dpdk-dev] [PATCH v7 0/4] net/tap: support flow API Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).