DPDK patches and discussions
 help / color / mirror / Atom feed
From: Andrzej Ostruszka <aostruszka@marvell.com>
To: <dev@dpdk.org>, Thomas Monjalon <thomas@monjalon.net>
Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>,
	Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>,
	Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>,
	Kiran Kumar Kokkilagadda <kirankumark@marvell.com>,
	Krzysztof Kanas <kkanas@marvell.com>
Subject: [dpdk-dev] [RFC PATCH 2/3] if_proxy: add preliminary Linux implementation
Date: Tue, 14 Jan 2020 15:25:16 +0100	[thread overview]
Message-ID: <20200114142517.29522-3-aostruszka@marvell.com> (raw)
In-Reply-To: <20200114142517.29522-1-aostruszka@marvell.com>

This commit adds a preliminary Linux implementation of the IF Proxy
library.  It should allow one to play around with the idea and check its
usefulness.

Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
---
 config/common_base                            |   5 +
 lib/Makefile                                  |   2 +
 .../common/include/rte_eal_interrupts.h       |   2 +
 lib/librte_eal/linux/eal/eal_interrupts.c     |  14 +-
 lib/librte_if_proxy/Makefile                  |  25 +
 lib/librte_if_proxy/meson.build               |   7 +
 lib/librte_if_proxy/rte_if_proxy.c            | 803 ++++++++++++++++++
 lib/meson.build                               |   2 +-
 8 files changed, 855 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_if_proxy/Makefile
 create mode 100644 lib/librte_if_proxy/meson.build
 create mode 100644 lib/librte_if_proxy/rte_if_proxy.c

diff --git a/config/common_base b/config/common_base
index 7dec7ed45..f20296750 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1056,6 +1056,11 @@ CONFIG_RTE_LIBRTE_BPF_ELF=n
 #
 CONFIG_RTE_LIBRTE_IPSEC=y
 
+#
+# Compile librte_if_proxy
+#
+CONFIG_RTE_LIBRTE_IF_PROXY=y
+
 #
 # Compile the test application
 #
diff --git a/lib/Makefile b/lib/Makefile
index 46b91ae1a..0a60f3656 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -118,6 +118,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
 DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
 DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
 DEPDIRS-librte_rcu := librte_eal
+DIRS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += librte_if_proxy
+DEPDIRS-librte_if_proxy := librte_eal
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUX),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
index b370c0d26..f3d39a5ce 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -35,7 +35,9 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_EXT,          /**< external handler */
 	RTE_INTR_HANDLE_VDEV,         /**< virtual device */
 	RTE_INTR_HANDLE_DEV_EVENT,    /**< device event handle */
+	RTE_INTR_HANDLE_NETLINK,      /**< netlink notification handle */
 	RTE_INTR_HANDLE_VFIO_REQ,     /**< VFIO request handle */
+
 	RTE_INTR_HANDLE_MAX           /**< count of elements */
 };
 
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
index 14ebb108c..ccdd94002 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -680,6 +680,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+	case RTE_INTR_HANDLE_NETLINK:
+#endif
 		return -1;
 #ifdef VFIO_PRESENT
 	case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -796,6 +799,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+	case RTE_INTR_HANDLE_NETLINK:
+#endif
 		return -1;
 #ifdef VFIO_PRESENT
 	case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -889,12 +895,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 			break;
 #endif
 #endif
-		case RTE_INTR_HANDLE_VDEV:
 		case RTE_INTR_HANDLE_EXT:
-			bytes_read = 0;
-			call = true;
-			break;
+		case RTE_INTR_HANDLE_VDEV:
 		case RTE_INTR_HANDLE_DEV_EVENT:
+#if RTE_LIBRTE_IF_PROXY
+		case RTE_INTR_HANDLE_NETLINK:
+#endif
 			bytes_read = 0;
 			call = true;
 			break;
diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
new file mode 100644
index 000000000..9dd5f4791
--- /dev/null
+++ b/lib/librte_if_proxy/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_if_proxy.a
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+LDLIBS += -lrte_eal
+
+EXPORT_MAP := rte_if_proxy_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := rte_if_proxy.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
new file mode 100644
index 000000000..f9ed410b6
--- /dev/null
+++ b/lib/librte_if_proxy/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+version = 1
+allow_experimental_apis = true
+sources = files('rte_if_proxy.c')
+headers = files('rte_if_proxy.h')
diff --git a/lib/librte_if_proxy/rte_if_proxy.c b/lib/librte_if_proxy/rte_if_proxy.c
new file mode 100644
index 000000000..770462702
--- /dev/null
+++ b/lib/librte_if_proxy/rte_if_proxy.c
@@ -0,0 +1,803 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#include <rte_if_proxy.h>
+#include <rte_interrupts.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+
+static
+int ifpx_log_type;
+#define IFPX_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, ifpx_log_type, "%s(): " fmt "\n", \
+		__func__, ##args)
+
+static
+struct rte_intr_handle ifpx_irq = {
+	.type = RTE_INTR_HANDLE_NETLINK,
+	.fd = -1,
+};
+
+static
+unsigned int ifpx_pid;
+
+/* Port to proxy mapping table */
+static uint16_t ifpx_p2p[RTE_MAX_ETHPORTS];
+
+/* Since this library is really slow/config path we guard data structures with
+ * a lock - and only one for all of them should be enough.  But only callback
+ * and proxies lists are protected, I don't expect the need to protect port to
+ * proxy map table above.
+ */
+static
+rte_spinlock_t ifpx_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* List of configured proxies */
+struct ifpx_proxies_node {
+	TAILQ_ENTRY(ifpx_proxies_node) elem;
+	uint16_t proxy_id;
+	struct rte_ifpx_info info;
+};
+static
+TAILQ_HEAD(ifpx_proxies_head, ifpx_proxies_node) ifpx_proxies =
+		TAILQ_HEAD_INITIALIZER(ifpx_proxies);
+
+/* List of registered callbacks */
+struct ifpx_cbs_node {
+	TAILQ_ENTRY(ifpx_cbs_node) elem;
+	struct rte_ifpx_callbacks cbs;
+};
+static
+TAILQ_HEAD(ifpx_cbs_head, ifpx_cbs_node) ifpx_callbacks =
+		TAILQ_HEAD_INITIALIZER(ifpx_callbacks);
+
+static
+int request_info(int type, int index);
+
+uint64_t rte_ifpx_callbacks_available(void)
+{
+	return RTE_IFPX_MAC_CHANGE | RTE_IFPX_MTU_CHANGE |
+		RTE_IFPX_LINK_CHANGE | RTE_IFPX_ADDR_ADD |
+		RTE_IFPX_ADDR_DEL | RTE_IFPX_ADDR6_ADD |
+		RTE_IFPX_ADDR6_DEL | RTE_IFPX_ROUTE_ADD |
+		RTE_IFPX_ROUTE_DEL | RTE_IFPX_ROUTE6_ADD |
+		RTE_IFPX_ROUTE6_DEL;
+}
+
+uint16_t rte_ifpx_create(enum rte_ifpx_type type)
+{
+	char devargs[16] = { '\0' };
+	int dev_cnt = 0, nlen;
+	uint16_t port_id;
+
+	switch (type) {
+	case RTE_IFPX_DEFAULT:
+	case RTE_IFPX_TAP:
+		nlen = strlcpy(devargs, "net_tap", sizeof(devargs));
+		break;
+	case RTE_IFPX_KNI:
+		nlen = strlcpy(devargs, "net_kni", sizeof(devargs));
+		break;
+	default:
+		IFPX_LOG(ERR, "Unknown proxy type: %d", type);
+		return RTE_MAX_ETHPORTS;
+	}
+
+	RTE_ETH_FOREACH_DEV(port_id) {
+		if (strcmp(rte_eth_devices[port_id].device->driver->name,
+			   devargs) == 0)
+			++dev_cnt;
+	}
+	snprintf(devargs+nlen, sizeof(devargs)-nlen, "%d", dev_cnt);
+
+	return rte_ifpx_create_by_devarg(devargs);
+}
+
+uint16_t rte_ifpx_create_by_devarg(const char *devarg)
+{
+	uint16_t port_id = RTE_MAX_ETHPORTS;
+	struct rte_dev_iterator iter;
+
+	if (rte_dev_probe(devarg) < 0) {
+		IFPX_LOG(ERR, "Failed to create proxy port %s\n", devarg);
+		return RTE_MAX_ETHPORTS;
+	}
+
+	RTE_ETH_FOREACH_MATCHING_DEV(port_id, devarg, &iter) {
+		break;
+	}
+	if (port_id != RTE_MAX_ETHPORTS)
+		rte_eth_iterator_cleanup(&iter);
+
+	return port_id;
+}
+
+int rte_ifpx_destroy(uint16_t proxy_id)
+{
+	struct ifpx_proxies_node *px;
+	unsigned int i;
+	int ec = 0;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id != proxy_id)
+			continue;
+	}
+	if (!px) {
+		ec = -EINVAL;
+		goto exit;
+	}
+	TAILQ_REMOVE(&ifpx_proxies, px, elem);
+	free(px);
+
+	/* Clear any bindings for this proxy. */
+	for (i = 0; i < RTE_DIM(ifpx_p2p); ++i) {
+		if (ifpx_p2p[i] == proxy_id)
+			ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+	}
+
+	ec = rte_dev_remove(rte_eth_devices[proxy_id].device);
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+	return ec;
+}
+
+int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id)
+{
+	struct rte_eth_dev_info proxy_eth_info;
+	struct ifpx_proxies_node *px;
+	int ec;
+
+	if (port_id >= RTE_MAX_ETHPORTS || proxy_id >= RTE_MAX_ETHPORTS) {
+		IFPX_LOG(ERR, "Invalid port_id: %d", port_id);
+		return -EINVAL;
+	}
+
+	/* Do automatic rebinding but issue a warning since this is not
+	 * considered to be a valid behaviour.
+	 */
+	if (ifpx_p2p[port_id] != RTE_MAX_ETHPORTS) {
+		IFPX_LOG(WARNING, "Port already bound: %d -> %d", port_id,
+			 ifpx_p2p[port_id]);
+	}
+
+	ec = rte_eth_dev_info_get(proxy_id, &proxy_eth_info);
+	if (ec < 0) {
+		IFPX_LOG(ERR, "Failed to read proxy dev info: %d", ec);
+		return ec;
+	}
+	if (proxy_eth_info.if_index == 0) {
+		IFPX_LOG(ERR, "Proxy with no IF index");
+		return -EINVAL;
+	}
+
+	/* Search for existing proxy - if not found add one to the list. */
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id == proxy_id)
+			break;
+	}
+	if (!px) {
+		px = malloc(sizeof(*px));
+		if (!px) {
+			rte_spinlock_unlock(&ifpx_lock);
+			return -ENOMEM;
+		}
+		px->proxy_id = proxy_id;
+		px->info.if_index = proxy_eth_info.if_index;
+		rte_eth_dev_get_mtu(proxy_id, &px->info.mtu);
+		rte_eth_macaddr_get(proxy_id, &px->info.mac);
+		memset(px->info.if_name, 0, sizeof(px->info.if_name));
+		TAILQ_INSERT_TAIL(&ifpx_proxies, px, elem);
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+	ifpx_p2p[port_id] = proxy_id;
+
+	if (ifpx_irq.fd != -1)
+		request_info(RTM_GETLINK, px->info.if_index);
+
+	return 0;
+}
+
+int rte_ifpx_port_unbind(uint16_t port_id)
+{
+	if (port_id >= RTE_MAX_ETHPORTS ||
+	    ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+		return -EINVAL;
+
+	ifpx_p2p[port_id] = RTE_MAX_ETHPORTS;
+	/* Proxy without any port bound is OK - that is the state of the proxy
+	 * that has just been created, and it can still report routing
+	 * information.  So we do not even check if this is the case.
+	 */
+
+	return 0;
+}
+
+rte_ifpx_cbs_hndl rte_ifpx_callbacks_register(const
+					      struct rte_ifpx_callbacks *cbs)
+{
+	rte_ifpx_cbs_hndl cb_hndl = NULL;
+	struct ifpx_cbs_node *node;
+
+	if (!cbs)
+		return NULL;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+		if (&node->cbs == cbs) {
+			cb_hndl = cbs;
+			goto exit;
+		}
+	}
+
+	node = malloc(sizeof(*node));
+	if (!node)
+		goto exit;
+
+	node->cbs = *cbs;
+	TAILQ_INSERT_TAIL(&ifpx_callbacks, node, elem);
+	cb_hndl = &node->cbs;
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return cb_hndl;
+}
+
+int rte_ifpx_callbacks_unregister(rte_ifpx_cbs_hndl cbs)
+{
+	struct ifpx_cbs_node *node;
+	int ec = -EINVAL;
+
+	if (!cbs)
+		return ec;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+		if (&node->cbs == cbs) {
+			TAILQ_REMOVE(&ifpx_callbacks, node, elem);
+			free(node);
+			ec = 0;
+			break;
+		}
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return ec;
+}
+
+uint16_t rte_ifpx_proxy_get(uint16_t port_id)
+{
+	if (port_id >= RTE_MAX_ETHPORTS)
+		return RTE_MAX_ETHPORTS;
+
+	return ifpx_p2p[port_id];
+}
+
+unsigned int rte_ifpx_port_get(uint16_t proxy_id,
+                               uint16_t *ports, unsigned int num)
+{
+	unsigned int p, cnt = 0;
+
+	for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+		if (ifpx_p2p[p] == proxy_id) {
+			++cnt;
+			if (ports && num > 0) {
+				*ports++ = ifpx_p2p[p];
+				--num;
+			}
+		}
+	}
+	return cnt;
+}
+
+const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
+{
+	struct ifpx_proxies_node *px;
+
+	if (port_id >= RTE_MAX_ETHPORTS ||
+	    ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+		return NULL;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id == ifpx_p2p[port_id])
+			break;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+	RTE_ASSERT(px && "Internal IF Proxy library error");
+
+	return &px->info;
+}
+
+static
+void handle_link(const struct nlmsghdr *h)
+{
+	const struct ifinfomsg *ifi = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	const struct rtattr *attrs[IFLA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_proxies_node *px;
+	struct ifpx_cbs_node *cb;
+	uint16_t p;
+
+	IFPX_LOG(DEBUG, "\tLink action (%u): %u, 0x%x/0x%x (flags/changed)",
+		 ifi->ifi_index, h->nlmsg_type, ifi->ifi_flags,
+		 ifi->ifi_change);
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->info.if_index == (unsigned int)ifi->ifi_index)
+			break;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+
+	/* Drop messages that are not associated with any proxy */
+	if (!px)
+		return;
+	/* When message is a reply to request for specific interface then keep
+	 * it only when it contains info for this interface.
+	 */
+	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+	    (h->nlmsg_seq >> 8) != (unsigned int)ifi->ifi_index)
+		return;
+
+	for (attr = IFLA_RTA(ifi); RTA_OK(attr, alen);
+				   attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > IFLA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	rte_spinlock_lock(&ifpx_lock);
+	if (ifi->ifi_change & IFF_UP) {
+		TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+			if (!cb->cbs.link_change)
+				continue;
+			for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+				if (ifpx_p2p[p] != px->proxy_id)
+					continue;
+				cb->cbs.link_change(p,
+						    ifi->ifi_flags & IFF_UP);
+			}
+		}
+	}
+	if (attrs[IFLA_MTU]) {
+		uint16_t mtu = *(const int *)RTA_DATA(attrs[IFLA_MTU]);
+		if (mtu != px->info.mtu) {
+			px->info.mtu = mtu;
+			TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+				if (!cb->cbs.mtu_change)
+					continue;
+				for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+					if (ifpx_p2p[p] != px->proxy_id)
+						continue;
+					cb->cbs.mtu_change(p, mtu);
+				}
+			}
+		}
+	}
+	if (attrs[IFLA_ADDRESS]) {
+		const struct rte_ether_addr *mac =
+		                RTA_DATA(attrs[IFLA_ADDRESS]);
+
+		RTE_ASSERT(RTA_PAYLOAD(attrs[IFLA_ADDRESS]) ==
+		           RTE_ETHER_ADDR_LEN);
+		if (memcmp(mac, &px->info.mac, RTE_ETHER_ADDR_LEN) != 0) {
+			memcpy(px->info.mac.addr_bytes, mac, RTE_ETHER_ADDR_LEN);
+			TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+				if (!cb->cbs.mac_change)
+					continue;
+				for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+					if (ifpx_p2p[p] != px->proxy_id)
+						continue;
+					cb->cbs.mac_change(p, mac);
+				}
+			}
+		}
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+	if (h->nlmsg_pid == ifpx_pid) {
+		RTE_ASSERT((h->nlmsg_seq & 0xFF) == RTM_GETLINK);
+		/* If this is reply for specific link request (not initial
+		 * global dump) then follow up with address request, otherwise
+		 * just store the interface name.
+		 */
+		if (h->nlmsg_seq >> 8)
+			request_info(RTM_GETADDR, ifi->ifi_index);
+		else if (!px->info.if_name[0] && attrs[IFLA_IFNAME])
+			strlcpy(px->info.if_name, RTA_DATA(attrs[IFLA_IFNAME]),
+				sizeof(px->info.if_name));
+	}
+}
+
+static
+void handle_addr(const struct nlmsghdr *h, bool needs_del)
+{
+	const struct ifaddrmsg *ifa = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa));
+	const struct rtattr *attrs[IFA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_proxies_node *px;
+	struct ifpx_cbs_node *cb;
+	const uint8_t *ip;
+	uint16_t p;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->info.if_index == ifa->ifa_index)
+			break;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+
+	/* Drop messages that are not associated with any proxy */
+	if (!px)
+		return;
+	/* When message is a reply to request for specific interface then keep
+	 * it only when it contains info for this interface.
+	 */
+	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+	    (h->nlmsg_seq >> 8) != ifa->ifa_index)
+		return;
+
+	for (attr = IFA_RTA(ifa); RTA_OK(attr, alen);
+				  attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > IFA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	rte_spinlock_lock(&ifpx_lock);
+	if (attrs[IFA_ADDRESS]) {
+		TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+			struct rte_ifpx_callbacks *cbs = &cb->cbs;
+
+			ip = RTA_DATA(attrs[IFA_ADDRESS]);
+			if (ifa->ifa_family == AF_INET) {
+				/* address is in network order */
+				uint32_t ipv4 =
+					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+				for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+					if (ifpx_p2p[p] != px->proxy_id)
+						continue;
+					if (needs_del && cbs->addr_del)
+						cb->cbs.addr_del(p, ipv4);
+					else if (!needs_del && cbs->addr_add)
+						cb->cbs.addr_add(p, ipv4);
+				}
+			} else if (ifa->ifa_family == AF_INET6) {
+				for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+					if (ifpx_p2p[p] != px->proxy_id)
+						continue;
+					if (needs_del && cbs->addr6_del)
+						cb->cbs.addr6_del(p, ip);
+					else if (!needs_del && cbs->addr6_add)
+						cb->cbs.addr6_add(p, ip);
+				}
+			}
+		}
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void handle_route(const struct nlmsghdr *h, bool needs_del)
+{
+	const struct rtmsg *r = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+	const struct rtattr *attrs[RTA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_cbs_node *node;
+	const uint8_t *ip;
+
+	for (attr = RTM_RTA(r); RTA_OK(attr, alen);
+				attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > RTA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	rte_spinlock_lock(&ifpx_lock);
+	if (attrs[RTA_DST]) {
+		TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+			struct rte_ifpx_callbacks *cbs = &node->cbs;
+
+			ip = RTA_DATA(attrs[RTA_DST]);
+			if (r->rtm_family == AF_INET) {
+				/* address is in network order */
+				uint32_t ipv4 =
+					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+				if (needs_del && cbs->route_del)
+					cbs->route_del(ipv4, r->rtm_dst_len);
+				else if (!needs_del && cbs->route_add)
+					cbs->route_add(ipv4, r->rtm_dst_len);
+			} else if (r->rtm_family == AF_INET6) {
+				if (needs_del && cbs->route6_del)
+					cbs->route6_del(ip, r->rtm_dst_len);
+				else if (!needs_del && cbs->route6_add)
+					cbs->route6_add(ip, r->rtm_dst_len);
+			}
+		}
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+int request_info(int type, int index)
+{
+	static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
+	struct info_get {
+		struct nlmsghdr h;
+		union {
+			struct ifinfomsg ifm;
+			struct ifaddrmsg ifa;
+			struct rtmsg rtm;
+		} __rte_aligned(NLMSG_ALIGNTO);
+	} info_req;
+	int ret;
+
+	IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
+
+	memset(&info_req, 0, sizeof(info_req));
+	/* First byte of these messages is family, so just make sure that this
+	 * memset is enough to get all families.
+	 */
+	RTE_ASSERT(AF_UNSPEC == 0);
+
+	info_req.h.nlmsg_pid = ifpx_pid;
+	info_req.h.nlmsg_type = type;
+	info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
+
+	switch (type) {
+	case RTM_GETLINK:
+		info_req.h.nlmsg_len += sizeof(info_req.ifm);
+		info_req.ifm.ifi_index = index;
+		break;
+	case RTM_GETADDR:
+		info_req.h.nlmsg_len += sizeof(info_req.ifa);
+		info_req.ifa.ifa_index = index;
+		break;
+	case RTM_GETROUTE:
+		info_req.h.nlmsg_len += sizeof(info_req.rtm);
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* Store request type (and if it is global or link specific) in 'seq'.
+	 * Later it is used during handling of reply to continue requesting of
+	 * information dump from system - if needed.
+	 */
+	info_req.h.nlmsg_seq = index << 8 | type;
+
+	rte_spinlock_lock(&send_lock);
+	ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
+		rte_errno = errno;
+	}
+	rte_spinlock_unlock(&send_lock);
+
+	return ret;
+}
+
+static
+void notify_cfg_finished(void)
+{
+	struct ifpx_cbs_node *node;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+		if ( !node->cbs.cfg_finished)
+			continue;
+		node->cbs.cfg_finished();
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void if_proxy_intr_callback(void *arg __rte_unused)
+{
+	struct nlmsghdr *h;
+	struct sockaddr_nl addr;
+	socklen_t addr_len;
+	char buf[8192];
+	ssize_t len;
+
+restart:
+	len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
+		       (struct sockaddr *)&addr, &addr_len);
+	if (len < 0) {
+		if (errno == EINTR) {
+			IFPX_LOG(DEBUG, "recvmsg() interrupted");
+			goto restart;
+		}
+		IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
+			 len, errno);
+		return;
+	}
+	if (addr_len != sizeof(addr)) {
+		IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
+		return;
+	}
+	IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
+		 sizeof(buf), addr.nl_pid, addr.nl_groups);
+
+	for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
+					 h = NLMSG_NEXT(h, len)) {
+		IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
+			 h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
+			 h->nlmsg_pid);
+
+		switch (h->nlmsg_type) {
+		case RTM_NEWLINK:
+		case RTM_DELLINK:
+			handle_link(h);
+			break;
+		case RTM_NEWADDR:
+		case RTM_DELADDR:
+			handle_addr(h, h->nlmsg_type == RTM_DELADDR);
+			break;
+		case RTM_NEWROUTE:
+		case RTM_DELROUTE:
+			handle_route(h, h->nlmsg_type == RTM_DELROUTE);
+			break;
+		}
+
+		/* If this is a reply for global request then follow up with
+		 * additional requests and notify about finish.
+		 */
+		if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
+		    h->nlmsg_type == NLMSG_DONE) {
+			if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
+				request_info(RTM_GETADDR, 0);
+			else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
+				request_info(RTM_GETROUTE, 0);
+			else {
+				RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
+								RTE_GETROUTE);
+				notify_cfg_finished();
+			}
+		}
+	}
+	IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
+}
+
+int rte_ifpx_listen(void)
+{
+	struct sockaddr_nl addr = {
+		.nl_family = AF_NETLINK,
+		.nl_pid = 0,
+	};
+	socklen_t addr_len = sizeof(addr);
+	int ret;
+
+	if (ifpx_irq.fd != -1) {
+		rte_errno = EBUSY;
+		return -1;
+	}
+
+	addr.nl_groups = 1 << (RTNLGRP_LINK-1)
+			| 1 << (RTNLGRP_IPV4_IFADDR-1)
+			| 1 << (RTNLGRP_IPV6_IFADDR-1)
+			| 1 << (RTNLGRP_IPV4_ROUTE-1)
+			| 1 << (RTNLGRP_IPV6_ROUTE-1);
+
+	ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+				 NETLINK_ROUTE);
+	if (ifpx_irq.fd == -1) {
+		IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
+		goto error;
+	}
+	/* Starting with kernel 4.19 you can request dump for a specific
+	 * interface and kernel will filter out and send only relevant info.
+	 * Otherwise NLM_F_DUMP will generate info for all interfaces and you
+	 * need to filter them yourself.
+	 */
+#ifdef NETLINK_DUMP_STRICT_CHK
+	ret = 1; /* use this var also as an input param */
+	ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
+			 &ret, sizeof(ret));
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
+		goto error;
+	}
+#endif
+
+	ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
+		goto error;
+	}
+	ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
+		goto error;
+	} else {
+		ifpx_pid = addr.nl_pid;
+		IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
+	}
+
+	ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
+					 NULL);
+	if (ret < 0)
+		goto error;
+
+	request_info(RTM_GETLINK, 0);
+
+	return 0;
+
+error:
+	rte_errno = errno;
+	if (ifpx_irq.fd != -1) {
+		close(ifpx_irq.fd);
+		ifpx_irq.fd = -1;
+	}
+	return -1;
+}
+
+int rte_ifpx_close(void)
+{
+	int ec;
+	unsigned int p;
+	struct ifpx_cbs_node *cbs;
+	struct ifpx_proxies_node *px;
+
+	if (ifpx_irq.fd < 0)
+		return -EBADFD;
+
+restart:
+	ec = rte_intr_callback_unregister(&ifpx_irq,
+					  if_proxy_intr_callback, NULL);
+	if (ec == -EAGAIN) /* unlikely but possible - at least I think so */
+		goto restart;
+
+	rte_spinlock_lock(&ifpx_lock);
+
+	close(ifpx_irq.fd);
+	ifpx_irq.fd = -1;
+	ifpx_pid = 0;
+
+	/* Clear callbacks. */
+	while (!TAILQ_EMPTY(&ifpx_callbacks)) {
+		cbs = TAILQ_FIRST(&ifpx_callbacks);
+		TAILQ_REMOVE(&ifpx_callbacks, cbs, elem);
+		free(cbs);
+	}
+
+	/* Clear proxies. */
+	while (!TAILQ_EMPTY(&ifpx_proxies)) {
+		px = TAILQ_FIRST(&ifpx_proxies);
+		TAILQ_REMOVE(&ifpx_proxies, px, elem);
+		free(px);
+	}
+
+	for (p = 0; p < RTE_DIM(ifpx_p2p); ++p)
+		ifpx_p2p[p] = RTE_MAX_ETHPORTS;
+
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return 0;
+}
+
+RTE_INIT(if_proxy_init)
+{
+	unsigned int i;
+	for (i = 0; i < RTE_DIM(ifpx_p2p); ++i)
+		ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+
+	ifpx_log_type = rte_log_register("lib.if_proxy");
+	if (ifpx_log_type >= 0)
+		rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
+}
diff --git a/lib/meson.build b/lib/meson.build
index 0af3efab2..c913b33dd 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -19,7 +19,7 @@ libraries = [
 	'acl', 'bbdev', 'bitratestats', 'cfgfile',
 	'compressdev', 'cryptodev',
 	'distributor', 'efd', 'eventdev',
-	'gro', 'gso', 'ip_frag', 'jobstats',
+	'gro', 'gso', 'if_proxy', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
 	'power', 'pdump', 'rawdev',
 	'rcu', 'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
-- 
2.17.1


  parent reply	other threads:[~2020-01-14 14:25 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-14 14:25 [dpdk-dev] [RFC PATCH 0/3] introduce IF proxy library Andrzej Ostruszka
2020-01-14 14:25 ` [dpdk-dev] [RFC PATCH 1/3] lib: introduce IF proxy library (API) Andrzej Ostruszka
2020-01-14 14:25 ` Andrzej Ostruszka [this message]
2020-01-14 14:25 ` [dpdk-dev] [RFC PATCH 3/3] if_proxy: add example, test and documentation Andrzej Ostruszka
2020-01-14 15:16 ` [dpdk-dev] [RFC PATCH 0/3] introduce IF proxy library Morten Brørup
2020-01-14 17:38   ` Andrzej Ostruszka
2020-01-15 10:15     ` Bruce Richardson
2020-01-15 11:27       ` Jerin Jacob
2020-01-15 12:28       ` Morten Brørup
2020-01-15 12:57         ` Jerin Jacob
2020-01-15 15:30           ` Morten Brørup
2020-01-15 16:04             ` Jerin Jacob
2020-01-15 18:15               ` Morten Brørup
2020-01-16  7:15                 ` Jerin Jacob
2020-01-16  9:11                   ` Morten Brørup
2020-01-16  9:09                 ` Andrzej Ostruszka
2020-01-16  9:30                   ` Morten Brørup
2020-01-16 10:42                     ` Andrzej Ostruszka
2020-01-16 10:58                       ` Morten Brørup
2020-01-16 12:06                         ` Andrzej Ostruszka
2020-01-15 14:09         ` Bruce Richardson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200114142517.29522-3-aostruszka@marvell.com \
    --to=aostruszka@marvell.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=kirankumark@marvell.com \
    --cc=kkanas@marvell.com \
    --cc=ndabilpuram@marvell.com \
    --cc=pbhagavatula@marvell.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).