From: Andrzej Ostruszka <aostruszka@marvell.com>
To: <dev@dpdk.org>, Thomas Monjalon <thomas@monjalon.net>
Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>,
Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>,
Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>,
Kiran Kumar Kokkilagadda <kirankumark@marvell.com>,
Krzysztof Kanas <kkanas@marvell.com>
Subject: [dpdk-dev] [RFC PATCH 2/3] if_proxy: add preliminary Linux implementation
Date: Tue, 14 Jan 2020 15:25:16 +0100 [thread overview]
Message-ID: <20200114142517.29522-3-aostruszka@marvell.com> (raw)
In-Reply-To: <20200114142517.29522-1-aostruszka@marvell.com>
This commit adds a preliminary Linux implementation of the IF Proxy
library. It should allow one to play around with the idea and check its
usefulness.
Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
---
config/common_base | 5 +
lib/Makefile | 2 +
.../common/include/rte_eal_interrupts.h | 2 +
lib/librte_eal/linux/eal/eal_interrupts.c | 14 +-
lib/librte_if_proxy/Makefile | 25 +
lib/librte_if_proxy/meson.build | 7 +
lib/librte_if_proxy/rte_if_proxy.c | 803 ++++++++++++++++++
lib/meson.build | 2 +-
8 files changed, 855 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_if_proxy/Makefile
create mode 100644 lib/librte_if_proxy/meson.build
create mode 100644 lib/librte_if_proxy/rte_if_proxy.c
diff --git a/config/common_base b/config/common_base
index 7dec7ed45..f20296750 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1056,6 +1056,11 @@ CONFIG_RTE_LIBRTE_BPF_ELF=n
#
CONFIG_RTE_LIBRTE_IPSEC=y
+#
+# Compile librte_if_proxy
+#
+CONFIG_RTE_LIBRTE_IF_PROXY=y
+
#
# Compile the test application
#
diff --git a/lib/Makefile b/lib/Makefile
index 46b91ae1a..0a60f3656 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -118,6 +118,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
DEPDIRS-librte_rcu := librte_eal
+DIRS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += librte_if_proxy
+DEPDIRS-librte_if_proxy := librte_eal
ifeq ($(CONFIG_RTE_EXEC_ENV_LINUX),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
index b370c0d26..f3d39a5ce 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -35,7 +35,9 @@ enum rte_intr_handle_type {
RTE_INTR_HANDLE_EXT, /**< external handler */
RTE_INTR_HANDLE_VDEV, /**< virtual device */
RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */
+ RTE_INTR_HANDLE_NETLINK, /**< netlink notification handle */
RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */
+
RTE_INTR_HANDLE_MAX /**< count of elements */
};
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
index 14ebb108c..ccdd94002 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -680,6 +680,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
break;
/* not used at this moment */
case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+ case RTE_INTR_HANDLE_NETLINK:
+#endif
return -1;
#ifdef VFIO_PRESENT
case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -796,6 +799,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
break;
/* not used at this moment */
case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+ case RTE_INTR_HANDLE_NETLINK:
+#endif
return -1;
#ifdef VFIO_PRESENT
case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -889,12 +895,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
break;
#endif
#endif
- case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_EXT:
- bytes_read = 0;
- call = true;
- break;
+ case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_DEV_EVENT:
+#if RTE_LIBRTE_IF_PROXY
+ case RTE_INTR_HANDLE_NETLINK:
+#endif
bytes_read = 0;
call = true;
break;
diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
new file mode 100644
index 000000000..9dd5f4791
--- /dev/null
+++ b/lib/librte_if_proxy/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_if_proxy.a
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+LDLIBS += -lrte_eal
+
+EXPORT_MAP := rte_if_proxy_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := rte_if_proxy.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
new file mode 100644
index 000000000..f9ed410b6
--- /dev/null
+++ b/lib/librte_if_proxy/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+version = 1
+allow_experimental_apis = true
+sources = files('rte_if_proxy.c')
+headers = files('rte_if_proxy.h')
diff --git a/lib/librte_if_proxy/rte_if_proxy.c b/lib/librte_if_proxy/rte_if_proxy.c
new file mode 100644
index 000000000..770462702
--- /dev/null
+++ b/lib/librte_if_proxy/rte_if_proxy.c
@@ -0,0 +1,803 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#include <rte_if_proxy.h>
+#include <rte_interrupts.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+
+static
+int ifpx_log_type;
+#define IFPX_LOG(level, fmt, args...) \
+ rte_log(RTE_LOG_ ## level, ifpx_log_type, "%s(): " fmt "\n", \
+ __func__, ##args)
+
+static
+struct rte_intr_handle ifpx_irq = {
+ .type = RTE_INTR_HANDLE_NETLINK,
+ .fd = -1,
+};
+
+static
+unsigned int ifpx_pid;
+
+/* Port to proxy mapping table */
+static uint16_t ifpx_p2p[RTE_MAX_ETHPORTS];
+
+/* Since this library is really slow/config path we guard data structures with
+ * a lock - and only one for all of them should be enough. But only callback
+ * and proxies lists are protected, I don't expect the need to protect port to
+ * proxy map table above.
+ */
+static
+rte_spinlock_t ifpx_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* List of configured proxies */
+struct ifpx_proxies_node {
+ TAILQ_ENTRY(ifpx_proxies_node) elem;
+ uint16_t proxy_id;
+ struct rte_ifpx_info info;
+};
+static
+TAILQ_HEAD(ifpx_proxies_head, ifpx_proxies_node) ifpx_proxies =
+ TAILQ_HEAD_INITIALIZER(ifpx_proxies);
+
+/* List of registered callbacks */
+struct ifpx_cbs_node {
+ TAILQ_ENTRY(ifpx_cbs_node) elem;
+ struct rte_ifpx_callbacks cbs;
+};
+static
+TAILQ_HEAD(ifpx_cbs_head, ifpx_cbs_node) ifpx_callbacks =
+ TAILQ_HEAD_INITIALIZER(ifpx_callbacks);
+
+static
+int request_info(int type, int index);
+
+uint64_t rte_ifpx_callbacks_available(void)
+{
+ return RTE_IFPX_MAC_CHANGE | RTE_IFPX_MTU_CHANGE |
+ RTE_IFPX_LINK_CHANGE | RTE_IFPX_ADDR_ADD |
+ RTE_IFPX_ADDR_DEL | RTE_IFPX_ADDR6_ADD |
+ RTE_IFPX_ADDR6_DEL | RTE_IFPX_ROUTE_ADD |
+ RTE_IFPX_ROUTE_DEL | RTE_IFPX_ROUTE6_ADD |
+ RTE_IFPX_ROUTE6_DEL;
+}
+
+uint16_t rte_ifpx_create(enum rte_ifpx_type type)
+{
+ char devargs[16] = { '\0' };
+ int dev_cnt = 0, nlen;
+ uint16_t port_id;
+
+ switch (type) {
+ case RTE_IFPX_DEFAULT:
+ case RTE_IFPX_TAP:
+ nlen = strlcpy(devargs, "net_tap", sizeof(devargs));
+ break;
+ case RTE_IFPX_KNI:
+ nlen = strlcpy(devargs, "net_kni", sizeof(devargs));
+ break;
+ default:
+ IFPX_LOG(ERR, "Unknown proxy type: %d", type);
+ return RTE_MAX_ETHPORTS;
+ }
+
+ RTE_ETH_FOREACH_DEV(port_id) {
+ if (strcmp(rte_eth_devices[port_id].device->driver->name,
+ devargs) == 0)
+ ++dev_cnt;
+ }
+ snprintf(devargs+nlen, sizeof(devargs)-nlen, "%d", dev_cnt);
+
+ return rte_ifpx_create_by_devarg(devargs);
+}
+
+uint16_t rte_ifpx_create_by_devarg(const char *devarg)
+{
+ uint16_t port_id = RTE_MAX_ETHPORTS;
+ struct rte_dev_iterator iter;
+
+ if (rte_dev_probe(devarg) < 0) {
+ IFPX_LOG(ERR, "Failed to create proxy port %s\n", devarg);
+ return RTE_MAX_ETHPORTS;
+ }
+
+ RTE_ETH_FOREACH_MATCHING_DEV(port_id, devarg, &iter) {
+ break;
+ }
+ if (port_id != RTE_MAX_ETHPORTS)
+ rte_eth_iterator_cleanup(&iter);
+
+ return port_id;
+}
+
+int rte_ifpx_destroy(uint16_t proxy_id)
+{
+ struct ifpx_proxies_node *px;
+ unsigned int i;
+ int ec = 0;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+ if (px->proxy_id != proxy_id)
+ continue;
+ }
+ if (!px) {
+ ec = -EINVAL;
+ goto exit;
+ }
+ TAILQ_REMOVE(&ifpx_proxies, px, elem);
+ free(px);
+
+ /* Clear any bindings for this proxy. */
+ for (i = 0; i < RTE_DIM(ifpx_p2p); ++i) {
+ if (ifpx_p2p[i] == proxy_id)
+ ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+ }
+
+ ec = rte_dev_remove(rte_eth_devices[proxy_id].device);
+exit:
+ rte_spinlock_unlock(&ifpx_lock);
+ return ec;
+}
+
+int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id)
+{
+ struct rte_eth_dev_info proxy_eth_info;
+ struct ifpx_proxies_node *px;
+ int ec;
+
+ if (port_id >= RTE_MAX_ETHPORTS || proxy_id >= RTE_MAX_ETHPORTS) {
+ IFPX_LOG(ERR, "Invalid port_id: %d", port_id);
+ return -EINVAL;
+ }
+
+ /* Do automatic rebinding but issue a warning since this is not
+ * considered to be a valid behaviour.
+ */
+ if (ifpx_p2p[port_id] != RTE_MAX_ETHPORTS) {
+ IFPX_LOG(WARNING, "Port already bound: %d -> %d", port_id,
+ ifpx_p2p[port_id]);
+ }
+
+ ec = rte_eth_dev_info_get(proxy_id, &proxy_eth_info);
+ if (ec < 0) {
+ IFPX_LOG(ERR, "Failed to read proxy dev info: %d", ec);
+ return ec;
+ }
+ if (proxy_eth_info.if_index == 0) {
+ IFPX_LOG(ERR, "Proxy with no IF index");
+ return -EINVAL;
+ }
+
+ /* Search for existing proxy - if not found add one to the list. */
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+ if (px->proxy_id == proxy_id)
+ break;
+ }
+ if (!px) {
+ px = malloc(sizeof(*px));
+ if (!px) {
+ rte_spinlock_unlock(&ifpx_lock);
+ return -ENOMEM;
+ }
+ px->proxy_id = proxy_id;
+ px->info.if_index = proxy_eth_info.if_index;
+ rte_eth_dev_get_mtu(proxy_id, &px->info.mtu);
+ rte_eth_macaddr_get(proxy_id, &px->info.mac);
+ memset(px->info.if_name, 0, sizeof(px->info.if_name));
+ TAILQ_INSERT_TAIL(&ifpx_proxies, px, elem);
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+ ifpx_p2p[port_id] = proxy_id;
+
+ if (ifpx_irq.fd != -1)
+ request_info(RTM_GETLINK, px->info.if_index);
+
+ return 0;
+}
+
+int rte_ifpx_port_unbind(uint16_t port_id)
+{
+ if (port_id >= RTE_MAX_ETHPORTS ||
+ ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+ return -EINVAL;
+
+ ifpx_p2p[port_id] = RTE_MAX_ETHPORTS;
+ /* Proxy without any port bound is OK - that is the state of the proxy
+ * that has just been created, and it can still report routing
+ * information. So we do not even check if this is the case.
+ */
+
+ return 0;
+}
+
+rte_ifpx_cbs_hndl rte_ifpx_callbacks_register(const
+ struct rte_ifpx_callbacks *cbs)
+{
+ rte_ifpx_cbs_hndl cb_hndl = NULL;
+ struct ifpx_cbs_node *node;
+
+ if (!cbs)
+ return NULL;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+ if (&node->cbs == cbs) {
+ cb_hndl = cbs;
+ goto exit;
+ }
+ }
+
+ node = malloc(sizeof(*node));
+ if (!node)
+ goto exit;
+
+ node->cbs = *cbs;
+ TAILQ_INSERT_TAIL(&ifpx_callbacks, node, elem);
+ cb_hndl = &node->cbs;
+exit:
+ rte_spinlock_unlock(&ifpx_lock);
+
+ return cb_hndl;
+}
+
+int rte_ifpx_callbacks_unregister(rte_ifpx_cbs_hndl cbs)
+{
+ struct ifpx_cbs_node *node;
+ int ec = -EINVAL;
+
+ if (!cbs)
+ return ec;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+ if (&node->cbs == cbs) {
+ TAILQ_REMOVE(&ifpx_callbacks, node, elem);
+ free(node);
+ ec = 0;
+ break;
+ }
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+
+ return ec;
+}
+
+uint16_t rte_ifpx_proxy_get(uint16_t port_id)
+{
+ if (port_id >= RTE_MAX_ETHPORTS)
+ return RTE_MAX_ETHPORTS;
+
+ return ifpx_p2p[port_id];
+}
+
+unsigned int rte_ifpx_port_get(uint16_t proxy_id,
+ uint16_t *ports, unsigned int num)
+{
+ unsigned int p, cnt = 0;
+
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] == proxy_id) {
+ ++cnt;
+ if (ports && num > 0) {
+ *ports++ = ifpx_p2p[p];
+ --num;
+ }
+ }
+ }
+ return cnt;
+}
+
+const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
+{
+ struct ifpx_proxies_node *px;
+
+ if (port_id >= RTE_MAX_ETHPORTS ||
+ ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+ return NULL;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+ if (px->proxy_id == ifpx_p2p[port_id])
+ break;
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+ RTE_ASSERT(px && "Internal IF Proxy library error");
+
+ return &px->info;
+}
+
+static
+void handle_link(const struct nlmsghdr *h)
+{
+ const struct ifinfomsg *ifi = NLMSG_DATA(h);
+ int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+ const struct rtattr *attrs[IFLA_MAX+1] = { NULL };
+ const struct rtattr *attr;
+ struct ifpx_proxies_node *px;
+ struct ifpx_cbs_node *cb;
+ uint16_t p;
+
+ IFPX_LOG(DEBUG, "\tLink action (%u): %u, 0x%x/0x%x (flags/changed)",
+ ifi->ifi_index, h->nlmsg_type, ifi->ifi_flags,
+ ifi->ifi_change);
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+ if (px->info.if_index == (unsigned int)ifi->ifi_index)
+ break;
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+
+ /* Drop messages that are not associated with any proxy */
+ if (!px)
+ return;
+ /* When message is a reply to request for specific interface then keep
+ * it only when it contains info for this interface.
+ */
+ if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+ (h->nlmsg_seq >> 8) != (unsigned int)ifi->ifi_index)
+ return;
+
+ for (attr = IFLA_RTA(ifi); RTA_OK(attr, alen);
+ attr = RTA_NEXT(attr, alen)) {
+ if (attr->rta_type > IFLA_MAX)
+ continue;
+ attrs[attr->rta_type] = attr;
+ }
+
+ rte_spinlock_lock(&ifpx_lock);
+ if (ifi->ifi_change & IFF_UP) {
+ TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+ if (!cb->cbs.link_change)
+ continue;
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] != px->proxy_id)
+ continue;
+ cb->cbs.link_change(p,
+ ifi->ifi_flags & IFF_UP);
+ }
+ }
+ }
+ if (attrs[IFLA_MTU]) {
+ uint16_t mtu = *(const int *)RTA_DATA(attrs[IFLA_MTU]);
+ if (mtu != px->info.mtu) {
+ px->info.mtu = mtu;
+ TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+ if (!cb->cbs.mtu_change)
+ continue;
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] != px->proxy_id)
+ continue;
+ cb->cbs.mtu_change(p, mtu);
+ }
+ }
+ }
+ }
+ if (attrs[IFLA_ADDRESS]) {
+ const struct rte_ether_addr *mac =
+ RTA_DATA(attrs[IFLA_ADDRESS]);
+
+ RTE_ASSERT(RTA_PAYLOAD(attrs[IFLA_ADDRESS]) ==
+ RTE_ETHER_ADDR_LEN);
+ if (memcmp(mac, &px->info.mac, RTE_ETHER_ADDR_LEN) != 0) {
+ memcpy(px->info.mac.addr_bytes, mac, RTE_ETHER_ADDR_LEN);
+ TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+ if (!cb->cbs.mac_change)
+ continue;
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] != px->proxy_id)
+ continue;
+ cb->cbs.mac_change(p, mac);
+ }
+ }
+ }
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+ if (h->nlmsg_pid == ifpx_pid) {
+ RTE_ASSERT((h->nlmsg_seq & 0xFF) == RTM_GETLINK);
+ /* If this is reply for specific link request (not initial
+ * global dump) then follow up with address request, otherwise
+ * just store the interface name.
+ */
+ if (h->nlmsg_seq >> 8)
+ request_info(RTM_GETADDR, ifi->ifi_index);
+ else if (!px->info.if_name[0] && attrs[IFLA_IFNAME])
+ strlcpy(px->info.if_name, RTA_DATA(attrs[IFLA_IFNAME]),
+ sizeof(px->info.if_name));
+ }
+}
+
+static
+void handle_addr(const struct nlmsghdr *h, bool needs_del)
+{
+ const struct ifaddrmsg *ifa = NLMSG_DATA(h);
+ int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa));
+ const struct rtattr *attrs[IFA_MAX+1] = { NULL };
+ const struct rtattr *attr;
+ struct ifpx_proxies_node *px;
+ struct ifpx_cbs_node *cb;
+ const uint8_t *ip;
+ uint16_t p;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+ if (px->info.if_index == ifa->ifa_index)
+ break;
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+
+ /* Drop messages that are not associated with any proxy */
+ if (!px)
+ return;
+ /* When message is a reply to request for specific interface then keep
+ * it only when it contains info for this interface.
+ */
+ if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+ (h->nlmsg_seq >> 8) != ifa->ifa_index)
+ return;
+
+ for (attr = IFA_RTA(ifa); RTA_OK(attr, alen);
+ attr = RTA_NEXT(attr, alen)) {
+ if (attr->rta_type > IFA_MAX)
+ continue;
+ attrs[attr->rta_type] = attr;
+ }
+
+ rte_spinlock_lock(&ifpx_lock);
+ if (attrs[IFA_ADDRESS]) {
+ TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+ struct rte_ifpx_callbacks *cbs = &cb->cbs;
+
+ ip = RTA_DATA(attrs[IFA_ADDRESS]);
+ if (ifa->ifa_family == AF_INET) {
+ /* address is in network order */
+ uint32_t ipv4 =
+ RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] != px->proxy_id)
+ continue;
+ if (needs_del && cbs->addr_del)
+ cb->cbs.addr_del(p, ipv4);
+ else if (!needs_del && cbs->addr_add)
+ cb->cbs.addr_add(p, ipv4);
+ }
+ } else if (ifa->ifa_family == AF_INET6) {
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+ if (ifpx_p2p[p] != px->proxy_id)
+ continue;
+ if (needs_del && cbs->addr6_del)
+ cb->cbs.addr6_del(p, ip);
+ else if (!needs_del && cbs->addr6_add)
+ cb->cbs.addr6_add(p, ip);
+ }
+ }
+ }
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void handle_route(const struct nlmsghdr *h, bool needs_del)
+{
+ const struct rtmsg *r = NLMSG_DATA(h);
+ int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+ const struct rtattr *attrs[RTA_MAX+1] = { NULL };
+ const struct rtattr *attr;
+ struct ifpx_cbs_node *node;
+ const uint8_t *ip;
+
+ for (attr = RTM_RTA(r); RTA_OK(attr, alen);
+ attr = RTA_NEXT(attr, alen)) {
+ if (attr->rta_type > RTA_MAX)
+ continue;
+ attrs[attr->rta_type] = attr;
+ }
+
+ rte_spinlock_lock(&ifpx_lock);
+ if (attrs[RTA_DST]) {
+ TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+ struct rte_ifpx_callbacks *cbs = &node->cbs;
+
+ ip = RTA_DATA(attrs[RTA_DST]);
+ if (r->rtm_family == AF_INET) {
+ /* address is in network order */
+ uint32_t ipv4 =
+ RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+ if (needs_del && cbs->route_del)
+ cbs->route_del(ipv4, r->rtm_dst_len);
+ else if (!needs_del && cbs->route_add)
+ cbs->route_add(ipv4, r->rtm_dst_len);
+ } else if (r->rtm_family == AF_INET6) {
+ if (needs_del && cbs->route6_del)
+ cbs->route6_del(ip, r->rtm_dst_len);
+ else if (!needs_del && cbs->route6_add)
+ cbs->route6_add(ip, r->rtm_dst_len);
+ }
+ }
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+int request_info(int type, int index)
+{
+ static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
+ struct info_get {
+ struct nlmsghdr h;
+ union {
+ struct ifinfomsg ifm;
+ struct ifaddrmsg ifa;
+ struct rtmsg rtm;
+ } __rte_aligned(NLMSG_ALIGNTO);
+ } info_req;
+ int ret;
+
+ IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
+
+ memset(&info_req, 0, sizeof(info_req));
+ /* First byte of these messages is family, so just make sure that this
+ * memset is enough to get all families.
+ */
+ RTE_ASSERT(AF_UNSPEC == 0);
+
+ info_req.h.nlmsg_pid = ifpx_pid;
+ info_req.h.nlmsg_type = type;
+ info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
+
+ switch (type) {
+ case RTM_GETLINK:
+ info_req.h.nlmsg_len += sizeof(info_req.ifm);
+ info_req.ifm.ifi_index = index;
+ break;
+ case RTM_GETADDR:
+ info_req.h.nlmsg_len += sizeof(info_req.ifa);
+ info_req.ifa.ifa_index = index;
+ break;
+ case RTM_GETROUTE:
+ info_req.h.nlmsg_len += sizeof(info_req.rtm);
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* Store request type (and if it is global or link specific) in 'seq'.
+ * Later it is used during handling of reply to continue requesting of
+ * information dump from system - if needed.
+ */
+ info_req.h.nlmsg_seq = index << 8 | type;
+
+ rte_spinlock_lock(&send_lock);
+ ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
+ if (ret < 0) {
+ IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
+ rte_errno = errno;
+ }
+ rte_spinlock_unlock(&send_lock);
+
+ return ret;
+}
+
+static
+void notify_cfg_finished(void)
+{
+ struct ifpx_cbs_node *node;
+
+ rte_spinlock_lock(&ifpx_lock);
+ TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+ if ( !node->cbs.cfg_finished)
+ continue;
+ node->cbs.cfg_finished();
+ }
+ rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void if_proxy_intr_callback(void *arg __rte_unused)
+{
+ struct nlmsghdr *h;
+ struct sockaddr_nl addr;
+ socklen_t addr_len;
+ char buf[8192];
+ ssize_t len;
+
+restart:
+ len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
+ (struct sockaddr *)&addr, &addr_len);
+ if (len < 0) {
+ if (errno == EINTR) {
+ IFPX_LOG(DEBUG, "recvmsg() interrupted");
+ goto restart;
+ }
+ IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
+ len, errno);
+ return;
+ }
+ if (addr_len != sizeof(addr)) {
+ IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
+ return;
+ }
+ IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
+ sizeof(buf), addr.nl_pid, addr.nl_groups);
+
+ for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
+ h = NLMSG_NEXT(h, len)) {
+ IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
+ h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
+ h->nlmsg_pid);
+
+ switch (h->nlmsg_type) {
+ case RTM_NEWLINK:
+ case RTM_DELLINK:
+ handle_link(h);
+ break;
+ case RTM_NEWADDR:
+ case RTM_DELADDR:
+ handle_addr(h, h->nlmsg_type == RTM_DELADDR);
+ break;
+ case RTM_NEWROUTE:
+ case RTM_DELROUTE:
+ handle_route(h, h->nlmsg_type == RTM_DELROUTE);
+ break;
+ }
+
+ /* If this is a reply for global request then follow up with
+ * additional requests and notify about finish.
+ */
+ if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
+ h->nlmsg_type == NLMSG_DONE) {
+ if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
+ request_info(RTM_GETADDR, 0);
+ else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
+ request_info(RTM_GETROUTE, 0);
+ else {
+ RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
+ RTE_GETROUTE);
+ notify_cfg_finished();
+ }
+ }
+ }
+ IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
+}
+
+int rte_ifpx_listen(void)
+{
+ struct sockaddr_nl addr = {
+ .nl_family = AF_NETLINK,
+ .nl_pid = 0,
+ };
+ socklen_t addr_len = sizeof(addr);
+ int ret;
+
+ if (ifpx_irq.fd != -1) {
+ rte_errno = EBUSY;
+ return -1;
+ }
+
+ addr.nl_groups = 1 << (RTNLGRP_LINK-1)
+ | 1 << (RTNLGRP_IPV4_IFADDR-1)
+ | 1 << (RTNLGRP_IPV6_IFADDR-1)
+ | 1 << (RTNLGRP_IPV4_ROUTE-1)
+ | 1 << (RTNLGRP_IPV6_ROUTE-1);
+
+ ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+ NETLINK_ROUTE);
+ if (ifpx_irq.fd == -1) {
+ IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
+ goto error;
+ }
+ /* Starting with kernel 4.19 you can request dump for a specific
+ * interface and kernel will filter out and send only relevant info.
+ * Otherwise NLM_F_DUMP will generate info for all interfaces and you
+ * need to filter them yourself.
+ */
+#ifdef NETLINK_DUMP_STRICT_CHK
+ ret = 1; /* use this var also as an input param */
+ ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
+ &ret, sizeof(ret));
+ if (ret < 0) {
+ IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
+ goto error;
+ }
+#endif
+
+ ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
+ if (ret < 0) {
+ IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
+ goto error;
+ }
+ ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
+ if (ret < 0) {
+ IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
+ goto error;
+ } else {
+ ifpx_pid = addr.nl_pid;
+ IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
+ }
+
+ ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
+ NULL);
+ if (ret < 0)
+ goto error;
+
+ request_info(RTM_GETLINK, 0);
+
+ return 0;
+
+error:
+ rte_errno = errno;
+ if (ifpx_irq.fd != -1) {
+ close(ifpx_irq.fd);
+ ifpx_irq.fd = -1;
+ }
+ return -1;
+}
+
+int rte_ifpx_close(void)
+{
+ int ec;
+ unsigned int p;
+ struct ifpx_cbs_node *cbs;
+ struct ifpx_proxies_node *px;
+
+ if (ifpx_irq.fd < 0)
+ return -EBADFD;
+
+restart:
+ ec = rte_intr_callback_unregister(&ifpx_irq,
+ if_proxy_intr_callback, NULL);
+ if (ec == -EAGAIN) /* unlikely but possible - at least I think so */
+ goto restart;
+
+ rte_spinlock_lock(&ifpx_lock);
+
+ close(ifpx_irq.fd);
+ ifpx_irq.fd = -1;
+ ifpx_pid = 0;
+
+ /* Clear callbacks. */
+ while (!TAILQ_EMPTY(&ifpx_callbacks)) {
+ cbs = TAILQ_FIRST(&ifpx_callbacks);
+ TAILQ_REMOVE(&ifpx_callbacks, cbs, elem);
+ free(cbs);
+ }
+
+ /* Clear proxies. */
+ while (!TAILQ_EMPTY(&ifpx_proxies)) {
+ px = TAILQ_FIRST(&ifpx_proxies);
+ TAILQ_REMOVE(&ifpx_proxies, px, elem);
+ free(px);
+ }
+
+ for (p = 0; p < RTE_DIM(ifpx_p2p); ++p)
+ ifpx_p2p[p] = RTE_MAX_ETHPORTS;
+
+ rte_spinlock_unlock(&ifpx_lock);
+
+ return 0;
+}
+
+RTE_INIT(if_proxy_init)
+{
+ unsigned int i;
+ for (i = 0; i < RTE_DIM(ifpx_p2p); ++i)
+ ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+
+ ifpx_log_type = rte_log_register("lib.if_proxy");
+ if (ifpx_log_type >= 0)
+ rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
+}
diff --git a/lib/meson.build b/lib/meson.build
index 0af3efab2..c913b33dd 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -19,7 +19,7 @@ libraries = [
'acl', 'bbdev', 'bitratestats', 'cfgfile',
'compressdev', 'cryptodev',
'distributor', 'efd', 'eventdev',
- 'gro', 'gso', 'ip_frag', 'jobstats',
+ 'gro', 'gso', 'if_proxy', 'ip_frag', 'jobstats',
'kni', 'latencystats', 'lpm', 'member',
'power', 'pdump', 'rawdev',
'rcu', 'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
--
2.17.1
next prev parent reply other threads:[~2020-01-14 14:25 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-01-14 14:25 [dpdk-dev] [RFC PATCH 0/3] introduce IF proxy library Andrzej Ostruszka
2020-01-14 14:25 ` [dpdk-dev] [RFC PATCH 1/3] lib: introduce IF proxy library (API) Andrzej Ostruszka
2020-01-14 14:25 ` Andrzej Ostruszka [this message]
2020-01-14 14:25 ` [dpdk-dev] [RFC PATCH 3/3] if_proxy: add example, test and documentation Andrzej Ostruszka
2020-01-14 15:16 ` [dpdk-dev] [RFC PATCH 0/3] introduce IF proxy library Morten Brørup
2020-01-14 17:38 ` Andrzej Ostruszka
2020-01-15 10:15 ` Bruce Richardson
2020-01-15 11:27 ` Jerin Jacob
2020-01-15 12:28 ` Morten Brørup
2020-01-15 12:57 ` Jerin Jacob
2020-01-15 15:30 ` Morten Brørup
2020-01-15 16:04 ` Jerin Jacob
2020-01-15 18:15 ` Morten Brørup
2020-01-16 7:15 ` Jerin Jacob
2020-01-16 9:11 ` Morten Brørup
2020-01-16 9:09 ` Andrzej Ostruszka
2020-01-16 9:30 ` Morten Brørup
2020-01-16 10:42 ` Andrzej Ostruszka
2020-01-16 10:58 ` Morten Brørup
2020-01-16 12:06 ` Andrzej Ostruszka
2020-01-15 14:09 ` Bruce Richardson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200114142517.29522-3-aostruszka@marvell.com \
--to=aostruszka@marvell.com \
--cc=dev@dpdk.org \
--cc=jerinj@marvell.com \
--cc=kirankumark@marvell.com \
--cc=kkanas@marvell.com \
--cc=ndabilpuram@marvell.com \
--cc=pbhagavatula@marvell.com \
--cc=thomas@monjalon.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).