DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
@ 2022-12-14 15:34 Shibin Koikkara Reeny
  0 siblings, 0 replies; 8+ messages in thread
From: Shibin Koikkara Reeny @ 2022-12-14 15:34 UTC (permalink / raw)
  To: dev, anatoly.burakov, bruce.richardson
  Cc: ciara.loftus, qi.z.zhang, Shibin Koikkara Reeny

Integrate support for the AF_XDP CNI and device plugin [1] so that the
DPDK AF_XDP PMD can work in an unprivileged container environment.
Part of the AF_XDP PMD initialization process involves loading
an eBPF program onto the given netdev. This operation requires
privileges, which prevents the PMD from being able to work in an
unprivileged container (without root access). The plugin CNI handles
the program loading. CNI open Unix Domain Socket (UDS) and waits
listening for a client to make requests over that UDS. The client(DPDK)
connects and a "handshake" occurs, then the File Descriptor which points
to the XSKMAP associated with the loaded eBPF program is handed over
to the client. The client can then proceed with creating an AF_XDP
socket and inserting the socket into the XSKMAP pointed to by the
FD received on the UDS.

A new vdev arg "use_cni" is created to indicate user wishes to run
the PMD in unprivileged mode and to receive the XSKMAP FD from the CNI.
When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf flag
should be used when creating the socket, which tells libbpf not to load the
default libbpf program on the netdev. We tell libbpf not to do this because
the loading is handled by the CNI in this scenario.

[1]: https://github.com/intel/afxdp-plugins-for-kubernetes

Signed-off-by: Shibin Koikkara Reeny <shibin.koikkara.reeny@intel.com>
---
 drivers/net/af_xdp/rte_eth_af_xdp.c | 337 +++++++++++++++++++++++++++-
 1 file changed, 325 insertions(+), 12 deletions(-)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index b6ec9bf490..196d98ad97 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <netinet/in.h>
 #include <net/if.h>
+#include <sys/un.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <linux/if_ether.h>
@@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype, NOTICE);
 
 #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
 
+#define MAX_LONG_OPT_SZ			64
+#define UDS_MAX_FD_NUM			2
+#define UDS_MAX_CMD_LEN			64
+#define UDS_MAX_CMD_RESP		128
+#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
+#define UDS_SOCK			"/tmp/afxdp.sock"
+#define UDS_CONNECT_MSG			"/connect"
+#define UDS_HOST_OK_MSG			"/host_ok"
+#define UDS_HOST_NAK_MSG		"/host_nak"
+#define UDS_VERSION_MSG			"/version"
+#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
+#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
+#define UDS_FD_ACK_MSG			"/fd_ack"
+#define UDS_FD_NAK_MSG			"/fd_nak"
+#define UDS_FIN_MSG			"/fin"
+#define UDS_FIN_ACK_MSG			"/fin_ack"
+
+
 static int afxdp_dev_count;
 
 /* Message header to synchronize fds via IPC */
@@ -151,6 +170,7 @@ struct pmd_internals {
 	char prog_path[PATH_MAX];
 	bool custom_prog_configured;
 	bool force_copy;
+	bool use_cni;
 	struct bpf_map *map;
 
 	struct rte_ether_addr eth_addr;
@@ -170,6 +190,7 @@ struct pmd_process_private {
 #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
 #define ETH_AF_XDP_BUDGET_ARG			"busy_budget"
 #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
+#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
 
 static const char * const valid_arguments[] = {
 	ETH_AF_XDP_IFACE_ARG,
@@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
 	ETH_AF_XDP_PROG_ARG,
 	ETH_AF_XDP_BUDGET_ARG,
 	ETH_AF_XDP_FORCE_COPY_ARG,
-	NULL
-};
+	ETH_AF_XDP_USE_CNI_ARG,
+	NULL};
 
 static const struct rte_eth_link pmd_link = {
 	.link_speed = RTE_ETH_SPEED_NUM_10G,
@@ -1129,7 +1150,8 @@ xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 		ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
 				&rxq->fq, &rxq->cq, &usr_config);
 		if (ret) {
-			AF_XDP_LOG(ERR, "Failed to create umem\n");
+			AF_XDP_LOG(ERR, "Failed to create umem [%d]: [%s]\n",
+				   errno, strerror(errno));
 			goto err;
 		}
 		umem->buffer = base_addr;
@@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
 	return 0;
 }
 
+static int
+init_uds_sock(struct sockaddr_un *server)
+{
+	int sock;
+
+	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (sock < 0) {
+		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
+		return -1;
+	}
+
+	server->sun_family = AF_UNIX;
+	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
+
+	if (connect(sock, (struct sockaddr *)server, sizeof(struct sockaddr_un)) < 0) {
+		close(sock);
+		AF_XDP_LOG(ERR, "Error connecting stream socket errno = [%d]: [%s]\n",
+			   errno, strerror(errno));
+		return -1;
+	}
+
+	return sock;
+}
+
+struct msg_internal {
+	char response[UDS_MAX_CMD_RESP];
+	int len_param;
+	int num_fds;
+	int fds[UDS_MAX_FD_NUM];
+};
+
+static int
+send_msg(int sock, char *request, int *fd)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	char control[CMSG_SPACE(sizeof(*fd))];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	strlcpy(dst.sun_path, UDS_SOCK, sizeof(dst.sun_path));
+
+	/* Initialize message header structure */
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+	iov.iov_base = request;
+	iov.iov_len = strlen(request);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	/* Translate the FD. */
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(sizeof(*fd));
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fd, sizeof(*fd));
+
+	/* Send the request message. */
+	do {
+		snd = sendmsg(sock, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	return snd;
+}
+
+static int
+read_msg(int sock, char *response, struct sockaddr_un *s, int *fd)
+{
+	int msglen;
+	struct msghdr msgh;
+	struct iovec iov;
+	char control[CMSG_SPACE(sizeof(*fd))];
+	struct cmsghdr *cmsg;
+
+	/* Initialize message header structure */
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = response;
+	iov.iov_len = UDS_MAX_CMD_RESP;
+
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(sock, &msgh, 0);
+
+	/* zero length message means socket was closed */
+	if (msglen == 0)
+		return 0;
+
+	if (msglen < 0) {
+		AF_XDP_LOG(ERR, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+			cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+				cmsg->cmsg_type == SCM_RIGHTS) {
+			memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
+			break;
+		}
+	}
+
+	response[msglen] = '\0';
+	return msglen;
+}
+
+static int
+make_request_cni(int sock, struct sockaddr_un *server, char *request,
+		 int *req_fd, char *response, int *out_fd)
+{
+	int rval;
+
+	AF_XDP_LOG(INFO, "Request: [%s]\n", request);
+
+	/* if no file descriptor to send then directly write to socket.
+	 * else use sendmsg() to send the file descriptor.
+	 */
+	if (req_fd == NULL)
+		rval = write(sock, request, strlen(request));
+	else
+		rval = send_msg(sock, request, req_fd);
+
+	if (rval < 0) {
+		AF_XDP_LOG(ERR, "Write error %s\n", strerror(errno));
+		return -1;
+	}
+
+	rval = read_msg(sock, response, server, out_fd);
+	if (rval <= 0) {
+		AF_XDP_LOG(ERR, "Read error %d\n", rval);
+		return -1;
+	}
+	AF_XDP_LOG(INFO, "Response: [%s]\n", request);
+
+	return 0;
+}
+
+static int
+check_response(char *response, char *exp_resp, long size)
+{
+	return strncmp(response, exp_resp, size);
+}
+
+static int
+get_cni_fd(char *if_name)
+{
+	char request[UDS_MAX_CMD_LEN], response[UDS_MAX_CMD_RESP];
+	char hostname[MAX_LONG_OPT_SZ], exp_resp[UDS_MAX_CMD_RESP];
+	struct sockaddr_un server;
+	int xsk_map_fd = -1, out_fd = 0;
+	int sock, err;
+
+	err = gethostname(hostname, MAX_LONG_OPT_SZ - 1);
+	if (err)
+		return -1;
+
+	memset(&server, 0, sizeof(server));
+	sock = init_uds_sock(&server);
+
+	/* Initiates handshake to CNI send: /connect,hostname */
+	snprintf(request, sizeof(request), "%s,%s", UDS_CONNECT_MSG, hostname);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Expect /host_ok */
+	strlcpy(exp_resp, UDS_HOST_OK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+	/* Request for "/version" */
+	strlcpy(request, UDS_VERSION_MSG, UDS_MAX_CMD_LEN);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Request for file descriptor for netdev name*/
+	snprintf(request, sizeof(request), "%s,%s", UDS_XSK_MAP_FD_MSG, if_name);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	if (out_fd < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	xsk_map_fd = out_fd;
+
+	/* Expect fd_ack with file descriptor */
+	strlcpy(exp_resp, UDS_FD_ACK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+
+	/* Initiate close connection */
+	strlcpy(request, UDS_FIN_MSG, UDS_MAX_CMD_LEN);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Connection close */
+	strlcpy(exp_resp, UDS_FIN_ACK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+	close(sock);
+
+	return xsk_map_fd;
+
+err_close:
+	close(sock);
+	return -1;
+}
+
 static int
 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 	      int ring_size)
@@ -1362,6 +1623,10 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 	cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
 #endif
 
+	/* Disable libbpf from loading XDP program */
+	if (internals->use_cni)
+		cfg.libbpf_flags |= XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
+
 	if (strnlen(internals->prog_path, PATH_MAX)) {
 		if (!internals->custom_prog_configured) {
 			ret = load_custom_xdp_prog(internals->prog_path,
@@ -1413,7 +1678,23 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 		}
 	}
 
-	if (rxq->busy_budget) {
+	if (internals->use_cni) {
+		int err, fd, map_fd;
+
+		/* get socket fd from CNI plugin */
+		map_fd = get_cni_fd(internals->if_name);
+		if (map_fd < 0) {
+			AF_XDP_LOG(ERR, "Failed to receive CNI plugin fd\n");
+			goto out_xsk;
+		}
+		/* get socket fd */
+		fd = xsk_socket__fd(rxq->xsk);
+		err = bpf_map_update_elem(map_fd, &rxq->xsk_queue_idx, &fd, 0);
+		if (err) {
+			AF_XDP_LOG(ERR, "Failed to insert unprivileged xsk in map.\n");
+			goto out_xsk;
+		}
+	} else if (rxq->busy_budget) {
 		ret = configure_preferred_busy_poll(rxq);
 		if (ret) {
 			AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
@@ -1584,6 +1865,26 @@ static const struct eth_dev_ops ops = {
 	.get_monitor_addr = eth_get_monitor_addr,
 };
 
+/* CNI option works in unprivileged container environment
+ * and ethernet device functionality will be reduced. So
+ * additional customiszed eth_dev_ops struct is needed
+ * for cni.
+ **/
+static const struct eth_dev_ops ops_cni = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_close = eth_dev_close,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.mtu_set = eth_dev_mtu_set,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+	.get_monitor_addr = eth_get_monitor_addr,
+};
+
 /** parse busy_budget argument */
 static int
 parse_budget_arg(const char *key __rte_unused,
@@ -1704,8 +2005,8 @@ xdp_get_channels_info(const char *if_name, int *max_queues,
 
 static int
 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
-			int *queue_cnt, int *shared_umem, char *prog_path,
-			int *busy_budget, int *force_copy)
+		 int *queue_cnt, int *shared_umem, char *prog_path,
+		 int *busy_budget, int *force_copy, int *use_cni)
 {
 	int ret;
 
@@ -1746,6 +2047,11 @@ parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
 	if (ret < 0)
 		goto free_kvlist;
 
+	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_USE_CNI_ARG,
+				 &parse_integer_arg, use_cni);
+	if (ret < 0)
+		goto free_kvlist;
+
 free_kvlist:
 	rte_kvargs_free(kvlist);
 	return ret;
@@ -1783,8 +2089,9 @@ get_iface_info(const char *if_name,
 
 static struct rte_eth_dev *
 init_internals(struct rte_vdev_device *dev, const char *if_name,
-		int start_queue_idx, int queue_cnt, int shared_umem,
-		const char *prog_path, int busy_budget, int force_copy)
+	       int start_queue_idx, int queue_cnt, int shared_umem,
+	       const char *prog_path, int busy_budget, int force_copy,
+	       int use_cni)
 {
 	const char *name = rte_vdev_device_name(dev);
 	const unsigned int numa_node = dev->device.numa_node;
@@ -1813,6 +2120,7 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
 #endif
 	internals->shared_umem = shared_umem;
 	internals->force_copy = force_copy;
+	internals->use_cni = use_cni;
 
 	if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
 				  &internals->combined_queue_cnt)) {
@@ -1871,7 +2179,11 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
 	eth_dev->data->dev_link = pmd_link;
 	eth_dev->data->mac_addrs = &internals->eth_addr;
 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
-	eth_dev->dev_ops = &ops;
+	if (!internals->use_cni)
+		eth_dev->dev_ops = &ops;
+	else
+		eth_dev->dev_ops = &ops_cni;
+
 	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
 	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
 	eth_dev->process_private = process_private;
@@ -1998,6 +2310,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 	char prog_path[PATH_MAX] = {'\0'};
 	int busy_budget = -1, ret;
 	int force_copy = 0;
+	int use_cni = 0;
 	struct rte_eth_dev *eth_dev = NULL;
 	const char *name = rte_vdev_device_name(dev);
 
@@ -2043,7 +2356,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 
 	if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
 			     &xsk_queue_cnt, &shared_umem, prog_path,
-			     &busy_budget, &force_copy) < 0) {
+			     &busy_budget, &force_copy, &use_cni) < 0) {
 		AF_XDP_LOG(ERR, "Invalid kvargs value\n");
 		return -EINVAL;
 	}
@@ -2057,8 +2370,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 					busy_budget;
 
 	eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
-					xsk_queue_cnt, shared_umem, prog_path,
-					busy_budget, force_copy);
+				 xsk_queue_cnt, shared_umem, prog_path,
+				 busy_budget, force_copy, use_cni);
 	if (eth_dev == NULL) {
 		AF_XDP_LOG(ERR, "Failed to init internals\n");
 		return -1;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2023-02-02 15:18         ` Bruce Richardson
@ 2023-02-02 17:02           ` Koikkara Reeny, Shibin
  0 siblings, 0 replies; 8+ messages in thread
From: Koikkara Reeny, Shibin @ 2023-02-02 17:02 UTC (permalink / raw)
  To: Richardson, Bruce, Zhang, Qi Z; +Cc: dev, Burakov, Anatoly, Loftus, Ciara



> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Thursday, February 2, 2023 3:19 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>;
> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Loftus,
> Ciara <ciara.loftus@intel.com>
> Subject: Re: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> 
> On Thu, Feb 02, 2023 at 02:49:52PM +0000, Zhang, Qi Z wrote:
> >
> >
> > > -----Original Message-----
> > > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > > Sent: Thursday, January 19, 2023 11:10 PM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>; dev@dpdk.org; Burakov,
> > > Anatoly <anatoly.burakov@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>
> > > Cc: Loftus, Ciara <ciara.loftus@intel.com>
> > > Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > >
> > >
> > > > -----Original Message-----
> > > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > > Sent: Wednesday, January 18, 2023 12:10 PM
> > > > To: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>;
> > > > dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > > > Richardson, Bruce <bruce.richardson@intel.com>
> > > > Cc: Loftus, Ciara <ciara.loftus@intel.com>
> > > > Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > > > > Sent: Wednesday, December 14, 2022 11:41 PM
> > > > > To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > > > > Richardson, Bruce <bruce.richardson@intel.com>
> > > > > Cc: Loftus, Ciara <ciara.loftus@intel.com>; Zhang, Qi Z
> > > > > <qi.z.zhang@intel.com>; Koikkara Reeny, Shibin
> > > > > <shibin.koikkara.reeny@intel.com>
> > > > > Subject: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > > > >
> > > > > Integrate support for the AF_XDP CNI and device plugin [1] so
> > > > > that the DPDK AF_XDP PMD can work in an unprivileged container
> > > environment.
> > > > > Part of the AF_XDP PMD initialization process involves loading
> > > > > an eBPF program onto the given netdev. This operation requires
> > > > > privileges, which prevents the PMD from being able to work in an
> > > > > unprivileged container (without root access). The plugin CNI
> > > > > handles the program loading. CNI open Unix Domain Socket (UDS)
> > > > > and waits listening for a client to make requests over that UDS.
> > > > > The
> > > > > client(DPDK) connects and a "handshake" occurs, then the File
> > > > > Descriptor which points to the XSKMAP associated with the loaded
> > > > > eBPF program is handed over to the client. The client can then
> > > > > proceed with creating an AF_XDP socket and inserting the socket
> > > > > into the XSKMAP pointed to by the FD received on the
> > > > UDS.
> > > > >
> > > > > A new vdev arg "use_cni" is created to indicate user wishes to
> > > > > run the PMD in unprivileged mode and to receive the XSKMAP FD
> > > > > from the
> > > CNI.
> > > > > When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD
> > > > > libbpf flag should be used when creating the socket, which tells
> > > > > libbpf not to load the default libbpf program on the netdev. We
> > > > > tell libbpf not to do this because the loading is handled by the
> > > > > CNI in this
> > > scenario.
> > > > >
> > > > > [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> > > > >
> > > > > Signed-off-by: Shibin Koikkara Reeny
> > > > > <shibin.koikkara.reeny@intel.com>
> > > > > ---
> > > > >  drivers/net/af_xdp/rte_eth_af_xdp.c | 337
> > > > > +++++++++++++++++++++++++++-
> > > > >  1 file changed, 325 insertions(+), 12 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > > index b6ec9bf490..196d98ad97 100644
> > > > > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > > @@ -7,6 +7,7 @@
> > > > >  #include <string.h>
> > > > >  #include <netinet/in.h>
> > > > >  #include <net/if.h>
> > > > > +#include <sys/un.h>
> > > > >  #include <sys/socket.h>
> > > > >  #include <sys/ioctl.h>
> > > > >  #include <linux/if_ether.h>
> > > > > @@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> > > > > NOTICE);
> > > > >
> > > > >  #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> > > > >
> > > > > +#define MAX_LONG_OPT_SZ			64
> > > > > +#define UDS_MAX_FD_NUM			2
> > > > > +#define UDS_MAX_CMD_LEN			64
> > > > > +#define UDS_MAX_CMD_RESP		128
> > > > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > > > +#define UDS_SOCK			"/tmp/afxdp.sock"
> > > > > +#define UDS_CONNECT_MSG			"/connect"
> > > > > +#define UDS_HOST_OK_MSG			"/host_ok"
> > > > > +#define UDS_HOST_NAK_MSG		"/host_nak"
> > > > > +#define UDS_VERSION_MSG			"/version"
> > > > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > > > +#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
> > > > > +#define UDS_FD_ACK_MSG			"/fd_ack"
> > > > > +#define UDS_FD_NAK_MSG			"/fd_nak"
> > > > > +#define UDS_FIN_MSG			"/fin"
> > > > > +#define UDS_FIN_ACK_MSG			"/fin_ack"
> > > > > +
> > > > > +
> > > > >  static int afxdp_dev_count;
> > > > >
> > > > >  /* Message header to synchronize fds via IPC */ @@ -151,6
> > > > > +170,7 @@ struct pmd_internals {
> > > > >  	char prog_path[PATH_MAX];
> > > > >  	bool custom_prog_configured;
> > > > >  	bool force_copy;
> > > > > +	bool use_cni;
> > > > >  	struct bpf_map *map;
> > > > >
> > > > >  	struct rte_ether_addr eth_addr; @@ -170,6 +190,7 @@
> struct
> > > > > pmd_process_private {
> > > > >  #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
> > > > >  #define ETH_AF_XDP_BUDGET_ARG
> > > > 	"busy_budget"
> > > > >  #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
> > > > > +#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
> > > > >
> > > > >  static const char * const valid_arguments[] = {
> > > > >  	ETH_AF_XDP_IFACE_ARG,
> > > > > @@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
> > > > >  	ETH_AF_XDP_PROG_ARG,
> > > > >  	ETH_AF_XDP_BUDGET_ARG,
> > > > >  	ETH_AF_XDP_FORCE_COPY_ARG,
> > > > > -	NULL
> > > > > -};
> > > > > +	ETH_AF_XDP_USE_CNI_ARG,
> > > > > +	NULL};
> > > > >
> > > > >  static const struct rte_eth_link pmd_link = {
> > > > >  	.link_speed = RTE_ETH_SPEED_NUM_10G, @@ -1129,7
> +1150,8
> > > @@
> > > > > xsk_umem_info *xdp_umem_configure(struct pmd_internals
> *internals,
> > > > >  		ret = xsk_umem__create(&umem->umem,
> base_addr,
> > > > umem_size,
> > > > >  				&rxq->fq, &rxq->cq, &usr_config);
> > > > >  		if (ret) {
> > > > > -			AF_XDP_LOG(ERR, "Failed to create
> umem\n");
> > > > > +			AF_XDP_LOG(ERR, "Failed to create umem
> [%d]:
> > > > > [%s]\n",
> > > > > +				   errno, strerror(errno));
> > > > >  			goto err;
> > > > >  		}
> > > > >  		umem->buffer = base_addr;
> > > > > @@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct
> > > > > pkt_rx_queue *rxq)
> > > > >  	return 0;
> > > > >  }
> > > > >
> > > > > +static int
> > > > > +init_uds_sock(struct sockaddr_un *server) {
> > > > > +	int sock;
> > > > > +
> > > > > +	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> > > > > +	if (sock < 0) {
> > > > > +		AF_XDP_LOG(ERR, "Failed to opening stream
> socket\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	server->sun_family = AF_UNIX;
> > > > > +	strlcpy(server->sun_path, UDS_SOCK, sizeof(server-
> >sun_path));
> > > > > +
> > > > > +	if (connect(sock, (struct sockaddr *)server, sizeof(struct
> > > > > sockaddr_un)) < 0) {
> > > >
> > > > 	seems the server address is hard coded as  "/tmp/afxdp.sock", is
> > > > any spec we follows, or should we parse this as a devargs?
> > > > 	better add some comment or external link that help to explain
> > > > this
> > >
> > > It was already hardcoded in the afxdp-plugins
> > > https://github.com/intel/afxdp-plugins-for-
> > > kubernetes/blob/main/constants/constants.go .
> >
> > OK, I saw this has been explained in your new doc patch.
> >
> > Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
> >
> I would suggest the doc patch should be merged into this patch - code and
> doc should go together. Please do a v3 with both patches together, thanks.
> 
> Assuming Qi is ok with it, you can probably keep his ack on the new version.
> 
I have merged the doc patch with code patch together. 
Patch V3 :http://patches.dpdk.org/project/dpdk/patch/20230202165513.31012-1-shibin.koikkara.reeny@intel.com/

> /Bruce

Regards,
Shibin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2023-02-02 14:49       ` Zhang, Qi Z
@ 2023-02-02 15:18         ` Bruce Richardson
  2023-02-02 17:02           ` Koikkara Reeny, Shibin
  0 siblings, 1 reply; 8+ messages in thread
From: Bruce Richardson @ 2023-02-02 15:18 UTC (permalink / raw)
  To: Zhang, Qi Z; +Cc: Koikkara Reeny, Shibin, dev, Burakov, Anatoly, Loftus, Ciara

On Thu, Feb 02, 2023 at 02:49:52PM +0000, Zhang, Qi Z wrote:
> 
> 
> > -----Original Message-----
> > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > Sent: Thursday, January 19, 2023 11:10 PM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>; dev@dpdk.org; Burakov, Anatoly
> > <anatoly.burakov@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Cc: Loftus, Ciara <ciara.loftus@intel.com>
> > Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > 
> > 
> > > -----Original Message-----
> > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > Sent: Wednesday, January 18, 2023 12:10 PM
> > > To: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>;
> > > dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > > Richardson, Bruce <bruce.richardson@intel.com>
> > > Cc: Loftus, Ciara <ciara.loftus@intel.com>
> > > Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > > > Sent: Wednesday, December 14, 2022 11:41 PM
> > > > To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > > > Richardson, Bruce <bruce.richardson@intel.com>
> > > > Cc: Loftus, Ciara <ciara.loftus@intel.com>; Zhang, Qi Z
> > > > <qi.z.zhang@intel.com>; Koikkara Reeny, Shibin
> > > > <shibin.koikkara.reeny@intel.com>
> > > > Subject: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > > >
> > > > Integrate support for the AF_XDP CNI and device plugin [1] so that
> > > > the DPDK AF_XDP PMD can work in an unprivileged container
> > environment.
> > > > Part of the AF_XDP PMD initialization process involves loading an
> > > > eBPF program onto the given netdev. This operation requires
> > > > privileges, which prevents the PMD from being able to work in an
> > > > unprivileged container (without root access). The plugin CNI handles
> > > > the program loading. CNI open Unix Domain Socket (UDS) and waits
> > > > listening for a client to make requests over that UDS. The
> > > > client(DPDK) connects and a "handshake" occurs, then the File
> > > > Descriptor which points to the XSKMAP associated with the loaded
> > > > eBPF program is handed over to the client. The client can then
> > > > proceed with creating an AF_XDP socket and inserting the socket into
> > > > the XSKMAP pointed to by the FD received on the
> > > UDS.
> > > >
> > > > A new vdev arg "use_cni" is created to indicate user wishes to run
> > > > the PMD in unprivileged mode and to receive the XSKMAP FD from the
> > CNI.
> > > > When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD
> > > > libbpf flag should be used when creating the socket, which tells
> > > > libbpf not to load the default libbpf program on the netdev. We tell
> > > > libbpf not to do this because the loading is handled by the CNI in this
> > scenario.
> > > >
> > > > [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> > > >
> > > > Signed-off-by: Shibin Koikkara Reeny
> > > > <shibin.koikkara.reeny@intel.com>
> > > > ---
> > > >  drivers/net/af_xdp/rte_eth_af_xdp.c | 337
> > > > +++++++++++++++++++++++++++-
> > > >  1 file changed, 325 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > index b6ec9bf490..196d98ad97 100644
> > > > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > > @@ -7,6 +7,7 @@
> > > >  #include <string.h>
> > > >  #include <netinet/in.h>
> > > >  #include <net/if.h>
> > > > +#include <sys/un.h>
> > > >  #include <sys/socket.h>
> > > >  #include <sys/ioctl.h>
> > > >  #include <linux/if_ether.h>
> > > > @@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> > > > NOTICE);
> > > >
> > > >  #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> > > >
> > > > +#define MAX_LONG_OPT_SZ			64
> > > > +#define UDS_MAX_FD_NUM			2
> > > > +#define UDS_MAX_CMD_LEN			64
> > > > +#define UDS_MAX_CMD_RESP		128
> > > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > > +#define UDS_SOCK			"/tmp/afxdp.sock"
> > > > +#define UDS_CONNECT_MSG			"/connect"
> > > > +#define UDS_HOST_OK_MSG			"/host_ok"
> > > > +#define UDS_HOST_NAK_MSG		"/host_nak"
> > > > +#define UDS_VERSION_MSG			"/version"
> > > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > > +#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
> > > > +#define UDS_FD_ACK_MSG			"/fd_ack"
> > > > +#define UDS_FD_NAK_MSG			"/fd_nak"
> > > > +#define UDS_FIN_MSG			"/fin"
> > > > +#define UDS_FIN_ACK_MSG			"/fin_ack"
> > > > +
> > > > +
> > > >  static int afxdp_dev_count;
> > > >
> > > >  /* Message header to synchronize fds via IPC */ @@ -151,6 +170,7 @@
> > > > struct pmd_internals {
> > > >  	char prog_path[PATH_MAX];
> > > >  	bool custom_prog_configured;
> > > >  	bool force_copy;
> > > > +	bool use_cni;
> > > >  	struct bpf_map *map;
> > > >
> > > >  	struct rte_ether_addr eth_addr;
> > > > @@ -170,6 +190,7 @@ struct pmd_process_private {
> > > >  #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
> > > >  #define ETH_AF_XDP_BUDGET_ARG
> > > 	"busy_budget"
> > > >  #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
> > > > +#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
> > > >
> > > >  static const char * const valid_arguments[] = {
> > > >  	ETH_AF_XDP_IFACE_ARG,
> > > > @@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
> > > >  	ETH_AF_XDP_PROG_ARG,
> > > >  	ETH_AF_XDP_BUDGET_ARG,
> > > >  	ETH_AF_XDP_FORCE_COPY_ARG,
> > > > -	NULL
> > > > -};
> > > > +	ETH_AF_XDP_USE_CNI_ARG,
> > > > +	NULL};
> > > >
> > > >  static const struct rte_eth_link pmd_link = {
> > > >  	.link_speed = RTE_ETH_SPEED_NUM_10G, @@ -1129,7 +1150,8
> > @@
> > > > xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
> > > >  		ret = xsk_umem__create(&umem->umem, base_addr,
> > > umem_size,
> > > >  				&rxq->fq, &rxq->cq, &usr_config);
> > > >  		if (ret) {
> > > > -			AF_XDP_LOG(ERR, "Failed to create umem\n");
> > > > +			AF_XDP_LOG(ERR, "Failed to create umem [%d]:
> > > > [%s]\n",
> > > > +				   errno, strerror(errno));
> > > >  			goto err;
> > > >  		}
> > > >  		umem->buffer = base_addr;
> > > > @@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct
> > > > pkt_rx_queue *rxq)
> > > >  	return 0;
> > > >  }
> > > >
> > > > +static int
> > > > +init_uds_sock(struct sockaddr_un *server) {
> > > > +	int sock;
> > > > +
> > > > +	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> > > > +	if (sock < 0) {
> > > > +		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	server->sun_family = AF_UNIX;
> > > > +	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
> > > > +
> > > > +	if (connect(sock, (struct sockaddr *)server, sizeof(struct
> > > > sockaddr_un)) < 0) {
> > >
> > > 	seems the server address is hard coded as  "/tmp/afxdp.sock", is any
> > > spec we follows, or should we parse this as a devargs?
> > > 	better add some comment or external link that help to explain this
> > 
> > It was already hardcoded in the afxdp-plugins
> > https://github.com/intel/afxdp-plugins-for-
> > kubernetes/blob/main/constants/constants.go .
> 
> OK, I saw this has been explained in your new doc patch.
> 
> Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
> 
I would suggest the doc patch should be merged into this patch - code and
doc should go together. Please do a v3 with both patches together, thanks.

Assuming Qi is ok with it, you can probably keep his ack on the new
version.

/Bruce

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2023-01-19 15:10     ` Koikkara Reeny, Shibin
@ 2023-02-02 14:49       ` Zhang, Qi Z
  2023-02-02 15:18         ` Bruce Richardson
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang, Qi Z @ 2023-02-02 14:49 UTC (permalink / raw)
  To: Koikkara Reeny, Shibin, dev, Burakov, Anatoly, Richardson, Bruce
  Cc: Loftus, Ciara



> -----Original Message-----
> From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> Sent: Thursday, January 19, 2023 11:10 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: Loftus, Ciara <ciara.loftus@intel.com>
> Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> 
> 
> > -----Original Message-----
> > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Sent: Wednesday, January 18, 2023 12:10 PM
> > To: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>;
> > dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: Loftus, Ciara <ciara.loftus@intel.com>
> > Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> >
> >
> >
> > > -----Original Message-----
> > > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > > Sent: Wednesday, December 14, 2022 11:41 PM
> > > To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > > Richardson, Bruce <bruce.richardson@intel.com>
> > > Cc: Loftus, Ciara <ciara.loftus@intel.com>; Zhang, Qi Z
> > > <qi.z.zhang@intel.com>; Koikkara Reeny, Shibin
> > > <shibin.koikkara.reeny@intel.com>
> > > Subject: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> > >
> > > Integrate support for the AF_XDP CNI and device plugin [1] so that
> > > the DPDK AF_XDP PMD can work in an unprivileged container
> environment.
> > > Part of the AF_XDP PMD initialization process involves loading an
> > > eBPF program onto the given netdev. This operation requires
> > > privileges, which prevents the PMD from being able to work in an
> > > unprivileged container (without root access). The plugin CNI handles
> > > the program loading. CNI open Unix Domain Socket (UDS) and waits
> > > listening for a client to make requests over that UDS. The
> > > client(DPDK) connects and a "handshake" occurs, then the File
> > > Descriptor which points to the XSKMAP associated with the loaded
> > > eBPF program is handed over to the client. The client can then
> > > proceed with creating an AF_XDP socket and inserting the socket into
> > > the XSKMAP pointed to by the FD received on the
> > UDS.
> > >
> > > A new vdev arg "use_cni" is created to indicate user wishes to run
> > > the PMD in unprivileged mode and to receive the XSKMAP FD from the
> CNI.
> > > When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD
> > > libbpf flag should be used when creating the socket, which tells
> > > libbpf not to load the default libbpf program on the netdev. We tell
> > > libbpf not to do this because the loading is handled by the CNI in this
> scenario.
> > >
> > > [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> > >
> > > Signed-off-by: Shibin Koikkara Reeny
> > > <shibin.koikkara.reeny@intel.com>
> > > ---
> > >  drivers/net/af_xdp/rte_eth_af_xdp.c | 337
> > > +++++++++++++++++++++++++++-
> > >  1 file changed, 325 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > index b6ec9bf490..196d98ad97 100644
> > > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > @@ -7,6 +7,7 @@
> > >  #include <string.h>
> > >  #include <netinet/in.h>
> > >  #include <net/if.h>
> > > +#include <sys/un.h>
> > >  #include <sys/socket.h>
> > >  #include <sys/ioctl.h>
> > >  #include <linux/if_ether.h>
> > > @@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> > > NOTICE);
> > >
> > >  #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> > >
> > > +#define MAX_LONG_OPT_SZ			64
> > > +#define UDS_MAX_FD_NUM			2
> > > +#define UDS_MAX_CMD_LEN			64
> > > +#define UDS_MAX_CMD_RESP		128
> > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > +#define UDS_SOCK			"/tmp/afxdp.sock"
> > > +#define UDS_CONNECT_MSG			"/connect"
> > > +#define UDS_HOST_OK_MSG			"/host_ok"
> > > +#define UDS_HOST_NAK_MSG		"/host_nak"
> > > +#define UDS_VERSION_MSG			"/version"
> > > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > > +#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
> > > +#define UDS_FD_ACK_MSG			"/fd_ack"
> > > +#define UDS_FD_NAK_MSG			"/fd_nak"
> > > +#define UDS_FIN_MSG			"/fin"
> > > +#define UDS_FIN_ACK_MSG			"/fin_ack"
> > > +
> > > +
> > >  static int afxdp_dev_count;
> > >
> > >  /* Message header to synchronize fds via IPC */ @@ -151,6 +170,7 @@
> > > struct pmd_internals {
> > >  	char prog_path[PATH_MAX];
> > >  	bool custom_prog_configured;
> > >  	bool force_copy;
> > > +	bool use_cni;
> > >  	struct bpf_map *map;
> > >
> > >  	struct rte_ether_addr eth_addr;
> > > @@ -170,6 +190,7 @@ struct pmd_process_private {
> > >  #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
> > >  #define ETH_AF_XDP_BUDGET_ARG
> > 	"busy_budget"
> > >  #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
> > > +#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
> > >
> > >  static const char * const valid_arguments[] = {
> > >  	ETH_AF_XDP_IFACE_ARG,
> > > @@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
> > >  	ETH_AF_XDP_PROG_ARG,
> > >  	ETH_AF_XDP_BUDGET_ARG,
> > >  	ETH_AF_XDP_FORCE_COPY_ARG,
> > > -	NULL
> > > -};
> > > +	ETH_AF_XDP_USE_CNI_ARG,
> > > +	NULL};
> > >
> > >  static const struct rte_eth_link pmd_link = {
> > >  	.link_speed = RTE_ETH_SPEED_NUM_10G, @@ -1129,7 +1150,8
> @@
> > > xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
> > >  		ret = xsk_umem__create(&umem->umem, base_addr,
> > umem_size,
> > >  				&rxq->fq, &rxq->cq, &usr_config);
> > >  		if (ret) {
> > > -			AF_XDP_LOG(ERR, "Failed to create umem\n");
> > > +			AF_XDP_LOG(ERR, "Failed to create umem [%d]:
> > > [%s]\n",
> > > +				   errno, strerror(errno));
> > >  			goto err;
> > >  		}
> > >  		umem->buffer = base_addr;
> > > @@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct
> > > pkt_rx_queue *rxq)
> > >  	return 0;
> > >  }
> > >
> > > +static int
> > > +init_uds_sock(struct sockaddr_un *server) {
> > > +	int sock;
> > > +
> > > +	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> > > +	if (sock < 0) {
> > > +		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	server->sun_family = AF_UNIX;
> > > +	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
> > > +
> > > +	if (connect(sock, (struct sockaddr *)server, sizeof(struct
> > > sockaddr_un)) < 0) {
> >
> > 	seems the server address is hard coded as  "/tmp/afxdp.sock", is any
> > spec we follows, or should we parse this as a devargs?
> > 	better add some comment or external link that help to explain this
> 
> It was already hardcoded in the afxdp-plugins
> https://github.com/intel/afxdp-plugins-for-
> kubernetes/blob/main/constants/constants.go .

OK, I saw this has been explained in your new doc patch.

Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>

> 
> If we want to go with the devargs we will still need to put a default
> hardcoded value as there is no documentation showing the where and what
> is the name of the socket in the repo without going through the code.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2022-12-14 15:41 ` Shibin Koikkara Reeny
  2023-01-18 12:10   ` Zhang, Qi Z
@ 2023-01-25 16:43   ` Burakov, Anatoly
  1 sibling, 0 replies; 8+ messages in thread
From: Burakov, Anatoly @ 2023-01-25 16:43 UTC (permalink / raw)
  To: Shibin Koikkara Reeny, dev, bruce.richardson; +Cc: ciara.loftus, qi.z.zhang

On 12/14/2022 3:41 PM, Shibin Koikkara Reeny wrote:
> Integrate support for the AF_XDP CNI and device plugin [1] so that the
> DPDK AF_XDP PMD can work in an unprivileged container environment.
> Part of the AF_XDP PMD initialization process involves loading
> an eBPF program onto the given netdev. This operation requires
> privileges, which prevents the PMD from being able to work in an
> unprivileged container (without root access). The plugin CNI handles
> the program loading. CNI open Unix Domain Socket (UDS) and waits
> listening for a client to make requests over that UDS. The client(DPDK)
> connects and a "handshake" occurs, then the File Descriptor which points
> to the XSKMAP associated with the loaded eBPF program is handed over
> to the client. The client can then proceed with creating an AF_XDP
> socket and inserting the socket into the XSKMAP pointed to by the
> FD received on the UDS.
> 
> A new vdev arg "use_cni" is created to indicate user wishes to run
> the PMD in unprivileged mode and to receive the XSKMAP FD from the CNI.
> When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf flag
> should be used when creating the socket, which tells libbpf not to load the
> default libbpf program on the netdev. We tell libbpf not to do this because
> the loading is handled by the CNI in this scenario.
> 
> [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> 
> Signed-off-by: Shibin Koikkara Reeny <shibin.koikkara.reeny@intel.com>
> ---

Tested-by: Anatoly Burakov <anatoly.burakov@intel.com>

The testing setup is a bit involved, but everything seems to work once 
all the pieces fall into place.

-- 
Thanks,
Anatoly


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2023-01-18 12:10   ` Zhang, Qi Z
@ 2023-01-19 15:10     ` Koikkara Reeny, Shibin
  2023-02-02 14:49       ` Zhang, Qi Z
  0 siblings, 1 reply; 8+ messages in thread
From: Koikkara Reeny, Shibin @ 2023-01-19 15:10 UTC (permalink / raw)
  To: Zhang, Qi Z, dev, Burakov, Anatoly, Richardson, Bruce; +Cc: Loftus, Ciara


> -----Original Message-----
> From: Zhang, Qi Z <qi.z.zhang@intel.com>
> Sent: Wednesday, January 18, 2023 12:10 PM
> To: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>;
> dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>; Richardson,
> Bruce <bruce.richardson@intel.com>
> Cc: Loftus, Ciara <ciara.loftus@intel.com>
> Subject: RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> 
> 
> 
> > -----Original Message-----
> > From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> > Sent: Wednesday, December 14, 2022 11:41 PM
> > To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: Loftus, Ciara <ciara.loftus@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Koikkara Reeny, Shibin
> > <shibin.koikkara.reeny@intel.com>
> > Subject: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> >
> > Integrate support for the AF_XDP CNI and device plugin [1] so that the
> > DPDK AF_XDP PMD can work in an unprivileged container environment.
> > Part of the AF_XDP PMD initialization process involves loading an eBPF
> > program onto the given netdev. This operation requires privileges,
> > which prevents the PMD from being able to work in an unprivileged
> > container (without root access). The plugin CNI handles the program
> > loading. CNI open Unix Domain Socket (UDS) and waits listening for a
> > client to make requests over that UDS. The client(DPDK) connects and a
> > "handshake" occurs, then the File Descriptor which points to the
> > XSKMAP associated with the loaded eBPF program is handed over to the
> > client. The client can then proceed with creating an AF_XDP socket and
> > inserting the socket into the XSKMAP pointed to by the FD received on the
> UDS.
> >
> > A new vdev arg "use_cni" is created to indicate user wishes to run the
> > PMD in unprivileged mode and to receive the XSKMAP FD from the CNI.
> > When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf
> > flag should be used when creating the socket, which tells libbpf not
> > to load the default libbpf program on the netdev. We tell libbpf not
> > to do this because the loading is handled by the CNI in this scenario.
> >
> > [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> >
> > Signed-off-by: Shibin Koikkara Reeny <shibin.koikkara.reeny@intel.com>
> > ---
> >  drivers/net/af_xdp/rte_eth_af_xdp.c | 337
> > +++++++++++++++++++++++++++-
> >  1 file changed, 325 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > index b6ec9bf490..196d98ad97 100644
> > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > @@ -7,6 +7,7 @@
> >  #include <string.h>
> >  #include <netinet/in.h>
> >  #include <net/if.h>
> > +#include <sys/un.h>
> >  #include <sys/socket.h>
> >  #include <sys/ioctl.h>
> >  #include <linux/if_ether.h>
> > @@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> > NOTICE);
> >
> >  #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> >
> > +#define MAX_LONG_OPT_SZ			64
> > +#define UDS_MAX_FD_NUM			2
> > +#define UDS_MAX_CMD_LEN			64
> > +#define UDS_MAX_CMD_RESP		128
> > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > +#define UDS_SOCK			"/tmp/afxdp.sock"
> > +#define UDS_CONNECT_MSG			"/connect"
> > +#define UDS_HOST_OK_MSG			"/host_ok"
> > +#define UDS_HOST_NAK_MSG		"/host_nak"
> > +#define UDS_VERSION_MSG			"/version"
> > +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> > +#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
> > +#define UDS_FD_ACK_MSG			"/fd_ack"
> > +#define UDS_FD_NAK_MSG			"/fd_nak"
> > +#define UDS_FIN_MSG			"/fin"
> > +#define UDS_FIN_ACK_MSG			"/fin_ack"
> > +
> > +
> >  static int afxdp_dev_count;
> >
> >  /* Message header to synchronize fds via IPC */ @@ -151,6 +170,7 @@
> > struct pmd_internals {
> >  	char prog_path[PATH_MAX];
> >  	bool custom_prog_configured;
> >  	bool force_copy;
> > +	bool use_cni;
> >  	struct bpf_map *map;
> >
> >  	struct rte_ether_addr eth_addr;
> > @@ -170,6 +190,7 @@ struct pmd_process_private {
> >  #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
> >  #define ETH_AF_XDP_BUDGET_ARG
> 	"busy_budget"
> >  #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
> > +#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
> >
> >  static const char * const valid_arguments[] = {
> >  	ETH_AF_XDP_IFACE_ARG,
> > @@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
> >  	ETH_AF_XDP_PROG_ARG,
> >  	ETH_AF_XDP_BUDGET_ARG,
> >  	ETH_AF_XDP_FORCE_COPY_ARG,
> > -	NULL
> > -};
> > +	ETH_AF_XDP_USE_CNI_ARG,
> > +	NULL};
> >
> >  static const struct rte_eth_link pmd_link = {
> >  	.link_speed = RTE_ETH_SPEED_NUM_10G, @@ -1129,7 +1150,8 @@
> > xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
> >  		ret = xsk_umem__create(&umem->umem, base_addr,
> umem_size,
> >  				&rxq->fq, &rxq->cq, &usr_config);
> >  		if (ret) {
> > -			AF_XDP_LOG(ERR, "Failed to create umem\n");
> > +			AF_XDP_LOG(ERR, "Failed to create umem [%d]:
> > [%s]\n",
> > +				   errno, strerror(errno));
> >  			goto err;
> >  		}
> >  		umem->buffer = base_addr;
> > @@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct
> > pkt_rx_queue *rxq)
> >  	return 0;
> >  }
> >
> > +static int
> > +init_uds_sock(struct sockaddr_un *server) {
> > +	int sock;
> > +
> > +	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> > +	if (sock < 0) {
> > +		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
> > +		return -1;
> > +	}
> > +
> > +	server->sun_family = AF_UNIX;
> > +	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
> > +
> > +	if (connect(sock, (struct sockaddr *)server, sizeof(struct
> > sockaddr_un)) < 0) {
> 
> 	seems the server address is hard coded as  "/tmp/afxdp.sock", is any
> spec we follows, or should we parse this as a devargs?
> 	better add some comment or external link that help to explain this

It was already hardcoded in the afxdp-plugins https://github.com/intel/afxdp-plugins-for-kubernetes/blob/main/constants/constants.go .

If we want to go with the devargs we will still need to put a default hardcoded value as there is no documentation showing the where and what is the name of the socket in the repo without going through the code.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
  2022-12-14 15:41 ` Shibin Koikkara Reeny
@ 2023-01-18 12:10   ` Zhang, Qi Z
  2023-01-19 15:10     ` Koikkara Reeny, Shibin
  2023-01-25 16:43   ` Burakov, Anatoly
  1 sibling, 1 reply; 8+ messages in thread
From: Zhang, Qi Z @ 2023-01-18 12:10 UTC (permalink / raw)
  To: Koikkara Reeny, Shibin, dev, Burakov, Anatoly, Richardson, Bruce
  Cc: Loftus, Ciara



> -----Original Message-----
> From: Koikkara Reeny, Shibin <shibin.koikkara.reeny@intel.com>
> Sent: Wednesday, December 14, 2022 11:41 PM
> To: dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>
> Cc: Loftus, Ciara <ciara.loftus@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Koikkara Reeny, Shibin
> <shibin.koikkara.reeny@intel.com>
> Subject: [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
> 
> Integrate support for the AF_XDP CNI and device plugin [1] so that the DPDK
> AF_XDP PMD can work in an unprivileged container environment.
> Part of the AF_XDP PMD initialization process involves loading an eBPF
> program onto the given netdev. This operation requires privileges, which
> prevents the PMD from being able to work in an unprivileged container
> (without root access). The plugin CNI handles the program loading. CNI open
> Unix Domain Socket (UDS) and waits listening for a client to make requests
> over that UDS. The client(DPDK) connects and a "handshake" occurs, then
> the File Descriptor which points to the XSKMAP associated with the loaded
> eBPF program is handed over to the client. The client can then proceed with
> creating an AF_XDP socket and inserting the socket into the XSKMAP pointed
> to by the FD received on the UDS.
> 
> A new vdev arg "use_cni" is created to indicate user wishes to run the PMD in
> unprivileged mode and to receive the XSKMAP FD from the CNI.
> When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf
> flag should be used when creating the socket, which tells libbpf not to load
> the default libbpf program on the netdev. We tell libbpf not to do this
> because the loading is handled by the CNI in this scenario.
> 
> [1]: https://github.com/intel/afxdp-plugins-for-kubernetes
> 
> Signed-off-by: Shibin Koikkara Reeny <shibin.koikkara.reeny@intel.com>
> ---
>  drivers/net/af_xdp/rte_eth_af_xdp.c | 337
> +++++++++++++++++++++++++++-
>  1 file changed, 325 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> b/drivers/net/af_xdp/rte_eth_af_xdp.c
> index b6ec9bf490..196d98ad97 100644
> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> @@ -7,6 +7,7 @@
>  #include <string.h>
>  #include <netinet/in.h>
>  #include <net/if.h>
> +#include <sys/un.h>
>  #include <sys/socket.h>
>  #include <sys/ioctl.h>
>  #include <linux/if_ether.h>
> @@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> NOTICE);
> 
>  #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> 
> +#define MAX_LONG_OPT_SZ			64
> +#define UDS_MAX_FD_NUM			2
> +#define UDS_MAX_CMD_LEN			64
> +#define UDS_MAX_CMD_RESP		128
> +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> +#define UDS_SOCK			"/tmp/afxdp.sock"
> +#define UDS_CONNECT_MSG			"/connect"
> +#define UDS_HOST_OK_MSG			"/host_ok"
> +#define UDS_HOST_NAK_MSG		"/host_nak"
> +#define UDS_VERSION_MSG			"/version"
> +#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
> +#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
> +#define UDS_FD_ACK_MSG			"/fd_ack"
> +#define UDS_FD_NAK_MSG			"/fd_nak"
> +#define UDS_FIN_MSG			"/fin"
> +#define UDS_FIN_ACK_MSG			"/fin_ack"
> +
> +
>  static int afxdp_dev_count;
> 
>  /* Message header to synchronize fds via IPC */ @@ -151,6 +170,7 @@
> struct pmd_internals {
>  	char prog_path[PATH_MAX];
>  	bool custom_prog_configured;
>  	bool force_copy;
> +	bool use_cni;
>  	struct bpf_map *map;
> 
>  	struct rte_ether_addr eth_addr;
> @@ -170,6 +190,7 @@ struct pmd_process_private {
>  #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
>  #define ETH_AF_XDP_BUDGET_ARG			"busy_budget"
>  #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
> +#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
> 
>  static const char * const valid_arguments[] = {
>  	ETH_AF_XDP_IFACE_ARG,
> @@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
>  	ETH_AF_XDP_PROG_ARG,
>  	ETH_AF_XDP_BUDGET_ARG,
>  	ETH_AF_XDP_FORCE_COPY_ARG,
> -	NULL
> -};
> +	ETH_AF_XDP_USE_CNI_ARG,
> +	NULL};
> 
>  static const struct rte_eth_link pmd_link = {
>  	.link_speed = RTE_ETH_SPEED_NUM_10G,
> @@ -1129,7 +1150,8 @@ xsk_umem_info *xdp_umem_configure(struct
> pmd_internals *internals,
>  		ret = xsk_umem__create(&umem->umem, base_addr,
> umem_size,
>  				&rxq->fq, &rxq->cq, &usr_config);
>  		if (ret) {
> -			AF_XDP_LOG(ERR, "Failed to create umem\n");
> +			AF_XDP_LOG(ERR, "Failed to create umem [%d]:
> [%s]\n",
> +				   errno, strerror(errno));
>  			goto err;
>  		}
>  		umem->buffer = base_addr;
> @@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct
> pkt_rx_queue *rxq)
>  	return 0;
>  }
> 
> +static int
> +init_uds_sock(struct sockaddr_un *server) {
> +	int sock;
> +
> +	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> +	if (sock < 0) {
> +		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
> +		return -1;
> +	}
> +
> +	server->sun_family = AF_UNIX;
> +	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
> +
> +	if (connect(sock, (struct sockaddr *)server, sizeof(struct
> sockaddr_un)) < 0) {

	seems the server address is hard coded as  "/tmp/afxdp.sock", is any spec we follows, or should we parse this as a devargs?
	better add some comment or external link that help to explain this


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration
       [not found] <to=20221213104746.805221-1-shibin.koikkara.reeny@intel.com>
@ 2022-12-14 15:41 ` Shibin Koikkara Reeny
  2023-01-18 12:10   ` Zhang, Qi Z
  2023-01-25 16:43   ` Burakov, Anatoly
  0 siblings, 2 replies; 8+ messages in thread
From: Shibin Koikkara Reeny @ 2022-12-14 15:41 UTC (permalink / raw)
  To: dev, anatoly.burakov, bruce.richardson
  Cc: ciara.loftus, qi.z.zhang, Shibin Koikkara Reeny

Integrate support for the AF_XDP CNI and device plugin [1] so that the
DPDK AF_XDP PMD can work in an unprivileged container environment.
Part of the AF_XDP PMD initialization process involves loading
an eBPF program onto the given netdev. This operation requires
privileges, which prevents the PMD from being able to work in an
unprivileged container (without root access). The plugin CNI handles
the program loading. CNI open Unix Domain Socket (UDS) and waits
listening for a client to make requests over that UDS. The client(DPDK)
connects and a "handshake" occurs, then the File Descriptor which points
to the XSKMAP associated with the loaded eBPF program is handed over
to the client. The client can then proceed with creating an AF_XDP
socket and inserting the socket into the XSKMAP pointed to by the
FD received on the UDS.

A new vdev arg "use_cni" is created to indicate user wishes to run
the PMD in unprivileged mode and to receive the XSKMAP FD from the CNI.
When this flag is set, the XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD libbpf flag
should be used when creating the socket, which tells libbpf not to load the
default libbpf program on the netdev. We tell libbpf not to do this because
the loading is handled by the CNI in this scenario.

[1]: https://github.com/intel/afxdp-plugins-for-kubernetes

Signed-off-by: Shibin Koikkara Reeny <shibin.koikkara.reeny@intel.com>
---
 drivers/net/af_xdp/rte_eth_af_xdp.c | 337 +++++++++++++++++++++++++++-
 1 file changed, 325 insertions(+), 12 deletions(-)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index b6ec9bf490..196d98ad97 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <netinet/in.h>
 #include <net/if.h>
+#include <sys/un.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <linux/if_ether.h>
@@ -81,6 +82,24 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype, NOTICE);
 
 #define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
 
+#define MAX_LONG_OPT_SZ			64
+#define UDS_MAX_FD_NUM			2
+#define UDS_MAX_CMD_LEN			64
+#define UDS_MAX_CMD_RESP		128
+#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
+#define UDS_SOCK			"/tmp/afxdp.sock"
+#define UDS_CONNECT_MSG			"/connect"
+#define UDS_HOST_OK_MSG			"/host_ok"
+#define UDS_HOST_NAK_MSG		"/host_nak"
+#define UDS_VERSION_MSG			"/version"
+#define UDS_XSK_MAP_FD_MSG		"/xsk_map_fd"
+#define UDS_XSK_SOCKET_MSG		"/xsk_socket"
+#define UDS_FD_ACK_MSG			"/fd_ack"
+#define UDS_FD_NAK_MSG			"/fd_nak"
+#define UDS_FIN_MSG			"/fin"
+#define UDS_FIN_ACK_MSG			"/fin_ack"
+
+
 static int afxdp_dev_count;
 
 /* Message header to synchronize fds via IPC */
@@ -151,6 +170,7 @@ struct pmd_internals {
 	char prog_path[PATH_MAX];
 	bool custom_prog_configured;
 	bool force_copy;
+	bool use_cni;
 	struct bpf_map *map;
 
 	struct rte_ether_addr eth_addr;
@@ -170,6 +190,7 @@ struct pmd_process_private {
 #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
 #define ETH_AF_XDP_BUDGET_ARG			"busy_budget"
 #define ETH_AF_XDP_FORCE_COPY_ARG		"force_copy"
+#define ETH_AF_XDP_USE_CNI_ARG			"use_cni"
 
 static const char * const valid_arguments[] = {
 	ETH_AF_XDP_IFACE_ARG,
@@ -179,8 +200,8 @@ static const char * const valid_arguments[] = {
 	ETH_AF_XDP_PROG_ARG,
 	ETH_AF_XDP_BUDGET_ARG,
 	ETH_AF_XDP_FORCE_COPY_ARG,
-	NULL
-};
+	ETH_AF_XDP_USE_CNI_ARG,
+	NULL};
 
 static const struct rte_eth_link pmd_link = {
 	.link_speed = RTE_ETH_SPEED_NUM_10G,
@@ -1129,7 +1150,8 @@ xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 		ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
 				&rxq->fq, &rxq->cq, &usr_config);
 		if (ret) {
-			AF_XDP_LOG(ERR, "Failed to create umem\n");
+			AF_XDP_LOG(ERR, "Failed to create umem [%d]: [%s]\n",
+				   errno, strerror(errno));
 			goto err;
 		}
 		umem->buffer = base_addr;
@@ -1314,6 +1336,245 @@ configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
 	return 0;
 }
 
+static int
+init_uds_sock(struct sockaddr_un *server)
+{
+	int sock;
+
+	sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (sock < 0) {
+		AF_XDP_LOG(ERR, "Failed to opening stream socket\n");
+		return -1;
+	}
+
+	server->sun_family = AF_UNIX;
+	strlcpy(server->sun_path, UDS_SOCK, sizeof(server->sun_path));
+
+	if (connect(sock, (struct sockaddr *)server, sizeof(struct sockaddr_un)) < 0) {
+		close(sock);
+		AF_XDP_LOG(ERR, "Error connecting stream socket errno = [%d]: [%s]\n",
+			   errno, strerror(errno));
+		return -1;
+	}
+
+	return sock;
+}
+
+struct msg_internal {
+	char response[UDS_MAX_CMD_RESP];
+	int len_param;
+	int num_fds;
+	int fds[UDS_MAX_FD_NUM];
+};
+
+static int
+send_msg(int sock, char *request, int *fd)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	char control[CMSG_SPACE(sizeof(*fd))];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	strlcpy(dst.sun_path, UDS_SOCK, sizeof(dst.sun_path));
+
+	/* Initialize message header structure */
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+	iov.iov_base = request;
+	iov.iov_len = strlen(request);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	/* Translate the FD. */
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(sizeof(*fd));
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fd, sizeof(*fd));
+
+	/* Send the request message. */
+	do {
+		snd = sendmsg(sock, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	return snd;
+}
+
+static int
+read_msg(int sock, char *response, struct sockaddr_un *s, int *fd)
+{
+	int msglen;
+	struct msghdr msgh;
+	struct iovec iov;
+	char control[CMSG_SPACE(sizeof(*fd))];
+	struct cmsghdr *cmsg;
+
+	/* Initialize message header structure */
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = response;
+	iov.iov_len = UDS_MAX_CMD_RESP;
+
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(sock, &msgh, 0);
+
+	/* zero length message means socket was closed */
+	if (msglen == 0)
+		return 0;
+
+	if (msglen < 0) {
+		AF_XDP_LOG(ERR, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+			cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+				cmsg->cmsg_type == SCM_RIGHTS) {
+			memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
+			break;
+		}
+	}
+
+	response[msglen] = '\0';
+	return msglen;
+}
+
+static int
+make_request_cni(int sock, struct sockaddr_un *server, char *request,
+		 int *req_fd, char *response, int *out_fd)
+{
+	int rval;
+
+	AF_XDP_LOG(INFO, "Request: [%s]\n", request);
+
+	/* if no file descriptor to send then directly write to socket.
+	 * else use sendmsg() to send the file descriptor.
+	 */
+	if (req_fd == NULL)
+		rval = write(sock, request, strlen(request));
+	else
+		rval = send_msg(sock, request, req_fd);
+
+	if (rval < 0) {
+		AF_XDP_LOG(ERR, "Write error %s\n", strerror(errno));
+		return -1;
+	}
+
+	rval = read_msg(sock, response, server, out_fd);
+	if (rval <= 0) {
+		AF_XDP_LOG(ERR, "Read error %d\n", rval);
+		return -1;
+	}
+	AF_XDP_LOG(INFO, "Response: [%s]\n", request);
+
+	return 0;
+}
+
+static int
+check_response(char *response, char *exp_resp, long size)
+{
+	return strncmp(response, exp_resp, size);
+}
+
+static int
+get_cni_fd(char *if_name)
+{
+	char request[UDS_MAX_CMD_LEN], response[UDS_MAX_CMD_RESP];
+	char hostname[MAX_LONG_OPT_SZ], exp_resp[UDS_MAX_CMD_RESP];
+	struct sockaddr_un server;
+	int xsk_map_fd = -1, out_fd = 0;
+	int sock, err;
+
+	err = gethostname(hostname, MAX_LONG_OPT_SZ - 1);
+	if (err)
+		return -1;
+
+	memset(&server, 0, sizeof(server));
+	sock = init_uds_sock(&server);
+
+	/* Initiates handshake to CNI send: /connect,hostname */
+	snprintf(request, sizeof(request), "%s,%s", UDS_CONNECT_MSG, hostname);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Expect /host_ok */
+	strlcpy(exp_resp, UDS_HOST_OK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+	/* Request for "/version" */
+	strlcpy(request, UDS_VERSION_MSG, UDS_MAX_CMD_LEN);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Request for file descriptor for netdev name*/
+	snprintf(request, sizeof(request), "%s,%s", UDS_XSK_MAP_FD_MSG, if_name);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	if (out_fd < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	xsk_map_fd = out_fd;
+
+	/* Expect fd_ack with file descriptor */
+	strlcpy(exp_resp, UDS_FD_ACK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+
+	/* Initiate close connection */
+	strlcpy(request, UDS_FIN_MSG, UDS_MAX_CMD_LEN);
+	memset(response, 0, sizeof(response));
+	if (make_request_cni(sock, &server, request, NULL, response, &out_fd) < 0) {
+		AF_XDP_LOG(ERR, "Error in processing cmd [%s]\n", request);
+		goto err_close;
+	}
+
+	/* Connection close */
+	strlcpy(exp_resp, UDS_FIN_ACK_MSG, UDS_MAX_CMD_LEN);
+	if (check_response(response, exp_resp, strlen(exp_resp)) < 0) {
+		AF_XDP_LOG(ERR, "Unexpected response [%s]\n", response);
+		goto err_close;
+	}
+	close(sock);
+
+	return xsk_map_fd;
+
+err_close:
+	close(sock);
+	return -1;
+}
+
 static int
 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 	      int ring_size)
@@ -1362,6 +1623,10 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 	cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
 #endif
 
+	/* Disable libbpf from loading XDP program */
+	if (internals->use_cni)
+		cfg.libbpf_flags |= XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
+
 	if (strnlen(internals->prog_path, PATH_MAX)) {
 		if (!internals->custom_prog_configured) {
 			ret = load_custom_xdp_prog(internals->prog_path,
@@ -1413,7 +1678,23 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 		}
 	}
 
-	if (rxq->busy_budget) {
+	if (internals->use_cni) {
+		int err, fd, map_fd;
+
+		/* get socket fd from CNI plugin */
+		map_fd = get_cni_fd(internals->if_name);
+		if (map_fd < 0) {
+			AF_XDP_LOG(ERR, "Failed to receive CNI plugin fd\n");
+			goto out_xsk;
+		}
+		/* get socket fd */
+		fd = xsk_socket__fd(rxq->xsk);
+		err = bpf_map_update_elem(map_fd, &rxq->xsk_queue_idx, &fd, 0);
+		if (err) {
+			AF_XDP_LOG(ERR, "Failed to insert unprivileged xsk in map.\n");
+			goto out_xsk;
+		}
+	} else if (rxq->busy_budget) {
 		ret = configure_preferred_busy_poll(rxq);
 		if (ret) {
 			AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
@@ -1584,6 +1865,26 @@ static const struct eth_dev_ops ops = {
 	.get_monitor_addr = eth_get_monitor_addr,
 };
 
+/* CNI option works in unprivileged container environment
+ * and ethernet device functionality will be reduced. So
+ * additional customiszed eth_dev_ops struct is needed
+ * for cni.
+ **/
+static const struct eth_dev_ops ops_cni = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_close = eth_dev_close,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.mtu_set = eth_dev_mtu_set,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+	.get_monitor_addr = eth_get_monitor_addr,
+};
+
 /** parse busy_budget argument */
 static int
 parse_budget_arg(const char *key __rte_unused,
@@ -1704,8 +2005,8 @@ xdp_get_channels_info(const char *if_name, int *max_queues,
 
 static int
 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
-			int *queue_cnt, int *shared_umem, char *prog_path,
-			int *busy_budget, int *force_copy)
+		 int *queue_cnt, int *shared_umem, char *prog_path,
+		 int *busy_budget, int *force_copy, int *use_cni)
 {
 	int ret;
 
@@ -1746,6 +2047,11 @@ parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
 	if (ret < 0)
 		goto free_kvlist;
 
+	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_USE_CNI_ARG,
+				 &parse_integer_arg, use_cni);
+	if (ret < 0)
+		goto free_kvlist;
+
 free_kvlist:
 	rte_kvargs_free(kvlist);
 	return ret;
@@ -1783,8 +2089,9 @@ get_iface_info(const char *if_name,
 
 static struct rte_eth_dev *
 init_internals(struct rte_vdev_device *dev, const char *if_name,
-		int start_queue_idx, int queue_cnt, int shared_umem,
-		const char *prog_path, int busy_budget, int force_copy)
+	       int start_queue_idx, int queue_cnt, int shared_umem,
+	       const char *prog_path, int busy_budget, int force_copy,
+	       int use_cni)
 {
 	const char *name = rte_vdev_device_name(dev);
 	const unsigned int numa_node = dev->device.numa_node;
@@ -1813,6 +2120,7 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
 #endif
 	internals->shared_umem = shared_umem;
 	internals->force_copy = force_copy;
+	internals->use_cni = use_cni;
 
 	if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
 				  &internals->combined_queue_cnt)) {
@@ -1871,7 +2179,11 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
 	eth_dev->data->dev_link = pmd_link;
 	eth_dev->data->mac_addrs = &internals->eth_addr;
 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
-	eth_dev->dev_ops = &ops;
+	if (!internals->use_cni)
+		eth_dev->dev_ops = &ops;
+	else
+		eth_dev->dev_ops = &ops_cni;
+
 	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
 	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
 	eth_dev->process_private = process_private;
@@ -1998,6 +2310,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 	char prog_path[PATH_MAX] = {'\0'};
 	int busy_budget = -1, ret;
 	int force_copy = 0;
+	int use_cni = 0;
 	struct rte_eth_dev *eth_dev = NULL;
 	const char *name = rte_vdev_device_name(dev);
 
@@ -2043,7 +2356,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 
 	if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
 			     &xsk_queue_cnt, &shared_umem, prog_path,
-			     &busy_budget, &force_copy) < 0) {
+			     &busy_budget, &force_copy, &use_cni) < 0) {
 		AF_XDP_LOG(ERR, "Invalid kvargs value\n");
 		return -EINVAL;
 	}
@@ -2057,8 +2370,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 					busy_budget;
 
 	eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
-					xsk_queue_cnt, shared_umem, prog_path,
-					busy_budget, force_copy);
+				 xsk_queue_cnt, shared_umem, prog_path,
+				 busy_budget, force_copy, use_cni);
 	if (eth_dev == NULL) {
 		AF_XDP_LOG(ERR, "Failed to init internals\n");
 		return -1;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-02-02 17:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-14 15:34 [PATCH v2] net/af_xdp: AF_XDP PMD CNI Integration Shibin Koikkara Reeny
     [not found] <to=20221213104746.805221-1-shibin.koikkara.reeny@intel.com>
2022-12-14 15:41 ` Shibin Koikkara Reeny
2023-01-18 12:10   ` Zhang, Qi Z
2023-01-19 15:10     ` Koikkara Reeny, Shibin
2023-02-02 14:49       ` Zhang, Qi Z
2023-02-02 15:18         ` Bruce Richardson
2023-02-02 17:02           ` Koikkara Reeny, Shibin
2023-01-25 16:43   ` Burakov, Anatoly

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).