* [RFC] eal: increase the number of availble file descriptors for MP
@ 2024-03-08 18:54 Stephen Hemminger
  2024-03-08 20:36 ` [RFC v2] eal: increase passed max multi-process file descriptors Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Stephen Hemminger @ 2024-03-08 18:54 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Anatoly Burakov
The current limit of file descriptors is too low, it should have
been set to the maximum possible to send across an unix domain
socket.
This is an attempt to allow increasing it without breaking ABI.
But in the process it exposes what is broken about how symbol
versions are checked in check-symbol-maps.sh. That script is
broken in that it won't allow adding a backwards compatiable
version hook like this.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/eal/common/eal_common_proc.c | 118 ++++++++++++++++++++++++++-----
 lib/eal/common/meson.build       |   2 +
 lib/eal/include/rte_eal.h        |  13 +++-
 lib/eal/version.map              |   9 +++
 4 files changed, 125 insertions(+), 17 deletions(-)
diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
index d24093937c1d..c08113a8d9e0 100644
--- a/lib/eal/common/eal_common_proc.c
+++ b/lib/eal/common/eal_common_proc.c
@@ -27,6 +27,7 @@
 #include <rte_lcore.h>
 #include <rte_log.h>
 #include <rte_thread.h>
+#include <rte_function_versioning.h>
 
 #include "eal_memcfg.h"
 #include "eal_private.h"
@@ -796,7 +797,7 @@ mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 }
 
 static int
-check_input(const struct rte_mp_msg *msg)
+check_input(const struct rte_mp_msg *msg, int max_fd)
 {
 	if (msg == NULL) {
 		EAL_LOG(ERR, "Msg cannot be NULL");
@@ -825,9 +826,8 @@ check_input(const struct rte_mp_msg *msg)
 		return -1;
 	}
 
-	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
-		EAL_LOG(ERR, "Cannot send more than %d FDs",
-			RTE_MP_MAX_FD_NUM);
+	if (msg->num_fds > max_fd) {
+		EAL_LOG(ERR, "Cannot send more than %d FDs", max_fd);
 		rte_errno = E2BIG;
 		return -1;
 	}
@@ -835,13 +835,13 @@ check_input(const struct rte_mp_msg *msg)
 	return 0;
 }
 
-int
-rte_mp_sendmsg(struct rte_mp_msg *msg)
+static int
+mp_sendmsg(struct rte_mp_msg *msg, int max_fd)
 {
 	const struct internal_config *internal_conf =
 		eal_get_internal_configuration();
 
-	if (check_input(msg) != 0)
+	if (check_input(msg, max_fd) != 0)
 		return -1;
 
 	if (internal_conf->no_shconf) {
@@ -854,6 +854,24 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
 	return mp_send(msg, NULL, MP_MSG);
 }
 
+int rte_mp_sendmsg_V23(struct rte_mp_old_msg *msg);
+int rte_mp_sendmsg_V24(struct rte_mp_msg *msg);
+
+int
+rte_mp_sendmsg_V23(struct rte_mp_old_msg *omsg)
+{
+	return mp_sendmsg((struct rte_mp_msg *)omsg, RTE_MP_MAX_OLD_FD_NUM);
+}
+VERSION_SYMBOL(rte_mp_sendmsg, _V23, 23);
+
+int
+rte_mp_sendmsg_V24(struct rte_mp_msg *msg)
+{
+	return mp_sendmsg(msg, RTE_MP_MAX_FD_NUM);
+}
+BIND_DEFAULT_SYMBOL(rte_mp_sendmsg, _V24, 24);
+MAP_STATIC_SYMBOL(int rte_mp_sendmsg(struct rte_mp_msg *msg), rte_mp_sendmsg_V24);
+
 static int
 mp_request_async(const char *dst, struct rte_mp_msg *req,
 		struct async_request_param *param, const struct timespec *ts)
@@ -988,9 +1006,9 @@ mp_request_sync(const char *dst, struct rte_mp_msg *req,
 	return 0;
 }
 
-int
-rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
-		const struct timespec *ts)
+static int
+__rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		      const struct timespec *ts, int max_fd)
 {
 	int dir_fd, ret = -1;
 	DIR *mp_dir;
@@ -1005,7 +1023,7 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
 	reply->nb_received = 0;
 	reply->msgs = NULL;
 
-	if (check_input(req) != 0)
+	if (check_input(req, max_fd) != 0)
 		goto end;
 
 	if (internal_conf->no_shconf) {
@@ -1085,9 +1103,34 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
 	return ret;
 }
 
+int rte_mp_request_sync_V23(struct rte_mp_old_msg *req, struct rte_mp_reply *reply,
+			    const struct timespec *ts);
+int rte_mp_request_sync_V24(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+			    const struct timespec *ts);
+
+
+int
+rte_mp_request_sync_V23(struct rte_mp_old_msg *req, struct rte_mp_reply *reply,
+		    const struct timespec *ts)
+{
+	return __rte_mp_request_sync((struct rte_mp_msg *)req, reply, ts, RTE_MP_MAX_OLD_FD_NUM);
+}
+VERSION_SYMBOL(rte_mp_request_sync, _V23, 23);
+
 int
-rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
-		rte_mp_async_reply_t clb)
+rte_mp_request_sync_V24(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+			const struct timespec *ts)
+{
+	return __rte_mp_request_sync(req, reply, ts, RTE_MP_MAX_FD_NUM);
+}
+BIND_DEFAULT_SYMBOL(rte_mp_request_sync, _V24, 24);
+MAP_STATIC_SYMBOL(int rte_mp_request_sync(struct rte_mp_msg *req, \
+					  struct rte_mp_reply *reply, \
+					  const struct timespec *ts), rte_mp_request_sync_V24);
+
+static int
+__rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
+		       rte_mp_async_reply_t clb, int max_fd)
 {
 	struct rte_mp_msg *copy;
 	struct pending_request *dummy;
@@ -1104,7 +1147,7 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 
 	EAL_LOG(DEBUG, "request: %s", req->name);
 
-	if (check_input(req) != 0)
+	if (check_input(req, max_fd) != 0)
 		return -1;
 
 	if (internal_conf->no_shconf) {
@@ -1237,14 +1280,38 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
 	return -1;
 }
 
+int rte_mp_request_async_V23(struct rte_mp_old_msg *req, const struct timespec *ts,
+			     rte_mp_async_reply_t clb);
+int rte_mp_request_async_V24(struct rte_mp_msg *req, const struct timespec *ts,
+			     rte_mp_async_reply_t clb);
+
 int
-rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+rte_mp_request_async_V23(struct rte_mp_old_msg *req, const struct timespec *ts,
+			 rte_mp_async_reply_t clb)
+{
+	return __rte_mp_request_async((struct rte_mp_msg *)req, ts, clb, RTE_MP_MAX_OLD_FD_NUM);
+}
+VERSION_SYMBOL(rte_mp_request_async, _V23, 23);
+
+int
+rte_mp_request_async_V24(struct rte_mp_msg *req, const struct timespec *ts,
+			 rte_mp_async_reply_t clb)
+{
+	return __rte_mp_request_async(req, ts, clb, RTE_MP_MAX_FD_NUM);
+}
+BIND_DEFAULT_SYMBOL(rte_mp_request_async, _V24, 24);
+MAP_STATIC_SYMBOL(int rte_mp_request_async(struct rte_mp_msg *req,	\
+					   const struct timespec *ts,	\
+					   rte_mp_async_reply_t clb), rte_mp_request_async_V24);
+
+static int
+mp_reply(struct rte_mp_msg *msg, const char *peer, int max_fd)
 {
 	EAL_LOG(DEBUG, "reply: %s", msg->name);
 	const struct internal_config *internal_conf =
 		eal_get_internal_configuration();
 
-	if (check_input(msg) != 0)
+	if (check_input(msg, max_fd) != 0)
 		return -1;
 
 	if (peer == NULL) {
@@ -1261,6 +1328,25 @@ rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
 	return mp_send(msg, peer, MP_REP);
 }
 
+int rte_mp_reply_V23(struct rte_mp_old_msg *msg, const char *peer);
+int rte_mp_reply_V24(struct rte_mp_msg *msg, const char *peer);
+
+int
+rte_mp_reply_V23(struct rte_mp_old_msg *msg, const char *peer)
+{
+	return mp_reply((struct rte_mp_msg *)msg, peer, RTE_MP_MAX_OLD_FD_NUM);
+}
+VERSION_SYMBOL(rte_mp_reply, _V23, 23);
+
+int
+rte_mp_reply_V24(struct rte_mp_msg *msg, const char *peer)
+{
+	return mp_reply(msg, peer, RTE_MP_MAX_FD_NUM);
+}
+BIND_DEFAULT_SYMBOL(rte_mp_reply, _V24, 24);
+MAP_STATIC_SYMBOL(int rte_mp_reply(struct rte_mp_msg *msg, const char *peer), rte_mp_reply_V24);
+
+
 /* Internally, the status of the mp feature is represented as a three-state:
  * - "unknown" as long as no secondary process attached to a primary process
  *   and there was no call to rte_mp_disable yet,
diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
index 22a626ba6fc7..3faf0c20e798 100644
--- a/lib/eal/common/meson.build
+++ b/lib/eal/common/meson.build
@@ -3,6 +3,8 @@
 
 includes += include_directories('.')
 
+use_function_versioning = true
+
 cflags += [ '-DABI_VERSION="@0@"'.format(abi_version) ]
 
 sources += files(
diff --git a/lib/eal/include/rte_eal.h b/lib/eal/include/rte_eal.h
index c2256f832e51..0d0761c50409 100644
--- a/lib/eal/include/rte_eal.h
+++ b/lib/eal/include/rte_eal.h
@@ -155,9 +155,10 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
  */
 bool rte_mp_disable(void);
 
-#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_FD_NUM	253  /* The max number of fds (SCM_MAX_FD) */
 #define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
 #define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
+
 struct rte_mp_msg {
 	char name[RTE_MP_MAX_NAME_LEN];
 	int len_param;
@@ -166,6 +167,16 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+/* Legacy API version */
+#define RTE_MP_MAX_OLD_FD_NUM	8    /* The legacy limit on fds */
+struct rte_mp_old_msg {
+	char name[RTE_MP_MAX_NAME_LEN];
+	int len_param;
+	int num_fds;
+	uint8_t param[RTE_MP_MAX_PARAM_LEN];
+	int fds[RTE_MP_MAX_OLD_FD_NUM];
+};
+
 struct rte_mp_reply {
 	int nb_sent;
 	int nb_received;
diff --git a/lib/eal/version.map b/lib/eal/version.map
index c06ceaad5097..264ff2d0818b 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -344,6 +344,15 @@ DPDK_24 {
 	local: *;
 };
 
+DPDK_23 {
+	global:
+
+	rte_mp_reply;
+	rte_mp_request_async;
+	rte_mp_request_sync;
+	rte_mp_sendmsg;
+} DPDK_24;
+
 EXPERIMENTAL {
 	global:
 
-- 
2.43.0
^ permalink raw reply	[flat|nested] 6+ messages in thread
* [RFC v2] eal: increase passed max multi-process file descriptors
  2024-03-08 18:54 [RFC] eal: increase the number of availble file descriptors for MP Stephen Hemminger
@ 2024-03-08 20:36 ` Stephen Hemminger
  2024-03-14 15:23   ` Stephen Hemminger
  2024-03-09 18:12 ` [RFC v3] tap: do not duplicate fd's Stephen Hemminger
  2024-03-14 14:40 ` [RFC] eal: increase the number of availble file descriptors for MP David Marchand
  2 siblings, 1 reply; 6+ messages in thread
From: Stephen Hemminger @ 2024-03-08 20:36 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Anatoly Burakov, Jianfeng Tan
Both XDP and TAP device are limited in the number of queues
because of limitations on the number of file descriptors that
are allowed. The original choice of 8 was too low; the allowed
maximum is 253 according to unix(7) man page.
This may look like a serious ABI breakage but it is not.
It is simpler for everyone if the limit is increased rather than
building a parallel set of calls.
The case that matters is older application registering MP support
with the newer version of EAL. In this case, since the old application
will always send the more compact structure (less possible fd's)
it is OK.
Request (for up to 8 fds) sent to EAL.
   - EAL only references up to num_fds.
   - The area past the old fd array is not accessed.
Reply callback:
   - EAL will pass pointer to the new (larger structure),
     the old callback will only look at the first part of
     the fd array (num_fds <= 8).
   - Since primary and secondary must both be from same DPDK version
     there is normal way that a reply with more fd's could be possible.
     The only case is the same as above, where application requested
     something that would break in old version and now succeeds.
The one possible incompatibility is that if application passed
a larger number of fd's (32?) and expected an error. Now it will
succeed and get passed through.
Fixes: bacaa2754017 ("eal: add channel for multi-process communication")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v2 - show the simpler way to address with some minor ABI issue
 doc/guides/rel_notes/release_24_03.rst | 4 ++++
 lib/eal/include/rte_eal.h              | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index 932688ca4d82..1d33cfa15dfb 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -225,6 +225,10 @@ API Changes
 * ethdev: Renamed structure ``rte_flow_action_modify_data`` to be
   ``rte_flow_field_data`` for more generic usage.
 
+* eal: The maximum number of file descriptors allowed to be passed in
+  multi-process requests is increased from 8 to the maximum possible on
+  Linux unix domain sockets 253. This allows for more queues on XDP and
+  TAP device.
 
 ABI Changes
 -----------
diff --git a/lib/eal/include/rte_eal.h b/lib/eal/include/rte_eal.h
index c2256f832e51..cd84fcdd1bdb 100644
--- a/lib/eal/include/rte_eal.h
+++ b/lib/eal/include/rte_eal.h
@@ -155,7 +155,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
  */
 bool rte_mp_disable(void);
 
-#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_FD_NUM	253    /* The max amount of fds */
 #define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
 #define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
 struct rte_mp_msg {
-- 
2.43.0
^ permalink raw reply	[flat|nested] 6+ messages in thread
* [RFC v3] tap: do not duplicate fd's
  2024-03-08 18:54 [RFC] eal: increase the number of availble file descriptors for MP Stephen Hemminger
  2024-03-08 20:36 ` [RFC v2] eal: increase passed max multi-process file descriptors Stephen Hemminger
@ 2024-03-09 18:12 ` Stephen Hemminger
  2024-03-14 14:40 ` [RFC] eal: increase the number of availble file descriptors for MP David Marchand
  2 siblings, 0 replies; 6+ messages in thread
From: Stephen Hemminger @ 2024-03-09 18:12 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
The TAP devic can use the same file descriptor for both rx and tx queues.
This allows up to 8 queues (versus 4) and reduces some resource consumption.
Also, reduce the TAP_MAX_QUEUES to what the multi-process restrictions
will allow.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v3 - This is more limited patch, only addresses tap device
     and only gets tap up from 4 to 8 queues.
     Still better to fix underlying EAL issue, but that requires
     overriding strict ABI rules.
 drivers/net/tap/meson.build   |   2 +-
 drivers/net/tap/rte_eth_tap.c | 197 +++++++++++++++-------------------
 drivers/net/tap/rte_eth_tap.h |   3 +-
 drivers/net/tap/tap_flow.c    |   3 +-
 drivers/net/tap/tap_intr.c    |  12 ++-
 5 files changed, 95 insertions(+), 122 deletions(-)
diff --git a/drivers/net/tap/meson.build b/drivers/net/tap/meson.build
index 5099ccdff11b..9cd124d53e23 100644
--- a/drivers/net/tap/meson.build
+++ b/drivers/net/tap/meson.build
@@ -16,7 +16,7 @@ sources = files(
 
 deps = ['bus_vdev', 'gso', 'hash']
 
-cflags += '-DTAP_MAX_QUEUES=16'
+cflags += '-DTAP_MAX_QUEUES=8'
 
 # input array for meson symbol search:
 # [ "MACRO to define if found", "header for the search",
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 69d9da695bed..38a1b2d825f9 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -124,8 +124,7 @@ enum ioctl_mode {
 /* Message header to synchronize queues via IPC */
 struct ipc_queues {
 	char port_name[RTE_DEV_NAME_MAX_LEN];
-	int rxq_count;
-	int txq_count;
+	int q_count;
 	/*
 	 * The file descriptors are in the dedicated part
 	 * of the Unix message to be translated by the kernel.
@@ -446,7 +445,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		uint16_t data_off = rte_pktmbuf_headroom(mbuf);
 		int len;
 
-		len = readv(process_private->rxq_fds[rxq->queue_id],
+		len = readv(process_private->fds[rxq->queue_id],
 			*rxq->iovecs,
 			1 + (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_SCATTER ?
 			     rxq->nb_rx_desc : 1));
@@ -643,7 +642,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
 		}
 
 		/* copy the tx frame data */
-		n = writev(process_private->txq_fds[txq->queue_id], iovecs, k);
+		n = writev(process_private->fds[txq->queue_id], iovecs, k);
 		if (n <= 0)
 			return -1;
 
@@ -851,7 +850,6 @@ tap_mp_req_on_rxtx(struct rte_eth_dev *dev)
 	struct rte_mp_msg msg;
 	struct ipc_queues *request_param = (struct ipc_queues *)msg.param;
 	int err;
-	int fd_iterator = 0;
 	struct pmd_process_private *process_private = dev->process_private;
 	int i;
 
@@ -859,16 +857,13 @@ tap_mp_req_on_rxtx(struct rte_eth_dev *dev)
 	strlcpy(msg.name, TAP_MP_REQ_START_RXTX, sizeof(msg.name));
 	strlcpy(request_param->port_name, dev->data->name, sizeof(request_param->port_name));
 	msg.len_param = sizeof(*request_param);
-	for (i = 0; i < dev->data->nb_tx_queues; i++) {
-		msg.fds[fd_iterator++] = process_private->txq_fds[i];
-		msg.num_fds++;
-		request_param->txq_count++;
-	}
-	for (i = 0; i < dev->data->nb_rx_queues; i++) {
-		msg.fds[fd_iterator++] = process_private->rxq_fds[i];
-		msg.num_fds++;
-		request_param->rxq_count++;
-	}
+
+	/* rx and tx share file descriptors and nb_tx_queues == nb_rx_queues */
+	for (i = 0; i < dev->data->nb_rx_queues; i++)
+		msg.fds[i] = process_private->fds[i];
+
+	request_param->q_count = dev->data->nb_rx_queues;
+	msg.num_fds = dev->data->nb_rx_queues;
 
 	err = rte_mp_sendmsg(&msg);
 	if (err < 0) {
@@ -910,8 +905,6 @@ tap_mp_req_start_rxtx(const struct rte_mp_msg *request, __rte_unused const void
 	struct rte_eth_dev *dev;
 	const struct ipc_queues *request_param =
 		(const struct ipc_queues *)request->param;
-	int fd_iterator;
-	int queue;
 	struct pmd_process_private *process_private;
 
 	dev = rte_eth_dev_get_by_name(request_param->port_name);
@@ -920,14 +913,13 @@ tap_mp_req_start_rxtx(const struct rte_mp_msg *request, __rte_unused const void
 			request_param->port_name);
 		return -1;
 	}
+
 	process_private = dev->process_private;
-	fd_iterator = 0;
-	TAP_LOG(DEBUG, "tap_attach rx_q:%d tx_q:%d\n", request_param->rxq_count,
-		request_param->txq_count);
-	for (queue = 0; queue < request_param->txq_count; queue++)
-		process_private->txq_fds[queue] = request->fds[fd_iterator++];
-	for (queue = 0; queue < request_param->rxq_count; queue++)
-		process_private->rxq_fds[queue] = request->fds[fd_iterator++];
+	TAP_LOG(DEBUG, "tap_attach q:%d\n", request_param->q_count);
+
+	for (int q = 0; q < request_param->q_count; q++)
+		process_private->fds[q] = request->fds[q];
+
 
 	return 0;
 }
@@ -1121,7 +1113,6 @@ tap_dev_close(struct rte_eth_dev *dev)
 	int i;
 	struct pmd_internals *internals = dev->data->dev_private;
 	struct pmd_process_private *process_private = dev->process_private;
-	struct rx_queue *rxq;
 
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
 		rte_free(dev->process_private);
@@ -1141,19 +1132,18 @@ tap_dev_close(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
-		if (process_private->rxq_fds[i] != -1) {
-			rxq = &internals->rxq[i];
-			close(process_private->rxq_fds[i]);
-			process_private->rxq_fds[i] = -1;
-			tap_rxq_pool_free(rxq->pool);
-			rte_free(rxq->iovecs);
-			rxq->pool = NULL;
-			rxq->iovecs = NULL;
-		}
-		if (process_private->txq_fds[i] != -1) {
-			close(process_private->txq_fds[i]);
-			process_private->txq_fds[i] = -1;
-		}
+		struct rx_queue *rxq = &internals->rxq[i];
+
+		if (process_private->fds[i] == -1)
+			continue;
+
+		close(process_private->fds[i]);
+		process_private->fds[i] = -1;
+
+		tap_rxq_pool_free(rxq->pool);
+		rte_free(rxq->iovecs);
+		rxq->pool = NULL;
+		rxq->iovecs = NULL;
 	}
 
 	if (internals->remote_if_index) {
@@ -1198,6 +1188,15 @@ tap_dev_close(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static void
+tap_queue_close(struct pmd_process_private *process_private, uint16_t qid)
+{
+	if (process_private->fds[qid] != -1) {
+		close(process_private->fds[qid]);
+		process_private->fds[qid] = -1;
+	}
+}
+
 static void
 tap_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
@@ -1206,15 +1205,16 @@ tap_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 
 	if (!rxq)
 		return;
+
 	process_private = rte_eth_devices[rxq->in_port].process_private;
-	if (process_private->rxq_fds[rxq->queue_id] != -1) {
-		close(process_private->rxq_fds[rxq->queue_id]);
-		process_private->rxq_fds[rxq->queue_id] = -1;
-		tap_rxq_pool_free(rxq->pool);
-		rte_free(rxq->iovecs);
-		rxq->pool = NULL;
-		rxq->iovecs = NULL;
-	}
+
+	tap_rxq_pool_free(rxq->pool);
+	rte_free(rxq->iovecs);
+	rxq->pool = NULL;
+	rxq->iovecs = NULL;
+
+	if (dev->data->tx_queues[qid] == NULL)
+		tap_queue_close(process_private, qid);
 }
 
 static void
@@ -1225,12 +1225,10 @@ tap_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 
 	if (!txq)
 		return;
-	process_private = rte_eth_devices[txq->out_port].process_private;
 
-	if (process_private->txq_fds[txq->queue_id] != -1) {
-		close(process_private->txq_fds[txq->queue_id]);
-		process_private->txq_fds[txq->queue_id] = -1;
-	}
+	process_private = rte_eth_devices[txq->out_port].process_private;
+	if (dev->data->rx_queues[qid] == NULL)
+		tap_queue_close(process_private, qid);
 }
 
 static int
@@ -1482,52 +1480,34 @@ tap_setup_queue(struct rte_eth_dev *dev,
 		uint16_t qid,
 		int is_rx)
 {
-	int ret;
-	int *fd;
-	int *other_fd;
-	const char *dir;
+	int fd, ret;
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct pmd_process_private *process_private = dev->process_private;
 	struct rx_queue *rx = &internals->rxq[qid];
 	struct tx_queue *tx = &internals->txq[qid];
-	struct rte_gso_ctx *gso_ctx;
+	struct rte_gso_ctx *gso_ctx = NULL;
+	const char *dir = is_rx ? "rx" : "tx";
 
-	if (is_rx) {
-		fd = &process_private->rxq_fds[qid];
-		other_fd = &process_private->txq_fds[qid];
-		dir = "rx";
-		gso_ctx = NULL;
-	} else {
-		fd = &process_private->txq_fds[qid];
-		other_fd = &process_private->rxq_fds[qid];
-		dir = "tx";
+	if (is_rx)
 		gso_ctx = &tx->gso_ctx;
-	}
-	if (*fd != -1) {
+
+	fd = process_private->fds[qid];
+	if (fd != -1) {
 		/* fd for this queue already exists */
 		TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
-			pmd->name, *fd, dir, qid);
+			pmd->name, fd, dir, qid);
 		gso_ctx = NULL;
-	} else if (*other_fd != -1) {
-		/* Only other_fd exists. dup it */
-		*fd = dup(*other_fd);
-		if (*fd < 0) {
-			*fd = -1;
-			TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
-			return -1;
-		}
-		TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
-			pmd->name, *other_fd, dir, qid, *fd);
 	} else {
-		/* Both RX and TX fds do not exist (equal -1). Create fd */
-		*fd = tun_alloc(pmd, 0, 0);
-		if (*fd < 0) {
-			*fd = -1; /* restore original value */
+		fd = tun_alloc(pmd, 0, 0);
+		if (fd < 0) {
 			TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
 			return -1;
 		}
+
 		TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
-			pmd->name, dir, qid, *fd);
+			pmd->name, dir, qid, fd);
+
+		process_private->fds[qid] = fd;
 	}
 
 	tx->mtu = &dev->data->mtu;
@@ -1540,7 +1520,7 @@ tap_setup_queue(struct rte_eth_dev *dev,
 
 	tx->type = pmd->type;
 
-	return *fd;
+	return fd;
 }
 
 static int
@@ -1620,7 +1600,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
 
 	TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
 		internals->name, rx_queue_id,
-		process_private->rxq_fds[rx_queue_id]);
+		process_private->fds[rx_queue_id]);
 
 	return 0;
 
@@ -1664,7 +1644,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
 	TAP_LOG(DEBUG,
 		"  TX TUNTAP device name %s, qid %d on fd %d csum %s",
 		internals->name, tx_queue_id,
-		process_private->txq_fds[tx_queue_id],
+		process_private->fds[tx_queue_id],
 		txq->csum ? "on" : "off");
 
 	return 0;
@@ -2001,10 +1981,9 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name,
 	dev->intr_handle = pmd->intr_handle;
 
 	/* Presetup the fds to -1 as being not valid */
-	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
-		process_private->rxq_fds[i] = -1;
-		process_private->txq_fds[i] = -1;
-	}
+	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++)
+		process_private->fds[i] = -1;
+
 
 	if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
 		if (rte_is_zero_ether_addr(mac_addr))
@@ -2332,7 +2311,6 @@ tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
 	struct ipc_queues *request_param = (struct ipc_queues *)request.param;
 	struct ipc_queues *reply_param;
 	struct pmd_process_private *process_private = dev->process_private;
-	int queue, fd_iterator;
 
 	/* Prepare the request */
 	memset(&request, 0, sizeof(request));
@@ -2352,18 +2330,17 @@ tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
 	TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
 
 	/* Attach the queues from received file descriptors */
-	if (reply_param->rxq_count + reply_param->txq_count != reply->num_fds) {
+	if (reply_param->q_count != reply->num_fds) {
 		TAP_LOG(ERR, "Unexpected number of fds received");
 		return -1;
 	}
 
-	dev->data->nb_rx_queues = reply_param->rxq_count;
-	dev->data->nb_tx_queues = reply_param->txq_count;
-	fd_iterator = 0;
-	for (queue = 0; queue < reply_param->rxq_count; queue++)
-		process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
-	for (queue = 0; queue < reply_param->txq_count; queue++)
-		process_private->txq_fds[queue] = reply->fds[fd_iterator++];
+	dev->data->nb_rx_queues = reply_param->q_count;
+	dev->data->nb_tx_queues = reply_param->q_count;
+
+	for (int q = 0; q < reply_param->q_count; q++)
+		process_private->fds[q] = reply->fds[q];
+
 	free(reply);
 	return 0;
 }
@@ -2393,25 +2370,19 @@ tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
 
 	/* Fill file descriptors for all queues */
 	reply.num_fds = 0;
-	reply_param->rxq_count = 0;
-	if (dev->data->nb_rx_queues + dev->data->nb_tx_queues >
-			RTE_MP_MAX_FD_NUM){
-		TAP_LOG(ERR, "Number of rx/tx queues exceeds max number of fds");
+	reply_param->q_count = 0;
+
+	RTE_ASSERT(dev->data->nb_rx_queues == dev->data->nb_tx_queues);
+	if (dev->data->nb_rx_queues > RTE_MP_MAX_FD_NUM) {
+		TAP_LOG(ERR, "Number of rx/tx queues %u exceeds max number of fds %u",
+			dev->data->nb_rx_queues, RTE_MP_MAX_FD_NUM);
 		return -1;
 	}
 
 	for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
-		reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
-		reply_param->rxq_count++;
-	}
-	RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
-
-	reply_param->txq_count = 0;
-	for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
-		reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
-		reply_param->txq_count++;
+		reply.fds[reply.num_fds++] = process_private->fds[queue];
+		reply_param->q_count++;
 	}
-	RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
 
 	/* Send reply */
 	strlcpy(reply.name, request->name, sizeof(reply.name));
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 5ac93f93e961..dc8201020b5f 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -96,8 +96,7 @@ struct pmd_internals {
 };
 
 struct pmd_process_private {
-	int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
-	int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+	int fds[RTE_PMD_TAP_MAX_QUEUES];
 };
 
 /* tap_intr.c */
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index fa50fe45d7b7..a78fd50cd494 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1595,8 +1595,9 @@ tap_flow_isolate(struct rte_eth_dev *dev,
 	 * If netdevice is there, setup appropriate flow rules immediately.
 	 * Otherwise it will be set when bringing up the netdevice (tun_alloc).
 	 */
-	if (!process_private->rxq_fds[0])
+	if (process_private->fds[0] == -1)
 		return 0;
+
 	if (set) {
 		struct rte_flow *remote_flow;
 
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index a9097def1a32..bc953791635e 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -68,20 +68,22 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
 	}
 	for (i = 0; i < n; i++) {
 		struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
+		int fd = process_private->fds[i];
 
 		/* Skip queues that cannot request interrupts. */
-		if (!rxq || process_private->rxq_fds[i] == -1) {
+		if (!rxq || fd == -1) {
 			/* Use invalid intr_vec[] index to disable entry. */
 			if (rte_intr_vec_list_index_set(intr_handle, i,
-			RTE_INTR_VEC_RXTX_OFFSET + RTE_MAX_RXTX_INTR_VEC_ID))
+						RTE_INTR_VEC_RXTX_OFFSET + RTE_MAX_RXTX_INTR_VEC_ID))
 				return -rte_errno;
 			continue;
 		}
+
 		if (rte_intr_vec_list_index_set(intr_handle, i,
-					RTE_INTR_VEC_RXTX_OFFSET + count))
+						RTE_INTR_VEC_RXTX_OFFSET + count))
 			return -rte_errno;
-		if (rte_intr_efds_index_set(intr_handle, count,
-						   process_private->rxq_fds[i]))
+
+		if (rte_intr_efds_index_set(intr_handle, count, fd))
 			return -rte_errno;
 		count++;
 	}
-- 
2.43.0
^ permalink raw reply	[flat|nested] 6+ messages in thread
* Re: [RFC] eal: increase the number of availble file descriptors for MP
  2024-03-08 18:54 [RFC] eal: increase the number of availble file descriptors for MP Stephen Hemminger
  2024-03-08 20:36 ` [RFC v2] eal: increase passed max multi-process file descriptors Stephen Hemminger
  2024-03-09 18:12 ` [RFC v3] tap: do not duplicate fd's Stephen Hemminger
@ 2024-03-14 14:40 ` David Marchand
  2 siblings, 0 replies; 6+ messages in thread
From: David Marchand @ 2024-03-14 14:40 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Anatoly Burakov, Ferruh Yigit, Thomas Monjalon
On Fri, Mar 8, 2024 at 7:54 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> The current limit of file descriptors is too low, it should have
> been set to the maximum possible to send across an unix domain
> socket.
>
> This is an attempt to allow increasing it without breaking ABI.
> But in the process it exposes what is broken about how symbol
> versions are checked in check-symbol-maps.sh. That script is
> broken in that it won't allow adding a backwards compatiable
> version hook like this.
- It could be enhanced maybe, but I see no problem with the script.
The versions for compat symbols in this patch are wrong.
We want to keep compat with ABI 24, not 23.
And next ABI will be 25.
- rte_mp_old_msg does not have to be exported as public in rte_eal.h.
- I think the patch is not complete:
 * rte_mp_action_register and rte_mp_request_async need versioning too,
 * because of the former point, handling of msg requests probably
needs to keep track of accepted length per registered callbacks,
-- 
David Marchand
^ permalink raw reply	[flat|nested] 6+ messages in thread
* Re: [RFC v2] eal: increase passed max multi-process file descriptors
  2024-03-08 20:36 ` [RFC v2] eal: increase passed max multi-process file descriptors Stephen Hemminger
@ 2024-03-14 15:23   ` Stephen Hemminger
  2024-03-14 15:38     ` David Marchand
  0 siblings, 1 reply; 6+ messages in thread
From: Stephen Hemminger @ 2024-03-14 15:23 UTC (permalink / raw)
  To: dev; +Cc: Anatoly Burakov, Jianfeng Tan
On Fri,  8 Mar 2024 12:36:39 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:
> Both XDP and TAP device are limited in the number of queues
> because of limitations on the number of file descriptors that
> are allowed. The original choice of 8 was too low; the allowed
> maximum is 253 according to unix(7) man page.
> 
> This may look like a serious ABI breakage but it is not.
> It is simpler for everyone if the limit is increased rather than
> building a parallel set of calls.
> 
> The case that matters is older application registering MP support
> with the newer version of EAL. In this case, since the old application
> will always send the more compact structure (less possible fd's)
> it is OK.
> 
> Request (for up to 8 fds) sent to EAL.
>    - EAL only references up to num_fds.
>    - The area past the old fd array is not accessed.
> 
> Reply callback:
>    - EAL will pass pointer to the new (larger structure),
>      the old callback will only look at the first part of
>      the fd array (num_fds <= 8).
> 
>    - Since primary and secondary must both be from same DPDK version
>      there is normal way that a reply with more fd's could be possible.
>      The only case is the same as above, where application requested
>      something that would break in old version and now succeeds.
> 
> The one possible incompatibility is that if application passed
> a larger number of fd's (32?) and expected an error. Now it will
> succeed and get passed through.
> 
> Fixes: bacaa2754017 ("eal: add channel for multi-process communication")
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> v2 - show the simpler way to address with some minor ABI issue
> 
>  doc/guides/rel_notes/release_24_03.rst | 4 ++++
>  lib/eal/include/rte_eal.h              | 2 +-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
> index 932688ca4d82..1d33cfa15dfb 100644
> --- a/doc/guides/rel_notes/release_24_03.rst
> +++ b/doc/guides/rel_notes/release_24_03.rst
> @@ -225,6 +225,10 @@ API Changes
>  * ethdev: Renamed structure ``rte_flow_action_modify_data`` to be
>    ``rte_flow_field_data`` for more generic usage.
>  
> +* eal: The maximum number of file descriptors allowed to be passed in
> +  multi-process requests is increased from 8 to the maximum possible on
> +  Linux unix domain sockets 253. This allows for more queues on XDP and
> +  TAP device.
>  
>  ABI Changes
>  -----------
> diff --git a/lib/eal/include/rte_eal.h b/lib/eal/include/rte_eal.h
> index c2256f832e51..cd84fcdd1bdb 100644
> --- a/lib/eal/include/rte_eal.h
> +++ b/lib/eal/include/rte_eal.h
> @@ -155,7 +155,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
>   */
>  bool rte_mp_disable(void);
>  
> -#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
> +#define RTE_MP_MAX_FD_NUM	253    /* The max amount of fds */
>  #define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
>  #define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
>  struct rte_mp_msg {
Rather than mess with versioning everything, probably better to just
hold off to 24.11 release and do the change there.
It will limit xdp and tap PMD's to 8 queues but no user has been
demanding more yet.
^ permalink raw reply	[flat|nested] 6+ messages in thread
* Re: [RFC v2] eal: increase passed max multi-process file descriptors
  2024-03-14 15:23   ` Stephen Hemminger
@ 2024-03-14 15:38     ` David Marchand
  0 siblings, 0 replies; 6+ messages in thread
From: David Marchand @ 2024-03-14 15:38 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Anatoly Burakov, Jianfeng Tan
On Thu, Mar 14, 2024 at 4:23 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
> Rather than mess with versioning everything, probably better to just
> hold off to 24.11 release and do the change there.
>
> It will limit xdp and tap PMD's to 8 queues but no user has been
> demanding more yet.
IIUC, this limitation only applies to multiprocess setups.
Waiting for next ABI seems the simpler approach if there is no
explicit ask for this change.
Until someone wants to send more than 253 fds :-).
-- 
David Marchand
^ permalink raw reply	[flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-03-14 15:38 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-08 18:54 [RFC] eal: increase the number of availble file descriptors for MP Stephen Hemminger
2024-03-08 20:36 ` [RFC v2] eal: increase passed max multi-process file descriptors Stephen Hemminger
2024-03-14 15:23   ` Stephen Hemminger
2024-03-14 15:38     ` David Marchand
2024-03-09 18:12 ` [RFC v3] tap: do not duplicate fd's Stephen Hemminger
2024-03-14 14:40 ` [RFC] eal: increase the number of availble file descriptors for MP David Marchand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).