DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/af_packet: allow disabling packet fanout
@ 2024-12-12  8:04 Tudor Cornea
  2024-12-12 17:11 ` Stephen Hemminger
  2025-01-06 15:35 ` [PATCH v2] net/af_packet: allow changing fanout mode Tudor Cornea
  0 siblings, 2 replies; 5+ messages in thread
From: Tudor Cornea @ 2024-12-12  8:04 UTC (permalink / raw)
  To: linville; +Cc: ferruh.yigit, andrew.rybchenko, dev, Tudor Cornea

This allows us to control whether the PMD will attempt to use
the PACKET_FANOUT socket option, and allows the binary compiled
against newer kernel headers to run on an older kernel, which
lacks support for it.

Signed-off-by: Tudor Cornea <tudor.cornea@gmail.com>
---
 doc/guides/nics/af_packet.rst             |  4 ++-
 drivers/net/af_packet/rte_eth_af_packet.c | 32 ++++++++++++++++++-----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
index a343d3a961..42837031de 100644
--- a/doc/guides/nics/af_packet.rst
+++ b/doc/guides/nics/af_packet.rst
@@ -25,6 +25,8 @@ Some of these, in turn, will be used to configure the PACKET_MMAP settings.
 *   ``qpairs`` - number of Rx and Tx queues (optional, default 1);
 *   ``qdisc_bypass`` - set PACKET_QDISC_BYPASS option in AF_PACKET (optional,
     disabled by default);
+*   ``packet_fanout`` - set PACKET_FANOUT option in AF_PACKET (optional,
+    enabled by default);
 *   ``blocksz`` - PACKET_MMAP block size (optional, default 4096);
 *   ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: multiple
     of 16B);
@@ -64,7 +66,7 @@ framecnt=512):
 
 .. code-block:: console
 
-    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
+    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0,packet_fanout=0
 
 Features and Limitations
 ------------------------
diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index ceb8d9356a..521dbef317 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -36,6 +36,7 @@
 #define ETH_AF_PACKET_FRAMESIZE_ARG	"framesz"
 #define ETH_AF_PACKET_FRAMECOUNT_ARG	"framecnt"
 #define ETH_AF_PACKET_QDISC_BYPASS_ARG	"qdisc_bypass"
+#define ETH_AF_PACKET_PACKET_FANOUT_ARG	"packet_fanout"
 
 #define DFLT_FRAME_SIZE		(1 << 11)
 #define DFLT_FRAME_COUNT	(1 << 9)
@@ -96,6 +97,7 @@ static const char *valid_arguments[] = {
 	ETH_AF_PACKET_FRAMESIZE_ARG,
 	ETH_AF_PACKET_FRAMECOUNT_ARG,
 	ETH_AF_PACKET_QDISC_BYPASS_ARG,
+	ETH_AF_PACKET_PACKET_FANOUT_ARG,
 	NULL
 };
 
@@ -709,6 +711,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        unsigned int framesize,
                        unsigned int framecnt,
 		       unsigned int qdisc_bypass,
+		       unsigned int packet_fanout,
                        struct pmd_internals **internals,
                        struct rte_eth_dev **eth_dev,
                        struct rte_kvargs *kvlist)
@@ -926,14 +929,17 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 			goto error;
 		}
 
+		if (packet_fanout) {
 #if defined(PACKET_FANOUT)
-		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
-				&fanout_arg, sizeof(fanout_arg));
-		if (rc == -1) {
-			PMD_LOG_ERRNO(ERR,
-				"%s: could not set PACKET_FANOUT on AF_PACKET socket for %s",
-				name, pair->value);
-			goto error;
+			rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
+					&fanout_arg, sizeof(fanout_arg));
+			if (rc == -1) {
+				PMD_LOG_ERRNO(ERR,
+					"%s: could not set PACKET_FANOUT "
+					"on AF_PACKET socket for %s",
+					name, pair->value);
+				goto error;
+			}
 		}
 #endif
 	}
@@ -1003,6 +1009,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	unsigned int framecount = DFLT_FRAME_COUNT;
 	unsigned int qpairs = 1;
 	unsigned int qdisc_bypass = 1;
+	unsigned int packet_fanout = 1;
 
 	/* do some parameter checking */
 	if (*sockfd < 0)
@@ -1065,6 +1072,16 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			}
 			continue;
 		}
+		if (strstr(pair->key, ETH_AF_PACKET_PACKET_FANOUT_ARG) != NULL) {
+			packet_fanout = atoi(pair->value);
+			if (packet_fanout > 1) {
+				PMD_LOG(ERR,
+					"%s: invalid packet fanout value",
+					name);
+				return -1;
+			}
+			continue;
+		}
 	}
 
 	if (framesize > blocksize) {
@@ -1091,6 +1108,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 				   blocksize, blockcount,
 				   framesize, framecount,
 				   qdisc_bypass,
+				   packet_fanout,
 				   &internals, &eth_dev,
 				   kvlist) < 0)
 		return -1;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net/af_packet: allow disabling packet fanout
  2024-12-12  8:04 [PATCH] net/af_packet: allow disabling packet fanout Tudor Cornea
@ 2024-12-12 17:11 ` Stephen Hemminger
  2024-12-16  9:58   ` Tudor Cornea
  2025-01-06 15:35 ` [PATCH v2] net/af_packet: allow changing fanout mode Tudor Cornea
  1 sibling, 1 reply; 5+ messages in thread
From: Stephen Hemminger @ 2024-12-12 17:11 UTC (permalink / raw)
  To: Tudor Cornea; +Cc: linville, ferruh.yigit, andrew.rybchenko, dev

On Thu, 12 Dec 2024 10:04:42 +0200
Tudor Cornea <tudor.cornea@gmail.com> wrote:

> This allows us to control whether the PMD will attempt to use
> the PACKET_FANOUT socket option, and allows the binary compiled
> against newer kernel headers to run on an older kernel, which
> lacks support for it.
> 
> Signed-off-by: Tudor Cornea <tudor.cornea@gmail.com>

Controlling fanout more is a good idea but not sure what this patch
is trying to do with it.

- DPDK minimum kernel version is now 4.19 so no point in worrying about
   backward compatibility. According to man page for packet, fanout
   was added in 3.1 kernel.

- It would be useful to allow application to control fanout in more detail.
  According to man page:
         •  The  load-balance  mode  PACKET_FANOUT_LB  implements a round-
            robin algorithm.

         •  PACKET_FANOUT_CPU selects the socket based on the CPU that the
            packet arrived on.

          •  PACKET_FANOUT_ROLLOVER processes all data on a single  socket,
             moving to the next when one becomes backlogged.

          •  PACKET_FANOUT_RND  selects  the  socket  using a pseudo-random
             number generator.

          •  PACKET_FANOUT_QM (available  since  Linux  3.14)  selects  the
             socket using the recorded queue_mapping of the received skb.

The default should be for packet to behave like a hardware NIC if RSS is
enabled. And use a single queue if RSS is not enabled. The PMD kind of does
this now but not the same.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net/af_packet: allow disabling packet fanout
  2024-12-12 17:11 ` Stephen Hemminger
@ 2024-12-16  9:58   ` Tudor Cornea
  2025-01-06 15:29     ` Tudor Cornea
  0 siblings, 1 reply; 5+ messages in thread
From: Tudor Cornea @ 2024-12-16  9:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linville, ferruh.yigit, andrew.rybchenko, dev

> Controlling fanout more is a good idea but not sure what this patch
> is trying to do with it.

I am maintaining a DPDK application which should run on a large number
of setups.
Unfortunately, I do not have a lot of control over the environment the
application runs on (e.g kernel).

The problem I was trying to solve is that the Af_Packet PMD seems to
fail to initialize properly on some setups.
I managed to reproduce the issue with testpmd.

sudo ./x86_64-native-linuxapp-gcc/app/dpdk-testpmd \
     -l 0-3 \
     -m 1024 \
     --no-huge \
     --no-shconf \
     --no-pci \
     --vdev=net_af_packet0,iface=eth1,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
\
     --vdev=net_af_packet1,iface=eth2,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
\
     -- \
     -i

Error: Unknown device type.
EAL: Detected CPU lcores: 8
EAL: Detected NUMA nodes: 1
EAL: Detected static linkage of DPDK
EAL: Selected IOVA mode 'VA'
AFPACKET: rte_pmd_init_internals(): net_af_packet0: could not set
PACKET_FANOUT on AF_PACKET socket for eth1:Invalid argument
VDEV_BUS: vdev_probe(): failed to initialize net_af_packet0 device
AFPACKET: rte_pmd_init_internals(): net_af_packet1: could not set
PACKET_FANOUT on AF_PACKET socket for eth1:Invalid argument
VDEV_BUS: vdev_probe(): failed to initialize net_af_packet1 device
EAL: Bus (vdev) probe failed.
testpmd: No probed ethernet devices

It seems that the call to PACKET_FANOUT setsockopt failed with EINVAL.
I debugged the issue further. It seems that the root cause is that
when the interface is down (e.g eth1), after the bind operation
succeeds, the PACKET_FANOUT setsockopt will fail.

This is a behavior of AF_Packet in Linux which I'm not sure is that
well documented.
It seems to be fixed in a recent Linux Kernel commit. I re-compiled
the kernel with the change, and did not see the issue afterwards.

Commit 2cee3e6e2e4b74bec96694169f01cd3feec1f264
af_packet: allow fanout_add when socket is not RUNNING

[1] https://github.com/torvalds/linux/commit/2cee3e6e2e4b74bec96694169f01cd3feec1f264

I was trying to find a solution for my application to run on setups
which don't have the kernel patch.
Introducing a devarg to control when packet_fanout is enabled seemed
like a possible way around the issue.

As a second solution around the issue, I was also thinking of using
the PACKET_FANOUT setsockopt only when nb_queues > 1.
Conceptually, I think it would make some sense, because fanout doesn't
really help when having a single socket.
We would need more sockets in order to load balance incoming packets.
And this would still allow me to control the behavior from the
existing 'qpairs' devarg.

The idea would be something along these lines

if (nb_queues > 1)  {
   rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, &fanout_arg,
sizeof(fanout_arg));
 /* ... */
}

Would a patch using this idea be considered more suitable ?

> - DPDK minimum kernel version is now 4.19 so no point in worrying about
>    backward compatibility. According to man page for packet, fanout
>    was added in 3.1 kernel.
>
> - It would be useful to allow application to control fanout in more detail.

This is a great idea. Would introducing a new devarg, (e.g
'fanout_mode') be the proper way to allow the application to customize
fanout in more detail ?

--vdev=net_af_packet0,iface=eth1,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,fanout_mode=[fanout_hash|fanout_cpu|fanout_rnd|fanout_qm]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net/af_packet: allow disabling packet fanout
  2024-12-16  9:58   ` Tudor Cornea
@ 2025-01-06 15:29     ` Tudor Cornea
  0 siblings, 0 replies; 5+ messages in thread
From: Tudor Cornea @ 2025-01-06 15:29 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linville, ferruh.yigit, andrew.rybchenko, dev

> This is a great idea. Would introducing a new devarg, (e.g
> 'fanout_mode') be the proper way to allow the application to customize
> fanout in more detail ?
>
> --vdev=net_af_packet0,iface=eth1,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,fanout_mode=[fanout_hash|fanout_cpu|fanout_rnd|fanout_qm]

I'll send a new version of the patch which will add some additional
customization

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2] net/af_packet: allow changing fanout mode
  2024-12-12  8:04 [PATCH] net/af_packet: allow disabling packet fanout Tudor Cornea
  2024-12-12 17:11 ` Stephen Hemminger
@ 2025-01-06 15:35 ` Tudor Cornea
  1 sibling, 0 replies; 5+ messages in thread
From: Tudor Cornea @ 2025-01-06 15:35 UTC (permalink / raw)
  To: stephen; +Cc: linville, ferruh.yigit, andrew.rybchenko, dev, Tudor Cornea

This allows us to control the algorithm used to spread traffic between
sockets, adding more fine grained control. If the user does not
specify a fanout mode, the PMD driver will default to
PACKET_FANOUT_HASH.

Signed-off-by: Tudor Cornea <tudor.cornea@gmail.com>

---
v2:
* Renamed the patch
* Replaced packet_fanout argument with fanout_mode, which allows
more fine grained control
---
 doc/guides/nics/af_packet.rst             |  4 +-
 drivers/net/af_packet/rte_eth_af_packet.c | 92 ++++++++++++++++++++---
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
index a343d3a961..3443f95004 100644
--- a/doc/guides/nics/af_packet.rst
+++ b/doc/guides/nics/af_packet.rst
@@ -25,6 +25,8 @@ Some of these, in turn, will be used to configure the PACKET_MMAP settings.
 *   ``qpairs`` - number of Rx and Tx queues (optional, default 1);
 *   ``qdisc_bypass`` - set PACKET_QDISC_BYPASS option in AF_PACKET (optional,
     disabled by default);
+*   ``fanout_mode`` - set fanout algorithm. Possible choices: hash,lb,cpu,rollover,rnd,qm (optional,
+    default hash);
 *   ``blocksz`` - PACKET_MMAP block size (optional, default 4096);
 *   ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: multiple
     of 16B);
@@ -64,7 +66,7 @@ framecnt=512):
 
 .. code-block:: console
 
-    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
+    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0,fanout_mode=hash
 
 Features and Limitations
 ------------------------
diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index ceb8d9356a..8449975384 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -36,6 +36,7 @@
 #define ETH_AF_PACKET_FRAMESIZE_ARG	"framesz"
 #define ETH_AF_PACKET_FRAMECOUNT_ARG	"framecnt"
 #define ETH_AF_PACKET_QDISC_BYPASS_ARG	"qdisc_bypass"
+#define ETH_AF_PACKET_FANOUT_MODE_ARG	"fanout_mode"
 
 #define DFLT_FRAME_SIZE		(1 << 11)
 #define DFLT_FRAME_COUNT	(1 << 9)
@@ -96,6 +97,7 @@ static const char *valid_arguments[] = {
 	ETH_AF_PACKET_FRAMESIZE_ARG,
 	ETH_AF_PACKET_FRAMECOUNT_ARG,
 	ETH_AF_PACKET_QDISC_BYPASS_ARG,
+	ETH_AF_PACKET_FANOUT_MODE_ARG,
 	NULL
 };
 
@@ -700,6 +702,61 @@ open_packet_iface(const char *key __rte_unused,
 	return 0;
 }
 
+#if defined(PACKET_FANOUT)
+#define PACKET_FANOUT_INVALID -1
+
+static int
+get_fanout_group_id(int if_index)
+{
+	return (getpid() ^ if_index) & 0xffff;
+}
+
+static int
+get_fanout_mode(const char *fanout_mode)
+{
+	int mode = PACKET_FANOUT_FLAG_DEFRAG;
+
+#if defined(PACKET_FANOUT_FLAG_ROLLOVER)
+	mode |= PACKET_FANOUT_FLAG_ROLLOVER;
+#endif
+
+	if (!fanout_mode) {
+		/* Default */
+		mode |= PACKET_FANOUT_HASH;
+	} else if (!strcmp(fanout_mode, "hash")) {
+		mode |= PACKET_FANOUT_HASH;
+	} else if (!strcmp(fanout_mode, "lb")) {
+		mode |= PACKET_FANOUT_LB;
+	} else if (!strcmp(fanout_mode, "cpu")) {
+		mode |= PACKET_FANOUT_CPU;
+	} else if (!strcmp(fanout_mode, "rollover")) {
+		mode |= PACKET_FANOUT_ROLLOVER;
+	} else if (!strcmp(fanout_mode, "rnd")) {
+		mode |= PACKET_FANOUT_RND;
+	} else if (!strcmp(fanout_mode, "qm")) {
+		mode |= PACKET_FANOUT_QM;
+	} else {
+		/* Invalid Fanout Mode */
+		mode = PACKET_FANOUT_INVALID;
+	}
+
+	return mode;
+}
+
+static int
+get_fanout(const char *fanout_mode, int if_index)
+{
+	int group_id = get_fanout_group_id(if_index);
+	int mode = get_fanout_mode(fanout_mode);
+	int fanout = PACKET_FANOUT_INVALID;
+
+	if (mode != PACKET_FANOUT_INVALID)
+		fanout = group_id | (mode << 16);
+
+	return fanout;
+}
+#endif
+
 static int
 rte_pmd_init_internals(struct rte_vdev_device *dev,
                        const int sockfd,
@@ -709,6 +766,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        unsigned int framesize,
                        unsigned int framecnt,
 		       unsigned int qdisc_bypass,
+		       const char *fanout_mode,
                        struct pmd_internals **internals,
                        struct rte_eth_dev **eth_dev,
                        struct rte_kvargs *kvlist)
@@ -810,11 +868,12 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 	sockaddr.sll_ifindex = (*internals)->if_index;
 
 #if defined(PACKET_FANOUT)
-	fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff;
-	fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16;
-#if defined(PACKET_FANOUT_FLAG_ROLLOVER)
-	fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
-#endif
+	fanout_arg = get_fanout(fanout_mode, (*internals)->if_index);
+
+	if (fanout_arg == PACKET_FANOUT_INVALID) {
+		PMD_LOG(ERR, "Invalid fanout mode: %s", fanout_mode);
+		goto error;
+	}
 #endif
 
 	for (q = 0; q < nb_queues; q++) {
@@ -927,13 +986,16 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 		}
 
 #if defined(PACKET_FANOUT)
-		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
-				&fanout_arg, sizeof(fanout_arg));
-		if (rc == -1) {
-			PMD_LOG_ERRNO(ERR,
-				"%s: could not set PACKET_FANOUT on AF_PACKET socket for %s",
-				name, pair->value);
-			goto error;
+		if (nb_queues > 1) {
+			rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
+					&fanout_arg, sizeof(fanout_arg));
+			if (rc == -1) {
+				PMD_LOG_ERRNO(ERR,
+					"%s: could not set PACKET_FANOUT "
+					"on AF_PACKET socket for %s",
+					name, pair->value);
+				goto error;
+			}
 		}
 #endif
 	}
@@ -1003,6 +1065,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	unsigned int framecount = DFLT_FRAME_COUNT;
 	unsigned int qpairs = 1;
 	unsigned int qdisc_bypass = 1;
+	const char *fanout_mode = NULL;
 
 	/* do some parameter checking */
 	if (*sockfd < 0)
@@ -1065,6 +1128,10 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			}
 			continue;
 		}
+		if (strstr(pair->key, ETH_AF_PACKET_FANOUT_MODE_ARG) != NULL) {
+			fanout_mode = pair->value;
+			continue;
+		}
 	}
 
 	if (framesize > blocksize) {
@@ -1091,6 +1158,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 				   blocksize, blockcount,
 				   framesize, framecount,
 				   qdisc_bypass,
+				   fanout_mode,
 				   &internals, &eth_dev,
 				   kvlist) < 0)
 		return -1;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-01-06 15:35 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-12-12  8:04 [PATCH] net/af_packet: allow disabling packet fanout Tudor Cornea
2024-12-12 17:11 ` Stephen Hemminger
2024-12-16  9:58   ` Tudor Cornea
2025-01-06 15:29     ` Tudor Cornea
2025-01-06 15:35 ` [PATCH v2] net/af_packet: allow changing fanout mode Tudor Cornea

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).