* [PATCH v02] net/af_packet: add rollover and defrag options
@ 2024-10-31 13:50 Gur Stavi
2024-11-01 22:49 ` Ferruh Yigit
0 siblings, 1 reply; 3+ messages in thread
From: Gur Stavi @ 2024-10-31 13:50 UTC (permalink / raw)
To: Gur Stavi; +Cc: dev, John W. Linville, Stephen Hemminger, Ferruh Yigit
net_af_packet PMD multi "queue" support relies on Linux FANOUT capability.
Linux FANOUT is a SW based load balancer that is similar to HW RSS which
is more common for DPDK PMDs. Instead of multiple HW descriptor queues,
AF PACKET uses multiple sockets.
HW RSS will typically drop a packet if its selected RX queue is empty.
However, Linux FANOUT, as a SW load balancer, can be configured to avoid
this packet drop by rolling over to the next socket.
This rollover functionality was ALWAYS enabled in net_af_packet. It is
surrounded by ifdef, but only to allow compilation on ancient Linux
versions that did not have it.
Since DPDK applications are usually designed for HW based PMDs, this
rollover functionality, which the developers are likely unaware of, could
be confusing.
Another option that is part of Linux FANOUT is DEFRAG that instructs Linux
to compose complete IP packet out of fragments before delivering it to the
PACKET socket. Again, this behavior typically does not exist for HW based
PMDs and may confuse users.
This patch adds 2 options to control these features:
rollover=[0|1],defrag=[0|1]
For backward compatibility both features are enabled by default even
though most users will probably want both of them disabled.
Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
---
v2:
* Improve error handling for parsing of arg strings. Add a wrapper around
strtoul to replace atoi.
* Fix checkpatch indentation error due to copy-paste.
v1: https://mails.dpdk.org/archives/dev/2024-October/307451.html
---
doc/guides/nics/af_packet.rst | 4 ++
drivers/net/af_packet/rte_eth_af_packet.c | 77 ++++++++++++++++++-----
2 files changed, 65 insertions(+), 16 deletions(-)
diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
index 66b977e1a2..4bc748b50b 100644
--- a/doc/guides/nics/af_packet.rst
+++ b/doc/guides/nics/af_packet.rst
@@ -29,6 +29,8 @@ Some of these, in turn, will be used to configure the PACKET_MMAP settings.
* ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: multiple
of 16B);
* ``framecnt`` - PACKET_MMAP frame count (optional, default 512).
+* ``rollover`` - disable(0)/enable(1) PACKET_FANOUT_ROLLOVER (optional, default 1).
+* ``defrag`` - disable(0)/enable(1) PACKET_FANOUT_FLAG_DEFRAG (optional, default 1).
Because this implementation is based on PACKET_MMAP, and PACKET_MMAP has its
own pre-requisites, it should be noted that the inner workings of PACKET_MMAP
@@ -47,6 +49,8 @@ For the full details behind PACKET_MMAP's structures and settings, consider
reading the `PACKET_MMAP documentation in the Kernel
<https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt>`_.
+For details of ``rollover`` anb ``defrag`` refer to the *packet(7)* man page.
+
Prerequisites
-------------
diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 68870dd3e2..1a043e8398 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -36,6 +36,8 @@
#define ETH_AF_PACKET_FRAMESIZE_ARG "framesz"
#define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt"
#define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass"
+#define ETH_AF_PACKET_ROLLOVER "rollover"
+#define ETH_AF_PACKET_DEFRAG "defrag"
#define DFLT_FRAME_SIZE (1 << 11)
#define DFLT_FRAME_COUNT (1 << 9)
@@ -91,6 +93,8 @@ static const char *valid_arguments[] = {
ETH_AF_PACKET_FRAMESIZE_ARG,
ETH_AF_PACKET_FRAMECOUNT_ARG,
ETH_AF_PACKET_QDISC_BYPASS_ARG,
+ ETH_AF_PACKET_ROLLOVER,
+ ETH_AF_PACKET_DEFRAG,
NULL
};
@@ -659,6 +663,20 @@ open_packet_iface(const char *key __rte_unused,
return 0;
}
+static int
+uint_arg_parse(const char *arg_str, unsigned int *arg_val)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = strtoul(arg_str, &endptr, 10);
+ if (*arg_str == '\0' || *endptr != '\0')
+ return -EINVAL;
+
+ *arg_val = val;
+ return 0;
+}
+
static int
rte_pmd_init_internals(struct rte_vdev_device *dev,
const int sockfd,
@@ -676,6 +694,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
const unsigned int numa_node = dev->device.numa_node;
struct rte_eth_dev_data *data = NULL;
struct rte_kvargs_pair *pair = NULL;
+ const char *ifname = NULL;
struct ifreq ifr;
size_t ifnamelen;
unsigned k_idx;
@@ -686,6 +705,8 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
int rc, tpver, discard;
int qsockfd = -1;
unsigned int i, q, rdsize;
+ unsigned int rollover = 1;
+ unsigned int defrag = 1;
#if defined(PACKET_FANOUT)
int fanout_arg;
#endif
@@ -693,9 +714,30 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
pair = &kvlist->pairs[k_idx];
if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL)
- break;
+ ifname = pair->value;
+ if (strcmp(pair->key, ETH_AF_PACKET_ROLLOVER) == 0) {
+ rc = uint_arg_parse(pair->value, &rollover);
+ if (rc || rollover > 1) {
+ PMD_LOG(ERR,
+ "%s: invalid rollover value: %s",
+ name, pair->value);
+ return -1;
+ }
+ continue;
+ }
+ if (strcmp(pair->key, ETH_AF_PACKET_DEFRAG) == 0) {
+ rc = uint_arg_parse(pair->value, &defrag);
+ if (rc || defrag > 1) {
+ PMD_LOG(ERR,
+ "%s: invalid defrag value: %s",
+ name, pair->value);
+ return -1;
+ }
+ continue;
+ }
}
- if (pair == NULL) {
+
+ if (ifname == NULL) {
PMD_LOG(ERR,
"%s: no interface specified for AF_PACKET ethdev",
name);
@@ -738,21 +780,21 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
req->tp_frame_size = framesize;
req->tp_frame_nr = framecnt;
- ifnamelen = strlen(pair->value);
+ ifnamelen = strlen(ifname);
if (ifnamelen < sizeof(ifr.ifr_name)) {
- memcpy(ifr.ifr_name, pair->value, ifnamelen);
+ memcpy(ifr.ifr_name, ifname, ifnamelen);
ifr.ifr_name[ifnamelen] = '\0';
} else {
PMD_LOG(ERR,
"%s: I/F name too long (%s)",
- name, pair->value);
+ name, ifname);
goto free_internals;
}
if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) {
PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name);
goto free_internals;
}
- (*internals)->if_name = strdup(pair->value);
+ (*internals)->if_name = strdup(ifname);
if ((*internals)->if_name == NULL)
goto free_internals;
(*internals)->if_index = ifr.ifr_ifindex;
@@ -770,9 +812,12 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
#if defined(PACKET_FANOUT)
fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff;
- fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16;
+ fanout_arg |= PACKET_FANOUT_HASH << 16;
+ if (defrag)
+ fanout_arg |= PACKET_FANOUT_FLAG_DEFRAG << 16;
#if defined(PACKET_FANOUT_FLAG_ROLLOVER)
- fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
+ if (rollover)
+ fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
#endif
#endif
@@ -792,7 +837,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_VERSION on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
@@ -802,7 +847,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_LOSS on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
@@ -813,7 +858,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
#endif
@@ -823,7 +868,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_RX_RING on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
@@ -831,7 +876,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_TX_RING on AF_PACKET "
- "socket for %s", name, pair->value);
+ "socket for %s", name, ifname);
goto error;
}
@@ -844,7 +889,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rx_queue->map == MAP_FAILED) {
PMD_LOG_ERRNO(ERR,
"%s: call to mmap failed on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
@@ -881,7 +926,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not bind AF_PACKET socket to %s",
- name, pair->value);
+ name, ifname);
goto error;
}
@@ -891,7 +936,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
if (rc == -1) {
PMD_LOG_ERRNO(ERR,
"%s: could not set PACKET_FANOUT on AF_PACKET socket for %s",
- name, pair->value);
+ name, ifname);
goto error;
}
#endif
base-commit: 98613d32e3dac58d685f4f236cf8cc9733abaaf3
--
2.40.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v02] net/af_packet: add rollover and defrag options
2024-10-31 13:50 [PATCH v02] net/af_packet: add rollover and defrag options Gur Stavi
@ 2024-11-01 22:49 ` Ferruh Yigit
2024-11-03 7:27 ` Gur Stavi
0 siblings, 1 reply; 3+ messages in thread
From: Ferruh Yigit @ 2024-11-01 22:49 UTC (permalink / raw)
To: Gur Stavi; +Cc: dev, John W. Linville, Stephen Hemminger
On 10/31/2024 1:50 PM, Gur Stavi wrote:
> net_af_packet PMD multi "queue" support relies on Linux FANOUT capability.
> Linux FANOUT is a SW based load balancer that is similar to HW RSS which
> is more common for DPDK PMDs. Instead of multiple HW descriptor queues,
> AF PACKET uses multiple sockets.
> HW RSS will typically drop a packet if its selected RX queue is empty.
> However, Linux FANOUT, as a SW load balancer, can be configured to avoid
> this packet drop by rolling over to the next socket.
> This rollover functionality was ALWAYS enabled in net_af_packet. It is
> surrounded by ifdef, but only to allow compilation on ancient Linux
> versions that did not have it.
>
> Since DPDK applications are usually designed for HW based PMDs, this
> rollover functionality, which the developers are likely unaware of, could
> be confusing.
>
> Another option that is part of Linux FANOUT is DEFRAG that instructs Linux
> to compose complete IP packet out of fragments before delivering it to the
> PACKET socket. Again, this behavior typically does not exist for HW based
> PMDs and may confuse users.
>
> This patch adds 2 options to control these features:
> rollover=[0|1],defrag=[0|1]
> For backward compatibility both features are enabled by default even
> though most users will probably want both of them disabled.
>
> Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
> ---
> v2:
> * Improve error handling for parsing of arg strings. Add a wrapper around
> strtoul to replace atoi.
> * Fix checkpatch indentation error due to copy-paste.
>
> v1: https://mails.dpdk.org/archives/dev/2024-October/307451.html
> ---
> doc/guides/nics/af_packet.rst | 4 ++
> drivers/net/af_packet/rte_eth_af_packet.c | 77 ++++++++++++++++++-----
> 2 files changed, 65 insertions(+), 16 deletions(-)
>
> diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
> index 66b977e1a2..4bc748b50b 100644
> --- a/doc/guides/nics/af_packet.rst
> +++ b/doc/guides/nics/af_packet.rst
> @@ -29,6 +29,8 @@ Some of these, in turn, will be used to configure the PACKET_MMAP settings.
> * ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: multiple
> of 16B);
> * ``framecnt`` - PACKET_MMAP frame count (optional, default 512).
> +* ``rollover`` - disable(0)/enable(1) PACKET_FANOUT_ROLLOVER (optional, default 1).
> +* ``defrag`` - disable(0)/enable(1) PACKET_FANOUT_FLAG_DEFRAG (optional, default 1).
>
+1 to introduce both options, thanks.
Do you have any practical usecase to disable them?
> Because this implementation is based on PACKET_MMAP, and PACKET_MMAP has its
> own pre-requisites, it should be noted that the inner workings of PACKET_MMAP
> @@ -47,6 +49,8 @@ For the full details behind PACKET_MMAP's structures and settings, consider
> reading the `PACKET_MMAP documentation in the Kernel
> <https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt>`_.
>
> +For details of ``rollover`` anb ``defrag`` refer to the *packet(7)* man page.
> +
> Prerequisites
> -------------
>
> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
> index 68870dd3e2..1a043e8398 100644
> --- a/drivers/net/af_packet/rte_eth_af_packet.c
> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
> @@ -36,6 +36,8 @@
> #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz"
> #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt"
> #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass"
> +#define ETH_AF_PACKET_ROLLOVER "rollover"
> +#define ETH_AF_PACKET_DEFRAG "defrag"
>
Can you please add new arguments to 'RTE_PMD_REGISTER_PARAM_STRING'
macro below?
> #define DFLT_FRAME_SIZE (1 << 11)
> #define DFLT_FRAME_COUNT (1 << 9)
> @@ -91,6 +93,8 @@ static const char *valid_arguments[] = {
> ETH_AF_PACKET_FRAMESIZE_ARG,
> ETH_AF_PACKET_FRAMECOUNT_ARG,
> ETH_AF_PACKET_QDISC_BYPASS_ARG,
> + ETH_AF_PACKET_ROLLOVER,
> + ETH_AF_PACKET_DEFRAG,
> NULL
> };
>
> @@ -659,6 +663,20 @@ open_packet_iface(const char *key __rte_unused,
> return 0;
> }
>
> +static int
> +uint_arg_parse(const char *arg_str, unsigned int *arg_val)
> +{
> + unsigned long val;
> + char *endptr;
> +
> + val = strtoul(arg_str, &endptr, 10);
> + if (*arg_str == '\0' || *endptr != '\0')
> + return -EINVAL;
> +
> + *arg_val = val;
> + return 0;
> +}
> +
> static int
> rte_pmd_init_internals(struct rte_vdev_device *dev,
> const int sockfd,
> @@ -676,6 +694,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> const unsigned int numa_node = dev->device.numa_node;
> struct rte_eth_dev_data *data = NULL;
> struct rte_kvargs_pair *pair = NULL;
> + const char *ifname = NULL;
> struct ifreq ifr;
> size_t ifnamelen;
> unsigned k_idx;
> @@ -686,6 +705,8 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> int rc, tpver, discard;
> int qsockfd = -1;
> unsigned int i, q, rdsize;
> + unsigned int rollover = 1;
> + unsigned int defrag = 1;
> #if defined(PACKET_FANOUT)
> int fanout_arg;
> #endif
> @@ -693,9 +714,30 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
> pair = &kvlist->pairs[k_idx];
> if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL)
> - break;
> + ifname = pair->value;
> + if (strcmp(pair->key, ETH_AF_PACKET_ROLLOVER) == 0) {
> + rc = uint_arg_parse(pair->value, &rollover);
> + if (rc || rollover > 1) {
> + PMD_LOG(ERR,
> + "%s: invalid rollover value: %s",
> + name, pair->value);
> + return -1;
> + }
> + continue;
> + }
> + if (strcmp(pair->key, ETH_AF_PACKET_DEFRAG) == 0) {
> + rc = uint_arg_parse(pair->value, &defrag);
> + if (rc || defrag > 1) {
> + PMD_LOG(ERR,
> + "%s: invalid defrag value: %s",
> + name, pair->value);
> + return -1;
> + }
> + continue;
> + }
>
Instead of adding parameter parsing here, they are already in
'rte_eth_from_packet()' function, better to add there,
BUT, before adding more paramter, can you please fix the parameter
parsing in the af_packet PMD? This is in my list for a long time now.
Instead of accessing to 'kvlist' internals (kvlist->count,
kvlist->pairs[]), and use 'strstr' (or 'strcmp'), there is already
existing 'rte_kvargs_process()' API for it, you can find multiple
samples in other drivers.
> }
> - if (pair == NULL) {
> +
> + if (ifname == NULL) {
> PMD_LOG(ERR,
> "%s: no interface specified for AF_PACKET ethdev",
> name);
> @@ -738,21 +780,21 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> req->tp_frame_size = framesize;
> req->tp_frame_nr = framecnt;
>
> - ifnamelen = strlen(pair->value);
> + ifnamelen = strlen(ifname);
> if (ifnamelen < sizeof(ifr.ifr_name)) {
> - memcpy(ifr.ifr_name, pair->value, ifnamelen);
> + memcpy(ifr.ifr_name, ifname, ifnamelen);
> ifr.ifr_name[ifnamelen] = '\0';
> } else {
> PMD_LOG(ERR,
> "%s: I/F name too long (%s)",
> - name, pair->value);
> + name, ifname);
> goto free_internals;
> }
> if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) {
> PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name);
> goto free_internals;
> }
> - (*internals)->if_name = strdup(pair->value);
> + (*internals)->if_name = strdup(ifname);
> if ((*internals)->if_name == NULL)
> goto free_internals;
> (*internals)->if_index = ifr.ifr_ifindex;
> @@ -770,9 +812,12 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
>
> #if defined(PACKET_FANOUT)
> fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff;
> - fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16;
> + fanout_arg |= PACKET_FANOUT_HASH << 16;
> + if (defrag)
> + fanout_arg |= PACKET_FANOUT_FLAG_DEFRAG << 16;
> #if defined(PACKET_FANOUT_FLAG_ROLLOVER)
> - fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
> + if (rollover)
> + fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
> #endif
> #endif
>
> @@ -792,7 +837,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_VERSION on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
>
> @@ -802,7 +847,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_LOSS on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
>
> @@ -813,7 +858,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
> #endif
> @@ -823,7 +868,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
>
> @@ -831,7 +876,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_TX_RING on AF_PACKET "
> - "socket for %s", name, pair->value);
> + "socket for %s", name, ifname);
> goto error;
> }
>
> @@ -844,7 +889,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rx_queue->map == MAP_FAILED) {
> PMD_LOG_ERRNO(ERR,
> "%s: call to mmap failed on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
>
> @@ -881,7 +926,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not bind AF_PACKET socket to %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
>
> @@ -891,7 +936,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> if (rc == -1) {
> PMD_LOG_ERRNO(ERR,
> "%s: could not set PACKET_FANOUT on AF_PACKET socket for %s",
> - name, pair->value);
> + name, ifname);
> goto error;
> }
> #endif
>
> base-commit: 98613d32e3dac58d685f4f236cf8cc9733abaaf3
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* RE: [PATCH v02] net/af_packet: add rollover and defrag options
2024-11-01 22:49 ` Ferruh Yigit
@ 2024-11-03 7:27 ` Gur Stavi
0 siblings, 0 replies; 3+ messages in thread
From: Gur Stavi @ 2024-11-03 7:27 UTC (permalink / raw)
To: 'Ferruh Yigit'; +Cc: dev, John W. Linville, Stephen Hemminger
> > net_af_packet PMD multi "queue" support relies on Linux FANOUT
> capability.
> > Linux FANOUT is a SW based load balancer that is similar to HW RSS
> which
> > is more common for DPDK PMDs. Instead of multiple HW descriptor queues,
> > AF PACKET uses multiple sockets.
> > HW RSS will typically drop a packet if its selected RX queue is empty.
> > However, Linux FANOUT, as a SW load balancer, can be configured to
> avoid
> > this packet drop by rolling over to the next socket.
> > This rollover functionality was ALWAYS enabled in net_af_packet. It is
> > surrounded by ifdef, but only to allow compilation on ancient Linux
> > versions that did not have it.
> >
> > Since DPDK applications are usually designed for HW based PMDs, this
> > rollover functionality, which the developers are likely unaware of,
> could
> > be confusing.
> >
> > Another option that is part of Linux FANOUT is DEFRAG that instructs
> Linux
> > to compose complete IP packet out of fragments before delivering it to
> the
> > PACKET socket. Again, this behavior typically does not exist for HW
> based
> > PMDs and may confuse users.
> >
> > This patch adds 2 options to control these features:
> > rollover=[0|1],defrag=[0|1]
> > For backward compatibility both features are enabled by default even
> > though most users will probably want both of them disabled.
> >
> > Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
> > ---
> > v2:
> > * Improve error handling for parsing of arg strings. Add a wrapper
> around
> > strtoul to replace atoi.
> > * Fix checkpatch indentation error due to copy-paste.
> >
> > v1: https://mails.dpdk.org/archives/dev/2024-October/307451.html
> > ---
> > doc/guides/nics/af_packet.rst | 4 ++
> > drivers/net/af_packet/rte_eth_af_packet.c | 77 ++++++++++++++++++-----
> > 2 files changed, 65 insertions(+), 16 deletions(-)
> >
> > diff --git a/doc/guides/nics/af_packet.rst
> b/doc/guides/nics/af_packet.rst
> > index 66b977e1a2..4bc748b50b 100644
> > --- a/doc/guides/nics/af_packet.rst
> > +++ b/doc/guides/nics/af_packet.rst
> > @@ -29,6 +29,8 @@ Some of these, in turn, will be used to configure the
> PACKET_MMAP settings.
> > * ``framesz`` - PACKET_MMAP frame size (optional, default 2048B;
> Note: multiple
> > of 16B);
> > * ``framecnt`` - PACKET_MMAP frame count (optional, default 512).
> > +* ``rollover`` - disable(0)/enable(1) PACKET_FANOUT_ROLLOVER
> (optional, default 1).
> > +* ``defrag`` - disable(0)/enable(1) PACKET_FANOUT_FLAG_DEFRAG
> (optional, default 1).
> >
>
> +1 to introduce both options, thanks.
> Do you have any practical usecase to disable them?
Not a very specific use case, but generic though experiments:
A DPDK should assume that all packets of a UDP flow will arrive to a
specific queue/core (or be dropped). It will maintain a
non-synchronized SW state for that flow only accessed by the designated
core. Having rollover deliver a flow packet to another queue/core could
cause various buggy behaviors over this state.
IP fragmentation is strongly discouraged. An app may want to explicitly
Ignore/detect fragments to help system maintainers identify offenders and
Fix the configuration that leads to fragmentation. Having Linux "secretly"
handling fragmentation can prevent this.
>
>
> > Because this implementation is based on PACKET_MMAP, and PACKET_MMAP
> has its
> > own pre-requisites, it should be noted that the inner workings of
> PACKET_MMAP
> > @@ -47,6 +49,8 @@ For the full details behind PACKET_MMAP's structures
> and settings, consider
> > reading the `PACKET_MMAP documentation in the Kernel
> >
> <https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt>`_.
> >
> > +For details of ``rollover`` anb ``defrag`` refer to the *packet(7)*
> man page.
> > +
> > Prerequisites
> > -------------
> >
> > diff --git a/drivers/net/af_packet/rte_eth_af_packet.c
> b/drivers/net/af_packet/rte_eth_af_packet.c
> > index 68870dd3e2..1a043e8398 100644
> > --- a/drivers/net/af_packet/rte_eth_af_packet.c
> > +++ b/drivers/net/af_packet/rte_eth_af_packet.c
> > @@ -36,6 +36,8 @@
> > #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz"
> > #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt"
> > #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass"
> > +#define ETH_AF_PACKET_ROLLOVER "rollover"
> > +#define ETH_AF_PACKET_DEFRAG "defrag"
> >
>
> Can you please add new arguments to 'RTE_PMD_REGISTER_PARAM_STRING'
> macro below?
>
> > #define DFLT_FRAME_SIZE (1 << 11)
> > #define DFLT_FRAME_COUNT (1 << 9)
> > @@ -91,6 +93,8 @@ static const char *valid_arguments[] = {
> > ETH_AF_PACKET_FRAMESIZE_ARG,
> > ETH_AF_PACKET_FRAMECOUNT_ARG,
> > ETH_AF_PACKET_QDISC_BYPASS_ARG,
> > + ETH_AF_PACKET_ROLLOVER,
> > + ETH_AF_PACKET_DEFRAG,
> > NULL
> > };
> >
> > @@ -659,6 +663,20 @@ open_packet_iface(const char *key __rte_unused,
> > return 0;
> > }
> >
> > +static int
> > +uint_arg_parse(const char *arg_str, unsigned int *arg_val)
> > +{
> > + unsigned long val;
> > + char *endptr;
> > +
> > + val = strtoul(arg_str, &endptr, 10);
> > + if (*arg_str == '\0' || *endptr != '\0')
> > + return -EINVAL;
> > +
> > + *arg_val = val;
> > + return 0;
> > +}
> > +
> > static int
> > rte_pmd_init_internals(struct rte_vdev_device *dev,
> > const int sockfd,
> > @@ -676,6 +694,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > const unsigned int numa_node = dev->device.numa_node;
> > struct rte_eth_dev_data *data = NULL;
> > struct rte_kvargs_pair *pair = NULL;
> > + const char *ifname = NULL;
> > struct ifreq ifr;
> > size_t ifnamelen;
> > unsigned k_idx;
> > @@ -686,6 +705,8 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > int rc, tpver, discard;
> > int qsockfd = -1;
> > unsigned int i, q, rdsize;
> > + unsigned int rollover = 1;
> > + unsigned int defrag = 1;
> > #if defined(PACKET_FANOUT)
> > int fanout_arg;
> > #endif
> > @@ -693,9 +714,30 @@ rte_pmd_init_internals(struct rte_vdev_device
> *dev,
> > for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
> > pair = &kvlist->pairs[k_idx];
> > if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL)
> > - break;
> > + ifname = pair->value;
> > + if (strcmp(pair->key, ETH_AF_PACKET_ROLLOVER) == 0) {
> > + rc = uint_arg_parse(pair->value, &rollover);
> > + if (rc || rollover > 1) {
> > + PMD_LOG(ERR,
> > + "%s: invalid rollover value: %s",
> > + name, pair->value);
> > + return -1;
> > + }
> > + continue;
> > + }
> > + if (strcmp(pair->key, ETH_AF_PACKET_DEFRAG) == 0) {
> > + rc = uint_arg_parse(pair->value, &defrag);
> > + if (rc || defrag > 1) {
> > + PMD_LOG(ERR,
> > + "%s: invalid defrag value: %s",
> > + name, pair->value);
> > + return -1;
> > + }
> > + continue;
> > + }
> >
>
> Instead of adding parameter parsing here, they are already in
> 'rte_eth_from_packet()' function, better to add there,
>
> BUT, before adding more paramter, can you please fix the parameter
> parsing in the af_packet PMD? This is in my list for a long time now.
>
> Instead of accessing to 'kvlist' internals (kvlist->count,
> kvlist->pairs[]), and use 'strstr' (or 'strcmp'), there is already
> existing 'rte_kvargs_process()' API for it, you can find multiple
> samples in other drivers.
>
>
> > }
> > - if (pair == NULL) {
> > +
> > + if (ifname == NULL) {
> > PMD_LOG(ERR,
> > "%s: no interface specified for AF_PACKET ethdev",
> > name);
> > @@ -738,21 +780,21 @@ rte_pmd_init_internals(struct rte_vdev_device
> *dev,
> > req->tp_frame_size = framesize;
> > req->tp_frame_nr = framecnt;
> >
> > - ifnamelen = strlen(pair->value);
> > + ifnamelen = strlen(ifname);
> > if (ifnamelen < sizeof(ifr.ifr_name)) {
> > - memcpy(ifr.ifr_name, pair->value, ifnamelen);
> > + memcpy(ifr.ifr_name, ifname, ifnamelen);
> > ifr.ifr_name[ifnamelen] = '\0';
> > } else {
> > PMD_LOG(ERR,
> > "%s: I/F name too long (%s)",
> > - name, pair->value);
> > + name, ifname);
> > goto free_internals;
> > }
> > if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) {
> > PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name);
> > goto free_internals;
> > }
> > - (*internals)->if_name = strdup(pair->value);
> > + (*internals)->if_name = strdup(ifname);
> > if ((*internals)->if_name == NULL)
> > goto free_internals;
> > (*internals)->if_index = ifr.ifr_ifindex;
> > @@ -770,9 +812,12 @@ rte_pmd_init_internals(struct rte_vdev_device
> *dev,
> >
> > #if defined(PACKET_FANOUT)
> > fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff;
> > - fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) <<
> 16;
> > + fanout_arg |= PACKET_FANOUT_HASH << 16;
> > + if (defrag)
> > + fanout_arg |= PACKET_FANOUT_FLAG_DEFRAG << 16;
> > #if defined(PACKET_FANOUT_FLAG_ROLLOVER)
> > - fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
> > + if (rollover)
> > + fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16;
> > #endif
> > #endif
> >
> > @@ -792,7 +837,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_VERSION on AF_PACKET
> socket for %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> >
> > @@ -802,7 +847,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_LOSS on AF_PACKET socket
> for %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> >
> > @@ -813,7 +858,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_QDISC_BYPASS on
> AF_PACKET socket for %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> > #endif
> > @@ -823,7 +868,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_RX_RING on AF_PACKET
> socket for %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> >
> > @@ -831,7 +876,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_TX_RING on AF_PACKET "
> > - "socket for %s", name, pair->value);
> > + "socket for %s", name, ifname);
> > goto error;
> > }
> >
> > @@ -844,7 +889,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rx_queue->map == MAP_FAILED) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: call to mmap failed on AF_PACKET socket for
> %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> >
> > @@ -881,7 +926,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not bind AF_PACKET socket to %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> >
> > @@ -891,7 +936,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
> > if (rc == -1) {
> > PMD_LOG_ERRNO(ERR,
> > "%s: could not set PACKET_FANOUT on AF_PACKET
> socket for %s",
> > - name, pair->value);
> > + name, ifname);
> > goto error;
> > }
> > #endif
> >
> > base-commit: 98613d32e3dac58d685f4f236cf8cc9733abaaf3
> > --
> > 2.40.1
> >
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-11-03 7:27 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-10-31 13:50 [PATCH v02] net/af_packet: add rollover and defrag options Gur Stavi
2024-11-01 22:49 ` Ferruh Yigit
2024-11-03 7:27 ` Gur Stavi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).