* [PATCH v1] net/mlx5: fix probe optimization race condition
@ 2025-08-28 3:21 Rongwei Liu
2025-08-28 4:40 ` Stephen Hemminger
0 siblings, 1 reply; 4+ messages in thread
From: Rongwei Liu @ 2025-08-28 3:21 UTC (permalink / raw)
To: dev, matan, viacheslavo, orika, suanmingm, thomas
Cc: rongweil, stable, Dariusz Sosnowski, Bing Zhao
With dedicated RDMA link monitor, there are two threads
which can update the IB device port information.
Add a new flag to avoid the race condition. Update should
go through RDMA link monitor once ready.
Fixes: 51fb5c40c826 ("net/mlx5: optimize device probing")
Cc: rongweil@nvidia.com
Cc: stable@dpdk.org
Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/linux/mlx5_nl.c | 7 ++-
drivers/common/mlx5/mlx5_common.h | 1 +
drivers/net/mlx5/linux/mlx5_ethdev_os.c | 69 ++++---------------------
drivers/net/mlx5/linux/mlx5_os.c | 9 +++-
4 files changed, 25 insertions(+), 61 deletions(-)
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c
index dd69e229e3..84c12efdc7 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -1171,8 +1171,12 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex, struct mlx5_dev_info
data.ibindex = dev_info->ibindex;
}
+ /* Update should be done via monitor thread to avoid race condition */
+ if (dev_info->async_mon_ready) {
+ rte_errno = ENODEV;
+ return 0;
+ }
ret = mlx5_nl_port_info(nl, pindex, &data);
-
if (dev_info->probe_opt && !strcmp(dev_info->ibname, name)) {
if ((!ret || ret == -ENODEV) && dev_info->port_info &&
pindex <= dev_info->port_num) {
@@ -1182,7 +1186,6 @@ mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex, struct mlx5_dev_info
dev_info->port_info[pindex].valid = 1;
}
}
-
return ret ? 0 : data.ifindex;
}
diff --git a/drivers/common/mlx5/mlx5_common.h b/drivers/common/mlx5/mlx5_common.h
index bea1382911..b49f0c850e 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -185,6 +185,7 @@ struct mlx5_dev_info {
uint32_t ibindex;
char ibname[MLX5_FS_NAME_MAX];
uint8_t probe_opt;
+ uint8_t async_mon_ready;
struct mlx5_port_nl_info *port_info;
};
diff --git a/drivers/net/mlx5/linux/mlx5_ethdev_os.c b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
index a371c2c747..180fd60f3a 100644
--- a/drivers/net/mlx5/linux/mlx5_ethdev_os.c
+++ b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
@@ -704,59 +704,6 @@ mlx5_link_update_bond(struct rte_eth_dev *dev)
((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING));
}
-static void
-mlx5_handle_port_info_update(struct mlx5_dev_info *dev_info, uint32_t if_index,
- uint16_t msg_type)
-{
- struct mlx5_switch_info info = {
- .master = 0,
- .representor = 0,
- .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
- .port_name = 0,
- .switch_id = 0,
- };
- uint32_t i;
- int nl_route;
-
- if (dev_info->port_num <= 1 || dev_info->port_info == NULL)
- return;
-
- DRV_LOG(DEBUG, "IB device %s ifindex %u received netlink event %u",
- dev_info->ibname, if_index, msg_type);
- for (i = 1; i <= dev_info->port_num; i++) {
- if (!dev_info->port_info[i].valid)
- continue;
- if (dev_info->port_info[i].ifindex == if_index)
- break;
- }
- if (msg_type == RTM_NEWLINK && i > dev_info->port_num) {
- nl_route = mlx5_nl_init(NETLINK_ROUTE, 0);
- if (nl_route < 0)
- goto flush_all;
-
- if (mlx5_nl_switch_info(nl_route, if_index, &info)) {
- if (mlx5_sysfs_switch_info(if_index, &info))
- goto flush_all;
- }
-
- if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_PFSF ||
- info.name_type == MLX5_PHYS_PORT_NAME_TYPE_PFVF)
- goto flush_all;
- close(nl_route);
- } else if (msg_type == RTM_DELLINK && i <= dev_info->port_num) {
- memset(dev_info->port_info + i, 0, sizeof(struct mlx5_port_nl_info));
- }
-
- return;
-flush_all:
- if (nl_route >= 0)
- close(nl_route);
- for (i = 1; i <= dev_info->port_num; i++) {
- if (!dev_info->port_info[i].ifindex)
- dev_info->port_info[i].valid = 0;
- }
-}
-
static void
mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg)
{
@@ -766,8 +713,6 @@ mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg)
if (mlx5_nl_parse_link_status_update(hdr, &if_index) < 0)
return;
- if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1 && !sh->rdma_monitor_supp)
- mlx5_handle_port_info_update(&sh->cdev->dev_info, if_index, hdr->nlmsg_type);
for (i = 0; i < sh->max_port; i++) {
struct mlx5_dev_shared_port *port = &sh->port[i];
@@ -970,10 +915,18 @@ mlx5_dev_interrupt_handler_ib(void *arg)
return;
if (data.event_type == MLX5_NL_RDMA_NETDEV_ATTACH_EVENT &&
- !(data.flags & MLX5_NL_CMD_GET_NET_INDEX))
+ !(data.flags & MLX5_NL_CMD_GET_NET_INDEX)) {
+ DRV_LOG(WARNING, "Incomplete RDMA ATTACH event for ibdev[%d]",
+ dev_info->ibindex);
+ if (data.flags & MLX5_NL_CMD_GET_PORT_INDEX)
+ memset(dev_info->port_info + data.portnum, 0,
+ sizeof(struct mlx5_port_nl_info));
+ else
+ goto flush_all;
return;
+ }
- DRV_LOG(DEBUG, "Event info: type %d, ibindex %d, ifindex %d, portnum %d,",
+ DRV_LOG(INFO, "Event info: type %d, ibindex %d, ifindex %d, portnum %d,",
data.event_type, data.ibindex, data.ifindex, data.portnum);
/* Changes found in number of SF/VF ports. All information is likely unreliable. */
@@ -992,7 +945,7 @@ mlx5_dev_interrupt_handler_ib(void *arg)
goto flush_all;
}
} else if (data.event_type == MLX5_NL_RDMA_NETDEV_DETACH_EVENT) {
- memset(dev_info->port_info + data.portnum, 0, sizeof(struct mlx5_port_nl_info));
+ dev_info->port_info[data.portnum].ifindex = 0;
}
return;
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 85b3fabaf5..edfe61ea55 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -3051,7 +3051,7 @@ mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
DRV_LOG(ERR, "Failed to allocate intr_handle.");
return;
}
- if (sh->cdev->config.probe_opt &&
+ if (sh->cdev->dev_info.probe_opt &&
sh->cdev->dev_info.port_num > 1 &&
!sh->rdma_monitor_supp) {
nlsk_fd = mlx5_nl_rdma_monitor_init();
@@ -3076,8 +3076,15 @@ mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
close(nlsk_fd);
return;
}
+ sh->cdev->dev_info.async_mon_ready = 1;
} else {
close(nlsk_fd);
+ if (sh->cdev->dev_info.probe_opt) {
+ DRV_LOG(INFO, "Failed to create rdma link monitor, disable probe optimization");
+ sh->cdev->dev_info.probe_opt = 0;
+ mlx5_free(sh->cdev->dev_info.port_info);
+ sh->cdev->dev_info.port_info = NULL;
+ }
}
}
nlsk_fd = mlx5_nl_init(NETLINK_ROUTE, RTMGRP_LINK);
--
2.27.0
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v1] net/mlx5: fix probe optimization race condition
2025-08-28 3:21 [PATCH v1] net/mlx5: fix probe optimization race condition Rongwei Liu
@ 2025-08-28 4:40 ` Stephen Hemminger
2025-08-28 4:49 ` rongwei liu
0 siblings, 1 reply; 4+ messages in thread
From: Stephen Hemminger @ 2025-08-28 4:40 UTC (permalink / raw)
To: Rongwei Liu
Cc: dev, matan, viacheslavo, orika, suanmingm, thomas, stable,
Dariusz Sosnowski, Bing Zhao
On Thu, 28 Aug 2025 06:21:34 +0300
Rongwei Liu <rongweil@nvidia.com> wrote:
> With dedicated RDMA link monitor, there are two threads
> which can update the IB device port information.
>
> Add a new flag to avoid the race condition. Update should
> go through RDMA link monitor once ready.
>
> Fixes: 51fb5c40c826 ("net/mlx5: optimize device probing")
> Cc: rongweil@nvidia.com
> Cc: stable@dpdk.org
> Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
> ---
If variable is modified (with out locking) on two threads it
needs to atomic or volatile.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v1] net/mlx5: fix probe optimization race condition
2025-08-28 4:40 ` Stephen Hemminger
@ 2025-08-28 4:49 ` rongwei liu
2025-08-28 13:37 ` Thomas Monjalon
0 siblings, 1 reply; 4+ messages in thread
From: rongwei liu @ 2025-08-28 4:49 UTC (permalink / raw)
To: Stephen Hemminger
Cc: dev, matan, viacheslavo, orika, suanmingm, thomas, stable,
Dariusz Sosnowski, Bing Zhao
On 2025/8/28 12:40, Stephen Hemminger wrote:
> On Thu, 28 Aug 2025 06:21:34 +0300
> Rongwei Liu <rongweil@nvidia.com> wrote:
>
>> With dedicated RDMA link monitor, there are two threads
>> which can update the IB device port information.
>>
>> Add a new flag to avoid the race condition. Update should
>> go through RDMA link monitor once ready.
>>
>> Fixes: 51fb5c40c826 ("net/mlx5: optimize device probing")
>> Cc: rongweil@nvidia.com
>> Cc: stable@dpdk.org
>> Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
>> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
>> ---
>
> If variable is modified (with out locking) on two threads it
> needs to atomic or volatile.
Exactly. Before this patch, it' user responsebility to seperate probe probing and sf manipualtion.
Obviously, customer didn't follow this very well.
Now logic change to:
1. Update all port information in probing thread.
2. Probe thread initiate the dedicated rdma monitor thread. Once ready, all port update will go to this thread.
3. Next port probing won't trigger PMD port information update.
No lock is required then.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v1] net/mlx5: fix probe optimization race condition
2025-08-28 4:49 ` rongwei liu
@ 2025-08-28 13:37 ` Thomas Monjalon
0 siblings, 0 replies; 4+ messages in thread
From: Thomas Monjalon @ 2025-08-28 13:37 UTC (permalink / raw)
To: rongwei liu
Cc: Stephen Hemminger, dev, matan, viacheslavo, orika, suanmingm,
stable, Dariusz Sosnowski, Bing Zhao
28/08/2025 06:49, rongwei liu:
>
> On 2025/8/28 12:40, Stephen Hemminger wrote:
> > On Thu, 28 Aug 2025 06:21:34 +0300
> > Rongwei Liu <rongweil@nvidia.com> wrote:
> >
> >> With dedicated RDMA link monitor, there are two threads
> >> which can update the IB device port information.
> >>
> >> Add a new flag to avoid the race condition. Update should
> >> go through RDMA link monitor once ready.
> >>
> >> Fixes: 51fb5c40c826 ("net/mlx5: optimize device probing")
> >> Cc: rongweil@nvidia.com
> >> Cc: stable@dpdk.org
> >> Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
> >> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
> >> ---
> >
> > If variable is modified (with out locking) on two threads it
> > needs to atomic or volatile.
> Exactly. Before this patch, it' user responsebility to seperate probe probing and sf manipualtion.
> Obviously, customer didn't follow this very well.
> Now logic change to:
> 1. Update all port information in probing thread.
> 2. Probe thread initiate the dedicated rdma monitor thread. Once ready, all port update will go to this thread.
> 3. Next port probing won't trigger PMD port information update.
>
> No lock is required then.
This is the kind of info we need in a commit log.
Please make a v2, thanks.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-08-28 13:38 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-28 3:21 [PATCH v1] net/mlx5: fix probe optimization race condition Rongwei Liu
2025-08-28 4:40 ` Stephen Hemminger
2025-08-28 4:49 ` rongwei liu
2025-08-28 13:37 ` Thomas Monjalon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).