DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev]   Segfault when eal thread executing mlx5 nic‘s lsc event
@ 2019-03-06  3:05 Zhaohui (zhaohui, Polestar)
  2019-03-06  6:02 ` Shahaf Shuler
  0 siblings, 1 reply; 6+ messages in thread
From: Zhaohui (zhaohui, Polestar) @ 2019-03-06  3:05 UTC (permalink / raw)
  To: dev, shahafs, yskoh; +Cc: xudingke, chenchanghu, wangyunjian

Hi:
    I think the flow list may be accessed in the same time by two different threads and may cause some errors. Do it need a lock to protect the flow list?

                                                                                                   Thanks
                                                                                                              Yunjian
(gdb) bt
#0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
#1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
#2 0x00000000006a8749 in PAT_abort ()
#3 0x00000000006a588d in patchIllInsHandler ()
#4 <signal handler called>
#5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940 <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
#6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
#7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
#8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940 <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:400
#9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-18.11/lib/librte_ethdev/rte_ethdev.c:3230
#10 0x00007f54cd8dee81 in mac_address_slaves_update (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
#11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, ret_param=<optimized out>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
#12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
#13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
#14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized out>, events=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
#15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885
#16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
#17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0
#18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6


In order to solve this problem(core dump). Something modified like this:(Looking forward to your reply)

From: zhaohui8 <zhaohui8@huawei.com>
---
 drivers/net/mlx5/mlx5.c         |  1 +
 drivers/net/mlx5/mlx5.h         |  1 +
 drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++-------
 drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9e5cab1..e8ae816 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1195,6 +1195,7 @@
 			priv->tcf_context = NULL;
 		}
 	}
+	rte_rwlock_init(&priv->flows_rwlock);
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bc500b2..cb8657c 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -202,6 +202,7 @@ struct priv {
 	unsigned int (*reta_idx)[]; /* RETA index table. */
 	unsigned int reta_idx_n; /* RETA index size. */
 	struct mlx5_drop drop_queue; /* Flow drop queues. */
+	rte_rwlock_t flows_rwlock; /* flows Lock. */
 	struct mlx5_flows flows; /* RTE Flow rules. */
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 97dc3e1..2c18602 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2121,9 +2121,13 @@ struct rte_flow *
 		 const struct rte_flow_action actions[],
 		 struct rte_flow_error *error)
 {
-	return flow_list_create(dev,
+	struct rte_flow * flow;
+	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)->flows_rwlock);
+	flow = flow_list_create(dev,
 				&((struct priv *)dev->data->dev_private)->flows,
 				attr, items, actions, error);
+	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)->flows_rwlock);
+	return flow;
 }
 
 /**
@@ -2235,12 +2239,13 @@ struct rte_flow *
 	struct priv *priv = dev->data->dev_private;
 	struct rte_flow *flow;
 	int ret = 0;
-
+	rte_rwlock_read_lock(&priv->flows_rwlock);
 	TAILQ_FOREACH(flow, &priv->flows, next) {
 		DRV_LOG(DEBUG, "port %u flow %p still referenced",
 			dev->data->port_id, (void *)flow);
 		++ret;
 	}
+	rte_rwlock_read_unlock(&priv->flows_rwlock);
 	return ret;
 }
 
@@ -2320,10 +2325,14 @@ struct rte_flow *
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
 		queue[i] = (*priv->reta_idx)[i];
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_list_create(dev, &priv->ctrl_flows,
 				&attr, items, actions, &error);
-	if (!flow)
+	if (!flow) {
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		return -rte_errno;
+	}
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2360,8 +2369,9 @@ struct rte_flow *
 		  struct rte_flow_error *error __rte_unused)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow_list_destroy(dev, &priv->flows, flow);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2376,8 +2386,9 @@ struct rte_flow *
 		struct rte_flow_error *error __rte_unused)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2729,17 +2740,22 @@ struct rte_flow *
 	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
 	if (ret)
 		goto error;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_fdir_filter_lookup(dev, fdir_flow);
 	if (flow) {
 		rte_errno = EEXIST;
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		goto error;
 	}
 	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
 				fdir_flow->items, fdir_flow->actions, NULL);
-	if (!flow)
+	if (!flow) {
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		goto error;
+	}
 	assert(!flow->fdir);
 	flow->fdir = fdir_flow;
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
 		dev->data->port_id, (void *)flow);
 	return 0;
@@ -2773,6 +2789,7 @@ struct rte_flow *
 	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
 	if (ret)
 		return -rte_errno;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
 	if (!flow) {
 		rte_errno = ENOENT;
@@ -2781,6 +2798,7 @@ struct rte_flow *
 	flow_list_destroy(dev, &priv->flows, flow);
 	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
 		dev->data->port_id, (void *)flow);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2817,8 +2835,9 @@ struct rte_flow *
 flow_fdir_filter_flush(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index e2a9bb7..b95c7cf 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -188,12 +188,15 @@
 			dev->data->port_id);
 		goto error;
 	}
+	rte_rwlock_read_lock(&priv->flows_rwlock);
 	ret = mlx5_flow_start(dev, &priv->flows);
 	if (ret) {
 		DRV_LOG(DEBUG, "port %u failed to set flows",
 			dev->data->port_id);
+		rte_rwlock_read_unlock(&priv->flows_rwlock);
 		goto error;
 	}
+	rte_rwlock_read_unlock(&priv->flows_rwlock);
 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
 	mlx5_dev_interrupt_handler_install(dev);
@@ -202,7 +205,9 @@
 	ret = rte_errno; /* Save rte_errno before cleanup. */
 	/* Rollback. */
 	dev->data->dev_started = 0;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_stop(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	mlx5_traffic_disable(dev);
 	mlx5_txq_stop(dev);
 	mlx5_rxq_stop(dev);
@@ -230,7 +235,9 @@
 	rte_wmb();
 	usleep(1000 * priv->rxqs_n);
 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_stop(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	mlx5_traffic_disable(dev);
 	mlx5_rx_intr_vec_disable(dev);
 	mlx5_dev_interrupt_handler_uninstall(dev);
@@ -364,7 +371,9 @@
 	return 0;
 error:
 	ret = rte_errno; /* Save rte_errno before cleanup. */
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	rte_errno = ret; /* Restore rte_errno. */
 	return -rte_errno;
 }
@@ -380,8 +389,9 @@
 mlx5_traffic_disable(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 }
 

-----邮件原件-----
发件人: wangyunjian 
发送时间: 2019年2月22日 15:34
收件人: dev@dpdk.org; shahafs@mellanox.com; yskoh@mellanox.com
抄送: xudingke <xudingke@huawei.com>; Zhaohui (zhaohui, Polestar) <zhaohui8@huawei.com>
主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event
  2019-03-06  3:05 [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event Zhaohui (zhaohui, Polestar)
@ 2019-03-06  6:02 ` Shahaf Shuler
  2019-03-06  9:42   ` [dpdk-dev] 答复: " Zhaohui (zhaohui, Polestar)
  0 siblings, 1 reply; 6+ messages in thread
From: Shahaf Shuler @ 2019-03-06  6:02 UTC (permalink / raw)
  To: Zhaohui (zhaohui, Polestar), dev, Yongseok Koh
  Cc: xudingke, chenchanghu, wangyunjian

Hi Yunjian,

Wednesday, March 6, 2019 5:06 AM, Zhaohui (zhaohui, Polestar):
> Subject: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event
> 
> Hi:
>     I think the flow list may be accessed in the same time by two different
> threads and may cause some errors. Do it need a lock to protect the flow list?

In general it is the application responsibility to synchronize the control between the different threads. The PMD does not support multi-thread control what so ever. 

On this specific case, when working on top of bond, the bond reacts to the LSC event while, I assume, your application is inserting/deleting rules using rte_flow APIs. 

The right fix should be for the bond PMD to mutex before calling rte_eth_dev_default_mac_addr_set. 

> 
>                                                                                                    Thanks
>                                                                                                               Yunjian
> (gdb) bt
> #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
> #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
> #2 0x00000000006a8749 in PAT_abort ()
> #3 0x00000000006a588d in patchIllInsHandler ()
> #4 <signal handler called>
> #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940
> <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
> #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940
> <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-
> pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
> #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
> #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940
> <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-
> 18.11/drivers/net/mlx5/mlx5_trigger.c:400
> #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set
> (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-
> 18.11/lib/librte_ethdev/rte_ethdev.c:3230
> #10 0x00007f54cd8dee81 in mac_address_slaves_update
> (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>)
> at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
> #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback
> (port_id=<optimized out>, type=<optimized out>, param=<optimized out>,
> ret_param=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
> #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0
> <rte_eth_devices>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
> #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
> #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized
> out>, events=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
> #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885
> #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
> #17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0
> #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6
> 
> 
> In order to solve this problem(core dump). Something modified like this:(
> Looking forward to your reply)
> 
> From: zhaohui8 <zhaohui8@huawei.com>
> ---
>  drivers/net/mlx5/mlx5.c         |  1 +
>  drivers/net/mlx5/mlx5.h         |  1 +
>  drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++------
> -
>  drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
>  4 files changed, 39 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 9e5cab1..e8ae816 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1195,6 +1195,7 @@
>  			priv->tcf_context = NULL;
>  		}
>  	}
> +	rte_rwlock_init(&priv->flows_rwlock);
>  	TAILQ_INIT(&priv->flows);
>  	TAILQ_INIT(&priv->ctrl_flows);
>  	/* Hint libmlx5 to use PMD allocator for data plane resources */ diff --
> git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> bc500b2..cb8657c 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -202,6 +202,7 @@ struct priv {
>  	unsigned int (*reta_idx)[]; /* RETA index table. */
>  	unsigned int reta_idx_n; /* RETA index size. */
>  	struct mlx5_drop drop_queue; /* Flow drop queues. */
> +	rte_rwlock_t flows_rwlock; /* flows Lock. */
>  	struct mlx5_flows flows; /* RTE Flow rules. */
>  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git
> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
> 97dc3e1..2c18602 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -2121,9 +2121,13 @@ struct rte_flow *
>  		 const struct rte_flow_action actions[],
>  		 struct rte_flow_error *error)
>  {
> -	return flow_list_create(dev,
> +	struct rte_flow * flow;
> +	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	flow = flow_list_create(dev,
>  				&((struct priv *)dev->data->dev_private)-
> >flows,
>  				attr, items, actions, error);
> +	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	return flow;
>  }
> 
>  /**
> @@ -2235,12 +2239,13 @@ struct rte_flow *
>  	struct priv *priv = dev->data->dev_private;
>  	struct rte_flow *flow;
>  	int ret = 0;
> -
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	TAILQ_FOREACH(flow, &priv->flows, next) {
>  		DRV_LOG(DEBUG, "port %u flow %p still referenced",
>  			dev->data->port_id, (void *)flow);
>  		++ret;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	return ret;
>  }
> 
> @@ -2320,10 +2325,14 @@ struct rte_flow *
>  	}
>  	for (i = 0; i != priv->reta_idx_n; ++i)
>  		queue[i] = (*priv->reta_idx)[i];
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_list_create(dev, &priv->ctrl_flows,
>  				&attr, items, actions, &error);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		return -rte_errno;
> +	}
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2360,8 +2369,9 @@ struct rte_flow *
>  		  struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow_list_destroy(dev, &priv->flows, flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2376,8 +2386,9 @@ struct rte_flow *
>  		struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2729,17 +2740,22 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
>  	if (ret)
>  		goto error;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, fdir_flow);
>  	if (flow) {
>  		rte_errno = EEXIST;
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
>  	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
>  				fdir_flow->items, fdir_flow->actions, NULL);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
> +	}
>  	assert(!flow->fdir);
>  	flow->fdir = fdir_flow;
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
>  	return 0;
> @@ -2773,6 +2789,7 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
>  	if (ret)
>  		return -rte_errno;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
>  	if (!flow) {
>  		rte_errno = ENOENT;
> @@ -2781,6 +2798,7 @@ struct rte_flow *
>  	flow_list_destroy(dev, &priv->flows, flow);
>  	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2817,8 +2835,9 @@ struct rte_flow *
>  flow_fdir_filter_flush(struct rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
>  /**
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -188,12 +188,15 @@
>  			dev->data->port_id);
>  		goto error;
>  	}
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	ret = mlx5_flow_start(dev, &priv->flows);
>  	if (ret) {
>  		DRV_LOG(DEBUG, "port %u failed to set flows",
>  			dev->data->port_id);
> +		rte_rwlock_read_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
>  	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
>  	mlx5_dev_interrupt_handler_install(dev);
> @@ -202,7 +205,9 @@
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
>  	/* Rollback. */
>  	dev->data->dev_started = 0;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_txq_stop(dev);
>  	mlx5_rxq_stop(dev);
> @@ -230,7 +235,9 @@
>  	rte_wmb();
>  	usleep(1000 * priv->rxqs_n);
>  	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_rx_intr_vec_disable(dev);
>  	mlx5_dev_interrupt_handler_uninstall(dev);
> @@ -364,7 +371,9 @@
>  	return 0;
>  error:
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	rte_errno = ret; /* Restore rte_errno. */
>  	return -rte_errno;
>  }
> @@ -380,8 +389,9 @@
>  mlx5_traffic_disable(struct rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
> 
> -----邮件原件-----
> 发件人: wangyunjian
> 发送时间: 2019年2月22日 15:34
> 收件人: dev@dpdk.org; shahafs@mellanox.com; yskoh@mellanox.com
> 抄送: xudingke <xudingke@huawei.com>; Zhaohui (zhaohui, Polestar)
> <zhaohui8@huawei.com>
> 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [dpdk-dev] 答复:   Segfault when eal thread executing mlx5 nic‘s lsc event
  2019-03-06  6:02 ` Shahaf Shuler
@ 2019-03-06  9:42   ` Zhaohui (zhaohui, Polestar)
  2019-03-06 18:59     ` Chas Williams
  0 siblings, 1 reply; 6+ messages in thread
From: Zhaohui (zhaohui, Polestar) @ 2019-03-06  9:42 UTC (permalink / raw)
  To: Shahaf Shuler, dev, Yongseok Koh, Chas Williams, Declan Doherty
  Cc: xudingke, chenchanghu, wangyunjian

Hi Williams:
	We have some questions that need your help. About “Segfault when eal thread executing mlx5 nic‘s lsc event”.
In order to solve this problem(core dump), What should we do?( Looking forward to your reply)
																				Thanks
											                                    by zhaohui

-----邮件原件-----
发件人: Shahaf Shuler [mailto:shahafs@mellanox.com] 
发送时间: 2019年3月6日 14:03
收件人: Zhaohui (zhaohui, Polestar) <zhaohui8@huawei.com>; dev@dpdk.org; Yongseok Koh <yskoh@mellanox.com>
抄送: xudingke <xudingke@huawei.com>; chenchanghu <chenchanghu@huawei.com>; wangyunjian <wangyunjian@huawei.com>
主题: RE: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event

Hi Yunjian,

Wednesday, March 6, 2019 5:06 AM, Zhaohui (zhaohui, Polestar):
> Subject: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc 
> event
> 
> Hi:
>     I think the flow list may be accessed in the same time by two 
> different threads and may cause some errors. Do it need a lock to protect the flow list?

In general it is the application responsibility to synchronize the control between the different threads. The PMD does not support multi-thread control what so ever. 

On this specific case, when working on top of bond, the bond reacts to the LSC event while, I assume, your application is inserting/deleting rules using rte_flow APIs. 

The right fix should be for the bond PMD to mutex before calling rte_eth_dev_default_mac_addr_set. 

> 
>                                                                                                    Thanks
>                                                                                                               
> Yunjian
> (gdb) bt
> #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
> #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
> #2 0x00000000006a8749 in PAT_abort ()
> #3 0x00000000006a588d in patchIllInsHandler ()
> #4 <signal handler called>
> #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940 
> <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
> #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 
> <rte_eth_devices+16512>, list=0x14455e618) at 
> /usr/src/debug/dpdk-mlx4-
> pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
> #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
> #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940
> <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-
> 18.11/drivers/net/mlx5/mlx5_trigger.c:400
> #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set 
> (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-
> 18.11/lib/librte_ethdev/rte_ethdev.c:3230
> #10 0x00007f54cd8dee81 in mac_address_slaves_update
> (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) at 
> /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
> #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback 
> (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, 
> ret_param=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
> #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0
> <rte_eth_devices>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
> #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
> #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized
> out>, events=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
> #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> :885
> #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
> #17 0x00007f54cade6dd5 in start_thread () from 
> /usr/lib64/libpthread.so.0
> #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6
> 
> 
> In order to solve this problem(core dump). Something modified like 
> this:(
> Looking forward to your reply)
> 
> From: zhaohui8 <zhaohui8@huawei.com>
> ---
>  drivers/net/mlx5/mlx5.c         |  1 +
>  drivers/net/mlx5/mlx5.h         |  1 +
>  drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++------
> -
>  drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
>  4 files changed, 39 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 9e5cab1..e8ae816 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1195,6 +1195,7 @@
>  			priv->tcf_context = NULL;
>  		}
>  	}
> +	rte_rwlock_init(&priv->flows_rwlock);
>  	TAILQ_INIT(&priv->flows);
>  	TAILQ_INIT(&priv->ctrl_flows);
>  	/* Hint libmlx5 to use PMD allocator for data plane resources */ 
> diff -- git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 
> bc500b2..cb8657c 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -202,6 +202,7 @@ struct priv {
>  	unsigned int (*reta_idx)[]; /* RETA index table. */
>  	unsigned int reta_idx_n; /* RETA index size. */
>  	struct mlx5_drop drop_queue; /* Flow drop queues. */
> +	rte_rwlock_t flows_rwlock; /* flows Lock. */
>  	struct mlx5_flows flows; /* RTE Flow rules. */
>  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git 
> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
> 97dc3e1..2c18602 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -2121,9 +2121,13 @@ struct rte_flow *
>  		 const struct rte_flow_action actions[],
>  		 struct rte_flow_error *error)
>  {
> -	return flow_list_create(dev,
> +	struct rte_flow * flow;
> +	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	flow = flow_list_create(dev,
>  				&((struct priv *)dev->data->dev_private)-
> >flows,
>  				attr, items, actions, error);
> +	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	return flow;
>  }
> 
>  /**
> @@ -2235,12 +2239,13 @@ struct rte_flow *
>  	struct priv *priv = dev->data->dev_private;
>  	struct rte_flow *flow;
>  	int ret = 0;
> -
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	TAILQ_FOREACH(flow, &priv->flows, next) {
>  		DRV_LOG(DEBUG, "port %u flow %p still referenced",
>  			dev->data->port_id, (void *)flow);
>  		++ret;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	return ret;
>  }
> 
> @@ -2320,10 +2325,14 @@ struct rte_flow *
>  	}
>  	for (i = 0; i != priv->reta_idx_n; ++i)
>  		queue[i] = (*priv->reta_idx)[i];
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_list_create(dev, &priv->ctrl_flows,
>  				&attr, items, actions, &error);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		return -rte_errno;
> +	}
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2360,8 +2369,9 @@ struct rte_flow *
>  		  struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow_list_destroy(dev, &priv->flows, flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2376,8 +2386,9 @@ struct rte_flow *
>  		struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2729,17 +2740,22 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
>  	if (ret)
>  		goto error;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, fdir_flow);
>  	if (flow) {
>  		rte_errno = EEXIST;
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
>  	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
>  				fdir_flow->items, fdir_flow->actions, NULL);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
> +	}
>  	assert(!flow->fdir);
>  	flow->fdir = fdir_flow;
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
>  	return 0;
> @@ -2773,6 +2789,7 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
>  	if (ret)
>  		return -rte_errno;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
>  	if (!flow) {
>  		rte_errno = ENOENT;
> @@ -2781,6 +2798,7 @@ struct rte_flow *
>  	flow_list_destroy(dev, &priv->flows, flow);
>  	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2817,8 +2835,9 @@ struct rte_flow *  flow_fdir_filter_flush(struct 
> rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
>  /**
> diff --git a/drivers/net/mlx5/mlx5_trigger.c 
> b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -188,12 +188,15 @@
>  			dev->data->port_id);
>  		goto error;
>  	}
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	ret = mlx5_flow_start(dev, &priv->flows);
>  	if (ret) {
>  		DRV_LOG(DEBUG, "port %u failed to set flows",
>  			dev->data->port_id);
> +		rte_rwlock_read_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
>  	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
>  	mlx5_dev_interrupt_handler_install(dev);
> @@ -202,7 +205,9 @@
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
>  	/* Rollback. */
>  	dev->data->dev_started = 0;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_txq_stop(dev);
>  	mlx5_rxq_stop(dev);
> @@ -230,7 +235,9 @@
>  	rte_wmb();
>  	usleep(1000 * priv->rxqs_n);
>  	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_rx_intr_vec_disable(dev);
>  	mlx5_dev_interrupt_handler_uninstall(dev);
> @@ -364,7 +371,9 @@
>  	return 0;
>  error:
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	rte_errno = ret; /* Restore rte_errno. */
>  	return -rte_errno;
>  }
> @@ -380,8 +389,9 @@
>  mlx5_traffic_disable(struct rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
> 
> -----邮件原件-----
> 发件人: wangyunjian
> 发送时间: 2019年2月22日 15:34
> 收件人: dev@dpdk.org; shahafs@mellanox.com; yskoh@mellanox.com
> 抄送: xudingke <xudingke@huawei.com>; Zhaohui (zhaohui, Polestar) 
> <zhaohui8@huawei.com>
> 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [dpdk-dev] 答复: Segfault when eal thread executing mlx5 nic‘s lsc event
  2019-03-06  9:42   ` [dpdk-dev] 答复: " Zhaohui (zhaohui, Polestar)
@ 2019-03-06 18:59     ` Chas Williams
  0 siblings, 0 replies; 6+ messages in thread
From: Chas Williams @ 2019-03-06 18:59 UTC (permalink / raw)
  To: Zhaohui (zhaohui, Polestar),
	Shahaf Shuler, dev, Yongseok Koh, Chas Williams, Declan Doherty
  Cc: xudingke, chenchanghu, wangyunjian



On 3/6/19 4:42 AM, Zhaohui (zhaohui, Polestar) wrote:
> Hi Williams:
> 	We have some questions that need your help. About “Segfault when eal thread executing mlx5 nic‘s lsc event”.
> In order to solve this problem(core dump), What should we do?( Looking forward to your reply)

The link state change monitor does run on the EAL interrupt thread.
So something is going to need to coordinate updates to the mlx5 list
structures since there are two threads that can access.

You could also stop the bonding device. That should stop the link
state change monitoring thread from attempting to make any updates.
But that's likely to impact performance when adding flows.

> 																				Thanks
> 											                                    by zhaohui
> 
> -----邮件原件-----
> 发件人: Shahaf Shuler [mailto:shahafs@mellanox.com]
> 发送时间: 2019年3月6日 14:03
> 收件人: Zhaohui (zhaohui, Polestar) <zhaohui8@huawei.com>; dev@dpdk.org; Yongseok Koh <yskoh@mellanox.com>
> 抄送: xudingke <xudingke@huawei.com>; chenchanghu <chenchanghu@huawei.com>; wangyunjian <wangyunjian@huawei.com>
> 主题: RE: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event
> 
> Hi Yunjian,
> 
> Wednesday, March 6, 2019 5:06 AM, Zhaohui (zhaohui, Polestar):
>> Subject: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc
>> event
>>
>> Hi:
>>      I think the flow list may be accessed in the same time by two
>> different threads and may cause some errors. Do it need a lock to protect the flow list?
> 
> In general it is the application responsibility to synchronize the control between the different threads. The PMD does not support multi-thread control what so ever.
> 
> On this specific case, when working on top of bond, the bond reacts to the LSC event while, I assume, your application is inserting/deleting rules using rte_flow APIs.
> 
> The right fix should be for the bond PMD to mutex before calling rte_eth_dev_default_mac_addr_set.
> 
>>
>>                                                                                                     Thanks
>>                                                                                                                
>> Yunjian
>> (gdb) bt
>> #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
>> #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
>> #2 0x00000000006a8749 in PAT_abort ()
>> #3 0x00000000006a588d in patchIllInsHandler ()
>> #4 <signal handler called>
>> #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940
>> <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at
>> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
>> #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940
>> <rte_eth_devices+16512>, list=0x14455e618) at
>> /usr/src/debug/dpdk-mlx4-
>> pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
>> #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at
>> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
>> #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940
>> <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-
>> 18.11/drivers/net/mlx5/mlx5_trigger.c:400
>> #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set
>> (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-
>> 18.11/lib/librte_ethdev/rte_ethdev.c:3230
>> #10 0x00007f54cd8dee81 in mac_address_slaves_update
>> (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) at
>> /usr/src/debug/dpdk-
>> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
>> #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback
>> (port_id=<optimized out>, type=<optimized out>, param=<optimized out>,
>> ret_param=<optimized out>) at /usr/src/debug/dpdk-
>> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
>> #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0
>> <rte_eth_devices>) at /usr/src/debug/dpdk-
>> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
>> #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at
>> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
>> #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized
>> out>, events=<optimized out>) at /usr/src/debug/dpdk-
>> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
>> #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at
>> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c
>> :885
>> #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-
>> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
>> #17 0x00007f54cade6dd5 in start_thread () from
>> /usr/lib64/libpthread.so.0
>> #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6
>>
>>
>> In order to solve this problem(core dump). Something modified like
>> this:(
>> Looking forward to your reply)
>>
>> From: zhaohui8 <zhaohui8@huawei.com>
>> ---
>>   drivers/net/mlx5/mlx5.c         |  1 +
>>   drivers/net/mlx5/mlx5.h         |  1 +
>>   drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++------
>> -
>>   drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
>>   4 files changed, 39 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
>> 9e5cab1..e8ae816 100644
>> --- a/drivers/net/mlx5/mlx5.c
>> +++ b/drivers/net/mlx5/mlx5.c
>> @@ -1195,6 +1195,7 @@
>>   			priv->tcf_context = NULL;
>>   		}
>>   	}
>> +	rte_rwlock_init(&priv->flows_rwlock);
>>   	TAILQ_INIT(&priv->flows);
>>   	TAILQ_INIT(&priv->ctrl_flows);
>>   	/* Hint libmlx5 to use PMD allocator for data plane resources */
>> diff -- git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
>> bc500b2..cb8657c 100644
>> --- a/drivers/net/mlx5/mlx5.h
>> +++ b/drivers/net/mlx5/mlx5.h
>> @@ -202,6 +202,7 @@ struct priv {
>>   	unsigned int (*reta_idx)[]; /* RETA index table. */
>>   	unsigned int reta_idx_n; /* RETA index size. */
>>   	struct mlx5_drop drop_queue; /* Flow drop queues. */
>> +	rte_rwlock_t flows_rwlock; /* flows Lock. */
>>   	struct mlx5_flows flows; /* RTE Flow rules. */
>>   	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>>   	LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git
>> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
>> 97dc3e1..2c18602 100644
>> --- a/drivers/net/mlx5/mlx5_flow.c
>> +++ b/drivers/net/mlx5/mlx5_flow.c
>> @@ -2121,9 +2121,13 @@ struct rte_flow *
>>   		 const struct rte_flow_action actions[],
>>   		 struct rte_flow_error *error)
>>   {
>> -	return flow_list_create(dev,
>> +	struct rte_flow * flow;
>> +	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)-
>>> flows_rwlock);
>> +	flow = flow_list_create(dev,
>>   				&((struct priv *)dev->data->dev_private)-
>>> flows,
>>   				attr, items, actions, error);
>> +	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)-
>>> flows_rwlock);
>> +	return flow;
>>   }
>>
>>   /**
>> @@ -2235,12 +2239,13 @@ struct rte_flow *
>>   	struct priv *priv = dev->data->dev_private;
>>   	struct rte_flow *flow;
>>   	int ret = 0;
>> -
>> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>>   	TAILQ_FOREACH(flow, &priv->flows, next) {
>>   		DRV_LOG(DEBUG, "port %u flow %p still referenced",
>>   			dev->data->port_id, (void *)flow);
>>   		++ret;
>>   	}
>> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>>   	return ret;
>>   }
>>
>> @@ -2320,10 +2325,14 @@ struct rte_flow *
>>   	}
>>   	for (i = 0; i != priv->reta_idx_n; ++i)
>>   		queue[i] = (*priv->reta_idx)[i];
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	flow = flow_list_create(dev, &priv->ctrl_flows,
>>   				&attr, items, actions, &error);
>> -	if (!flow)
>> +	if (!flow) {
>> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   		return -rte_errno;
>> +	}
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	return 0;
>>   }
>>
>> @@ -2360,8 +2369,9 @@ struct rte_flow *
>>   		  struct rte_flow_error *error __rte_unused)  {
>>   	struct priv *priv = dev->data->dev_private;
>> -
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	flow_list_destroy(dev, &priv->flows, flow);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	return 0;
>>   }
>>
>> @@ -2376,8 +2386,9 @@ struct rte_flow *
>>   		struct rte_flow_error *error __rte_unused)  {
>>   	struct priv *priv = dev->data->dev_private;
>> -
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_list_flush(dev, &priv->flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	return 0;
>>   }
>>
>> @@ -2729,17 +2740,22 @@ struct rte_flow *
>>   	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
>>   	if (ret)
>>   		goto error;
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	flow = flow_fdir_filter_lookup(dev, fdir_flow);
>>   	if (flow) {
>>   		rte_errno = EEXIST;
>> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   		goto error;
>>   	}
>>   	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
>>   				fdir_flow->items, fdir_flow->actions, NULL);
>> -	if (!flow)
>> +	if (!flow) {
>> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   		goto error;
>> +	}
>>   	assert(!flow->fdir);
>>   	flow->fdir = fdir_flow;
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
>>   		dev->data->port_id, (void *)flow);
>>   	return 0;
>> @@ -2773,6 +2789,7 @@ struct rte_flow *
>>   	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
>>   	if (ret)
>>   		return -rte_errno;
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
>>   	if (!flow) {
>>   		rte_errno = ENOENT;
>> @@ -2781,6 +2798,7 @@ struct rte_flow *
>>   	flow_list_destroy(dev, &priv->flows, flow);
>>   	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
>>   		dev->data->port_id, (void *)flow);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	return 0;
>>   }
>>
>> @@ -2817,8 +2835,9 @@ struct rte_flow *  flow_fdir_filter_flush(struct
>> rte_eth_dev *dev)  {
>>   	struct priv *priv = dev->data->dev_private;
>> -
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_list_flush(dev, &priv->flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   }
>>
>>   /**
>> diff --git a/drivers/net/mlx5/mlx5_trigger.c
>> b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644
>> --- a/drivers/net/mlx5/mlx5_trigger.c
>> +++ b/drivers/net/mlx5/mlx5_trigger.c
>> @@ -188,12 +188,15 @@
>>   			dev->data->port_id);
>>   		goto error;
>>   	}
>> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>>   	ret = mlx5_flow_start(dev, &priv->flows);
>>   	if (ret) {
>>   		DRV_LOG(DEBUG, "port %u failed to set flows",
>>   			dev->data->port_id);
>> +		rte_rwlock_read_unlock(&priv->flows_rwlock);
>>   		goto error;
>>   	}
>> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>>   	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
>>   	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
>>   	mlx5_dev_interrupt_handler_install(dev);
>> @@ -202,7 +205,9 @@
>>   	ret = rte_errno; /* Save rte_errno before cleanup. */
>>   	/* Rollback. */
>>   	dev->data->dev_started = 0;
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_stop(dev, &priv->flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	mlx5_traffic_disable(dev);
>>   	mlx5_txq_stop(dev);
>>   	mlx5_rxq_stop(dev);
>> @@ -230,7 +235,9 @@
>>   	rte_wmb();
>>   	usleep(1000 * priv->rxqs_n);
>>   	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_stop(dev, &priv->flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	mlx5_traffic_disable(dev);
>>   	mlx5_rx_intr_vec_disable(dev);
>>   	mlx5_dev_interrupt_handler_uninstall(dev);
>> @@ -364,7 +371,9 @@
>>   	return 0;
>>   error:
>>   	ret = rte_errno; /* Save rte_errno before cleanup. */
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   	rte_errno = ret; /* Restore rte_errno. */
>>   	return -rte_errno;
>>   }
>> @@ -380,8 +389,9 @@
>>   mlx5_traffic_disable(struct rte_eth_dev *dev)  {
>>   	struct priv *priv = dev->data->dev_private;
>> -
>> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>>   	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
>> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>>   }
>>
>>
>> -----邮件原件-----
>> 发件人: wangyunjian
>> 发送时间: 2019年2月22日 15:34
>> 收件人: dev@dpdk.org; shahafs@mellanox.com; yskoh@mellanox.com
>> 抄送: xudingke <xudingke@huawei.com>; Zhaohui (zhaohui, Polestar)
>> <zhaohui8@huawei.com>
>> 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [dpdk-dev] Segfault when eal thread executing mlx5 nic's lsc event
@ 2020-10-29 16:42 Weifeng LI
  0 siblings, 0 replies; 6+ messages in thread
From: Weifeng LI @ 2020-10-29 16:42 UTC (permalink / raw)
  To: dev, shahafs, matan, viacheslavo, chas3
  Cc: liweifeng2, 863348577, Weifeng Li, zhaohui8

hi

     I am using the dpdk bond of mlx5. There is a segment error in the 
process of starting the bond port. This is because EAL interrupt thread 
is processing LSC interrupt when slave_configure is executing the 
rte_eth_dev_rss_reta_update. rte_eth_dev_rss_reta_update will also use 
mlx5 flow list.

     I've also found another discussion about this 
issue.https://mails.dpdk.org/archives/dev/2019-March/125929.html 
<https://mails.dpdk.org/archives/dev/2019-March/125929.html>

     Do it need a lock to protect the mlx5 flow list?


int
slave_configure(struct rte_eth_dev *bonded_eth_dev,
         struct rte_eth_dev *slave_eth_dev)

{

     ...

     /* Start device */
     errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
     if (errval != 0) {
         RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
                 slave_eth_dev->data->port_id, errval);
         return -1;
     }

     /* If RSS is enabled for bonding, synchronize RETA */
     if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
         int i;
         struct bond_dev_private *internals;

         internals = bonded_eth_dev->data->dev_private;

         for (i = 0; i < internals->slave_count; i++) {
             if (internals->slaves[i].port_id == 
slave_eth_dev->data->port_id) {
                 errval = rte_eth_dev_rss_reta_update(
                         slave_eth_dev->data->port_id,
                         &internals->reta_conf[0],
                         internals->slaves[i].reta_size);
                 if (errval != 0) {
                     RTE_BOND_LOG(WARNING,
                              "rte_eth_dev_rss_reta_update on slave port 
%d fails (err %d)."
                              " RSS Configuration for bonding may be 
inconsistent.",
                              slave_eth_dev->data->port_id, errval);
                 }
                 break;
             }
         }
     }



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [dpdk-dev]  Segfault when eal thread executing mlx5 nic‘s lsc event
@ 2019-02-22  7:33 wangyunjian
  0 siblings, 0 replies; 6+ messages in thread
From: wangyunjian @ 2019-02-22  7:33 UTC (permalink / raw)
  To: dev, shahafs, yskoh; +Cc: xudingke, Zhaohui (zhaohui, Polestar)

I think the flow list may be accessed in the same time by two
different threads and may cause some errors. Do it need a lock
to protect the flow list?

Thanks
          Yunjian

(gdb) bt
#0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
#1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
#2 0x00000000006a8749 in PAT_abort ()
#3 0x00000000006a588d in patchIllInsHandler ()
#4 <signal handler called>
#5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940 <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618)
at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
#6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
#7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
#8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940 <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:400
#9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-18.11/lib/librte_ethdev/rte_ethdev.c:3230
#10 0x00007f54cd8dee81 in mac_address_slaves_update (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
#11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, ret_param=<optimized out>)
at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
#12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
#13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
#14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized out>, events=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
#15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885
#16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
#17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0
#18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-10-30  7:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-06  3:05 [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event Zhaohui (zhaohui, Polestar)
2019-03-06  6:02 ` Shahaf Shuler
2019-03-06  9:42   ` [dpdk-dev] 答复: " Zhaohui (zhaohui, Polestar)
2019-03-06 18:59     ` Chas Williams
  -- strict thread matches above, loose matches on Subject: below --
2020-10-29 16:42 [dpdk-dev] Segfault when eal thread executing mlx5 nic's " Weifeng LI
2019-02-22  7:33 [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s " wangyunjian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).