* [dpdk-dev] [PATCH v3 1/5] eal: use sizeof to avoid a double use of a define
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
@ 2018-04-24 14:46 ` Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 2/5] eal: new function to create control threads Olivier Matz
` (5 subsequent siblings)
6 siblings, 0 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-24 14:46 UTC (permalink / raw)
To: dev; +Cc: Anatoly Burakov
Only a cosmetic change: the *_LEN defines are already used
when defining the buffer. Using sizeof() ensures that the length
stays consistent, even if the definition is modified.
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
examples/tep_termination/main.c | 2 +-
examples/vhost/main.c | 2 +-
lib/librte_eal/bsdapp/eal/eal.c | 4 ++--
lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
lib/librte_eal/common/eal_common_proc.c | 4 ++--
lib/librte_eal/linuxapp/eal/eal.c | 4 ++--
lib/librte_eal/linuxapp/eal/eal_interrupts.c | 2 +-
lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
lib/librte_eal/linuxapp/eal/eal_timer.c | 2 +-
lib/librte_vhost/socket.c | 4 ++--
10 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index 8543a9803..52b95025c 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -1208,7 +1208,7 @@ main(int argc, char *argv[])
ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
if (ret != 0)
rte_exit(EXIT_FAILURE, "Cannot create print-stats thread\n");
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
+ snprintf(thread_name, sizeof(thread_name), "print-stats");
ret = rte_thread_setname(tid, thread_name);
if (ret != 0)
RTE_LOG(DEBUG, VHOST_CONFIG, "Cannot set print-stats name\n");
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 84e0d6366..e33a0f3c9 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1499,7 +1499,7 @@ main(int argc, char *argv[])
"Cannot create print-stats thread\n");
/* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
+ snprintf(thread_name, sizeof(thread_name), "print-stats");
ret = rte_thread_setname(tid, thread_name);
if (ret != 0)
RTE_LOG(DEBUG, VHOST_CONFIG,
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d315cdef8..10d8dc03f 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -657,7 +657,7 @@ rte_eal_init(int argc, char **argv)
eal_thread_init_master(rte_config.master_lcore);
- ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
rte_config.master_lcore, thread_id, cpuset,
@@ -683,7 +683,7 @@ rte_eal_init(int argc, char **argv)
rte_panic("Cannot create thread\n");
/* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ snprintf(thread_name, sizeof(thread_name),
"lcore-slave-%d", i);
rte_thread_setname(lcore_config[i].thread_id, thread_name);
}
diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
index d602daf81..309b58726 100644
--- a/lib/librte_eal/bsdapp/eal/eal_thread.c
+++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
@@ -119,7 +119,7 @@ eal_thread_loop(__attribute__((unused)) void *arg)
if (eal_thread_set_affinity() < 0)
rte_panic("cannot set affinity\n");
- ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
lcore_id, thread_id, cpuset, ret == 0 ? "" : "...");
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 9136fb0a3..c450c848f 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -682,11 +682,11 @@ rte_mp_channel_init(void)
}
/* try best to set thread name */
- strlcpy(thread_name, "rte_mp_handle", RTE_MAX_THREAD_NAME_LEN);
+ strlcpy(thread_name, "rte_mp_handle", sizeof(thread_name));
rte_thread_setname(mp_handle_tid, thread_name);
/* try best to set thread name */
- strlcpy(thread_name, "rte_mp_async_handle", RTE_MAX_THREAD_NAME_LEN);
+ strlcpy(thread_name, "rte_mp_async_handle", sizeof(thread_name));
rte_thread_setname(async_reply_handle_tid, thread_name);
/* unlock the directory */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 5b23bf0e0..200e879d2 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -895,7 +895,7 @@ rte_eal_init(int argc, char **argv)
eal_thread_init_master(rte_config.master_lcore);
- ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
rte_config.master_lcore, (int)thread_id, cpuset,
@@ -926,7 +926,7 @@ rte_eal_init(int argc, char **argv)
rte_panic("Cannot create thread\n");
/* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ snprintf(thread_name, sizeof(thread_name),
"lcore-slave-%d", i);
ret = rte_thread_setname(lcore_config[i].thread_id,
thread_name);
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 58e93281a..b5bcfc59a 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -877,7 +877,7 @@ rte_eal_intr_init(void)
"Failed to create thread for interrupt handling\n");
} else {
/* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ snprintf(thread_name, sizeof(thread_name),
"eal-intr-thread");
ret_1 = rte_thread_setname(intr_thread, thread_name);
if (ret_1 != 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b77..f652ff988 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -119,7 +119,7 @@ eal_thread_loop(__attribute__((unused)) void *arg)
if (eal_thread_set_affinity() < 0)
rte_panic("cannot set affinity\n");
- ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 161322f23..098e22a65 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -189,7 +189,7 @@ rte_eal_hpet_init(int make_default)
/*
* Set thread_name for aid in debugging.
*/
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc");
+ snprintf(thread_name, sizeof(thread_name), "hpet-msb-inc");
ret = rte_thread_setname(msb_inc_thread_id, thread_name);
if (ret != 0)
RTE_LOG(DEBUG, EAL,
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 636fc25c6..6eec427e4 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -485,7 +485,7 @@ vhost_user_reconnect_init(void)
"failed to destroy reconnect mutex");
}
} else {
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ snprintf(thread_name, sizeof(thread_name),
"vhost-reconn");
if (rte_thread_setname(reconn_tid, thread_name)) {
@@ -1029,7 +1029,7 @@ rte_vhost_driver_start(const char *path)
fdset_pipe_uninit(&vhost_user.fdset);
return -1;
} else {
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ snprintf(thread_name, sizeof(thread_name),
"vhost-events");
if (rte_thread_setname(fdset_tid, thread_name)) {
--
2.11.0
^ permalink raw reply [flat|nested] 41+ messages in thread
* [dpdk-dev] [PATCH v3 2/5] eal: new function to create control threads
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 1/5] eal: use sizeof to avoid a double use of a define Olivier Matz
@ 2018-04-24 14:46 ` Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread Olivier Matz
` (4 subsequent siblings)
6 siblings, 0 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-24 14:46 UTC (permalink / raw)
To: dev; +Cc: Anatoly Burakov
Many parts of dpdk use their own management threads. Introduce a new
wrapper for thread creation that will be extended in next commits to set
the name and affinity.
To be consistent with other DPDK APIs, the return value is negative in
case of error, which was not the case for pthread_create().
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
drivers/net/kni/Makefile | 1 +
drivers/net/kni/rte_eth_kni.c | 2 +-
lib/librte_eal/common/eal_common_proc.c | 4 ++--
lib/librte_eal/common/eal_common_thread.c | 8 ++++++++
lib/librte_eal/common/include/rte_lcore.h | 21 +++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_interrupts.c | 6 +++---
lib/librte_eal/linuxapp/eal/eal_timer.c | 2 +-
lib/librte_eal/rte_eal_version.map | 1 +
lib/librte_vhost/socket.c | 6 +++---
9 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/drivers/net/kni/Makefile b/drivers/net/kni/Makefile
index 01eaef056..562e8d2da 100644
--- a/drivers/net/kni/Makefile
+++ b/drivers/net/kni/Makefile
@@ -10,6 +10,7 @@ LIB = librte_pmd_kni.a
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
LDLIBS += -lpthread
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_kni
diff --git a/drivers/net/kni/rte_eth_kni.c b/drivers/net/kni/rte_eth_kni.c
index 08fc6a35e..fdd592554 100644
--- a/drivers/net/kni/rte_eth_kni.c
+++ b/drivers/net/kni/rte_eth_kni.c
@@ -149,7 +149,7 @@ eth_kni_dev_start(struct rte_eth_dev *dev)
}
if (internals->no_request_thread == 0) {
- ret = pthread_create(&internals->thread, NULL,
+ ret = rte_ctrl_thread_create(&internals->thread, NULL,
kni_handle_request, internals);
if (ret) {
RTE_LOG(ERR, PMD,
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index c450c848f..abaa085c4 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -662,7 +662,7 @@ rte_mp_channel_init(void)
return -1;
}
- if (pthread_create(&mp_handle_tid, NULL, mp_handle, NULL) < 0) {
+ if (rte_ctrl_thread_create(&mp_handle_tid, NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
strerror(errno));
close(mp_fd);
@@ -671,7 +671,7 @@ rte_mp_channel_init(void)
return -1;
}
- if (pthread_create(&async_reply_handle_tid, NULL,
+ if (rte_ctrl_thread_create(&async_reply_handle_tid, NULL,
async_reply_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
strerror(errno));
diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
index 40902e49b..efbccddbc 100644
--- a/lib/librte_eal/common/eal_common_thread.c
+++ b/lib/librte_eal/common/eal_common_thread.c
@@ -140,3 +140,11 @@ eal_thread_dump_affinity(char *str, unsigned size)
return ret;
}
+
+__rte_experimental int
+rte_ctrl_thread_create(pthread_t *thread,
+ const pthread_attr_t *attr,
+ void *(*start_routine)(void *), void *arg)
+{
+ return pthread_create(thread, attr, start_routine, arg);
+}
diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h
index 731297574..30749d0ab 100644
--- a/lib/librte_eal/common/include/rte_lcore.h
+++ b/lib/librte_eal/common/include/rte_lcore.h
@@ -277,6 +277,27 @@ void rte_thread_get_affinity(rte_cpuset_t *cpusetp);
int rte_thread_setname(pthread_t id, const char *name);
/**
+ * Create a control thread.
+ *
+ * Wrapper to pthread_create().
+ *
+ * @param thread
+ * Filled with the thread id of the new created thread.
+ * @param attr
+ * Attributes for the new thread.
+ * @param start_routine
+ * Function to be executed by the new thread.
+ * @param arg
+ * Argument passed to start_routine.
+ * @return
+ * On success, returns 0; on error, it returns a negative value
+ * corresponding to the error number.
+ */
+__rte_experimental int
+rte_ctrl_thread_create(pthread_t *thread, const pthread_attr_t *attr,
+ void *(*start_routine)(void *), void *arg);
+
+/**
* Test if the core supplied has a specific role
*
* @param lcore_id
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index b5bcfc59a..9a49ae6a5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -869,10 +869,10 @@ rte_eal_intr_init(void)
}
/* create the host thread to wait/handle the interrupt */
- ret = pthread_create(&intr_thread, NULL,
+ ret = rte_ctrl_thread_create(&intr_thread, NULL,
eal_intr_thread_main, NULL);
if (ret != 0) {
- rte_errno = ret;
+ rte_errno = -ret;
RTE_LOG(ERR, EAL,
"Failed to create thread for interrupt handling\n");
} else {
@@ -885,7 +885,7 @@ rte_eal_intr_init(void)
"Failed to set thread name for interrupt handling\n");
}
- return -ret;
+ return ret;
}
static void
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 098e22a65..6f5d9fee6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -178,7 +178,7 @@ rte_eal_hpet_init(int make_default)
/* create a thread that will increment a global variable for
* msb (hpet is 32 bits by default under linux) */
- ret = pthread_create(&msb_inc_thread_id, NULL,
+ ret = rte_ctrl_thread_create(&msb_inc_thread_id, NULL,
(void *(*)(void *))hpet_msb_inc, NULL);
if (ret != 0) {
RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index d02d80b8a..bc3c02dcc 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -213,6 +213,7 @@ DPDK_18.02 {
EXPERIMENTAL {
global:
+ rte_ctrl_thread_create;
rte_dev_event_callback_register;
rte_dev_event_callback_unregister;
rte_dev_event_monitor_start;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 6eec427e4..db9be6a99 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -476,7 +476,7 @@ vhost_user_reconnect_init(void)
}
TAILQ_INIT(&reconn_list.head);
- ret = pthread_create(&reconn_tid, NULL,
+ ret = rte_ctrl_thread_create(&reconn_tid, NULL,
vhost_user_client_reconnect, NULL);
if (ret != 0) {
RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
@@ -1020,8 +1020,8 @@ rte_vhost_driver_start(const char *path)
return -1;
}
- int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch,
- &vhost_user.fdset);
+ int ret = rte_ctrl_thread_create(&fdset_tid, NULL,
+ fdset_event_dispatch, &vhost_user.fdset);
if (ret != 0) {
RTE_LOG(ERR, VHOST_CONFIG,
"failed to create fdset handling thread");
--
2.11.0
^ permalink raw reply [flat|nested] 41+ messages in thread
* [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 1/5] eal: use sizeof to avoid a double use of a define Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 2/5] eal: new function to create control threads Olivier Matz
@ 2018-04-24 14:46 ` Olivier Matz
2018-04-24 16:08 ` Burakov, Anatoly
2018-04-27 15:46 ` Tan, Jianfeng
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 4/5] eal: set affinity for control threads Olivier Matz
` (3 subsequent siblings)
6 siblings, 2 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-24 14:46 UTC (permalink / raw)
To: dev; +Cc: Anatoly Burakov
To avoid code duplication, add a parameter to rte_ctrl_thread_create()
to specify the name of the thread.
This requires to add a wrapper for the thread start routine in
rte_thread_init(), which will first wait that the thread is configured.
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
drivers/net/kni/rte_eth_kni.c | 3 +-
lib/librte_eal/common/eal_common_proc.c | 15 +++-----
lib/librte_eal/common/eal_common_thread.c | 52 +++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_lcore.h | 7 ++--
lib/librte_eal/linuxapp/eal/eal_interrupts.c | 13 ++-----
lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +------
lib/librte_vhost/socket.c | 25 +++----------
7 files changed, 66 insertions(+), 61 deletions(-)
diff --git a/drivers/net/kni/rte_eth_kni.c b/drivers/net/kni/rte_eth_kni.c
index fdd592554..35a6d3ef7 100644
--- a/drivers/net/kni/rte_eth_kni.c
+++ b/drivers/net/kni/rte_eth_kni.c
@@ -149,7 +149,8 @@ eth_kni_dev_start(struct rte_eth_dev *dev)
}
if (internals->no_request_thread == 0) {
- ret = rte_ctrl_thread_create(&internals->thread, NULL,
+ ret = rte_ctrl_thread_create(&internals->thread,
+ "kni_handle_req", NULL,
kni_handle_request, internals);
if (ret) {
RTE_LOG(ERR, PMD,
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index abaa085c4..611aba299 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -622,7 +622,6 @@ unlink_sockets(const char *filter)
int
rte_mp_channel_init(void)
{
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
char path[PATH_MAX];
int dir_fd;
pthread_t mp_handle_tid, async_reply_handle_tid;
@@ -662,7 +661,8 @@ rte_mp_channel_init(void)
return -1;
}
- if (rte_ctrl_thread_create(&mp_handle_tid, NULL, mp_handle, NULL) < 0) {
+ if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle",
+ NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
strerror(errno));
close(mp_fd);
@@ -671,7 +671,8 @@ rte_mp_channel_init(void)
return -1;
}
- if (rte_ctrl_thread_create(&async_reply_handle_tid, NULL,
+ if (rte_ctrl_thread_create(&async_reply_handle_tid,
+ "rte_mp_async", NULL,
async_reply_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
strerror(errno));
@@ -681,14 +682,6 @@ rte_mp_channel_init(void)
return -1;
}
- /* try best to set thread name */
- strlcpy(thread_name, "rte_mp_handle", sizeof(thread_name));
- rte_thread_setname(mp_handle_tid, thread_name);
-
- /* try best to set thread name */
- strlcpy(thread_name, "rte_mp_async_handle", sizeof(thread_name));
- rte_thread_setname(async_reply_handle_tid, thread_name);
-
/* unlock the directory */
flock(dir_fd, LOCK_UN);
close(dir_fd);
diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
index efbccddbc..94d2a6e42 100644
--- a/lib/librte_eal/common/eal_common_thread.c
+++ b/lib/librte_eal/common/eal_common_thread.c
@@ -7,6 +7,7 @@
#include <stdint.h>
#include <unistd.h>
#include <pthread.h>
+#include <signal.h>
#include <sched.h>
#include <assert.h>
#include <string.h>
@@ -141,10 +142,53 @@ eal_thread_dump_affinity(char *str, unsigned size)
return ret;
}
+
+struct rte_thread_ctrl_params {
+ void *(*start_routine)(void *);
+ void *arg;
+ pthread_barrier_t configured;
+};
+
+static void *rte_thread_init(void *arg)
+{
+ struct rte_thread_ctrl_params *params = arg;
+ void *(*start_routine)(void *) = params->start_routine;
+ void *routine_arg = params->arg;
+
+ pthread_barrier_wait(¶ms->configured);
+
+ return start_routine(routine_arg);
+}
+
__rte_experimental int
-rte_ctrl_thread_create(pthread_t *thread,
- const pthread_attr_t *attr,
- void *(*start_routine)(void *), void *arg)
+rte_ctrl_thread_create(pthread_t *thread, const char *name,
+ const pthread_attr_t *attr,
+ void *(*start_routine)(void *), void *arg)
{
- return pthread_create(thread, attr, start_routine, arg);
+ struct rte_thread_ctrl_params params = {
+ .start_routine = start_routine,
+ .arg = arg,
+ };
+ int ret;
+
+ pthread_barrier_init(¶ms.configured, NULL, 2);
+
+ ret = pthread_create(thread, attr, rte_thread_init, (void *)¶ms);
+ if (ret != 0)
+ return ret;
+
+ if (name != NULL) {
+ ret = rte_thread_setname(*thread, name);
+ if (ret < 0)
+ goto fail;
+ }
+
+ pthread_barrier_wait(¶ms.configured);
+
+ return 0;
+
+fail:
+ pthread_cancel(*thread);
+ pthread_join(*thread, NULL);
+ return ret;
}
diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h
index 30749d0ab..eb39c2a8b 100644
--- a/lib/librte_eal/common/include/rte_lcore.h
+++ b/lib/librte_eal/common/include/rte_lcore.h
@@ -279,10 +279,12 @@ int rte_thread_setname(pthread_t id, const char *name);
/**
* Create a control thread.
*
- * Wrapper to pthread_create().
+ * Wrapper to pthread_create() and pthread_setname_np().
*
* @param thread
* Filled with the thread id of the new created thread.
+ * @param name
+ * The name of the control thread (max 16 characters including '\0').
* @param attr
* Attributes for the new thread.
* @param start_routine
@@ -294,7 +296,8 @@ int rte_thread_setname(pthread_t id, const char *name);
* corresponding to the error number.
*/
__rte_experimental int
-rte_ctrl_thread_create(pthread_t *thread, const pthread_attr_t *attr,
+rte_ctrl_thread_create(pthread_t *thread, const char *name,
+ const pthread_attr_t *attr,
void *(*start_routine)(void *), void *arg);
/**
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 9a49ae6a5..056d41c12 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -853,8 +853,7 @@ eal_intr_thread_main(__rte_unused void *arg)
int
rte_eal_intr_init(void)
{
- int ret = 0, ret_1 = 0;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ int ret = 0;
/* init the global interrupt source head */
TAILQ_INIT(&intr_sources);
@@ -869,20 +868,12 @@ rte_eal_intr_init(void)
}
/* create the host thread to wait/handle the interrupt */
- ret = rte_ctrl_thread_create(&intr_thread, NULL,
+ ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
eal_intr_thread_main, NULL);
if (ret != 0) {
rte_errno = -ret;
RTE_LOG(ERR, EAL,
"Failed to create thread for interrupt handling\n");
- } else {
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, sizeof(thread_name),
- "eal-intr-thread");
- ret_1 = rte_thread_setname(intr_thread, thread_name);
- if (ret_1 != 0)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for interrupt handling\n");
}
return ret;
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 6f5d9fee6..2766bd789 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -137,7 +137,6 @@ int
rte_eal_hpet_init(int make_default)
{
int fd, ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
if (internal_config.no_hpet) {
RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
@@ -178,7 +177,7 @@ rte_eal_hpet_init(int make_default)
/* create a thread that will increment a global variable for
* msb (hpet is 32 bits by default under linux) */
- ret = rte_ctrl_thread_create(&msb_inc_thread_id, NULL,
+ ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
(void *(*)(void *))hpet_msb_inc, NULL);
if (ret != 0) {
RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
@@ -186,15 +185,6 @@ rte_eal_hpet_init(int make_default)
return -1;
}
- /*
- * Set thread_name for aid in debugging.
- */
- snprintf(thread_name, sizeof(thread_name), "hpet-msb-inc");
- ret = rte_thread_setname(msb_inc_thread_id, thread_name);
- if (ret != 0)
- RTE_LOG(DEBUG, EAL,
- "Cannot set HPET timer thread name!\n");
-
if (make_default)
eal_timer_source = EAL_TIMER_HPET;
return 0;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index db9be6a99..4a561add9 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -467,7 +467,6 @@ static int
vhost_user_reconnect_init(void)
{
int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
ret = pthread_mutex_init(&reconn_list.mutex, NULL);
if (ret < 0) {
@@ -476,7 +475,7 @@ vhost_user_reconnect_init(void)
}
TAILQ_INIT(&reconn_list.head);
- ret = rte_ctrl_thread_create(&reconn_tid, NULL,
+ ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
vhost_user_client_reconnect, NULL);
if (ret != 0) {
RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
@@ -484,14 +483,6 @@ vhost_user_reconnect_init(void)
RTE_LOG(ERR, VHOST_CONFIG,
"failed to destroy reconnect mutex");
}
- } else {
- snprintf(thread_name, sizeof(thread_name),
- "vhost-reconn");
-
- if (rte_thread_setname(reconn_tid, thread_name)) {
- RTE_LOG(DEBUG, VHOST_CONFIG,
- "failed to set reconnect thread name");
- }
}
return ret;
@@ -1000,7 +991,6 @@ rte_vhost_driver_start(const char *path)
{
struct vhost_user_socket *vsocket;
static pthread_t fdset_tid;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
@@ -1020,22 +1010,15 @@ rte_vhost_driver_start(const char *path)
return -1;
}
- int ret = rte_ctrl_thread_create(&fdset_tid, NULL,
- fdset_event_dispatch, &vhost_user.fdset);
+ int ret = rte_ctrl_thread_create(&fdset_tid,
+ "vhost-events", NULL, fdset_event_dispatch,
+ &vhost_user.fdset);
if (ret != 0) {
RTE_LOG(ERR, VHOST_CONFIG,
"failed to create fdset handling thread");
fdset_pipe_uninit(&vhost_user.fdset);
return -1;
- } else {
- snprintf(thread_name, sizeof(thread_name),
- "vhost-events");
-
- if (rte_thread_setname(fdset_tid, thread_name)) {
- RTE_LOG(DEBUG, VHOST_CONFIG,
- "failed to set vhost-event thread name");
- }
}
}
--
2.11.0
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread Olivier Matz
@ 2018-04-24 16:08 ` Burakov, Anatoly
2018-04-27 15:46 ` Tan, Jianfeng
1 sibling, 0 replies; 41+ messages in thread
From: Burakov, Anatoly @ 2018-04-24 16:08 UTC (permalink / raw)
To: Olivier Matz, dev
On 24-Apr-18 3:46 PM, Olivier Matz wrote:
> To avoid code duplication, add a parameter to rte_ctrl_thread_create()
> to specify the name of the thread.
>
> This requires to add a wrapper for the thread start routine in
> rte_thread_init(), which will first wait that the thread is configured.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
For IPC/thread parts,
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread Olivier Matz
2018-04-24 16:08 ` Burakov, Anatoly
@ 2018-04-27 15:46 ` Tan, Jianfeng
2018-04-27 16:17 ` Tan, Jianfeng
1 sibling, 1 reply; 41+ messages in thread
From: Tan, Jianfeng @ 2018-04-27 15:46 UTC (permalink / raw)
To: Olivier Matz, dev; +Cc: Anatoly Burakov
Hi Olivier,
After this patch, I find the two IPC threads block at
pthread_barrier_wait(), and never wake up. Please refer below for more
information. The system is Ubuntu 16.04.
On 4/24/2018 10:46 PM, Olivier Matz wrote:
> To avoid code duplication, add a parameter to rte_ctrl_thread_create()
> to specify the name of the thread.
>
> This requires to add a wrapper for the thread start routine in
> rte_thread_init(), which will first wait that the thread is configured.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
> drivers/net/kni/rte_eth_kni.c | 3 +-
> lib/librte_eal/common/eal_common_proc.c | 15 +++-----
> lib/librte_eal/common/eal_common_thread.c | 52 +++++++++++++++++++++++++---
> lib/librte_eal/common/include/rte_lcore.h | 7 ++--
> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 13 ++-----
> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +------
> lib/librte_vhost/socket.c | 25 +++----------
> 7 files changed, 66 insertions(+), 61 deletions(-)
[...]
> diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
> index efbccddbc..94d2a6e42 100644
> --- a/lib/librte_eal/common/eal_common_thread.c
> +++ b/lib/librte_eal/common/eal_common_thread.c
> @@ -7,6 +7,7 @@
> #include <stdint.h>
> #include <unistd.h>
> #include <pthread.h>
> +#include <signal.h>
> #include <sched.h>
> #include <assert.h>
> #include <string.h>
> @@ -141,10 +142,53 @@ eal_thread_dump_affinity(char *str, unsigned size)
> return ret;
> }
>
> +
> +struct rte_thread_ctrl_params {
> + void *(*start_routine)(void *);
> + void *arg;
> + pthread_barrier_t configured;
> +};
> +
> +static void *rte_thread_init(void *arg)
> +{
> + struct rte_thread_ctrl_params *params = arg;
> + void *(*start_routine)(void *) = params->start_routine;
> + void *routine_arg = params->arg;
> +
> + pthread_barrier_wait(¶ms->configured);
This thread never wakes up. The call trace as below:
#0 0x00007ffff72a8154 in futex_wait (private=0, expected=0,
futex_word=0x7fffffffcff4)
at ../sysdeps/unix/sysv/linux/futex-internal.h:61
#1 futex_wait_simple (private=0, expected=0, futex_word=0x7fffffffcff4)
at ../sysdeps/nptl/futex-internal.h:135
#2 __pthread_barrier_wait (barrier=0x7fffffffcff0) at
pthread_barrier_wait.c:184
#3 0x000000000055216a in rte_thread_init (arg=0x7fffffffcfe0) at
/home/tan/git/dpdk/lib/librte_eal/common/eal_common_thread.c:160
#4 0x00007ffff72a16ba in start_thread (arg=0x7ffff6ecf700) at
pthread_create.c:333
#5 0x00007ffff6fd741d in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:109
> +
> + return start_routine(routine_arg);
> +}
> +
> __rte_experimental int
> -rte_ctrl_thread_create(pthread_t *thread,
> - const pthread_attr_t *attr,
> - void *(*start_routine)(void *), void *arg)
> +rte_ctrl_thread_create(pthread_t *thread, const char *name,
> + const pthread_attr_t *attr,
> + void *(*start_routine)(void *), void *arg)
> {
> - return pthread_create(thread, attr, start_routine, arg);
> + struct rte_thread_ctrl_params params = {
> + .start_routine = start_routine,
> + .arg = arg,
> + };
> + int ret;
> +
> + pthread_barrier_init(¶ms.configured, NULL, 2);
> +
> + ret = pthread_create(thread, attr, rte_thread_init, (void *)¶ms);
> + if (ret != 0)
> + return ret;
> +
> + if (name != NULL) {
> + ret = rte_thread_setname(*thread, name);
> + if (ret < 0)
> + goto fail;
> + }
> +
> + pthread_barrier_wait(¶ms.configured);
Here, the thread wakes up normally, and continues.
Any idea on what's going on?
Thanks,
Jianfeng
> +
> + return 0;
> +
> +fail:
> + pthread_cancel(*thread);
> + pthread_join(*thread, NULL);
> + return ret;
> }
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread
2018-04-27 15:46 ` Tan, Jianfeng
@ 2018-04-27 16:17 ` Tan, Jianfeng
2018-04-27 16:46 ` Burakov, Anatoly
0 siblings, 1 reply; 41+ messages in thread
From: Tan, Jianfeng @ 2018-04-27 16:17 UTC (permalink / raw)
To: Olivier Matz, dev; +Cc: Anatoly Burakov
On 4/27/2018 11:46 PM, Tan, Jianfeng wrote:
> Hi Olivier,
>
> After this patch, I find the two IPC threads block at
> pthread_barrier_wait(), and never wake up. Please refer below for more
> information. The system is Ubuntu 16.04.
>
> On 4/24/2018 10:46 PM, Olivier Matz wrote:
>> To avoid code duplication, add a parameter to rte_ctrl_thread_create()
>> to specify the name of the thread.
>>
>> This requires to add a wrapper for the thread start routine in
>> rte_thread_init(), which will first wait that the thread is configured.
>>
>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> ---
>> drivers/net/kni/rte_eth_kni.c | 3 +-
>> lib/librte_eal/common/eal_common_proc.c | 15 +++-----
>> lib/librte_eal/common/eal_common_thread.c | 52
>> +++++++++++++++++++++++++---
>> lib/librte_eal/common/include/rte_lcore.h | 7 ++--
>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 13 ++-----
>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +------
>> lib/librte_vhost/socket.c | 25 +++----------
>> 7 files changed, 66 insertions(+), 61 deletions(-)
> [...]
>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>> b/lib/librte_eal/common/eal_common_thread.c
>> index efbccddbc..94d2a6e42 100644
>> --- a/lib/librte_eal/common/eal_common_thread.c
>> +++ b/lib/librte_eal/common/eal_common_thread.c
>> @@ -7,6 +7,7 @@
>> #include <stdint.h>
>> #include <unistd.h>
>> #include <pthread.h>
>> +#include <signal.h>
>> #include <sched.h>
>> #include <assert.h>
>> #include <string.h>
>> @@ -141,10 +142,53 @@ eal_thread_dump_affinity(char *str, unsigned size)
>> return ret;
>> }
>> +
>> +struct rte_thread_ctrl_params {
>> + void *(*start_routine)(void *);
>> + void *arg;
>> + pthread_barrier_t configured;
>> +};
>> +
>> +static void *rte_thread_init(void *arg)
>> +{
>> + struct rte_thread_ctrl_params *params = arg;
>> + void *(*start_routine)(void *) = params->start_routine;
>> + void *routine_arg = params->arg;
>> +
>> + pthread_barrier_wait(¶ms->configured);
>
> This thread never wakes up. The call trace as below:
>
> #0 0x00007ffff72a8154 in futex_wait (private=0, expected=0,
> futex_word=0x7fffffffcff4)
> at ../sysdeps/unix/sysv/linux/futex-internal.h:61
> #1 futex_wait_simple (private=0, expected=0,
> futex_word=0x7fffffffcff4) at ../sysdeps/nptl/futex-internal.h:135
> #2 __pthread_barrier_wait (barrier=0x7fffffffcff0) at
> pthread_barrier_wait.c:184
> #3 0x000000000055216a in rte_thread_init (arg=0x7fffffffcfe0) at
> /home/tan/git/dpdk/lib/librte_eal/common/eal_common_thread.c:160
> #4 0x00007ffff72a16ba in start_thread (arg=0x7ffff6ecf700) at
> pthread_create.c:333
> #5 0x00007ffff6fd741d in clone () at
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
>
>> +
>> + return start_routine(routine_arg);
>> +}
>> +
>> __rte_experimental int
>> -rte_ctrl_thread_create(pthread_t *thread,
>> - const pthread_attr_t *attr,
>> - void *(*start_routine)(void *), void *arg)
>> +rte_ctrl_thread_create(pthread_t *thread, const char *name,
>> + const pthread_attr_t *attr,
>> + void *(*start_routine)(void *), void *arg)
>> {
>> - return pthread_create(thread, attr, start_routine, arg);
>> + struct rte_thread_ctrl_params params = {
>> + .start_routine = start_routine,
>> + .arg = arg,
>> + };
Update:
I doubt it's due to that we defined this variable, params, on the stack;
and the value seems be overwritten by following code. Will send a patch
to fix it.
Thanks,
Jianfeng
>> + int ret;
>> +
>> + pthread_barrier_init(¶ms.configured, NULL, 2);
>> +
>> + ret = pthread_create(thread, attr, rte_thread_init, (void
>> *)¶ms);
>> + if (ret != 0)
>> + return ret;
>> +
>> + if (name != NULL) {
>> + ret = rte_thread_setname(*thread, name);
>> + if (ret < 0)
>> + goto fail;
>> + }
>> +
>> + pthread_barrier_wait(¶ms.configured);
>
> Here, the thread wakes up normally, and continues.
>
> Any idea on what's going on?
>
> Thanks,
> Jianfeng
>
>> +
>> + return 0;
>> +
>> +fail:
>> + pthread_cancel(*thread);
>> + pthread_join(*thread, NULL);
>> + return ret;
>> }
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread
2018-04-27 16:17 ` Tan, Jianfeng
@ 2018-04-27 16:46 ` Burakov, Anatoly
0 siblings, 0 replies; 41+ messages in thread
From: Burakov, Anatoly @ 2018-04-27 16:46 UTC (permalink / raw)
To: Tan, Jianfeng, Olivier Matz, dev
On 27-Apr-18 5:17 PM, Tan, Jianfeng wrote:
>
>
> On 4/27/2018 11:46 PM, Tan, Jianfeng wrote:
>> Hi Olivier,
>>
>> After this patch, I find the two IPC threads block at
>> pthread_barrier_wait(), and never wake up. Please refer below for more
>> information. The system is Ubuntu 16.04.
>>
>> On 4/24/2018 10:46 PM, Olivier Matz wrote:
>>> To avoid code duplication, add a parameter to rte_ctrl_thread_create()
>>> to specify the name of the thread.
>>>
>>> This requires to add a wrapper for the thread start routine in
>>> rte_thread_init(), which will first wait that the thread is configured.
>>>
>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>> ---
>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>> lib/librte_eal/common/eal_common_proc.c | 15 +++-----
>>> lib/librte_eal/common/eal_common_thread.c | 52
>>> +++++++++++++++++++++++++---
>>> lib/librte_eal/common/include/rte_lcore.h | 7 ++--
>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 13 ++-----
>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +------
>>> lib/librte_vhost/socket.c | 25 +++----------
>>> 7 files changed, 66 insertions(+), 61 deletions(-)
>> [...]
>>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>>> b/lib/librte_eal/common/eal_common_thread.c
>>> index efbccddbc..94d2a6e42 100644
>>> --- a/lib/librte_eal/common/eal_common_thread.c
>>> +++ b/lib/librte_eal/common/eal_common_thread.c
>>> @@ -7,6 +7,7 @@
>>> #include <stdint.h>
>>> #include <unistd.h>
>>> #include <pthread.h>
>>> +#include <signal.h>
>>> #include <sched.h>
>>> #include <assert.h>
>>> #include <string.h>
>>> @@ -141,10 +142,53 @@ eal_thread_dump_affinity(char *str, unsigned size)
>>> return ret;
>>> }
>>> +
>>> +struct rte_thread_ctrl_params {
>>> + void *(*start_routine)(void *);
>>> + void *arg;
>>> + pthread_barrier_t configured;
>>> +};
>>> +
>>> +static void *rte_thread_init(void *arg)
>>> +{
>>> + struct rte_thread_ctrl_params *params = arg;
>>> + void *(*start_routine)(void *) = params->start_routine;
>>> + void *routine_arg = params->arg;
>>> +
>>> + pthread_barrier_wait(¶ms->configured);
>>
>> This thread never wakes up. The call trace as below:
>>
>> #0 0x00007ffff72a8154 in futex_wait (private=0, expected=0,
>> futex_word=0x7fffffffcff4)
>> at ../sysdeps/unix/sysv/linux/futex-internal.h:61
>> #1 futex_wait_simple (private=0, expected=0,
>> futex_word=0x7fffffffcff4) at ../sysdeps/nptl/futex-internal.h:135
>> #2 __pthread_barrier_wait (barrier=0x7fffffffcff0) at
>> pthread_barrier_wait.c:184
>> #3 0x000000000055216a in rte_thread_init (arg=0x7fffffffcfe0) at
>> /home/tan/git/dpdk/lib/librte_eal/common/eal_common_thread.c:160
>> #4 0x00007ffff72a16ba in start_thread (arg=0x7ffff6ecf700) at
>> pthread_create.c:333
>> #5 0x00007ffff6fd741d in clone () at
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
>>
>>> +
>>> + return start_routine(routine_arg);
>>> +}
>>> +
>>> __rte_experimental int
>>> -rte_ctrl_thread_create(pthread_t *thread,
>>> - const pthread_attr_t *attr,
>>> - void *(*start_routine)(void *), void *arg)
>>> +rte_ctrl_thread_create(pthread_t *thread, const char *name,
>>> + const pthread_attr_t *attr,
>>> + void *(*start_routine)(void *), void *arg)
>>> {
>>> - return pthread_create(thread, attr, start_routine, arg);
>>> + struct rte_thread_ctrl_params params = {
>>> + .start_routine = start_routine,
>>> + .arg = arg,
>>> + };
>
> Update:
>
> I doubt it's due to that we defined this variable, params, on the stack;
> and the value seems be overwritten by following code. Will send a patch
> to fix it.
I'm not sure i follow you, but looking forward to the fix :)
As far as i can tell, even if the variable is on the stack, we're making
copies of values there before destroying them, so even if param somehow
got destroyed before the thread had a chance to start, we've already got
all data we needed from it. I can't see how that value being allocated
on the stack makes a difference.
Just about the only thing i can see that's slightly wrong here is lack
of pthread_barrier_destroy(). Perhaps add that as well? :)
>
> Thanks,
> Jianfeng
>
>
>>> + int ret;
>>> +
>>> + pthread_barrier_init(¶ms.configured, NULL, 2);
>>> +
>>> + ret = pthread_create(thread, attr, rte_thread_init, (void
>>> *)¶ms);
>>> + if (ret != 0)
>>> + return ret;
>>> +
>>> + if (name != NULL) {
>>> + ret = rte_thread_setname(*thread, name);
>>> + if (ret < 0)
>>> + goto fail;
>>> + }
>>> +
>>> + pthread_barrier_wait(¶ms.configured);
>>
>> Here, the thread wakes up normally, and continues.
>>
>> Any idea on what's going on?
>>
>> Thanks,
>> Jianfeng
>>
>>> +
>>> + return 0;
>>> +
>>> +fail:
>>> + pthread_cancel(*thread);
>>> + pthread_join(*thread, NULL);
>>> + return ret;
>>> }
>>
>
>
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 41+ messages in thread
* [dpdk-dev] [PATCH v3 4/5] eal: set affinity for control threads
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
` (2 preceding siblings ...)
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 3/5] eal: set name when creating a control thread Olivier Matz
@ 2018-04-24 14:46 ` Olivier Matz
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 5/5] examples: use new API to create " Olivier Matz
` (2 subsequent siblings)
6 siblings, 0 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-24 14:46 UTC (permalink / raw)
To: dev; +Cc: Anatoly Burakov
The management threads must not bother the dataplane or service cores.
Set the affinity of these threads accordingly.
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
lib/librte_eal/common/eal_common_thread.c | 22 +++++++++++++++++++++-
lib/librte_eal/common/include/rte_lcore.h | 4 +++-
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
index 94d2a6e42..4e75cb81f 100644
--- a/lib/librte_eal/common/eal_common_thread.c
+++ b/lib/librte_eal/common/eal_common_thread.c
@@ -16,6 +16,7 @@
#include <rte_memory.h>
#include <rte_log.h>
+#include "eal_private.h"
#include "eal_thread.h"
RTE_DECLARE_PER_LCORE(unsigned , _socket_id);
@@ -169,7 +170,9 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name,
.start_routine = start_routine,
.arg = arg,
};
- int ret;
+ unsigned int lcore_id;
+ rte_cpuset_t cpuset;
+ int cpu_found, ret;
pthread_barrier_init(¶ms.configured, NULL, 2);
@@ -183,6 +186,23 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name,
goto fail;
}
+ cpu_found = 0;
+ CPU_ZERO(&cpuset);
+ for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+ if (eal_cpu_detected(lcore_id) &&
+ rte_lcore_has_role(lcore_id, ROLE_OFF)) {
+ CPU_SET(lcore_id, &cpuset);
+ cpu_found = 1;
+ }
+ }
+ /* if no detected cpu is off, use master core */
+ if (!cpu_found)
+ CPU_SET(rte_get_master_lcore(), &cpuset);
+
+ ret = pthread_setaffinity_np(*thread, sizeof(cpuset), &cpuset);
+ if (ret < 0)
+ goto fail;
+
pthread_barrier_wait(¶ms.configured);
return 0;
diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h
index eb39c2a8b..334a0629e 100644
--- a/lib/librte_eal/common/include/rte_lcore.h
+++ b/lib/librte_eal/common/include/rte_lcore.h
@@ -279,7 +279,9 @@ int rte_thread_setname(pthread_t id, const char *name);
/**
* Create a control thread.
*
- * Wrapper to pthread_create() and pthread_setname_np().
+ * Wrapper to pthread_create(), pthread_setname_np() and
+ * pthread_setaffinity_np(). The dataplane and service lcores are
+ * excluded from the affinity of the new thread.
*
* @param thread
* Filled with the thread id of the new created thread.
--
2.11.0
^ permalink raw reply [flat|nested] 41+ messages in thread
* [dpdk-dev] [PATCH v3 5/5] examples: use new API to create control threads
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
` (3 preceding siblings ...)
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 4/5] eal: set affinity for control threads Olivier Matz
@ 2018-04-24 14:46 ` Olivier Matz
2018-04-24 22:53 ` [dpdk-dev] [PATCH v3 0/5] fix control thread affinities Thomas Monjalon
2018-04-30 15:45 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities") Maxime Coquelin
6 siblings, 0 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-24 14:46 UTC (permalink / raw)
To: dev; +Cc: Anatoly Burakov
A new API was introduced to create control thread:
rte_ctrl_thread_create(). Use it in examples when relevant.
While at it, change the prototype of the thread start functions: it's
not a good idea to cast it in (void *) since the compiler won't check
that the prototype is compatible.
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
examples/tep_termination/main.c | 16 +++++++---------
examples/vhost/main.c | 19 +++++++------------
2 files changed, 14 insertions(+), 21 deletions(-)
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index 52b95025c..7795d0894 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -1061,8 +1061,8 @@ static const struct vhost_device_ops virtio_net_device_ops = {
* This is a thread will wake up after a period to print stats if the user has
* enabled them.
*/
-static void
-print_stats(void)
+static void *
+print_stats(__rte_unused void *arg)
{
struct virtio_net_data_ll *dev_ll;
uint64_t tx_dropped, rx_dropped;
@@ -1119,6 +1119,8 @@ print_stats(void)
}
printf("\n================================================\n");
}
+
+ return NULL;
}
/**
@@ -1134,7 +1136,6 @@ main(int argc, char *argv[])
uint16_t portid;
uint16_t queue_id;
static pthread_t tid;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
/* init EAL */
ret = rte_eal_init(argc, argv);
@@ -1205,13 +1206,10 @@ main(int argc, char *argv[])
/* Enable stats if the user option is set. */
if (enable_stats) {
- ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
- if (ret != 0)
+ ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
+ print_stats, NULL);
+ if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot create print-stats thread\n");
- snprintf(thread_name, sizeof(thread_name), "print-stats");
- ret = rte_thread_setname(tid, thread_name);
- if (ret != 0)
- RTE_LOG(DEBUG, VHOST_CONFIG, "Cannot set print-stats name\n");
}
/* Launch all data cores. */
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index e33a0f3c9..1659ef315 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1292,8 +1292,8 @@ static const struct vhost_device_ops virtio_net_device_ops =
* This is a thread will wake up after a period to print stats if the user has
* enabled them.
*/
-static void
-print_stats(void)
+static void *
+print_stats(__rte_unused void *arg)
{
struct vhost_dev *vdev;
uint64_t tx_dropped, rx_dropped;
@@ -1332,6 +1332,8 @@ print_stats(void)
printf("===================================================\n");
}
+
+ return NULL;
}
static void
@@ -1420,7 +1422,6 @@ main(int argc, char *argv[])
int ret, i;
uint16_t portid;
static pthread_t tid;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
uint64_t flags = 0;
signal(SIGINT, sigint_handler);
@@ -1493,17 +1494,11 @@ main(int argc, char *argv[])
/* Enable stats if the user option is set. */
if (enable_stats) {
- ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
- if (ret != 0)
+ ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
+ print_stats, NULL);
+ if (ret < 0)
rte_exit(EXIT_FAILURE,
"Cannot create print-stats thread\n");
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, sizeof(thread_name), "print-stats");
- ret = rte_thread_setname(tid, thread_name);
- if (ret != 0)
- RTE_LOG(DEBUG, VHOST_CONFIG,
- "Cannot set print-stats name\n");
}
/* Launch all data cores. */
--
2.11.0
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/5] fix control thread affinities
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
` (4 preceding siblings ...)
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 5/5] examples: use new API to create " Olivier Matz
@ 2018-04-24 22:53 ` Thomas Monjalon
2018-04-30 15:45 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities") Maxime Coquelin
6 siblings, 0 replies; 41+ messages in thread
From: Thomas Monjalon @ 2018-04-24 22:53 UTC (permalink / raw)
To: Olivier Matz; +Cc: dev, Anatoly Burakov
> Olivier Matz (5):
> eal: use sizeof to avoid a double use of a define
> eal: new function to create control threads
> eal: set name when creating a control thread
> eal: set affinity for control threads
> examples: use new API to create control threads
Applied, thanks
^ permalink raw reply [flat|nested] 41+ messages in thread
* [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities")
2018-04-24 14:46 ` [dpdk-dev] [PATCH v3 0/5] " Olivier Matz
` (5 preceding siblings ...)
2018-04-24 22:53 ` [dpdk-dev] [PATCH v3 0/5] fix control thread affinities Thomas Monjalon
@ 2018-04-30 15:45 ` Maxime Coquelin
2018-04-30 18:46 ` Olivier Matz
6 siblings, 1 reply; 41+ messages in thread
From: Maxime Coquelin @ 2018-04-30 15:45 UTC (permalink / raw)
To: Olivier Matz, dev; +Cc: Anatoly Burakov, Jianfeng Tan, Thomas Monjalon
Hi Olivier,
On 04/24/2018 04:46 PM, Olivier Matz wrote:
> Some parts of dpdk use their own management threads. Most of the time,
> the affinity of the thread is not properly set: it should not be scheduled
> on the dataplane cores, because interrupting them can cause packet losses.
>
> This patchset introduces a new wrapper for thread creation that does
> the job automatically, avoiding code duplication.
>
> v3:
> * new patch: use this API in examples when relevant.
> * replace pthread_kill by pthread_cancel. Note that pthread_join()
> is still needed.
> * rebase: vfio and pdump do not have control pthreads anymore, and eal
> has 2 new pthreads
> * remove all calls to snprintf/strlcpy that truncate the thread name:
> all strings lengths are already < 16.
>
> v2:
> * set affinity to master core if no core is off, as suggested by
> Anatoly
>
> Olivier Matz (5):
> eal: use sizeof to avoid a double use of a define
> eal: new function to create control threads
> eal: set name when creating a control thread
> eal: set affinity for control threads
> examples: use new API to create control threads
>
> drivers/net/kni/Makefile | 1 +
> drivers/net/kni/rte_eth_kni.c | 3 +-
> examples/tep_termination/main.c | 16 +++----
> examples/vhost/main.c | 19 +++-----
> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
> lib/librte_eal/common/eal_common_proc.c | 15 ++----
> lib/librte_eal/common/eal_common_thread.c | 72 ++++++++++++++++++++++++++++
> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
> lib/librte_eal/rte_eal_version.map | 1 +
> lib/librte_vhost/socket.c | 25 ++--------
> 15 files changed, 135 insertions(+), 84 deletions(-)
>
I face a deadlock issue with your series, that Jianfeng patch does not
resolve ("eal: fix threads block on barrier"). Reverting the series and
Jianfeng patch makes the issue to disappear.
I face the problem in a VM (not seen on the host):
# ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type auto
--file-prefix pg -- --portmask=3 --forward-mode=macswap
--port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
--txd=256 --nb-cores=2 --auto-start
EAL: Detected 3 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: Auto-detected process type: PRIMARY
EAL: Multi-process socket /var/run/.pg_unix
Then it is stuck. Attaching with GDB, I get below backtrace information:
(gdb) info threads
Id Target Id Frame
3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
0x00007f63e2591bfd in recvmsg () at ../sysdeps/unix/syscall-template.S:81
2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
pthread_barrier_wait () at
../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
* 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
() at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
(gdb) bt full
#0 pthread_barrier_wait () at
../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
No locals.
#1 0x0000000000520c54 in rte_ctrl_thread_create
(thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
"rte_mp_async", attr=attr@entry=0x0,
start_routine=start_routine@entry=0x521030 <async_reply_handle>,
arg=arg@entry=0x0)
at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
params = 0x17b1e40
lcore_id = <optimized out>
cpuset = {__bits = {1, 0 <repeats 15 times>}}
cpu_found = <optimized out>
ret = 0
#2 0x00000000005220b6 in rte_mp_channel_init () at
/root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
path = "/var/run\000.pg_unix_*", '\000' <repeats 1301 times>...
dir_fd = 4
mp_handle_tid = 140066969745152
async_reply_handle_tid = 140066961352448
#3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
argv=argv@entry=0x7ffe5c896378) at
/root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
i = <optimized out>
fctret = 11
ret = <optimized out>
thread_id = 140066989861888
run_once = {cnt = 1}
logid = 0x17b1e00 "testpmd"
cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
"\020", '\000' <repeats 116 times>...
thread_name = "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
__func__ = "rte_eal_init"
#4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
/root/src/dpdk/app/test-pmd/testpmd.c:2597
diag = <optimized out>
port_id = <optimized out>
ret = <optimized out>
__func__ = "main"
(gdb) thread 2
[Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
#0 pthread_barrier_wait () at
../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
71 cmpl %edx, (%rdi)
(gdb) bt full
#0 pthread_barrier_wait () at
../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
No locals.
#1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
/root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
params = <optimized out>
start_routine = 0x521030 <async_reply_handle>
routine_arg = 0x0
#2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
pthread_create.c:308
__res = <optimized out>
pd = 0x7f63e179e700
now = <optimized out>
unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
1212869169857371576, 0, 8392704, 0, 140066961352448,
-1291626103561052744, -1291619793368703560}, mask_was_saved = 0}}, priv
= {pad = {0x0, 0x0, 0x0, 0x0}, data = {
prev = 0x0, cleanup = 0x0, canceltype = 0}}}
not_first_call = <optimized out>
pagesize_m1 = <optimized out>
sp = <optimized out>
freesize = <optimized out>
#3 0x00007f63e22b4b3d in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:113
No locals.
(gdb) thread 3
[Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
#0 0x00007f63e2591bfd in recvmsg () at
../sysdeps/unix/syscall-template.S:81
81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
(gdb) bt full
#0 0x00007f63e2591bfd in recvmsg () at
../sysdeps/unix/syscall-template.S:81
No locals.
#1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0, m=0x7f63e1f9d5a0)
at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
msglen = <optimized out>
control =
"\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
<repeats 23 times>, "\360\371\033\342c\177\000"
cmsg = <optimized out>
iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
msg_controllen = 48, msg_flags = 0}
#2 mp_handle (arg=<optimized out>) at
/root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
'\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
sa = {sun_family = 55104,
sun_path =
"\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
'\000' <repeats 34 times>, "\200\037\000\000\377\377"}
#3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
pthread_create.c:308
__res = <optimized out>
pd = 0x7f63e1f9f700
now = <optimized out>
unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
1212869169857371576, 0, 8392704, 0, 140066969745152,
-1291625004586295880, -1291619793368703560}, mask_was_saved = 0}}, priv
= {pad = {0x0, 0x0, 0x0, 0x0}, data = {
prev = 0x0, cleanup = 0x0, canceltype = 0}}}
not_first_call = <optimized out>
pagesize_m1 = <optimized out>
sp = <optimized out>
freesize = <optimized out>
#4 0x00007f63e22b4b3d in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:113
No locals.
I don't have more info for now.
Maxime
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities")
2018-04-30 15:45 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities") Maxime Coquelin
@ 2018-04-30 18:46 ` Olivier Matz
2018-05-01 8:59 ` Thomas Monjalon
2018-05-02 8:19 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 Tan, Jianfeng
0 siblings, 2 replies; 41+ messages in thread
From: Olivier Matz @ 2018-04-30 18:46 UTC (permalink / raw)
To: Maxime Coquelin, dev; +Cc: Anatoly Burakov, Jianfeng Tan, Thomas Monjalon
Hi Maxime,
Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin <maxime.coquelin@redhat.com> a écrit :
>Hi Olivier,
>
>On 04/24/2018 04:46 PM, Olivier Matz wrote:
>> Some parts of dpdk use their own management threads. Most of the
>time,
>> the affinity of the thread is not properly set: it should not be
>scheduled
>> on the dataplane cores, because interrupting them can cause packet
>losses.
>>
>> This patchset introduces a new wrapper for thread creation that does
>> the job automatically, avoiding code duplication.
>>
>> v3:
>> * new patch: use this API in examples when relevant.
>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>> is still needed.
>> * rebase: vfio and pdump do not have control pthreads anymore, and
>eal
>> has 2 new pthreads
>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>> all strings lengths are already < 16.
>>
>> v2:
>> * set affinity to master core if no core is off, as suggested by
>> Anatoly
>>
>> Olivier Matz (5):
>> eal: use sizeof to avoid a double use of a define
>> eal: new function to create control threads
>> eal: set name when creating a control thread
>> eal: set affinity for control threads
>> examples: use new API to create control threads
>>
>> drivers/net/kni/Makefile | 1 +
>> drivers/net/kni/rte_eth_kni.c | 3 +-
>> examples/tep_termination/main.c | 16 +++----
>> examples/vhost/main.c | 19 +++-----
>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>> lib/librte_eal/common/eal_common_thread.c | 72
>++++++++++++++++++++++++++++
>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>> lib/librte_eal/rte_eal_version.map | 1 +
>> lib/librte_vhost/socket.c | 25 ++--------
>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>
>
>I face a deadlock issue with your series, that Jianfeng patch does not
>resolve ("eal: fix threads block on barrier"). Reverting the series and
>Jianfeng patch makes the issue to disappear.
>
>I face the problem in a VM (not seen on the host):
># ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>auto
>--file-prefix pg -- --portmask=3 --forward-mode=macswap
>--port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>--txd=256 --nb-cores=2 --auto-start
>EAL: Detected 3 lcore(s)
>EAL: Detected 1 NUMA nodes
>EAL: Auto-detected process type: PRIMARY
>EAL: Multi-process socket /var/run/.pg_unix
>
>
>Then it is stuck. Attaching with GDB, I get below backtrace
>information:
>
>(gdb) info threads
> Id Target Id Frame
> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>0x00007f63e2591bfd in recvmsg () at
>../sysdeps/unix/syscall-template.S:81
> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>pthread_barrier_wait () at
>../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>* 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>() at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>(gdb) bt full
>#0 pthread_barrier_wait () at
>../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>No locals.
>#1 0x0000000000520c54 in rte_ctrl_thread_create
>(thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>"rte_mp_async", attr=attr@entry=0x0,
>start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>arg=arg@entry=0x0)
> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
> params = 0x17b1e40
> lcore_id = <optimized out>
> cpuset = {__bits = {1, 0 <repeats 15 times>}}
> cpu_found = <optimized out>
> ret = 0
>#2 0x00000000005220b6 in rte_mp_channel_init () at
>/root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301 times>...
> dir_fd = 4
> mp_handle_tid = 140066969745152
> async_reply_handle_tid = 140066961352448
>#3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>argv=argv@entry=0x7ffe5c896378) at
>/root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
> i = <optimized out>
> fctret = 11
> ret = <optimized out>
> thread_id = 140066989861888
> run_once = {cnt = 1}
> logid = 0x17b1e00 "testpmd"
> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>"\020", '\000' <repeats 116 times>...
> thread_name = "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
> __func__ = "rte_eal_init"
>#4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>/root/src/dpdk/app/test-pmd/testpmd.c:2597
> diag = <optimized out>
> port_id = <optimized out>
> ret = <optimized out>
> __func__ = "main"
>(gdb) thread 2
>[Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>#0 pthread_barrier_wait () at
>../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>71 cmpl %edx, (%rdi)
>(gdb) bt full
>#0 pthread_barrier_wait () at
>../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>No locals.
>#1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>/root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
> params = <optimized out>
> start_routine = 0x521030 <async_reply_handle>
> routine_arg = 0x0
>#2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>pthread_create.c:308
> __res = <optimized out>
> pd = 0x7f63e179e700
> now = <optimized out>
> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>1212869169857371576, 0, 8392704, 0, 140066961352448,
>-1291626103561052744, -1291619793368703560}, mask_was_saved = 0}}, priv
>
>= {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> not_first_call = <optimized out>
> pagesize_m1 = <optimized out>
> sp = <optimized out>
> freesize = <optimized out>
>#3 0x00007f63e22b4b3d in clone () at
>../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>No locals.
>(gdb) thread 3
>[Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>#0 0x00007f63e2591bfd in recvmsg () at
>../sysdeps/unix/syscall-template.S:81
>81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>(gdb) bt full
>#0 0x00007f63e2591bfd in recvmsg () at
>../sysdeps/unix/syscall-template.S:81
>No locals.
>#1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0, m=0x7f63e1f9d5a0)
>
>at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
> msglen = <optimized out>
> control =
>"\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
><repeats 23 times>, "\360\371\033\342c\177\000"
> cmsg = <optimized out>
> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>msg_controllen = 48, msg_flags = 0}
>#2 mp_handle (arg=<optimized out>) at
>/root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>'\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
> sa = {sun_family = 55104,
> sun_path =
>"\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>
>'\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>#3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>pthread_create.c:308
> __res = <optimized out>
> pd = 0x7f63e1f9f700
> now = <optimized out>
> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>1212869169857371576, 0, 8392704, 0, 140066969745152,
>-1291625004586295880, -1291619793368703560}, mask_was_saved = 0}}, priv
>
>= {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> not_first_call = <optimized out>
> pagesize_m1 = <optimized out>
> sp = <optimized out>
> freesize = <optimized out>
>#4 0x00007f63e22b4b3d in clone () at
>../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>No locals.
>
>I don't have more info for now.
>
Thanks for the feedback on this issue. I don't see obvious reason for this deadlock yet.
I'll investigate it asap (not tomorrow, but wednesday). In the worst case, we can revert the series if I cannot find the root cause rapidly.
Olivier
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1 (was: "Re: [PATCH v3 0/5] fix control thread affinities")
2018-04-30 18:46 ` Olivier Matz
@ 2018-05-01 8:59 ` Thomas Monjalon
2018-05-02 8:19 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 Tan, Jianfeng
1 sibling, 0 replies; 41+ messages in thread
From: Thomas Monjalon @ 2018-05-01 8:59 UTC (permalink / raw)
To: Olivier Matz, Maxime Coquelin, Anatoly Burakov, Jianfeng Tan; +Cc: dev
30/04/2018 20:46, Olivier Matz:
> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin <maxime.coquelin@redhat.com> a écrit :
> >I face a deadlock issue with your series, that Jianfeng patch does not
> >resolve ("eal: fix threads block on barrier"). Reverting the series and
> >Jianfeng patch makes the issue to disappear.
[...]
> >I don't have more info for now.
>
> Thanks for the feedback on this issue. I don't see obvious reason for this deadlock yet.
>
> I'll investigate it asap (not tomorrow, but wednesday). In the worst case, we can revert the series if I cannot find the root cause rapidly.
Guys, thanks for working on it.
There are so many big bugs in RC1 that we need to have an early RC2
today or tomorrow.
Maybe someone else, working today, can investigate?
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-04-30 18:46 ` Olivier Matz
2018-05-01 8:59 ` Thomas Monjalon
@ 2018-05-02 8:19 ` Tan, Jianfeng
2018-05-02 8:34 ` Maxime Coquelin
1 sibling, 1 reply; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 8:19 UTC (permalink / raw)
To: Olivier Matz, Maxime Coquelin, dev; +Cc: Anatoly Burakov, Thomas Monjalon
On 5/1/2018 2:46 AM, Olivier Matz wrote:
> Hi Maxime,
>
> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin <maxime.coquelin@redhat.com> a écrit :
>> Hi Olivier,
>>
>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>> Some parts of dpdk use their own management threads. Most of the
>> time,
>>> the affinity of the thread is not properly set: it should not be
>> scheduled
>>> on the dataplane cores, because interrupting them can cause packet
>> losses.
>>> This patchset introduces a new wrapper for thread creation that does
>>> the job automatically, avoiding code duplication.
>>>
>>> v3:
>>> * new patch: use this API in examples when relevant.
>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>> is still needed.
>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>> eal
>>> has 2 new pthreads
>>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>>> all strings lengths are already < 16.
>>>
>>> v2:
>>> * set affinity to master core if no core is off, as suggested by
>>> Anatoly
>>>
>>> Olivier Matz (5):
>>> eal: use sizeof to avoid a double use of a define
>>> eal: new function to create control threads
>>> eal: set name when creating a control thread
>>> eal: set affinity for control threads
>>> examples: use new API to create control threads
>>>
>>> drivers/net/kni/Makefile | 1 +
>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>> examples/tep_termination/main.c | 16 +++----
>>> examples/vhost/main.c | 19 +++-----
>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>> lib/librte_eal/common/eal_common_thread.c | 72
>> ++++++++++++++++++++++++++++
>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>> lib/librte_eal/rte_eal_version.map | 1 +
>>> lib/librte_vhost/socket.c | 25 ++--------
>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>
>> I face a deadlock issue with your series, that Jianfeng patch does not
>> resolve ("eal: fix threads block on barrier"). Reverting the series and
>> Jianfeng patch makes the issue to disappear.
>>
>> I face the problem in a VM (not seen on the host):
>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>> auto
>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>> --txd=256 --nb-cores=2 --auto-start
>> EAL: Detected 3 lcore(s)
>> EAL: Detected 1 NUMA nodes
>> EAL: Auto-detected process type: PRIMARY
>> EAL: Multi-process socket /var/run/.pg_unix
>>
>>
>> Then it is stuck. Attaching with GDB, I get below backtrace
>> information:
>>
>> (gdb) info threads
>> Id Target Id Frame
>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>> 0x00007f63e2591bfd in recvmsg () at
>> ../sysdeps/unix/syscall-template.S:81
>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>> pthread_barrier_wait () at
>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>> () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>> (gdb) bt full
>> #0 pthread_barrier_wait () at
>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>> No locals.
>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>> "rte_mp_async", attr=attr@entry=0x0,
>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>> arg=arg@entry=0x0)
>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>> params = 0x17b1e40
>> lcore_id = <optimized out>
>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>> cpu_found = <optimized out>
>> ret = 0
>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301 times>...
>> dir_fd = 4
>> mp_handle_tid = 140066969745152
>> async_reply_handle_tid = 140066961352448
>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>> argv=argv@entry=0x7ffe5c896378) at
>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>> i = <optimized out>
>> fctret = 11
>> ret = <optimized out>
>> thread_id = 140066989861888
>> run_once = {cnt = 1}
>> logid = 0x17b1e00 "testpmd"
>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>> "\020", '\000' <repeats 116 times>...
>> thread_name = "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>> __func__ = "rte_eal_init"
>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>> diag = <optimized out>
>> port_id = <optimized out>
>> ret = <optimized out>
>> __func__ = "main"
>> (gdb) thread 2
>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>> #0 pthread_barrier_wait () at
>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>> 71 cmpl %edx, (%rdi)
>> (gdb) bt full
>> #0 pthread_barrier_wait () at
>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>> No locals.
>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>> params = <optimized out>
>> start_routine = 0x521030 <async_reply_handle>
>> routine_arg = 0x0
>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>> pthread_create.c:308
>> __res = <optimized out>
>> pd = 0x7f63e179e700
>> now = <optimized out>
>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>> -1291626103561052744, -1291619793368703560}, mask_was_saved = 0}}, priv
>>
>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>> not_first_call = <optimized out>
>> pagesize_m1 = <optimized out>
>> sp = <optimized out>
>> freesize = <optimized out>
>> #3 0x00007f63e22b4b3d in clone () at
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>> No locals.
>> (gdb) thread 3
>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>> #0 0x00007f63e2591bfd in recvmsg () at
>> ../sysdeps/unix/syscall-template.S:81
>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>> (gdb) bt full
>> #0 0x00007f63e2591bfd in recvmsg () at
>> ../sysdeps/unix/syscall-template.S:81
>> No locals.
>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0, m=0x7f63e1f9d5a0)
>>
>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>> msglen = <optimized out>
>> control =
>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
>> <repeats 23 times>, "\360\371\033\342c\177\000"
>> cmsg = <optimized out>
>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>> msg_controllen = 48, msg_flags = 0}
>> #2 mp_handle (arg=<optimized out>) at
>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>> sa = {sun_family = 55104,
>> sun_path =
>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>
>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>> pthread_create.c:308
>> __res = <optimized out>
>> pd = 0x7f63e1f9f700
>> now = <optimized out>
>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>> -1291625004586295880, -1291619793368703560}, mask_was_saved = 0}}, priv
>>
>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>> not_first_call = <optimized out>
>> pagesize_m1 = <optimized out>
>> sp = <optimized out>
>> freesize = <optimized out>
>> #4 0x00007f63e22b4b3d in clone () at
>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>> No locals.
>>
>> I don't have more info for now.
>>
>
> Thanks for the feedback on this issue. I don't see obvious reason for this deadlock yet.
>
> I'll investigate it asap (not tomorrow, but wednesday). In the worst case, we can revert the series if I cannot find the root cause rapidly.
I might think that the suggestion from Stephen of destroying the barrier
can help this issue. I'll try to reproduce it and test it before sending
a patch to fix it.
Thanks,
Jianfeng
>
> Olivier
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 8:19 ` [dpdk-dev] pthread_barrier_deadlock in -rc1 Tan, Jianfeng
@ 2018-05-02 8:34 ` Maxime Coquelin
2018-05-02 8:50 ` Tan, Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Maxime Coquelin @ 2018-05-02 8:34 UTC (permalink / raw)
To: Tan, Jianfeng, Olivier Matz, dev; +Cc: Anatoly Burakov, Thomas Monjalon
Hi Jiangfeng,
On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
>
>
> On 5/1/2018 2:46 AM, Olivier Matz wrote:
>> Hi Maxime,
>>
>> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
>> <maxime.coquelin@redhat.com> a écrit :
>>> Hi Olivier,
>>>
>>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>>> Some parts of dpdk use their own management threads. Most of the
>>> time,
>>>> the affinity of the thread is not properly set: it should not be
>>> scheduled
>>>> on the dataplane cores, because interrupting them can cause packet
>>> losses.
>>>> This patchset introduces a new wrapper for thread creation that does
>>>> the job automatically, avoiding code duplication.
>>>>
>>>> v3:
>>>> * new patch: use this API in examples when relevant.
>>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>>> is still needed.
>>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>>> eal
>>>> has 2 new pthreads
>>>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>>>> all strings lengths are already < 16.
>>>>
>>>> v2:
>>>> * set affinity to master core if no core is off, as suggested by
>>>> Anatoly
>>>>
>>>> Olivier Matz (5):
>>>> eal: use sizeof to avoid a double use of a define
>>>> eal: new function to create control threads
>>>> eal: set name when creating a control thread
>>>> eal: set affinity for control threads
>>>> examples: use new API to create control threads
>>>>
>>>> drivers/net/kni/Makefile | 1 +
>>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>>> examples/tep_termination/main.c | 16 +++----
>>>> examples/vhost/main.c | 19 +++-----
>>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>>> lib/librte_eal/common/eal_common_thread.c | 72
>>> ++++++++++++++++++++++++++++
>>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>>> lib/librte_eal/rte_eal_version.map | 1 +
>>>> lib/librte_vhost/socket.c | 25 ++--------
>>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>>
>>> I face a deadlock issue with your series, that Jianfeng patch does not
>>> resolve ("eal: fix threads block on barrier"). Reverting the series and
>>> Jianfeng patch makes the issue to disappear.
>>>
>>> I face the problem in a VM (not seen on the host):
>>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>>> auto
>>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>>> --txd=256 --nb-cores=2 --auto-start
>>> EAL: Detected 3 lcore(s)
>>> EAL: Detected 1 NUMA nodes
>>> EAL: Auto-detected process type: PRIMARY
>>> EAL: Multi-process socket /var/run/.pg_unix
>>>
>>>
>>> Then it is stuck. Attaching with GDB, I get below backtrace
>>> information:
>>>
>>> (gdb) info threads
>>> Id Target Id Frame
>>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>>> 0x00007f63e2591bfd in recvmsg () at
>>> ../sysdeps/unix/syscall-template.S:81
>>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>>> pthread_barrier_wait () at
>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>>> () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>> (gdb) bt full
>>> #0 pthread_barrier_wait () at
>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>> No locals.
>>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>>> "rte_mp_async", attr=attr@entry=0x0,
>>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>>> arg=arg@entry=0x0)
>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>>> params = 0x17b1e40
>>> lcore_id = <optimized out>
>>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>>> cpu_found = <optimized out>
>>> ret = 0
>>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301 times>...
>>> dir_fd = 4
>>> mp_handle_tid = 140066969745152
>>> async_reply_handle_tid = 140066961352448
>>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>>> argv=argv@entry=0x7ffe5c896378) at
>>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>>> i = <optimized out>
>>> fctret = 11
>>> ret = <optimized out>
>>> thread_id = 140066989861888
>>> run_once = {cnt = 1}
>>> logid = 0x17b1e00 "testpmd"
>>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>>> "\020", '\000' <repeats 116 times>...
>>> thread_name = "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>>> __func__ = "rte_eal_init"
>>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>>> diag = <optimized out>
>>> port_id = <optimized out>
>>> ret = <optimized out>
>>> __func__ = "main"
>>> (gdb) thread 2
>>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>>> #0 pthread_barrier_wait () at
>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>> 71 cmpl %edx, (%rdi)
>>> (gdb) bt full
>>> #0 pthread_barrier_wait () at
>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>> No locals.
>>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>>> params = <optimized out>
>>> start_routine = 0x521030 <async_reply_handle>
>>> routine_arg = 0x0
>>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>>> pthread_create.c:308
>>> __res = <optimized out>
>>> pd = 0x7f63e179e700
>>> now = <optimized out>
>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>>> -1291626103561052744, -1291619793368703560}, mask_was_saved = 0}}, priv
>>>
>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>> not_first_call = <optimized out>
>>> pagesize_m1 = <optimized out>
>>> sp = <optimized out>
>>> freesize = <optimized out>
>>> #3 0x00007f63e22b4b3d in clone () at
>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>> No locals.
>>> (gdb) thread 3
>>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>>> #0 0x00007f63e2591bfd in recvmsg () at
>>> ../sysdeps/unix/syscall-template.S:81
>>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>>> (gdb) bt full
>>> #0 0x00007f63e2591bfd in recvmsg () at
>>> ../sysdeps/unix/syscall-template.S:81
>>> No locals.
>>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0, m=0x7f63e1f9d5a0)
>>>
>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>>> msglen = <optimized out>
>>> control =
>>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
>>> <repeats 23 times>, "\360\371\033\342c\177\000"
>>> cmsg = <optimized out>
>>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>>> msg_controllen = 48, msg_flags = 0}
>>> #2 mp_handle (arg=<optimized out>) at
>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>>> sa = {sun_family = 55104,
>>> sun_path =
>>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>>
>>>
>>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>>> pthread_create.c:308
>>> __res = <optimized out>
>>> pd = 0x7f63e1f9f700
>>> now = <optimized out>
>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>>> -1291625004586295880, -1291619793368703560}, mask_was_saved = 0}}, priv
>>>
>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>> not_first_call = <optimized out>
>>> pagesize_m1 = <optimized out>
>>> sp = <optimized out>
>>> freesize = <optimized out>
>>> #4 0x00007f63e22b4b3d in clone () at
>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>> No locals.
>>>
>>> I don't have more info for now.
>>>
>>
>> Thanks for the feedback on this issue. I don't see obvious reason for
>> this deadlock yet.
>>
>> I'll investigate it asap (not tomorrow, but wednesday). In the worst
>> case, we can revert the series if I cannot find the root cause rapidly.
>
> I might think that the suggestion from Stephen of destroying the barrier
> can help this issue. I'll try to reproduce it and test it before sending
> a patch to fix it.
In case you don't reproduce, feel free to send me the patch to test it.
Thanks,
Maxime
> Thanks,
> Jianfeng
>
>>
>> Olivier
>>
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 8:34 ` Maxime Coquelin
@ 2018-05-02 8:50 ` Tan, Jianfeng
2018-05-02 9:05 ` Maxime Coquelin
2018-05-02 9:30 ` Burakov, Anatoly
0 siblings, 2 replies; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 8:50 UTC (permalink / raw)
To: Maxime Coquelin, Olivier Matz, dev; +Cc: Anatoly Burakov, Thomas Monjalon
Hi Maxime,
On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
> Hi Jiangfeng,
>
> On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
>>
>>
>> On 5/1/2018 2:46 AM, Olivier Matz wrote:
>>> Hi Maxime,
>>>
>>> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
>>> <maxime.coquelin@redhat.com> a écrit :
>>>> Hi Olivier,
>>>>
>>>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>>>> Some parts of dpdk use their own management threads. Most of the
>>>> time,
>>>>> the affinity of the thread is not properly set: it should not be
>>>> scheduled
>>>>> on the dataplane cores, because interrupting them can cause packet
>>>> losses.
>>>>> This patchset introduces a new wrapper for thread creation that does
>>>>> the job automatically, avoiding code duplication.
>>>>>
>>>>> v3:
>>>>> * new patch: use this API in examples when relevant.
>>>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>>>> is still needed.
>>>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>>>> eal
>>>>> has 2 new pthreads
>>>>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>>>>> all strings lengths are already < 16.
>>>>>
>>>>> v2:
>>>>> * set affinity to master core if no core is off, as suggested by
>>>>> Anatoly
>>>>>
>>>>> Olivier Matz (5):
>>>>> eal: use sizeof to avoid a double use of a define
>>>>> eal: new function to create control threads
>>>>> eal: set name when creating a control thread
>>>>> eal: set affinity for control threads
>>>>> examples: use new API to create control threads
>>>>>
>>>>> drivers/net/kni/Makefile | 1 +
>>>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>>>> examples/tep_termination/main.c | 16 +++----
>>>>> examples/vhost/main.c | 19 +++-----
>>>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>>>> lib/librte_eal/common/eal_common_thread.c | 72
>>>> ++++++++++++++++++++++++++++
>>>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>>>> lib/librte_eal/rte_eal_version.map | 1 +
>>>>> lib/librte_vhost/socket.c | 25 ++--------
>>>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>>>
>>>> I face a deadlock issue with your series, that Jianfeng patch does not
>>>> resolve ("eal: fix threads block on barrier"). Reverting the series
>>>> and
>>>> Jianfeng patch makes the issue to disappear.
>>>>
>>>> I face the problem in a VM (not seen on the host):
>>>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>>>> auto
>>>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>>>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>>>> --txd=256 --nb-cores=2 --auto-start
>>>> EAL: Detected 3 lcore(s)
>>>> EAL: Detected 1 NUMA nodes
>>>> EAL: Auto-detected process type: PRIMARY
>>>> EAL: Multi-process socket /var/run/.pg_unix
>>>>
>>>>
>>>> Then it is stuck. Attaching with GDB, I get below backtrace
>>>> information:
>>>>
>>>> (gdb) info threads
>>>> Id Target Id Frame
>>>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>>>> 0x00007f63e2591bfd in recvmsg () at
>>>> ../sysdeps/unix/syscall-template.S:81
>>>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>>>> pthread_barrier_wait () at
>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>>>> () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>> (gdb) bt full
>>>> #0 pthread_barrier_wait () at
>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>> No locals.
>>>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>>>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>>>> "rte_mp_async", attr=attr@entry=0x0,
>>>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>>>> arg=arg@entry=0x0)
>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>>>> params = 0x17b1e40
>>>> lcore_id = <optimized out>
>>>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>>>> cpu_found = <optimized out>
>>>> ret = 0
>>>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>>>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301
>>>> times>...
>>>> dir_fd = 4
>>>> mp_handle_tid = 140066969745152
>>>> async_reply_handle_tid = 140066961352448
>>>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>>>> argv=argv@entry=0x7ffe5c896378) at
>>>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>>>> i = <optimized out>
>>>> fctret = 11
>>>> ret = <optimized out>
>>>> thread_id = 140066989861888
>>>> run_once = {cnt = 1}
>>>> logid = 0x17b1e00 "testpmd"
>>>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>>>> "\020", '\000' <repeats 116 times>...
>>>> thread_name =
>>>> "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>>>> __func__ = "rte_eal_init"
>>>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>>>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>>>> diag = <optimized out>
>>>> port_id = <optimized out>
>>>> ret = <optimized out>
>>>> __func__ = "main"
>>>> (gdb) thread 2
>>>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>>>> #0 pthread_barrier_wait () at
>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>> 71 cmpl %edx, (%rdi)
>>>> (gdb) bt full
>>>> #0 pthread_barrier_wait () at
>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>> No locals.
>>>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>>>> params = <optimized out>
>>>> start_routine = 0x521030 <async_reply_handle>
>>>> routine_arg = 0x0
>>>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>>>> pthread_create.c:308
>>>> __res = <optimized out>
>>>> pd = 0x7f63e179e700
>>>> now = <optimized out>
>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>>>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>>>> -1291626103561052744, -1291619793368703560}, mask_was_saved = 0}},
>>>> priv
>>>>
>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>> not_first_call = <optimized out>
>>>> pagesize_m1 = <optimized out>
>>>> sp = <optimized out>
>>>> freesize = <optimized out>
>>>> #3 0x00007f63e22b4b3d in clone () at
>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>> No locals.
>>>> (gdb) thread 3
>>>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>> ../sysdeps/unix/syscall-template.S:81
>>>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>>>> (gdb) bt full
>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>> ../sysdeps/unix/syscall-template.S:81
>>>> No locals.
>>>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
>>>> m=0x7f63e1f9d5a0)
>>>>
>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>>>> msglen = <optimized out>
>>>> control =
>>>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
>>>> <repeats 23 times>, "\360\371\033\342c\177\000"
>>>> cmsg = <optimized out>
>>>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>>>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>>>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>>>> msg_controllen = 48, msg_flags = 0}
>>>> #2 mp_handle (arg=<optimized out>) at
>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>>>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>>>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>>>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>>>> sa = {sun_family = 55104,
>>>> sun_path =
>>>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>>>
>>>>
>>>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>>>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>>>> pthread_create.c:308
>>>> __res = <optimized out>
>>>> pd = 0x7f63e1f9f700
>>>> now = <optimized out>
>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>>>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>>>> -1291625004586295880, -1291619793368703560}, mask_was_saved = 0}},
>>>> priv
>>>>
>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>> not_first_call = <optimized out>
>>>> pagesize_m1 = <optimized out>
>>>> sp = <optimized out>
>>>> freesize = <optimized out>
>>>> #4 0x00007f63e22b4b3d in clone () at
>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>> No locals.
>>>>
>>>> I don't have more info for now.
>>>>
>>>
>>> Thanks for the feedback on this issue. I don't see obvious reason
>>> for this deadlock yet.
>>>
>>> I'll investigate it asap (not tomorrow, but wednesday). In the worst
>>> case, we can revert the series if I cannot find the root cause rapidly.
>>
>> I might think that the suggestion from Stephen of destroying the
>> barrier can help this issue. I'll try to reproduce it and test it
>> before sending a patch to fix it.
>
> In case you don't reproduce, feel free to send me the patch to test it.
Below patch can fix another strange sigsegv issue in my VM. Please check
if it works for you. I doubt it's use-after-free problem which could
lead to different issues in different env. Please have a try.
diff --git a/lib/librte_eal/common/eal_common_thread.c
b/lib/librte_eal/common/eal_common_thread.c
index de69452..d91b67d 100644
--- a/lib/librte_eal/common/eal_common_thread.c
+++ b/lib/librte_eal/common/eal_common_thread.c
@@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
*name,
goto fail;
pthread_barrier_wait(¶ms->configured);
+ pthread_barrier_destroy(¶ms->configured);
free(params);
return 0;
Thanks,
Jianfeng
>
> Thanks,
> Maxime
>
>> Thanks,
>> Jianfeng
>>
>>>
>>> Olivier
>>>
>>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 8:50 ` Tan, Jianfeng
@ 2018-05-02 9:05 ` Maxime Coquelin
2018-05-02 9:20 ` Olivier Matz
2018-05-02 9:30 ` Burakov, Anatoly
1 sibling, 1 reply; 41+ messages in thread
From: Maxime Coquelin @ 2018-05-02 9:05 UTC (permalink / raw)
To: Tan, Jianfeng, Olivier Matz, dev; +Cc: Anatoly Burakov, Thomas Monjalon
On 05/02/2018 10:50 AM, Tan, Jianfeng wrote:
> Hi Maxime,
>
>
> On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
>> Hi Jiangfeng,
>>
>> On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
>>>
>>>
>>> On 5/1/2018 2:46 AM, Olivier Matz wrote:
>>>> Hi Maxime,
>>>>
>>>> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
>>>> <maxime.coquelin@redhat.com> a écrit :
>>>>> Hi Olivier,
>>>>>
>>>>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>>>>> Some parts of dpdk use their own management threads. Most of the
>>>>> time,
>>>>>> the affinity of the thread is not properly set: it should not be
>>>>> scheduled
>>>>>> on the dataplane cores, because interrupting them can cause packet
>>>>> losses.
>>>>>> This patchset introduces a new wrapper for thread creation that does
>>>>>> the job automatically, avoiding code duplication.
>>>>>>
>>>>>> v3:
>>>>>> * new patch: use this API in examples when relevant.
>>>>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>>>>> is still needed.
>>>>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>>>>> eal
>>>>>> has 2 new pthreads
>>>>>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>>>>>> all strings lengths are already < 16.
>>>>>>
>>>>>> v2:
>>>>>> * set affinity to master core if no core is off, as suggested by
>>>>>> Anatoly
>>>>>>
>>>>>> Olivier Matz (5):
>>>>>> eal: use sizeof to avoid a double use of a define
>>>>>> eal: new function to create control threads
>>>>>> eal: set name when creating a control thread
>>>>>> eal: set affinity for control threads
>>>>>> examples: use new API to create control threads
>>>>>>
>>>>>> drivers/net/kni/Makefile | 1 +
>>>>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>>>>> examples/tep_termination/main.c | 16 +++----
>>>>>> examples/vhost/main.c | 19 +++-----
>>>>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>>>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>>>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>>>>> lib/librte_eal/common/eal_common_thread.c | 72
>>>>> ++++++++++++++++++++++++++++
>>>>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>>>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>>>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>>>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>>>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>>>>> lib/librte_eal/rte_eal_version.map | 1 +
>>>>>> lib/librte_vhost/socket.c | 25 ++--------
>>>>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>>>>
>>>>> I face a deadlock issue with your series, that Jianfeng patch does not
>>>>> resolve ("eal: fix threads block on barrier"). Reverting the series
>>>>> and
>>>>> Jianfeng patch makes the issue to disappear.
>>>>>
>>>>> I face the problem in a VM (not seen on the host):
>>>>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>>>>> auto
>>>>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>>>>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>>>>> --txd=256 --nb-cores=2 --auto-start
>>>>> EAL: Detected 3 lcore(s)
>>>>> EAL: Detected 1 NUMA nodes
>>>>> EAL: Auto-detected process type: PRIMARY
>>>>> EAL: Multi-process socket /var/run/.pg_unix
>>>>>
>>>>>
>>>>> Then it is stuck. Attaching with GDB, I get below backtrace
>>>>> information:
>>>>>
>>>>> (gdb) info threads
>>>>> Id Target Id Frame
>>>>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>>>>> 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>>>>> pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>>>>> () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> (gdb) bt full
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> No locals.
>>>>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>>>>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>>>>> "rte_mp_async", attr=attr@entry=0x0,
>>>>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>>>>> arg=arg@entry=0x0)
>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>>>>> params = 0x17b1e40
>>>>> lcore_id = <optimized out>
>>>>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>>>>> cpu_found = <optimized out>
>>>>> ret = 0
>>>>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>>>>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301
>>>>> times>...
>>>>> dir_fd = 4
>>>>> mp_handle_tid = 140066969745152
>>>>> async_reply_handle_tid = 140066961352448
>>>>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>>>>> argv=argv@entry=0x7ffe5c896378) at
>>>>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>>>>> i = <optimized out>
>>>>> fctret = 11
>>>>> ret = <optimized out>
>>>>> thread_id = 140066989861888
>>>>> run_once = {cnt = 1}
>>>>> logid = 0x17b1e00 "testpmd"
>>>>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>>>>> "\020", '\000' <repeats 116 times>...
>>>>> thread_name =
>>>>> "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>>>>> __func__ = "rte_eal_init"
>>>>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>>>>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>>>>> diag = <optimized out>
>>>>> port_id = <optimized out>
>>>>> ret = <optimized out>
>>>>> __func__ = "main"
>>>>> (gdb) thread 2
>>>>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> 71 cmpl %edx, (%rdi)
>>>>> (gdb) bt full
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> No locals.
>>>>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>>>>> params = <optimized out>
>>>>> start_routine = 0x521030 <async_reply_handle>
>>>>> routine_arg = 0x0
>>>>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>>>>> pthread_create.c:308
>>>>> __res = <optimized out>
>>>>> pd = 0x7f63e179e700
>>>>> now = <optimized out>
>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>>>>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>>>>> -1291626103561052744, -1291619793368703560}, mask_was_saved = 0}},
>>>>> priv
>>>>>
>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>> not_first_call = <optimized out>
>>>>> pagesize_m1 = <optimized out>
>>>>> sp = <optimized out>
>>>>> freesize = <optimized out>
>>>>> #3 0x00007f63e22b4b3d in clone () at
>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>> No locals.
>>>>> (gdb) thread 3
>>>>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>>>>> (gdb) bt full
>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> No locals.
>>>>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
>>>>> m=0x7f63e1f9d5a0)
>>>>>
>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>>>>> msglen = <optimized out>
>>>>> control =
>>>>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
>>>>> <repeats 23 times>, "\360\371\033\342c\177\000"
>>>>> cmsg = <optimized out>
>>>>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>>>>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>>>>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>>>>> msg_controllen = 48, msg_flags = 0}
>>>>> #2 mp_handle (arg=<optimized out>) at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>>>>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>>>>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>>>>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>>>>> sa = {sun_family = 55104,
>>>>> sun_path =
>>>>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>>>>
>>>>>
>>>>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>>>>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>>>>> pthread_create.c:308
>>>>> __res = <optimized out>
>>>>> pd = 0x7f63e1f9f700
>>>>> now = <optimized out>
>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>>>>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>>>>> -1291625004586295880, -1291619793368703560}, mask_was_saved = 0}},
>>>>> priv
>>>>>
>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>> not_first_call = <optimized out>
>>>>> pagesize_m1 = <optimized out>
>>>>> sp = <optimized out>
>>>>> freesize = <optimized out>
>>>>> #4 0x00007f63e22b4b3d in clone () at
>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>> No locals.
>>>>>
>>>>> I don't have more info for now.
>>>>>
>>>>
>>>> Thanks for the feedback on this issue. I don't see obvious reason
>>>> for this deadlock yet.
>>>>
>>>> I'll investigate it asap (not tomorrow, but wednesday). In the worst
>>>> case, we can revert the series if I cannot find the root cause rapidly.
>>>
>>> I might think that the suggestion from Stephen of destroying the
>>> barrier can help this issue. I'll try to reproduce it and test it
>>> before sending a patch to fix it.
>>
>> In case you don't reproduce, feel free to send me the patch to test it.
>
> Below patch can fix another strange sigsegv issue in my VM. Please check
> if it works for you. I doubt it's use-after-free problem which could
> lead to different issues in different env. Please have a try.
>
>
> diff --git a/lib/librte_eal/common/eal_common_thread.c
> b/lib/librte_eal/common/eal_common_thread.c
> index de69452..d91b67d 100644
> --- a/lib/librte_eal/common/eal_common_thread.c
> +++ b/lib/librte_eal/common/eal_common_thread.c
> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
> *name,
> goto fail;
>
> pthread_barrier_wait(¶ms->configured);
> + pthread_barrier_destroy(¶ms->configured);
Thanks Jianfeng, that fixes my issue.
For correctness, I wonder whether we should check pthread_barrier_wait
return, and only call destroy() if PTHREAD_BARRIER_SERIAL_THREAD?
And so also do same the same thing in rte_thread_init().
What do you think?
Thanks,
Maxime
> free(params);
>
> return 0;
>
> Thanks,
> Jianfeng
>
>>
>> Thanks,
>> Maxime
>>
>>> Thanks,
>>> Jianfeng
>>>
>>>>
>>>> Olivier
>>>>
>>>
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:05 ` Maxime Coquelin
@ 2018-05-02 9:20 ` Olivier Matz
2018-05-02 9:32 ` Tan, Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Olivier Matz @ 2018-05-02 9:20 UTC (permalink / raw)
To: Maxime Coquelin; +Cc: Tan, Jianfeng, dev, Anatoly Burakov, Thomas Monjalon
Hi Jianfeng,
On Wed, May 02, 2018 at 11:05:12AM +0200, Maxime Coquelin wrote:
>
>
> On 05/02/2018 10:50 AM, Tan, Jianfeng wrote:
> > Hi Maxime,
> >
> >
> > On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
> > > Hi Jiangfeng,
> > >
> > > On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
> > > >
> > > >
> > > > On 5/1/2018 2:46 AM, Olivier Matz wrote:
> > > > > Hi Maxime,
> > > > >
> > > > > Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
> > > > > <maxime.coquelin@redhat.com> a écrit :
> > > > > > Hi Olivier,
> > > > > >
> > > > > > On 04/24/2018 04:46 PM, Olivier Matz wrote:
> > > > > > > Some parts of dpdk use their own management threads. Most of the
> > > > > > time,
> > > > > > > the affinity of the thread is not properly set: it should not be
> > > > > > scheduled
> > > > > > > on the dataplane cores, because interrupting them can cause packet
> > > > > > losses.
> > > > > > > This patchset introduces a new wrapper for thread creation that does
> > > > > > > the job automatically, avoiding code duplication.
> > > > > > >
> > > > > > > v3:
> > > > > > > * new patch: use this API in examples when relevant.
> > > > > > > * replace pthread_kill by pthread_cancel. Note that pthread_join()
> > > > > > > is still needed.
> > > > > > > * rebase: vfio and pdump do not have control pthreads anymore, and
> > > > > > eal
> > > > > > > has 2 new pthreads
> > > > > > > * remove all calls to snprintf/strlcpy that truncate the thread name:
> > > > > > > all strings lengths are already < 16.
> > > > > > >
> > > > > > > v2:
> > > > > > > * set affinity to master core if no core is off, as suggested by
> > > > > > > Anatoly
> > > > > > >
> > > > > > > Olivier Matz (5):
> > > > > > > eal: use sizeof to avoid a double use of a define
> > > > > > > eal: new function to create control threads
> > > > > > > eal: set name when creating a control thread
> > > > > > > eal: set affinity for control threads
> > > > > > > examples: use new API to create control threads
> > > > > > >
> > > > > > > drivers/net/kni/Makefile | 1 +
> > > > > > > drivers/net/kni/rte_eth_kni.c | 3 +-
> > > > > > > examples/tep_termination/main.c | 16 +++----
> > > > > > > examples/vhost/main.c | 19 +++-----
> > > > > > > lib/librte_eal/bsdapp/eal/eal.c | 4 +-
> > > > > > > lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
> > > > > > > lib/librte_eal/common/eal_common_proc.c | 15 ++----
> > > > > > > lib/librte_eal/common/eal_common_thread.c | 72
> > > > > > ++++++++++++++++++++++++++++
> > > > > > > lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
> > > > > > > lib/librte_eal/linuxapp/eal/eal.c | 4 +-
> > > > > > > lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
> > > > > > > lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
> > > > > > > lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
> > > > > > > lib/librte_eal/rte_eal_version.map | 1 +
> > > > > > > lib/librte_vhost/socket.c | 25 ++--------
> > > > > > > 15 files changed, 135 insertions(+), 84 deletions(-)
> > > > > > >
> > > > > > I face a deadlock issue with your series, that Jianfeng patch does not
> > > > > > resolve ("eal: fix threads block on barrier"). Reverting
> > > > > > the series and
> > > > > > Jianfeng patch makes the issue to disappear.
> > > > > >
> > > > > > I face the problem in a VM (not seen on the host):
> > > > > > # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
> > > > > > auto
> > > > > > --file-prefix pg -- --portmask=3 --forward-mode=macswap
> > > > > > --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
> > > > > > --txd=256 --nb-cores=2 --auto-start
> > > > > > EAL: Detected 3 lcore(s)
> > > > > > EAL: Detected 1 NUMA nodes
> > > > > > EAL: Auto-detected process type: PRIMARY
> > > > > > EAL: Multi-process socket /var/run/.pg_unix
> > > > > >
> > > > > >
> > > > > > Then it is stuck. Attaching with GDB, I get below backtrace
> > > > > > information:
> > > > > >
> > > > > > (gdb) info threads
> > > > > > Id Target Id Frame
> > > > > > 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
> > > > > > 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
> > > > > > pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
> > > > > > () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > (gdb) bt full
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > No locals.
> > > > > > #1 0x0000000000520c54 in rte_ctrl_thread_create
> > > > > > (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
> > > > > > "rte_mp_async", attr=attr@entry=0x0,
> > > > > > start_routine=start_routine@entry=0x521030 <async_reply_handle>,
> > > > > > arg=arg@entry=0x0)
> > > > > > at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
> > > > > > params = 0x17b1e40
> > > > > > lcore_id = <optimized out>
> > > > > > cpuset = {__bits = {1, 0 <repeats 15 times>}}
> > > > > > cpu_found = <optimized out>
> > > > > > ret = 0
> > > > > > #2 0x00000000005220b6 in rte_mp_channel_init () at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
> > > > > > path = "/var/run\000.pg_unix_*", '\000' <repeats
> > > > > > 1301 times>...
> > > > > > dir_fd = 4
> > > > > > mp_handle_tid = 140066969745152
> > > > > > async_reply_handle_tid = 140066961352448
> > > > > > #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
> > > > > > argv=argv@entry=0x7ffe5c896378) at
> > > > > > /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
> > > > > > i = <optimized out>
> > > > > > fctret = 11
> > > > > > ret = <optimized out>
> > > > > > thread_id = 140066989861888
> > > > > > run_once = {cnt = 1}
> > > > > > logid = 0x17b1e00 "testpmd"
> > > > > > cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
> > > > > > "\020", '\000' <repeats 116 times>...
> > > > > > thread_name =
> > > > > > "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
> > > > > > __func__ = "rte_eal_init"
> > > > > > #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
> > > > > > /root/src/dpdk/app/test-pmd/testpmd.c:2597
> > > > > > diag = <optimized out>
> > > > > > port_id = <optimized out>
> > > > > > ret = <optimized out>
> > > > > > __func__ = "main"
> > > > > > (gdb) thread 2
> > > > > > [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > 71 cmpl %edx, (%rdi)
> > > > > > (gdb) bt full
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > No locals.
> > > > > > #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
> > > > > > params = <optimized out>
> > > > > > start_routine = 0x521030 <async_reply_handle>
> > > > > > routine_arg = 0x0
> > > > > > #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
> > > > > > pthread_create.c:308
> > > > > > __res = <optimized out>
> > > > > > pd = 0x7f63e179e700
> > > > > > now = <optimized out>
> > > > > > unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
> > > > > > 1212869169857371576, 0, 8392704, 0, 140066961352448,
> > > > > > -1291626103561052744, -1291619793368703560},
> > > > > > mask_was_saved = 0}}, priv
> > > > > >
> > > > > > = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> > > > > > prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> > > > > > not_first_call = <optimized out>
> > > > > > pagesize_m1 = <optimized out>
> > > > > > sp = <optimized out>
> > > > > > freesize = <optimized out>
> > > > > > #3 0x00007f63e22b4b3d in clone () at
> > > > > > ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
> > > > > > No locals.
> > > > > > (gdb) thread 3
> > > > > > [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
> > > > > > #0 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
> > > > > > (gdb) bt full
> > > > > > #0 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > No locals.
> > > > > > #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
> > > > > > m=0x7f63e1f9d5a0)
> > > > > >
> > > > > > at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
> > > > > > msglen = <optimized out>
> > > > > > control =
> > > > > > "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
> > > > > > <repeats 23 times>, "\360\371\033\342c\177\000"
> > > > > > cmsg = <optimized out>
> > > > > > iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
> > > > > > msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
> > > > > > 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
> > > > > > msg_controllen = 48, msg_flags = 0}
> > > > > > #2 mp_handle (arg=<optimized out>) at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
> > > > > > msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
> > > > > > len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
> > > > > > '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
> > > > > > sa = {sun_family = 55104,
> > > > > > sun_path =
> > > > > > "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
> > > > > >
> > > > > >
> > > > > > '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
> > > > > > #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
> > > > > > pthread_create.c:308
> > > > > > __res = <optimized out>
> > > > > > pd = 0x7f63e1f9f700
> > > > > > now = <optimized out>
> > > > > > unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
> > > > > > 1212869169857371576, 0, 8392704, 0, 140066969745152,
> > > > > > -1291625004586295880, -1291619793368703560},
> > > > > > mask_was_saved = 0}}, priv
> > > > > >
> > > > > > = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> > > > > > prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> > > > > > not_first_call = <optimized out>
> > > > > > pagesize_m1 = <optimized out>
> > > > > > sp = <optimized out>
> > > > > > freesize = <optimized out>
> > > > > > #4 0x00007f63e22b4b3d in clone () at
> > > > > > ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
> > > > > > No locals.
> > > > > >
> > > > > > I don't have more info for now.
> > > > > >
> > > > >
> > > > > Thanks for the feedback on this issue. I don't see obvious
> > > > > reason for this deadlock yet.
> > > > >
> > > > > I'll investigate it asap (not tomorrow, but wednesday). In
> > > > > the worst case, we can revert the series if I cannot find
> > > > > the root cause rapidly.
> > > >
> > > > I might think that the suggestion from Stephen of destroying the
> > > > barrier can help this issue. I'll try to reproduce it and test
> > > > it before sending a patch to fix it.
> > >
> > > In case you don't reproduce, feel free to send me the patch to test it.
> >
> > Below patch can fix another strange sigsegv issue in my VM. Please check
> > if it works for you. I doubt it's use-after-free problem which could
> > lead to different issues in different env. Please have a try.
> >
> >
> > diff --git a/lib/librte_eal/common/eal_common_thread.c
> > b/lib/librte_eal/common/eal_common_thread.c
> > index de69452..d91b67d 100644
> > --- a/lib/librte_eal/common/eal_common_thread.c
> > +++ b/lib/librte_eal/common/eal_common_thread.c
> > @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
> > *name,
> > goto fail;
> >
> > pthread_barrier_wait(¶ms->configured);
> > + pthread_barrier_destroy(¶ms->configured);
>
> Thanks Jianfeng, that fixes my issue.
> For correctness, I wonder whether we should check pthread_barrier_wait
> return, and only call destroy() if PTHREAD_BARRIER_SERIAL_THREAD?
> And so also do same the same thing in rte_thread_init().
>
> What do you think?
> Thanks,
> Maxime
Thanks for the update. I also have a patch that replaces the barrier by
a lock which could also work, but if Jianfeng's one fixes the issue, I
think it is better.
About the PTHREAD_BARRIER_SERIAL_THREAD, not sure it will change
something:
Upon successful completion, the pthread_barrier_wait() function
shall return PTHREAD_BARRIER_SERIAL_THREAD for a single
(arbitrary) thread synchronized at the barrier and zero for each
of the other threads. Otherwise, an error number shall be
returned to indicate the error.
I understand that it will ensure that only one barrier will return
PTHREAD_BARRIER_SERIAL_THREAD, but not necessarily the last one. So
if destroy() is called in the parent thread, it should be the same, no?
By the way, there is also a small memory leak that was introduced by
the previous patch, maybe you can add the fix too:
- if (ret != 0)
+ if (ret != 0) {
+ free(params);
return ret;
+ }
Olivier
>
> > free(params);
> >
> > return 0;
> >
> > Thanks,
> > Jianfeng
> >
> > >
> > > Thanks,
> > > Maxime
> > >
> > > > Thanks,
> > > > Jianfeng
> > > >
> > > > >
> > > > > Olivier
> > > > >
> > > >
> >
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:20 ` Olivier Matz
@ 2018-05-02 9:32 ` Tan, Jianfeng
2018-05-02 9:41 ` Maxime Coquelin
0 siblings, 1 reply; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 9:32 UTC (permalink / raw)
To: Olivier Matz, Maxime Coquelin; +Cc: dev, Anatoly Burakov, Thomas Monjalon
Hi Maxime and Olivier,
[...]
>>> Below patch can fix another strange sigsegv issue in my VM. Please check
>>> if it works for you. I doubt it's use-after-free problem which could
>>> lead to different issues in different env. Please have a try.
>>>
>>>
>>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>>> b/lib/librte_eal/common/eal_common_thread.c
>>> index de69452..d91b67d 100644
>>> --- a/lib/librte_eal/common/eal_common_thread.c
>>> +++ b/lib/librte_eal/common/eal_common_thread.c
>>> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
>>> *name,
>>> goto fail;
>>>
>>> pthread_barrier_wait(¶ms->configured);
>>> + pthread_barrier_destroy(¶ms->configured);
>> Thanks Jianfeng, that fixes my issue.
>> For correctness, I wonder whether we should check pthread_barrier_wait
>> return, and only call destroy() if PTHREAD_BARRIER_SERIAL_THREAD?
>> And so also do same the same thing in rte_thread_init().
>>
>> What do you think?
>> Thanks,
>> Maxime
>
> Thanks for the update. I also have a patch that replaces the barrier by
> a lock which could also work, but if Jianfeng's one fixes the issue, I
> think it is better.
>
> About the PTHREAD_BARRIER_SERIAL_THREAD, not sure it will change
> something:
>
> Upon successful completion, the pthread_barrier_wait() function
> shall return PTHREAD_BARRIER_SERIAL_THREAD for a single
> (arbitrary) thread synchronized at the barrier and zero for each
> of the other threads. Otherwise, an error number shall be
> returned to indicate the error.
>
> I understand that it will ensure that only one barrier will return
> PTHREAD_BARRIER_SERIAL_THREAD, but not necessarily the last one. So
> if destroy() is called in the parent thread, it should be the same, no?
>
> By the way, there is also a small memory leak that was introduced by
> the previous patch, maybe you can add the fix too:
>
> - if (ret != 0)
> + if (ret != 0) {
> + free(params);
> return ret;
> + }
How about: the thread who gets PTHREAD_BARRIER_SERIAL_THREAD returned,
is responsible for the destroy and free(params)?
Thanks,
Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:32 ` Tan, Jianfeng
@ 2018-05-02 9:41 ` Maxime Coquelin
0 siblings, 0 replies; 41+ messages in thread
From: Maxime Coquelin @ 2018-05-02 9:41 UTC (permalink / raw)
To: Tan, Jianfeng, Olivier Matz; +Cc: dev, Anatoly Burakov, Thomas Monjalon
On 05/02/2018 11:32 AM, Tan, Jianfeng wrote:
> Hi Maxime and Olivier,
>
> [...]
>>>> Below patch can fix another strange sigsegv issue in my VM. Please
>>>> check
>>>> if it works for you. I doubt it's use-after-free problem which could
>>>> lead to different issues in different env. Please have a try.
>>>>
>>>>
>>>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>>>> b/lib/librte_eal/common/eal_common_thread.c
>>>> index de69452..d91b67d 100644
>>>> --- a/lib/librte_eal/common/eal_common_thread.c
>>>> +++ b/lib/librte_eal/common/eal_common_thread.c
>>>> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const
>>>> char
>>>> *name,
>>>> goto fail;
>>>>
>>>> pthread_barrier_wait(¶ms->configured);
>>>> + pthread_barrier_destroy(¶ms->configured);
>>> Thanks Jianfeng, that fixes my issue.
>>> For correctness, I wonder whether we should check pthread_barrier_wait
>>> return, and only call destroy() if PTHREAD_BARRIER_SERIAL_THREAD?
>>> And so also do same the same thing in rte_thread_init().
>>>
>>> What do you think?
>>> Thanks,
>>> Maxime
>>
>> Thanks for the update. I also have a patch that replaces the barrier by
>> a lock which could also work, but if Jianfeng's one fixes the issue, I
>> think it is better.
>>
>> About the PTHREAD_BARRIER_SERIAL_THREAD, not sure it will change
>> something:
>>
>> Upon successful completion, the pthread_barrier_wait() function
>> shall return PTHREAD_BARRIER_SERIAL_THREAD for a single
>> (arbitrary) thread synchronized at the barrier and zero for each
>> of the other threads. Otherwise, an error number shall be
>> returned to indicate the error.
>>
>> I understand that it will ensure that only one barrier will return
>> PTHREAD_BARRIER_SERIAL_THREAD, but not necessarily the last one. So
>> if destroy() is called in the parent thread, it should be the same, no?
>>
>> By the way, there is also a small memory leak that was introduced by
>> the previous patch, maybe you can add the fix too:
>>
>> - if (ret != 0)
>> + if (ret != 0) {
>> + free(params);
>> return ret;
>> + }
>
> How about: the thread who gets PTHREAD_BARRIER_SERIAL_THREAD returned,
> is responsible for the destroy and free(params)?
I agree with your suggestion.
Thanks,
Maxime
> Thanks,
> Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 8:50 ` Tan, Jianfeng
2018-05-02 9:05 ` Maxime Coquelin
@ 2018-05-02 9:30 ` Burakov, Anatoly
2018-05-02 9:38 ` Tan, Jianfeng
2018-05-02 9:57 ` Olivier Matz
1 sibling, 2 replies; 41+ messages in thread
From: Burakov, Anatoly @ 2018-05-02 9:30 UTC (permalink / raw)
To: Tan, Jianfeng, Maxime Coquelin, Olivier Matz, dev; +Cc: Thomas Monjalon
On 02-May-18 9:50 AM, Tan, Jianfeng wrote:
> Hi Maxime,
>
>
> On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
>> Hi Jiangfeng,
>>
>> On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
>>>
>>>
>>> On 5/1/2018 2:46 AM, Olivier Matz wrote:
>>>> Hi Maxime,
>>>>
>>>> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
>>>> <maxime.coquelin@redhat.com> a écrit :
>>>>> Hi Olivier,
>>>>>
>>>>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>>>>> Some parts of dpdk use their own management threads. Most of the
>>>>> time,
>>>>>> the affinity of the thread is not properly set: it should not be
>>>>> scheduled
>>>>>> on the dataplane cores, because interrupting them can cause packet
>>>>> losses.
>>>>>> This patchset introduces a new wrapper for thread creation that does
>>>>>> the job automatically, avoiding code duplication.
>>>>>>
>>>>>> v3:
>>>>>> * new patch: use this API in examples when relevant.
>>>>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>>>>> is still needed.
>>>>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>>>>> eal
>>>>>> has 2 new pthreads
>>>>>> * remove all calls to snprintf/strlcpy that truncate the thread name:
>>>>>> all strings lengths are already < 16.
>>>>>>
>>>>>> v2:
>>>>>> * set affinity to master core if no core is off, as suggested by
>>>>>> Anatoly
>>>>>>
>>>>>> Olivier Matz (5):
>>>>>> eal: use sizeof to avoid a double use of a define
>>>>>> eal: new function to create control threads
>>>>>> eal: set name when creating a control thread
>>>>>> eal: set affinity for control threads
>>>>>> examples: use new API to create control threads
>>>>>>
>>>>>> drivers/net/kni/Makefile | 1 +
>>>>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>>>>> examples/tep_termination/main.c | 16 +++----
>>>>>> examples/vhost/main.c | 19 +++-----
>>>>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>>>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>>>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>>>>> lib/librte_eal/common/eal_common_thread.c | 72
>>>>> ++++++++++++++++++++++++++++
>>>>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>>>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>>>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>>>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>>>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>>>>> lib/librte_eal/rte_eal_version.map | 1 +
>>>>>> lib/librte_vhost/socket.c | 25 ++--------
>>>>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>>>>
>>>>> I face a deadlock issue with your series, that Jianfeng patch does not
>>>>> resolve ("eal: fix threads block on barrier"). Reverting the series
>>>>> and
>>>>> Jianfeng patch makes the issue to disappear.
>>>>>
>>>>> I face the problem in a VM (not seen on the host):
>>>>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>>>>> auto
>>>>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>>>>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>>>>> --txd=256 --nb-cores=2 --auto-start
>>>>> EAL: Detected 3 lcore(s)
>>>>> EAL: Detected 1 NUMA nodes
>>>>> EAL: Auto-detected process type: PRIMARY
>>>>> EAL: Multi-process socket /var/run/.pg_unix
>>>>>
>>>>>
>>>>> Then it is stuck. Attaching with GDB, I get below backtrace
>>>>> information:
>>>>>
>>>>> (gdb) info threads
>>>>> Id Target Id Frame
>>>>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>>>>> 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>>>>> pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
>>>>> () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> (gdb) bt full
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> No locals.
>>>>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>>>>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>>>>> "rte_mp_async", attr=attr@entry=0x0,
>>>>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>>>>> arg=arg@entry=0x0)
>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>>>>> params = 0x17b1e40
>>>>> lcore_id = <optimized out>
>>>>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>>>>> cpu_found = <optimized out>
>>>>> ret = 0
>>>>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>>>>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301
>>>>> times>...
>>>>> dir_fd = 4
>>>>> mp_handle_tid = 140066969745152
>>>>> async_reply_handle_tid = 140066961352448
>>>>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>>>>> argv=argv@entry=0x7ffe5c896378) at
>>>>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>>>>> i = <optimized out>
>>>>> fctret = 11
>>>>> ret = <optimized out>
>>>>> thread_id = 140066989861888
>>>>> run_once = {cnt = 1}
>>>>> logid = 0x17b1e00 "testpmd"
>>>>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>>>>> "\020", '\000' <repeats 116 times>...
>>>>> thread_name =
>>>>> "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>>>>> __func__ = "rte_eal_init"
>>>>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>>>>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>>>>> diag = <optimized out>
>>>>> port_id = <optimized out>
>>>>> ret = <optimized out>
>>>>> __func__ = "main"
>>>>> (gdb) thread 2
>>>>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> 71 cmpl %edx, (%rdi)
>>>>> (gdb) bt full
>>>>> #0 pthread_barrier_wait () at
>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>> No locals.
>>>>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>>>>> params = <optimized out>
>>>>> start_routine = 0x521030 <async_reply_handle>
>>>>> routine_arg = 0x0
>>>>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>>>>> pthread_create.c:308
>>>>> __res = <optimized out>
>>>>> pd = 0x7f63e179e700
>>>>> now = <optimized out>
>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
>>>>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>>>>> -1291626103561052744, -1291619793368703560}, mask_was_saved = 0}},
>>>>> priv
>>>>>
>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>> not_first_call = <optimized out>
>>>>> pagesize_m1 = <optimized out>
>>>>> sp = <optimized out>
>>>>> freesize = <optimized out>
>>>>> #3 0x00007f63e22b4b3d in clone () at
>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>> No locals.
>>>>> (gdb) thread 3
>>>>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>>>>> (gdb) bt full
>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>> No locals.
>>>>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
>>>>> m=0x7f63e1f9d5a0)
>>>>>
>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>>>>> msglen = <optimized out>
>>>>> control =
>>>>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
>>>>> <repeats 23 times>, "\360\371\033\342c\177\000"
>>>>> cmsg = <optimized out>
>>>>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>>>>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
>>>>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>>>>> msg_controllen = 48, msg_flags = 0}
>>>>> #2 mp_handle (arg=<optimized out>) at
>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>>>>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>>>>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
>>>>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>>>>> sa = {sun_family = 55104,
>>>>> sun_path =
>>>>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>>>>
>>>>>
>>>>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>>>>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>>>>> pthread_create.c:308
>>>>> __res = <optimized out>
>>>>> pd = 0x7f63e1f9f700
>>>>> now = <optimized out>
>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
>>>>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>>>>> -1291625004586295880, -1291619793368703560}, mask_was_saved = 0}},
>>>>> priv
>>>>>
>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>> not_first_call = <optimized out>
>>>>> pagesize_m1 = <optimized out>
>>>>> sp = <optimized out>
>>>>> freesize = <optimized out>
>>>>> #4 0x00007f63e22b4b3d in clone () at
>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>> No locals.
>>>>>
>>>>> I don't have more info for now.
>>>>>
>>>>
>>>> Thanks for the feedback on this issue. I don't see obvious reason
>>>> for this deadlock yet.
>>>>
>>>> I'll investigate it asap (not tomorrow, but wednesday). In the worst
>>>> case, we can revert the series if I cannot find the root cause rapidly.
>>>
>>> I might think that the suggestion from Stephen of destroying the
>>> barrier can help this issue. I'll try to reproduce it and test it
>>> before sending a patch to fix it.
>>
>> In case you don't reproduce, feel free to send me the patch to test it.
>
> Below patch can fix another strange sigsegv issue in my VM. Please check
> if it works for you. I doubt it's use-after-free problem which could
> lead to different issues in different env. Please have a try.
>
>
> diff --git a/lib/librte_eal/common/eal_common_thread.c
> b/lib/librte_eal/common/eal_common_thread.c
> index de69452..d91b67d 100644
> --- a/lib/librte_eal/common/eal_common_thread.c
> +++ b/lib/librte_eal/common/eal_common_thread.c
> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
> *name,
> goto fail;
>
> pthread_barrier_wait(¶ms->configured);
> + pthread_barrier_destroy(¶ms->configured);
> free(params);
Should destroy also be called in fail case?
>
> return 0;
>
> Thanks,
> Jianfeng
>
>>
>> Thanks,
>> Maxime
>>
>>> Thanks,
>>> Jianfeng
>>>
>>>>
>>>> Olivier
>>>>
>>>
>
>
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:30 ` Burakov, Anatoly
@ 2018-05-02 9:38 ` Tan, Jianfeng
2018-05-02 9:57 ` Olivier Matz
1 sibling, 0 replies; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 9:38 UTC (permalink / raw)
To: Burakov, Anatoly, Maxime Coquelin, Olivier Matz, dev; +Cc: Thomas Monjalon
On 5/2/2018 5:30 PM, Burakov, Anatoly wrote:
> On 02-May-18 9:50 AM, Tan, Jianfeng wrote:
>> Hi Maxime,
>>
>>
>> On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
>>> Hi Jiangfeng,
>>>
>>> On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
>>>>
>>>>
>>>> On 5/1/2018 2:46 AM, Olivier Matz wrote:
>>>>> Hi Maxime,
>>>>>
>>>>> Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
>>>>> <maxime.coquelin@redhat.com> a écrit :
>>>>>> Hi Olivier,
>>>>>>
>>>>>> On 04/24/2018 04:46 PM, Olivier Matz wrote:
>>>>>>> Some parts of dpdk use their own management threads. Most of the
>>>>>> time,
>>>>>>> the affinity of the thread is not properly set: it should not be
>>>>>> scheduled
>>>>>>> on the dataplane cores, because interrupting them can cause packet
>>>>>> losses.
>>>>>>> This patchset introduces a new wrapper for thread creation that
>>>>>>> does
>>>>>>> the job automatically, avoiding code duplication.
>>>>>>>
>>>>>>> v3:
>>>>>>> * new patch: use this API in examples when relevant.
>>>>>>> * replace pthread_kill by pthread_cancel. Note that pthread_join()
>>>>>>> is still needed.
>>>>>>> * rebase: vfio and pdump do not have control pthreads anymore, and
>>>>>> eal
>>>>>>> has 2 new pthreads
>>>>>>> * remove all calls to snprintf/strlcpy that truncate the thread
>>>>>>> name:
>>>>>>> all strings lengths are already < 16.
>>>>>>>
>>>>>>> v2:
>>>>>>> * set affinity to master core if no core is off, as suggested by
>>>>>>> Anatoly
>>>>>>>
>>>>>>> Olivier Matz (5):
>>>>>>> eal: use sizeof to avoid a double use of a define
>>>>>>> eal: new function to create control threads
>>>>>>> eal: set name when creating a control thread
>>>>>>> eal: set affinity for control threads
>>>>>>> examples: use new API to create control threads
>>>>>>>
>>>>>>> drivers/net/kni/Makefile | 1 +
>>>>>>> drivers/net/kni/rte_eth_kni.c | 3 +-
>>>>>>> examples/tep_termination/main.c | 16 +++----
>>>>>>> examples/vhost/main.c | 19 +++-----
>>>>>>> lib/librte_eal/bsdapp/eal/eal.c | 4 +-
>>>>>>> lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
>>>>>>> lib/librte_eal/common/eal_common_proc.c | 15 ++----
>>>>>>> lib/librte_eal/common/eal_common_thread.c | 72
>>>>>> ++++++++++++++++++++++++++++
>>>>>>> lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
>>>>>>> lib/librte_eal/linuxapp/eal/eal.c | 4 +-
>>>>>>> lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
>>>>>>> lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
>>>>>>> lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
>>>>>>> lib/librte_eal/rte_eal_version.map | 1 +
>>>>>>> lib/librte_vhost/socket.c | 25 ++--------
>>>>>>> 15 files changed, 135 insertions(+), 84 deletions(-)
>>>>>>>
>>>>>> I face a deadlock issue with your series, that Jianfeng patch
>>>>>> does not
>>>>>> resolve ("eal: fix threads block on barrier"). Reverting the
>>>>>> series and
>>>>>> Jianfeng patch makes the issue to disappear.
>>>>>>
>>>>>> I face the problem in a VM (not seen on the host):
>>>>>> # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
>>>>>> auto
>>>>>> --file-prefix pg -- --portmask=3 --forward-mode=macswap
>>>>>> --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
>>>>>> --txd=256 --nb-cores=2 --auto-start
>>>>>> EAL: Detected 3 lcore(s)
>>>>>> EAL: Detected 1 NUMA nodes
>>>>>> EAL: Auto-detected process type: PRIMARY
>>>>>> EAL: Multi-process socket /var/run/.pg_unix
>>>>>>
>>>>>>
>>>>>> Then it is stuck. Attaching with GDB, I get below backtrace
>>>>>> information:
>>>>>>
>>>>>> (gdb) info threads
>>>>>> Id Target Id Frame
>>>>>> 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
>>>>>> 0x00007f63e2591bfd in recvmsg () at
>>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>>> 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
>>>>>> pthread_barrier_wait () at
>>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>>> * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd"
>>>>>> pthread_barrier_wait
>>>>>> () at
>>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>>> (gdb) bt full
>>>>>> #0 pthread_barrier_wait () at
>>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>>> No locals.
>>>>>> #1 0x0000000000520c54 in rte_ctrl_thread_create
>>>>>> (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
>>>>>> "rte_mp_async", attr=attr@entry=0x0,
>>>>>> start_routine=start_routine@entry=0x521030 <async_reply_handle>,
>>>>>> arg=arg@entry=0x0)
>>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
>>>>>> params = 0x17b1e40
>>>>>> lcore_id = <optimized out>
>>>>>> cpuset = {__bits = {1, 0 <repeats 15 times>}}
>>>>>> cpu_found = <optimized out>
>>>>>> ret = 0
>>>>>> #2 0x00000000005220b6 in rte_mp_channel_init () at
>>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
>>>>>> path = "/var/run\000.pg_unix_*", '\000' <repeats 1301
>>>>>> times>...
>>>>>> dir_fd = 4
>>>>>> mp_handle_tid = 140066969745152
>>>>>> async_reply_handle_tid = 140066961352448
>>>>>> #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
>>>>>> argv=argv@entry=0x7ffe5c896378) at
>>>>>> /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
>>>>>> i = <optimized out>
>>>>>> fctret = 11
>>>>>> ret = <optimized out>
>>>>>> thread_id = 140066989861888
>>>>>> run_once = {cnt = 1}
>>>>>> logid = 0x17b1e00 "testpmd"
>>>>>> cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
>>>>>> "\020", '\000' <repeats 116 times>...
>>>>>> thread_name =
>>>>>> "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
>>>>>> __func__ = "rte_eal_init"
>>>>>> #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
>>>>>> /root/src/dpdk/app/test-pmd/testpmd.c:2597
>>>>>> diag = <optimized out>
>>>>>> port_id = <optimized out>
>>>>>> ret = <optimized out>
>>>>>> __func__ = "main"
>>>>>> (gdb) thread 2
>>>>>> [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
>>>>>> #0 pthread_barrier_wait () at
>>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>>> 71 cmpl %edx, (%rdi)
>>>>>> (gdb) bt full
>>>>>> #0 pthread_barrier_wait () at
>>>>>> ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
>>>>>> No locals.
>>>>>> #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
>>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
>>>>>> params = <optimized out>
>>>>>> start_routine = 0x521030 <async_reply_handle>
>>>>>> routine_arg = 0x0
>>>>>> #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
>>>>>> pthread_create.c:308
>>>>>> __res = <optimized out>
>>>>>> pd = 0x7f63e179e700
>>>>>> now = <optimized out>
>>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf =
>>>>>> {140066961352448,
>>>>>> 1212869169857371576, 0, 8392704, 0, 140066961352448,
>>>>>> -1291626103561052744, -1291619793368703560}, mask_was_saved =
>>>>>> 0}}, priv
>>>>>>
>>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>>> not_first_call = <optimized out>
>>>>>> pagesize_m1 = <optimized out>
>>>>>> sp = <optimized out>
>>>>>> freesize = <optimized out>
>>>>>> #3 0x00007f63e22b4b3d in clone () at
>>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>>> No locals.
>>>>>> (gdb) thread 3
>>>>>> [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
>>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>>> 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
>>>>>> (gdb) bt full
>>>>>> #0 0x00007f63e2591bfd in recvmsg () at
>>>>>> ../sysdeps/unix/syscall-template.S:81
>>>>>> No locals.
>>>>>> #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
>>>>>> m=0x7f63e1f9d5a0)
>>>>>>
>>>>>> at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
>>>>>> msglen = <optimized out>
>>>>>> control =
>>>>>> "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005",
>>>>>> '\000'
>>>>>> <repeats 23 times>, "\360\371\033\342c\177\000"
>>>>>> cmsg = <optimized out>
>>>>>> iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
>>>>>> msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110,
>>>>>> msg_iov =
>>>>>> 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
>>>>>> msg_controllen = 48, msg_flags = 0}
>>>>>> #2 mp_handle (arg=<optimized out>) at
>>>>>> /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
>>>>>> msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
>>>>>> len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>,
>>>>>> "\002",
>>>>>> '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
>>>>>> sa = {sun_family = 55104,
>>>>>> sun_path =
>>>>>> "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
>>>>>>
>>>>>>
>>>>>> '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
>>>>>> #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
>>>>>> pthread_create.c:308
>>>>>> __res = <optimized out>
>>>>>> pd = 0x7f63e1f9f700
>>>>>> now = <optimized out>
>>>>>> unwind_buf = {cancel_jmp_buf = {{jmp_buf =
>>>>>> {140066969745152,
>>>>>> 1212869169857371576, 0, 8392704, 0, 140066969745152,
>>>>>> -1291625004586295880, -1291619793368703560}, mask_was_saved =
>>>>>> 0}}, priv
>>>>>>
>>>>>> = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
>>>>>> prev = 0x0, cleanup = 0x0, canceltype = 0}}}
>>>>>> not_first_call = <optimized out>
>>>>>> pagesize_m1 = <optimized out>
>>>>>> sp = <optimized out>
>>>>>> freesize = <optimized out>
>>>>>> #4 0x00007f63e22b4b3d in clone () at
>>>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
>>>>>> No locals.
>>>>>>
>>>>>> I don't have more info for now.
>>>>>>
>>>>>
>>>>> Thanks for the feedback on this issue. I don't see obvious reason
>>>>> for this deadlock yet.
>>>>>
>>>>> I'll investigate it asap (not tomorrow, but wednesday). In the
>>>>> worst case, we can revert the series if I cannot find the root
>>>>> cause rapidly.
>>>>
>>>> I might think that the suggestion from Stephen of destroying the
>>>> barrier can help this issue. I'll try to reproduce it and test it
>>>> before sending a patch to fix it.
>>>
>>> In case you don't reproduce, feel free to send me the patch to test it.
>>
>> Below patch can fix another strange sigsegv issue in my VM. Please
>> check if it works for you. I doubt it's use-after-free problem which
>> could lead to different issues in different env. Please have a try.
>>
>>
>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>> b/lib/librte_eal/common/eal_common_thread.c
>> index de69452..d91b67d 100644
>> --- a/lib/librte_eal/common/eal_common_thread.c
>> +++ b/lib/librte_eal/common/eal_common_thread.c
>> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const
>> char *name,
>> goto fail;
>>
>> pthread_barrier_wait(¶ms->configured);
>> + pthread_barrier_destroy(¶ms->configured);
>> free(params);
>
> Should destroy also be called in fail case?
After checking https://linux.die.net/man/3/pthread_barrier_wait, it
seems that there is only one possible fail case, which will not happen here.
EINVAL The value specified by barrier does not refer to an
initialized barrier object.
Do we need to take care of that?
Thanks,
Jianfeng
>
>>
>> return 0;
>>
>> Thanks,
>> Jianfeng
>>
>>>
>>> Thanks,
>>> Maxime
>>>
>>>> Thanks,
>>>> Jianfeng
>>>>
>>>>>
>>>>> Olivier
>>>>>
>>>>
>>
>>
>
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:30 ` Burakov, Anatoly
2018-05-02 9:38 ` Tan, Jianfeng
@ 2018-05-02 9:57 ` Olivier Matz
2018-05-02 10:01 ` Tan, Jianfeng
1 sibling, 1 reply; 41+ messages in thread
From: Olivier Matz @ 2018-05-02 9:57 UTC (permalink / raw)
To: Burakov, Anatoly; +Cc: Tan, Jianfeng, Maxime Coquelin, dev, Thomas Monjalon
On Wed, May 02, 2018 at 10:30:56AM +0100, Burakov, Anatoly wrote:
> On 02-May-18 9:50 AM, Tan, Jianfeng wrote:
> > Hi Maxime,
> >
> >
> > On 5/2/2018 4:34 PM, Maxime Coquelin wrote:
> > > Hi Jiangfeng,
> > >
> > > On 05/02/2018 10:19 AM, Tan, Jianfeng wrote:
> > > >
> > > >
> > > > On 5/1/2018 2:46 AM, Olivier Matz wrote:
> > > > > Hi Maxime,
> > > > >
> > > > > Le 30 avril 2018 17:45:52 GMT+02:00, Maxime Coquelin
> > > > > <maxime.coquelin@redhat.com> a écrit :
> > > > > > Hi Olivier,
> > > > > >
> > > > > > On 04/24/2018 04:46 PM, Olivier Matz wrote:
> > > > > > > Some parts of dpdk use their own management threads. Most of the
> > > > > > time,
> > > > > > > the affinity of the thread is not properly set: it should not be
> > > > > > scheduled
> > > > > > > on the dataplane cores, because interrupting them can cause packet
> > > > > > losses.
> > > > > > > This patchset introduces a new wrapper for thread creation that does
> > > > > > > the job automatically, avoiding code duplication.
> > > > > > >
> > > > > > > v3:
> > > > > > > * new patch: use this API in examples when relevant.
> > > > > > > * replace pthread_kill by pthread_cancel. Note that pthread_join()
> > > > > > > is still needed.
> > > > > > > * rebase: vfio and pdump do not have control pthreads anymore, and
> > > > > > eal
> > > > > > > has 2 new pthreads
> > > > > > > * remove all calls to snprintf/strlcpy that truncate the thread name:
> > > > > > > all strings lengths are already < 16.
> > > > > > >
> > > > > > > v2:
> > > > > > > * set affinity to master core if no core is off, as suggested by
> > > > > > > Anatoly
> > > > > > >
> > > > > > > Olivier Matz (5):
> > > > > > > eal: use sizeof to avoid a double use of a define
> > > > > > > eal: new function to create control threads
> > > > > > > eal: set name when creating a control thread
> > > > > > > eal: set affinity for control threads
> > > > > > > examples: use new API to create control threads
> > > > > > >
> > > > > > > drivers/net/kni/Makefile | 1 +
> > > > > > > drivers/net/kni/rte_eth_kni.c | 3 +-
> > > > > > > examples/tep_termination/main.c | 16 +++----
> > > > > > > examples/vhost/main.c | 19 +++-----
> > > > > > > lib/librte_eal/bsdapp/eal/eal.c | 4 +-
> > > > > > > lib/librte_eal/bsdapp/eal/eal_thread.c | 2 +-
> > > > > > > lib/librte_eal/common/eal_common_proc.c | 15 ++----
> > > > > > > lib/librte_eal/common/eal_common_thread.c | 72
> > > > > > ++++++++++++++++++++++++++++
> > > > > > > lib/librte_eal/common/include/rte_lcore.h | 26 ++++++++++
> > > > > > > lib/librte_eal/linuxapp/eal/eal.c | 4 +-
> > > > > > > lib/librte_eal/linuxapp/eal/eal_interrupts.c | 17 ++-----
> > > > > > > lib/librte_eal/linuxapp/eal/eal_thread.c | 2 +-
> > > > > > > lib/librte_eal/linuxapp/eal/eal_timer.c | 12 +----
> > > > > > > lib/librte_eal/rte_eal_version.map | 1 +
> > > > > > > lib/librte_vhost/socket.c | 25 ++--------
> > > > > > > 15 files changed, 135 insertions(+), 84 deletions(-)
> > > > > > >
> > > > > > I face a deadlock issue with your series, that Jianfeng patch does not
> > > > > > resolve ("eal: fix threads block on barrier"). Reverting
> > > > > > the series and
> > > > > > Jianfeng patch makes the issue to disappear.
> > > > > >
> > > > > > I face the problem in a VM (not seen on the host):
> > > > > > # ./install/bin/testpmd -l 0,1,2 --socket-mem 1024 -n 4 --proc-type
> > > > > > auto
> > > > > > --file-prefix pg -- --portmask=3 --forward-mode=macswap
> > > > > > --port-topology=chained --disable-rss -i --rxq=1 --txq=1 --rxd=256
> > > > > > --txd=256 --nb-cores=2 --auto-start
> > > > > > EAL: Detected 3 lcore(s)
> > > > > > EAL: Detected 1 NUMA nodes
> > > > > > EAL: Auto-detected process type: PRIMARY
> > > > > > EAL: Multi-process socket /var/run/.pg_unix
> > > > > >
> > > > > >
> > > > > > Then it is stuck. Attaching with GDB, I get below backtrace
> > > > > > information:
> > > > > >
> > > > > > (gdb) info threads
> > > > > > Id Target Id Frame
> > > > > > 3 Thread 0x7f63e1f9f700 (LWP 8808) "rte_mp_handle"
> > > > > > 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > 2 Thread 0x7f63e179e700 (LWP 8809) "rte_mp_async"
> > > > > > pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > * 1 Thread 0x7f63e32cec00 (LWP 8807) "testpmd" pthread_barrier_wait
> > > > > > () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > (gdb) bt full
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > No locals.
> > > > > > #1 0x0000000000520c54 in rte_ctrl_thread_create
> > > > > > (thread=thread@entry=0x7ffe5c895020, name=name@entry=0x869d86
> > > > > > "rte_mp_async", attr=attr@entry=0x0,
> > > > > > start_routine=start_routine@entry=0x521030 <async_reply_handle>,
> > > > > > arg=arg@entry=0x0)
> > > > > > at /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:207
> > > > > > params = 0x17b1e40
> > > > > > lcore_id = <optimized out>
> > > > > > cpuset = {__bits = {1, 0 <repeats 15 times>}}
> > > > > > cpu_found = <optimized out>
> > > > > > ret = 0
> > > > > > #2 0x00000000005220b6 in rte_mp_channel_init () at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:674
> > > > > > path = "/var/run\000.pg_unix_*", '\000' <repeats
> > > > > > 1301 times>...
> > > > > > dir_fd = 4
> > > > > > mp_handle_tid = 140066969745152
> > > > > > async_reply_handle_tid = 140066961352448
> > > > > > #3 0x000000000050c227 in rte_eal_init (argc=argc@entry=23,
> > > > > > argv=argv@entry=0x7ffe5c896378) at
> > > > > > /root/src/dpdk/lib/librte_eal/linuxapp/eal/eal.c:775
> > > > > > i = <optimized out>
> > > > > > fctret = 11
> > > > > > ret = <optimized out>
> > > > > > thread_id = 140066989861888
> > > > > > run_once = {cnt = 1}
> > > > > > logid = 0x17b1e00 "testpmd"
> > > > > > cpuset = "T}\211\\\376\177", '\000' <repeats 117 times>,
> > > > > > "\020", '\000' <repeats 116 times>...
> > > > > > thread_name =
> > > > > > "X}\211\\\376\177\000\000\226\301\036\342c\177\000"
> > > > > > __func__ = "rte_eal_init"
> > > > > > #4 0x0000000000473214 in main (argc=23, argv=0x7ffe5c896378) at
> > > > > > /root/src/dpdk/app/test-pmd/testpmd.c:2597
> > > > > > diag = <optimized out>
> > > > > > port_id = <optimized out>
> > > > > > ret = <optimized out>
> > > > > > __func__ = "main"
> > > > > > (gdb) thread 2
> > > > > > [Switching to thread 2 (Thread 0x7f63e179e700 (LWP 8809))]
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > 71 cmpl %edx, (%rdi)
> > > > > > (gdb) bt full
> > > > > > #0 pthread_barrier_wait () at
> > > > > > ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
> > > > > > No locals.
> > > > > > #1 0x0000000000520777 in rte_thread_init (arg=<optimized out>) at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_thread.c:156
> > > > > > params = <optimized out>
> > > > > > start_routine = 0x521030 <async_reply_handle>
> > > > > > routine_arg = 0x0
> > > > > > #2 0x00007f63e258add5 in start_thread (arg=0x7f63e179e700) at
> > > > > > pthread_create.c:308
> > > > > > __res = <optimized out>
> > > > > > pd = 0x7f63e179e700
> > > > > > now = <optimized out>
> > > > > > unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066961352448,
> > > > > > 1212869169857371576, 0, 8392704, 0, 140066961352448,
> > > > > > -1291626103561052744, -1291619793368703560},
> > > > > > mask_was_saved = 0}}, priv
> > > > > >
> > > > > > = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> > > > > > prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> > > > > > not_first_call = <optimized out>
> > > > > > pagesize_m1 = <optimized out>
> > > > > > sp = <optimized out>
> > > > > > freesize = <optimized out>
> > > > > > #3 0x00007f63e22b4b3d in clone () at
> > > > > > ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
> > > > > > No locals.
> > > > > > (gdb) thread 3
> > > > > > [Switching to thread 3 (Thread 0x7f63e1f9f700 (LWP 8808))]
> > > > > > #0 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
> > > > > > (gdb) bt full
> > > > > > #0 0x00007f63e2591bfd in recvmsg () at
> > > > > > ../sysdeps/unix/syscall-template.S:81
> > > > > > No locals.
> > > > > > #1 0x000000000052194e in read_msg (s=0x7f63e1f9d3b0,
> > > > > > m=0x7f63e1f9d5a0)
> > > > > >
> > > > > > at /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:258
> > > > > > msglen = <optimized out>
> > > > > > control =
> > > > > > "\000\000\000\000\000\000\000\000\336~\f\343c\177\000\000\005", '\000'
> > > > > > <repeats 23 times>, "\360\371\033\342c\177\000"
> > > > > > cmsg = <optimized out>
> > > > > > iov = {iov_base = 0x7f63e1f9d5a0, iov_len = 332}
> > > > > > msgh = {msg_name = 0x7f63e1f9d3b0, msg_namelen = 110, msg_iov =
> > > > > > 0x7f63e1f9d370, msg_iovlen = 1, msg_control = 0x7f63e1f9d380,
> > > > > > msg_controllen = 48, msg_flags = 0}
> > > > > > #2 mp_handle (arg=<optimized out>) at
> > > > > > /root/src/dpdk/lib/librte_eal/common/eal_common_proc.c:346
> > > > > > msg = {type = 0, msg = {name = '\000' <repeats 63 times>,
> > > > > > len_param = 0, num_fds = 0, param = '\000' <repeats 20 times>, "\002",
> > > > > > '\000' <repeats 234 times>, fds = {0, 0, 0, 0, 0, 0, 0, 0}}}
> > > > > > sa = {sun_family = 55104,
> > > > > > sun_path =
> > > > > > "\371\341c\177\000\000\352\372\f\343c\177\000\000\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\367\371\341c\177\000\000\030\000\000\000\000\000\000\000p\327\371\341c\177\000\000\000\367\371\341c\177\000\000\000\367\371\341c\177",
> > > > > >
> > > > > >
> > > > > > '\000' <repeats 34 times>, "\200\037\000\000\377\377"}
> > > > > > #3 0x00007f63e258add5 in start_thread (arg=0x7f63e1f9f700) at
> > > > > > pthread_create.c:308
> > > > > > __res = <optimized out>
> > > > > > pd = 0x7f63e1f9f700
> > > > > > now = <optimized out>
> > > > > > unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140066969745152,
> > > > > > 1212869169857371576, 0, 8392704, 0, 140066969745152,
> > > > > > -1291625004586295880, -1291619793368703560},
> > > > > > mask_was_saved = 0}}, priv
> > > > > >
> > > > > > = {pad = {0x0, 0x0, 0x0, 0x0}, data = {
> > > > > > prev = 0x0, cleanup = 0x0, canceltype = 0}}}
> > > > > > not_first_call = <optimized out>
> > > > > > pagesize_m1 = <optimized out>
> > > > > > sp = <optimized out>
> > > > > > freesize = <optimized out>
> > > > > > #4 0x00007f63e22b4b3d in clone () at
> > > > > > ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
> > > > > > No locals.
> > > > > >
> > > > > > I don't have more info for now.
> > > > > >
> > > > >
> > > > > Thanks for the feedback on this issue. I don't see obvious
> > > > > reason for this deadlock yet.
> > > > >
> > > > > I'll investigate it asap (not tomorrow, but wednesday). In
> > > > > the worst case, we can revert the series if I cannot find
> > > > > the root cause rapidly.
> > > >
> > > > I might think that the suggestion from Stephen of destroying the
> > > > barrier can help this issue. I'll try to reproduce it and test
> > > > it before sending a patch to fix it.
> > >
> > > In case you don't reproduce, feel free to send me the patch to test it.
> >
> > Below patch can fix another strange sigsegv issue in my VM. Please check
> > if it works for you. I doubt it's use-after-free problem which could
> > lead to different issues in different env. Please have a try.
> >
> >
> > diff --git a/lib/librte_eal/common/eal_common_thread.c
> > b/lib/librte_eal/common/eal_common_thread.c
> > index de69452..d91b67d 100644
> > --- a/lib/librte_eal/common/eal_common_thread.c
> > +++ b/lib/librte_eal/common/eal_common_thread.c
> > @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
> > *name,
> > goto fail;
> >
> > pthread_barrier_wait(¶ms->configured);
> > + pthread_barrier_destroy(¶ms->configured);
> > free(params);
>
> Should destroy also be called in fail case?
Yes, and also pthread_barrier_wait().
I did a quick test simulating a failure of rte_thread_setname(), and
without the wait, the barrier in the child thread blocks forever.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 9:57 ` Olivier Matz
@ 2018-05-02 10:01 ` Tan, Jianfeng
2018-05-02 10:08 ` Olivier Matz
0 siblings, 1 reply; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 10:01 UTC (permalink / raw)
To: Olivier Matz, Burakov, Anatoly; +Cc: Maxime Coquelin, dev, Thomas Monjalon
Hi Olivier and Anatoly,
[...]
>>> Below patch can fix another strange sigsegv issue in my VM. Please check
>>> if it works for you. I doubt it's use-after-free problem which could
>>> lead to different issues in different env. Please have a try.
>>>
>>>
>>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>>> b/lib/librte_eal/common/eal_common_thread.c
>>> index de69452..d91b67d 100644
>>> --- a/lib/librte_eal/common/eal_common_thread.c
>>> +++ b/lib/librte_eal/common/eal_common_thread.c
>>> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
>>> *name,
>>> goto fail;
>>>
>>> pthread_barrier_wait(¶ms->configured);
>>> + pthread_barrier_destroy(¶ms->configured);
>>> free(params);
>> Should destroy also be called in fail case?
> Yes, and also pthread_barrier_wait().
>
> I did a quick test simulating a failure of rte_thread_setname(), and
> without the wait, the barrier in the child thread blocks forever.
>
v1 has been sent without seeing this email. You are right, I
misunderstood it. Will send a v2 based on this result.
Thanks,
Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 10:01 ` Tan, Jianfeng
@ 2018-05-02 10:08 ` Olivier Matz
2018-05-02 10:16 ` Tan, Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Olivier Matz @ 2018-05-02 10:08 UTC (permalink / raw)
To: Tan, Jianfeng; +Cc: Burakov, Anatoly, Maxime Coquelin, dev, Thomas Monjalon
On Wed, May 02, 2018 at 06:01:20PM +0800, Tan, Jianfeng wrote:
> Hi Olivier and Anatoly,
>
> [...]
>
> > > > Below patch can fix another strange sigsegv issue in my VM. Please check
> > > > if it works for you. I doubt it's use-after-free problem which could
> > > > lead to different issues in different env. Please have a try.
> > > >
> > > >
> > > > diff --git a/lib/librte_eal/common/eal_common_thread.c
> > > > b/lib/librte_eal/common/eal_common_thread.c
> > > > index de69452..d91b67d 100644
> > > > --- a/lib/librte_eal/common/eal_common_thread.c
> > > > +++ b/lib/librte_eal/common/eal_common_thread.c
> > > > @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
> > > > *name,
> > > > goto fail;
> > > >
> > > > pthread_barrier_wait(¶ms->configured);
> > > > + pthread_barrier_destroy(¶ms->configured);
> > > > free(params);
> > > Should destroy also be called in fail case?
> > Yes, and also pthread_barrier_wait().
> >
> > I did a quick test simulating a failure of rte_thread_setname(), and
> > without the wait, the barrier in the child thread blocks forever.
> >
>
> v1 has been sent without seeing this email. You are right, I misunderstood
> it. Will send a v2 based on this result.
>
> Thanks,
> Jianfeng
Thanks.
Not sure it should be in the same patch, however there is also
a memory leak here:
ret = pthread_create(thread, attr, rte_thread_init, (void *)params);
if (ret != 0)
return ret;
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [dpdk-dev] pthread_barrier_deadlock in -rc1
2018-05-02 10:08 ` Olivier Matz
@ 2018-05-02 10:16 ` Tan, Jianfeng
0 siblings, 0 replies; 41+ messages in thread
From: Tan, Jianfeng @ 2018-05-02 10:16 UTC (permalink / raw)
To: Olivier Matz; +Cc: Burakov, Anatoly, Maxime Coquelin, dev, Thomas Monjalon
On 5/2/2018 6:08 PM, Olivier Matz wrote:
> On Wed, May 02, 2018 at 06:01:20PM +0800, Tan, Jianfeng wrote:
>> Hi Olivier and Anatoly,
>>
>> [...]
>>
>>>>> Below patch can fix another strange sigsegv issue in my VM. Please check
>>>>> if it works for you. I doubt it's use-after-free problem which could
>>>>> lead to different issues in different env. Please have a try.
>>>>>
>>>>>
>>>>> diff --git a/lib/librte_eal/common/eal_common_thread.c
>>>>> b/lib/librte_eal/common/eal_common_thread.c
>>>>> index de69452..d91b67d 100644
>>>>> --- a/lib/librte_eal/common/eal_common_thread.c
>>>>> +++ b/lib/librte_eal/common/eal_common_thread.c
>>>>> @@ -205,6 +205,7 @@ rte_ctrl_thread_create(pthread_t *thread, const char
>>>>> *name,
>>>>> goto fail;
>>>>>
>>>>> pthread_barrier_wait(¶ms->configured);
>>>>> + pthread_barrier_destroy(¶ms->configured);
>>>>> free(params);
>>>> Should destroy also be called in fail case?
>>> Yes, and also pthread_barrier_wait().
>>>
>>> I did a quick test simulating a failure of rte_thread_setname(), and
>>> without the wait, the barrier in the child thread blocks forever.
>>>
>> v1 has been sent without seeing this email. You are right, I misunderstood
>> it. Will send a v2 based on this result.
>>
>> Thanks,
>> Jianfeng
> Thanks.
>
> Not sure it should be in the same patch, however there is also
> a memory leak here:
>
> ret = pthread_create(thread, attr, rte_thread_init, (void *)params);
> if (ret != 0)
> return ret;
My bad, will fix it in a separate patch.
Thanks,
Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread