The issue indeed lies with the ixgbe driver. After making the following modifications, dpdk-dumpcap is now functioning properly. diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index d6cf00317e77b64f9822c155115f388ae62241eb..99b26f3c758b3c7ced5d59c6b27f305efe6cc33c 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -4301,48 +4301,50 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, wait = 1; #endif - if (vf) - diag = ixgbevf_check_link(hw, &link_speed, &link_up, wait); - else - diag = ixgbe_check_link(hw, &link_speed, &link_up, wait); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + if (vf) + diag = ixgbevf_check_link(hw, &link_speed, &link_up, wait); + else + diag = ixgbe_check_link(hw, &link_speed, &link_up, wait); - if (diag != 0) { - link.link_speed = RTE_ETH_SPEED_NUM_100M; - link.link_duplex = RTE_ETH_LINK_FULL_DUPLEX; - return rte_eth_linkstatus_set(dev, &link); - } + if (diag != 0) { + link.link_speed = RTE_ETH_SPEED_NUM_100M; + link.link_duplex = RTE_ETH_LINK_FULL_DUPLEX; + return rte_eth_linkstatus_set(dev, &link); + } + + if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber && + !ad->sdp3_no_tx_disable) { + esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP); + if ((esdp_reg & IXGBE_ESDP_SDP3)) + link_up = 0; + } - if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber && - !ad->sdp3_no_tx_disable) { - esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP); - if ((esdp_reg & IXGBE_ESDP_SDP3)) - link_up = 0; - } - - if (link_up == 0) { - if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { - ixgbe_dev_wait_setup_link_complete(dev, 0); - /* NOTE: review for potential ordering optimization */ - if (!__atomic_test_and_set(&ad->link_thread_running, __ATOMIC_SEQ_CST)) { - /* To avoid race condition between threads, set - * the IXGBE_FLAG_NEED_LINK_CONFIG flag only - * when there is no link thread running. - */ - intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; - if (rte_thread_create_internal_control(&ad->link_thread_tid, - "ixgbe-link", - ixgbe_dev_setup_link_thread_handler, dev) < 0) { + if (link_up == 0) { + if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { + ixgbe_dev_wait_setup_link_complete(dev, 0); + /* NOTE: review for potential ordering optimization */ + if (!__atomic_test_and_set(&ad->link_thread_running, __ATOMIC_SEQ_CST)) { + /* To avoid race condition between threads, set + * the IXGBE_FLAG_NEED_LINK_CONFIG flag only + * when there is no link thread running. + */ + intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; + if (rte_thread_create_internal_control(&ad->link_thread_tid, + "ixgbe-link", + ixgbe_dev_setup_link_thread_handler, dev) < 0) { + PMD_DRV_LOG(ERR, + "Create link thread failed!"); + /* NOTE: review for potential ordering optimization */ + __atomic_clear(&ad->link_thread_running, __ATOMIC_SEQ_CST); + } + } else { PMD_DRV_LOG(ERR, - "Create link thread failed!"); - /* NOTE: review for potential ordering optimization */ - __atomic_clear(&ad->link_thread_running, __ATOMIC_SEQ_CST); + "Other link thread is running now!"); } - } else { - PMD_DRV_LOG(ERR, - "Other link thread is running now!"); } + return rte_eth_linkstatus_set(dev, &link); } - return rte_eth_linkstatus_set(dev, &link); } link.link_status = RTE_ETH_LINK_UP; junwang01@cestc.cn From: junwang01@cestc.cn Date: 2024-03-14 17:22 To: Stephen Hemminger CC: dev Subject: Re: Re: dumpcap coredump for 82599 NIC Yes, I think you are right. After adding some debug information, I can confirm that it's probably an initialization issue with the ixgbe driver. Secondary processes should initialize some callback functions, but they seem to be missing. I made some minor modifications by moving the ixgbe_init_shared_code(hw) position before the secondary processes. While this brought about some changes, there still occurred a core dump. I suspect there might be other issues or that such modification might not be appropriate. [root@xc03-compute3 /]# /dpdk/app/dpdk-dumpcap -i 0000:18:00.0 mlx5_net: Cannot attach mlx5 shared data mlx5_net: Unable to init PMD global data: No such file or directory mlx5_common: Failed to load driver mlx5_eth EAL: Requested device 0000:3b:00.0 cannot be used mlx5_net: Cannot attach mlx5 shared data mlx5_net: Unable to init PMD global data: No such file or directory mlx5_common: Failed to load driver mlx5_eth EAL: Requested device 0000:3b:00.1 cannot be used File: /tmp/dpdk-dumpcap_0_0000:18:00.0_20240314091910.pcapng Capturing on '0000:18:00.0' Packets captured: 2 Primary process is no longer active, exiting... EAL: Fail to recv reply for request /var/run/dpdk/rte/mp_socket:mp_pdump pdump_prepare_client_request(): client request for pdump enable/disable failed Floating point exception (core dumped) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index d6cf00317e77b64f9822c155115f388ae62241eb..0bf885d7eaba3689fb9b98cdcaa6a928aa787985 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -1104,6 +1104,24 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts; eth_dev->tx_pkt_prepare = &ixgbe_prep_pkts; + /* Vendor and Device ID need to be set before init of shared code */ + hw->device_id = pci_dev->id.device_id; + hw->vendor_id = pci_dev->id.vendor_id; + hw->hw_addr = (void *)pci_dev->mem_resource[0].addr; + hw->allow_unsupported_sfp = 1; + + /* Initialize the shared code (base driver) */ +#ifdef RTE_LIBRTE_IXGBE_BYPASS + diag = ixgbe_bypass_init_shared_code(hw); +#else + diag = ixgbe_init_shared_code(hw); +#endif /* RTE_LIBRTE_IXGBE_BYPASS */ + + if (diag != IXGBE_SUCCESS) { + PMD_INIT_LOG(ERR, "Shared code init failed: %d", diag); + return -EIO; + } + /* * For secondary processes, we don't initialise any further as primary * has already done this work. Only check we don't need a different @@ -1135,24 +1153,6 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) rte_eth_copy_pci_info(eth_dev, pci_dev); eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; - /* Vendor and Device ID need to be set before init of shared code */ - hw->device_id = pci_dev->id.device_id; - hw->vendor_id = pci_dev->id.vendor_id; - hw->hw_addr = (void *)pci_dev->mem_resource[0].addr; - hw->allow_unsupported_sfp = 1; - - /* Initialize the shared code (base driver) */ -#ifdef RTE_LIBRTE_IXGBE_BYPASS - diag = ixgbe_bypass_init_shared_code(hw); -#else - diag = ixgbe_init_shared_code(hw); -#endif /* RTE_LIBRTE_IXGBE_BYPASS */ - - if (diag != IXGBE_SUCCESS) { - PMD_INIT_LOG(ERR, "Shared code init failed: %d", diag); - return -EIO; - } - if (hw->mac.ops.fw_recovery_mode && hw->mac.ops.fw_recovery_mode(hw)) { PMD_INIT_LOG(ERR, "\nERROR: " "Firmware recovery mode detected. Limiting functionality.\n" Additionally, I'm using a debug build, but the printed call stack still doesn't feel clear enough, which is quite strange. meson -Dc_args="-mno-avx512f" -Ddisable_drivers=net/ark,net/atlantic,net/avp,net/axgbe,net/pfe,net/netvsc -Dmax_numa_nodes=8 -Dmax_ethports=128 --buildtype=debug --optimization=0 build ninja -C build install junwang01@cestc.cn From: Stephen Hemminger Date: 2024-03-14 00:29 To: junwang01@cestc.cn CC: dev Subject: Re: dumpcap coredump for 82599 NIC On Wed, 13 Mar 2024 10:00:17 +0800 "junwang01@cestc.cn" wrote: > Hi, when I use dumpcap to capture packets on the 82559 network card, coredump appears. > The network card bound to ovs-dpdk is 82599, but when I capture packets in other non-82599 network cards (mellanox CX5/C6 or Intel's E810), it is normal. , > the dpdk version I am using is 22.11.1, but I see that the call stack is strange, so I am asking you for help. > > > > > > I thought the new version of dpdk might solve it, so I upgraded the dpdk version to 23.11, but the problem is still the same, but the call stack is different and weirder. > > > > > > > junwang01@cestc.cn This is not an issue with dumpcap. The problem is in ixgbe driver. Some part of the code for checking link status is not safe to be called in secondary process. The backtrace looks a bit messed up, since ixgbe driver should not be calling i40e code. Maybe do a debug build (so more complete symbols available).