We are using 2 Mellanox VFs with DPDK v22.11 but seeing an issue when dpdk rte_proc_secondary process is trying to xmit packets out. Please note DPDK rte_proc_primary process is able to successfully xmit packets out. Issue seems to be in check_cqe as it always returns MLX5_CQE_STATUS_HW_OWN.
admin@10-50-54-244:~$ lspci | grep "Mellanox"
00:07.0 Ethernet controller: Mellanox Technologies MT27700 Family [ConnectX-4 Virtual Function]
00:08.0 Ethernet controller: Mellanox Technologies MT27700 Family [ConnectX-4 Virtual Function]
In our application.
proc0 -> is DPDK rte_proc_primary which initializes the necessary shared memory data structures.
proc1 -> is DPDK rte_proc_secondary which attaches to pre-initialized shared memory.
proc0(rte_proc_primary) uses port0(00:07.0) to xmit packets out - works fine as expected.
But proc1(rte_proc_secondary) uses port1(00:08.0) to xmit packets out - doesn't work as the packet is not seen on the wire.
code snippet for below gdb outputs
mlx5_tx.c180 */
181 void
182 mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
183 unsigned int olx __rte_unused)
184 {
185 unsigned int count = MLX5_TX_COMP_MAX_CQE;
186 volatile struct mlx5_cqe *last_cqe = NULL;
187 bool ring_doorbell = false;
188 int ret;
189
190 do {
191 volatile struct mlx5_cqe *cqe;
192
193 cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
194 ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
195 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
196 if (likely(ret != MLX5_CQE_STATUS_ERR)) {
197 /* No new CQEs in completion queue. */
198 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
199 break;
200 }mlx5_common.h195 static __rte_always_inline enum mlx5_cqe_status
196 check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
197 const uint16_t ci)
198 {
199 const uint16_t idx = ci & cqes_n;
200 const uint8_t op_own = cqe->op_own;
201 const uint8_t op_owner = MLX5_CQE_OWNER(op_own);
202 const uint8_t op_code = MLX5_CQE_OPCODE(op_own);
203
204 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
205 return MLX5_CQE_STATUS_HW_OWN;
206 rte_io_rmb();
207 if (unlikely(op_code == MLX5_CQE_RESP_ERR ||
208 op_code == MLX5_CQE_REQ_ERR))
209 return MLX5_CQE_STATUS_ERR;
210 return MLX5_CQE_STATUS_SW_OWN;
211 }
proc1(non-working process): we have noticed the cq_ci remains 0 and doesn't increase.
Thread 1 "se_dp" hit Breakpoint 1, mlx5_tx_handle_completion (txq=0x6000496c72c0, olx=127)
at ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c:184
184 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
185 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
186 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
187 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
193 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
194 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
195 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) info locals
cqe = 0x60004962b000
count = 2
last_cqe = 0x0
ring_doorbell = false
ret = -2
(gdb) p *txq
$1 = {elts_head = 35, elts_tail = 0, elts_comp = 32, elts_s = 1024, elts_m = 1023, wqe_ci = 35,
wqe_pi = 0, wqe_s = 4096, wqe_m = 4095, wqe_comp = 32, wqe_thres = 512, cq_ci = 0, cq_pi = 1,
cqe_s = 64, cqe_m = 63, elts_n = 10, cqe_n = 6, wqe_n = 12, tso_en = 1, tunnel_en = 0, swp_en = 0,
vlan_en = 0, db_nc = 0, db_heu = 0, rt_timestamp = 0, wait_on_time = 0, fast_free = 0,
inlen_send = 18, inlen_empw = 0, inlen_mode = 18, qp_num_8s = 340992, offloads = 32815, mr_ctrl = {
dev_gen_ptr = 0x60004c2d62b4, cur_gen = 0, mru = 0, head = 0, cache = {{start = 0, end = 0,
lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0,
end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {
start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}}, cache_bh = {len = 1,
size = 256, table = 0x6000496c5d40}}, wqes = 0x60004c255000, wqes_end = 0x60004c295000,
fcqs = 0x60004c295dc0, cqes = 0x60004962b000, qp_db = 0x60004c295004, cq_db = 0x60004962c000,
port_id = 1, idx = 0, rt_timemask = 0, ts_mask = 0, ts_offset = -1, sh = 0x60004b865880, stats = {
opackets = 35, obytes = 2228, oerrors = 0}, stats_reset = {opackets = 0, obytes = 0, oerrors = 0},
uar_data = {db = 0x0}, elts = 0x6000496c7448}
and check_cqe always returns MLX5_CQE_STATUS_HW_OWN
(gdb)
194 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) s
check_cqe (ci=0, cqes_n=64, cqe=0x60004962b000) at ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h:199
199 ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h: No such file or directory.
(gdb) n
200 in ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
201 in ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
202 in ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
204 in ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb) n
205 in ../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb) info locals
idx = 0
op_own = 241 '\361'
op_owner = 1 '\001'
op_code = 15 '\017'
Because of check_cqe return being MLX5_CQE_STATUS_HW_OWN , we break in line 199 in mlx5_tx_handle_completion and ring_doorbell remains false forever.
Below are the logs from mlx5_txq_devx_obj_new which is called by proc0(rte_proc_primary) for port 1ppriv: 0x60004b8316c0 ,ppriv->uar_table: 0x60004b8316c8, txq_ctrl->uar_mmap_offset:0, ppriv->uar_table[txq_data->idx]:0x7f6b2d211800, txq_data->idx: 0, txq_data->db_nc:0
and logs from txq_uar_init_secondary which gets called by proc1(rte_proc_secondary) for port 1priv: 0x60004b8352c0, priv->sh: 0x60004b865880, priv->sh->pppriv: 0x60004b8316c0txq_ctrl:0x6000496c71c0 priv:0x60004b8352c0primary_ppriv->uar_table: 0x60004b8316c8 ,uar_va:7f6b2d211800 offset:800 addr:0x7f6b3fe47800ppriv:0x60004962a180 ppriv->uar_table[txq->idx]:0x7f6b3fe47800, txq->idx:0
Now for the working cases all the counters are incrementing as expected.
proc0(rte_proc_primary - working case): cq_ci, cq_pi and other counters are as expected.
Thread 1 "se_dp" hit Breakpoint 1, mlx5_tx_handle_completion (txq=0x60004b898940, olx=127) at ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c:184
184 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
185 in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) p *txq
$2 = {elts_head = 960, elts_tail = 931, elts_comp = 931, elts_s = 1024, elts_m = 1023, wqe_ci = 960, wqe_pi = 930, wqe_s = 4096, wqe_m = 4095, wqe_comp = 931, wqe_thres = 512, cq_ci = 28, cq_pi = 28, cqe_s = 64,
cqe_m = 63, elts_n = 10, cqe_n = 6, wqe_n = 12, tso_en = 1, tunnel_en = 0, swp_en = 0, vlan_en = 0, db_nc = 0, db_heu = 0, rt_timestamp = 0, wait_on_time = 0, fast_free = 0, inlen_send = 18, inlen_empw = 0,
inlen_mode = 18, qp_num_8s = 865280, offloads = 32815, mr_ctrl = {dev_gen_ptr = 0x600049a000f4, cur_gen = 0, mru = 0, head = 0, cache = {{start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {
start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}}, cache_bh = {
len = 1, size = 256, table = 0x60004b8973c0}}, wqes = 0x600049655000, wqes_end = 0x600049695000, fcqs = 0x600049697100, cqes = 0x600049696000, qp_db = 0x600049695004, cq_db = 0x600049697000, port_id = 0,
idx = 0, rt_timemask = 0, ts_mask = 0, ts_offset = -1, sh = 0x60004be00c40, stats = {opackets = 960, obytes = 73222, oerrors = 0}, stats_reset = {opackets = 0, obytes = 0, oerrors = 0}, uar_data = {db = 0x0},
elts = 0x60004b898ac8}
(gdb)
Few questions:
1. Why isn't the cqi counter increasing in proc1(rte_proc_secondary)? Does it mean the mlx backend hardware is not consuming the packets?
2. Why is the check_cqe stuck at MLX5_CQE_STATUS_HW_OWN in proc1(rte_proc_secondary) ?
Thanks,
Samar