* [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
@ 2024-11-27 17:48 Alexander Kozyrev
2024-12-03 17:43 ` Kevin Traynor
0 siblings, 1 reply; 6+ messages in thread
From: Alexander Kozyrev @ 2024-11-27 17:48 UTC (permalink / raw)
To: stable; +Cc: ktraynor, Viacheslav Ovsiienko
[ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
The completion queue element size can be independently configured
to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
at CQ creation. This parameter depends on the cache line size and
affects both regular CQEs and error CQEs. But the error handling
assumes that an error CQE is 64 bytes and doesn't take the padding
into consideration on platforms with 128-byte cache lines.
Fix the error CQE size in all error handling routines in mlx5.
Fixes: 957e45fb7bcb ("net/mlx5: handle Tx completion with error")
Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/mlx5_prm.h | 29 ++++++++++++++++++++-
drivers/common/mlx5/windows/mlx5_win_defs.h | 12 ---------
drivers/compress/mlx5/mlx5_compress.c | 4 +--
drivers/net/mlx5/mlx5_flow_aso.c | 6 ++---
drivers/net/mlx5/mlx5_rx.c | 2 +-
drivers/net/mlx5/mlx5_tx.c | 8 +++---
6 files changed, 38 insertions(+), 23 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 58aa72df64..8ce1e66888 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -249,8 +249,12 @@
/* Maximum number of DS in WQE. Limited by 6-bit field. */
#define MLX5_DSEG_MAX 63
-/* The 32 bit syndrome offset in struct mlx5_err_cqe. */
+/* The 32 bit syndrome offset in struct mlx5_error_cqe. */
+#if (RTE_CACHE_LINE_SIZE == 128)
+#define MLX5_ERROR_CQE_SYNDROME_OFFSET 116
+#else
#define MLX5_ERROR_CQE_SYNDROME_OFFSET 52
+#endif
/* The completion mode offset in the WQE control segment line 2. */
#define MLX5_COMP_MODE_OFFSET 2
@@ -379,6 +383,29 @@ struct mlx5_wqe_mprq {
#define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2
+struct mlx5_error_cqe {
+#if (RTE_CACHE_LINE_SIZE == 128)
+ uint8_t padding[64];
+#endif
+ uint8_t rsvd0[2];
+ uint16_t eth_wqe_id;
+ uint8_t rsvd1[16];
+ uint16_t ib_stride_index;
+ uint8_t rsvd2[10];
+ uint32_t srqn;
+ uint8_t rsvd3[8];
+ uint32_t byte_cnt;
+ uint8_t rsvd4[4];
+ uint8_t hw_err_synd;
+ uint8_t hw_synd_type;
+ uint8_t vendor_err_synd;
+ uint8_t syndrome;
+ uint32_t s_wqe_opcode_qpn;
+ uint16_t wqe_counter;
+ uint8_t signature;
+ uint8_t op_own;
+};
+
/* CQ element structure - should be equal to the cache line size */
struct mlx5_cqe {
#if (RTE_CACHE_LINE_SIZE == 128)
diff --git a/drivers/common/mlx5/windows/mlx5_win_defs.h b/drivers/common/mlx5/windows/mlx5_win_defs.h
index 0c09325444..d79728bcda 100644
--- a/drivers/common/mlx5/windows/mlx5_win_defs.h
+++ b/drivers/common/mlx5/windows/mlx5_win_defs.h
@@ -223,18 +223,6 @@ struct mlx5_action {
} dest_tir;
};
-struct mlx5_err_cqe {
- uint8_t rsvd0[32];
- uint32_t srqn;
- uint8_t rsvd1[18];
- uint8_t vendor_err_synd;
- uint8_t syndrome;
- uint32_t s_wqe_opcode_qpn;
- uint16_t wqe_counter;
- uint8_t signature;
- uint8_t op_own;
-};
-
struct mlx5_wqe_srq_next_seg {
uint8_t rsvd0[2];
rte_be16_t next_wqe_index;
diff --git a/drivers/compress/mlx5/mlx5_compress.c b/drivers/compress/mlx5/mlx5_compress.c
index 515b5dfa67..e1ca77795c 100644
--- a/drivers/compress/mlx5/mlx5_compress.c
+++ b/drivers/compress/mlx5/mlx5_compress.c
@@ -537,7 +537,7 @@ mlx5_compress_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe,
size_t i;
DRV_LOG(ERR, "Error cqe:");
- for (i = 0; i < sizeof(struct mlx5_err_cqe) >> 2; i += 4)
+ for (i = 0; i < sizeof(struct mlx5_error_cqe) >> 2; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
cqe[i + 2], cqe[i + 3]);
DRV_LOG(ERR, "\nError wqe:");
@@ -555,7 +555,7 @@ mlx5_compress_cqe_err_handle(struct mlx5_compress_qp *qp,
struct rte_comp_op *op)
{
const uint32_t idx = qp->ci & (qp->entries_n - 1);
- volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
+ volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe *)
&qp->cq.cqes[idx];
volatile struct mlx5_gga_wqe *wqes = (volatile struct mlx5_gga_wqe *)
qp->qp.wqes;
diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
index eb7fc43da3..0d01ca1310 100644
--- a/drivers/net/mlx5/mlx5_flow_aso.c
+++ b/drivers/net/mlx5/mlx5_flow_aso.c
@@ -406,7 +406,7 @@ mlx5_aso_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe)
int i;
DRV_LOG(ERR, "Error cqe:");
- for (i = 0; i < 16; i += 4)
+ for (i = 0; i < (int)sizeof(struct mlx5_error_cqe) / 4; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
cqe[i + 2], cqe[i + 3]);
DRV_LOG(ERR, "\nError wqe:");
@@ -426,8 +426,8 @@ mlx5_aso_cqe_err_handle(struct mlx5_aso_sq *sq)
{
struct mlx5_aso_cq *cq = &sq->cq;
uint32_t idx = cq->cq_ci & ((1 << cq->log_desc_n) - 1);
- volatile struct mlx5_err_cqe *cqe =
- (volatile struct mlx5_err_cqe *)&cq->cq_obj.cqes[idx];
+ volatile struct mlx5_error_cqe *cqe =
+ (volatile struct mlx5_error_cqe *)&cq->cq_obj.cqes[idx];
cq->errors++;
idx = rte_be_to_cpu_16(cqe->wqe_counter) & (1u << sq->log_desc_n);
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index eea6a5c6c7..78d3949276 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -433,7 +433,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
union {
volatile struct mlx5_cqe *cqe;
- volatile struct mlx5_err_cqe *err_cqe;
+ volatile struct mlx5_error_cqe *err_cqe;
} u = {
.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask],
};
diff --git a/drivers/net/mlx5/mlx5_tx.c b/drivers/net/mlx5/mlx5_tx.c
index 8f03743986..edb1e0265f 100644
--- a/drivers/net/mlx5/mlx5_tx.c
+++ b/drivers/net/mlx5/mlx5_tx.c
@@ -55,7 +55,7 @@ tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
static int
-check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
+check_err_cqe_seen(volatile struct mlx5_error_cqe *err_cqe)
{
static const uint8_t magic[] = "seen";
int ret = 1;
@@ -83,7 +83,7 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
*/
static int
mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
- volatile struct mlx5_err_cqe *err_cqe)
+ volatile struct mlx5_error_cqe *err_cqe)
{
if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
@@ -107,7 +107,7 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
mlx5_dump_debug_information(name, "MLX5 Error CQ:",
(const void *)((uintptr_t)
txq->cqes),
- sizeof(struct mlx5_cqe) *
+ sizeof(struct mlx5_error_cqe) *
(1 << txq->cqe_n));
mlx5_dump_debug_information(name, "MLX5 Error SQ:",
(const void *)((uintptr_t)
@@ -231,7 +231,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
*/
rte_wmb();
ret = mlx5_tx_error_cqe_handle
- (txq, (volatile struct mlx5_err_cqe *)cqe);
+ (txq, (volatile struct mlx5_error_cqe *)cqe);
if (unlikely(ret < 0)) {
/*
* Some error occurred on queue error
--
2.43.5
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
2024-11-27 17:48 [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE Alexander Kozyrev
@ 2024-12-03 17:43 ` Kevin Traynor
2024-12-03 17:54 ` Kevin Traynor
0 siblings, 1 reply; 6+ messages in thread
From: Kevin Traynor @ 2024-12-03 17:43 UTC (permalink / raw)
To: Alexander Kozyrev, stable; +Cc: Viacheslav Ovsiienko
On 27/11/2024 18:48, Alexander Kozyrev wrote:
> [ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
>
> The completion queue element size can be independently configured
> to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
> at CQ creation. This parameter depends on the cache line size and
> affects both regular CQEs and error CQEs. But the error handling
> assumes that an error CQE is 64 bytes and doesn't take the padding
> into consideration on platforms with 128-byte cache lines.
> Fix the error CQE size in all error handling routines in mlx5.
>
Hi Alexander,
This is causing a build error [1] on windows in the crypto code [2].
ref
https://dpdkdashboard.iol.unh.edu/results/dashboard/testruns/logs/1475543/
I didn't spend time to analyse. If you have a fix I can apply, or else
we can drop the patch.
thanks,
Kevin.
[1]
../drivers/crypto/mlx5/mlx5_crypto.c
../drivers/crypto/mlx5/mlx5_crypto.c:480:52: error: incomplete
definition of type 'struct mlx5_err_cqe'
DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
~~~^
[2]
mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct
rte_crypto_op *op)
{
const uint32_t idx = qp->ci & (qp->entries_n - 1);
volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
&qp->cq_obj.cqes[idx];
op->status = RTE_CRYPTO_OP_STATUS_ERROR;
qp->stats.dequeue_err_count++;
DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
}
> Fixes: 957e45fb7bcb ("net/mlx5: handle Tx completion with error")
>
> Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
> Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
> ---
> drivers/common/mlx5/mlx5_prm.h | 29 ++++++++++++++++++++-
> drivers/common/mlx5/windows/mlx5_win_defs.h | 12 ---------
> drivers/compress/mlx5/mlx5_compress.c | 4 +--
> drivers/net/mlx5/mlx5_flow_aso.c | 6 ++---
> drivers/net/mlx5/mlx5_rx.c | 2 +-
> drivers/net/mlx5/mlx5_tx.c | 8 +++---
> 6 files changed, 38 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
> index 58aa72df64..8ce1e66888 100644
> --- a/drivers/common/mlx5/mlx5_prm.h
> +++ b/drivers/common/mlx5/mlx5_prm.h
> @@ -249,8 +249,12 @@
> /* Maximum number of DS in WQE. Limited by 6-bit field. */
> #define MLX5_DSEG_MAX 63
>
> -/* The 32 bit syndrome offset in struct mlx5_err_cqe. */
> +/* The 32 bit syndrome offset in struct mlx5_error_cqe. */
> +#if (RTE_CACHE_LINE_SIZE == 128)
> +#define MLX5_ERROR_CQE_SYNDROME_OFFSET 116
> +#else
> #define MLX5_ERROR_CQE_SYNDROME_OFFSET 52
> +#endif
>
> /* The completion mode offset in the WQE control segment line 2. */
> #define MLX5_COMP_MODE_OFFSET 2
> @@ -379,6 +383,29 @@ struct mlx5_wqe_mprq {
>
> #define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2
>
> +struct mlx5_error_cqe {
> +#if (RTE_CACHE_LINE_SIZE == 128)
> + uint8_t padding[64];
> +#endif
> + uint8_t rsvd0[2];
> + uint16_t eth_wqe_id;
> + uint8_t rsvd1[16];
> + uint16_t ib_stride_index;
> + uint8_t rsvd2[10];
> + uint32_t srqn;
> + uint8_t rsvd3[8];
> + uint32_t byte_cnt;
> + uint8_t rsvd4[4];
> + uint8_t hw_err_synd;
> + uint8_t hw_synd_type;
> + uint8_t vendor_err_synd;
> + uint8_t syndrome;
> + uint32_t s_wqe_opcode_qpn;
> + uint16_t wqe_counter;
> + uint8_t signature;
> + uint8_t op_own;
> +};
> +
> /* CQ element structure - should be equal to the cache line size */
> struct mlx5_cqe {
> #if (RTE_CACHE_LINE_SIZE == 128)
> diff --git a/drivers/common/mlx5/windows/mlx5_win_defs.h b/drivers/common/mlx5/windows/mlx5_win_defs.h
> index 0c09325444..d79728bcda 100644
> --- a/drivers/common/mlx5/windows/mlx5_win_defs.h
> +++ b/drivers/common/mlx5/windows/mlx5_win_defs.h
> @@ -223,18 +223,6 @@ struct mlx5_action {
> } dest_tir;
> };
>
> -struct mlx5_err_cqe {
> - uint8_t rsvd0[32];
> - uint32_t srqn;
> - uint8_t rsvd1[18];
> - uint8_t vendor_err_synd;
> - uint8_t syndrome;
> - uint32_t s_wqe_opcode_qpn;
> - uint16_t wqe_counter;
> - uint8_t signature;
> - uint8_t op_own;
> -};
> -
> struct mlx5_wqe_srq_next_seg {
> uint8_t rsvd0[2];
> rte_be16_t next_wqe_index;
> diff --git a/drivers/compress/mlx5/mlx5_compress.c b/drivers/compress/mlx5/mlx5_compress.c
> index 515b5dfa67..e1ca77795c 100644
> --- a/drivers/compress/mlx5/mlx5_compress.c
> +++ b/drivers/compress/mlx5/mlx5_compress.c
> @@ -537,7 +537,7 @@ mlx5_compress_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe,
> size_t i;
>
> DRV_LOG(ERR, "Error cqe:");
> - for (i = 0; i < sizeof(struct mlx5_err_cqe) >> 2; i += 4)
> + for (i = 0; i < sizeof(struct mlx5_error_cqe) >> 2; i += 4)
> DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
> cqe[i + 2], cqe[i + 3]);
> DRV_LOG(ERR, "\nError wqe:");
> @@ -555,7 +555,7 @@ mlx5_compress_cqe_err_handle(struct mlx5_compress_qp *qp,
> struct rte_comp_op *op)
> {
> const uint32_t idx = qp->ci & (qp->entries_n - 1);
> - volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
> + volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe *)
> &qp->cq.cqes[idx];
> volatile struct mlx5_gga_wqe *wqes = (volatile struct mlx5_gga_wqe *)
> qp->qp.wqes;
> diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
> index eb7fc43da3..0d01ca1310 100644
> --- a/drivers/net/mlx5/mlx5_flow_aso.c
> +++ b/drivers/net/mlx5/mlx5_flow_aso.c
> @@ -406,7 +406,7 @@ mlx5_aso_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe)
> int i;
>
> DRV_LOG(ERR, "Error cqe:");
> - for (i = 0; i < 16; i += 4)
> + for (i = 0; i < (int)sizeof(struct mlx5_error_cqe) / 4; i += 4)
> DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
> cqe[i + 2], cqe[i + 3]);
> DRV_LOG(ERR, "\nError wqe:");
> @@ -426,8 +426,8 @@ mlx5_aso_cqe_err_handle(struct mlx5_aso_sq *sq)
> {
> struct mlx5_aso_cq *cq = &sq->cq;
> uint32_t idx = cq->cq_ci & ((1 << cq->log_desc_n) - 1);
> - volatile struct mlx5_err_cqe *cqe =
> - (volatile struct mlx5_err_cqe *)&cq->cq_obj.cqes[idx];
> + volatile struct mlx5_error_cqe *cqe =
> + (volatile struct mlx5_error_cqe *)&cq->cq_obj.cqes[idx];
>
> cq->errors++;
> idx = rte_be_to_cpu_16(cqe->wqe_counter) & (1u << sq->log_desc_n);
> diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
> index eea6a5c6c7..78d3949276 100644
> --- a/drivers/net/mlx5/mlx5_rx.c
> +++ b/drivers/net/mlx5/mlx5_rx.c
> @@ -433,7 +433,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
> container_of(rxq, struct mlx5_rxq_ctrl, rxq);
> union {
> volatile struct mlx5_cqe *cqe;
> - volatile struct mlx5_err_cqe *err_cqe;
> + volatile struct mlx5_error_cqe *err_cqe;
> } u = {
> .cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask],
> };
> diff --git a/drivers/net/mlx5/mlx5_tx.c b/drivers/net/mlx5/mlx5_tx.c
> index 8f03743986..edb1e0265f 100644
> --- a/drivers/net/mlx5/mlx5_tx.c
> +++ b/drivers/net/mlx5/mlx5_tx.c
> @@ -55,7 +55,7 @@ tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
>
> /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
> static int
> -check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
> +check_err_cqe_seen(volatile struct mlx5_error_cqe *err_cqe)
> {
> static const uint8_t magic[] = "seen";
> int ret = 1;
> @@ -83,7 +83,7 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
> */
> static int
> mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
> - volatile struct mlx5_err_cqe *err_cqe)
> + volatile struct mlx5_error_cqe *err_cqe)
> {
> if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
> const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
> @@ -107,7 +107,7 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
> mlx5_dump_debug_information(name, "MLX5 Error CQ:",
> (const void *)((uintptr_t)
> txq->cqes),
> - sizeof(struct mlx5_cqe) *
> + sizeof(struct mlx5_error_cqe) *
> (1 << txq->cqe_n));
> mlx5_dump_debug_information(name, "MLX5 Error SQ:",
> (const void *)((uintptr_t)
> @@ -231,7 +231,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
> */
> rte_wmb();
> ret = mlx5_tx_error_cqe_handle
> - (txq, (volatile struct mlx5_err_cqe *)cqe);
> + (txq, (volatile struct mlx5_error_cqe *)cqe);
> if (unlikely(ret < 0)) {
> /*
> * Some error occurred on queue error
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
2024-12-03 17:43 ` Kevin Traynor
@ 2024-12-03 17:54 ` Kevin Traynor
2024-12-03 18:08 ` Alexander Kozyrev
0 siblings, 1 reply; 6+ messages in thread
From: Kevin Traynor @ 2024-12-03 17:54 UTC (permalink / raw)
To: Alexander Kozyrev, stable; +Cc: Viacheslav Ovsiienko
On 03/12/2024 18:43, Kevin Traynor wrote:
> On 27/11/2024 18:48, Alexander Kozyrev wrote:
>> [ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
>>
>> The completion queue element size can be independently configured
>> to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
>> at CQ creation. This parameter depends on the cache line size and
>> affects both regular CQEs and error CQEs. But the error handling
>> assumes that an error CQE is 64 bytes and doesn't take the padding
>> into consideration on platforms with 128-byte cache lines.
>> Fix the error CQE size in all error handling routines in mlx5.
>>
> Hi Alexander,
>
> This is causing a build error [1] on windows in the crypto code [2].
>
> ref
> https://dpdkdashboard.iol.unh.edu/results/dashboard/testruns/logs/1475543/
>
> I didn't spend time to analyse. If you have a fix I can apply, or else
> we can drop the patch.
>
> thanks,
> Kevin.
>
> [1]
> ../drivers/crypto/mlx5/mlx5_crypto.c
> ../drivers/crypto/mlx5/mlx5_crypto.c:480:52: error: incomplete
> definition of type 'struct mlx5_err_cqe'
> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
> ~~~^
> [2]
> mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct
> rte_crypto_op *op)
> {
> const uint32_t idx = qp->ci & (qp->entries_n - 1);
> volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
> &qp->cq_obj.cqes[idx];
>
> op->status = RTE_CRYPTO_OP_STATUS_ERROR;
> qp->stats.dequeue_err_count++;
> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
> }
>
btw, the code that was run is ahead of the 21.11 branch.
It is at: https://dpdk.org/git/dpdk-stable on branch 21.11-staging
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
2024-12-03 17:54 ` Kevin Traynor
@ 2024-12-03 18:08 ` Alexander Kozyrev
2024-12-03 18:17 ` Kevin Traynor
0 siblings, 1 reply; 6+ messages in thread
From: Alexander Kozyrev @ 2024-12-03 18:08 UTC (permalink / raw)
To: Kevin Traynor, stable; +Cc: Slava Ovsiienko
[-- Attachment #1.1: Type: text/plain, Size: 2014 bytes --]
> On 03/12/2024 18:43, Kevin Traynor wrote:
>> On 27/11/2024 18:48, Alexander Kozyrev wrote:
>>> [ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
>>>
>>> The completion queue element size can be independently configured
>>> to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
>>> at CQ creation. This parameter depends on the cache line size and
>>> affects both regular CQEs and error CQEs. But the error handling
>>> assumes that an error CQE is 64 bytes and doesn't take the padding
>>> into consideration on platforms with 128-byte cache lines.
>>> Fix the error CQE size in all error handling routines in mlx5.
>>>
>> Hi Alexander,
>>
>> This is causing a build error [1] on windows in the crypto code [2].
>>
>> ref
>> https://dpdkdashboard.iol.unh.edu/results/dashboard/testruns/logs/1475543/
>>
>> I didn't spend time to analyse. If you have a fix I can apply, or else
>> we can drop the patch.
>>
>> thanks,
>> Kevin.
>>
>> [1]
>> ../drivers/crypto/mlx5/mlx5_crypto.c
>> ../drivers/crypto/mlx5/mlx5_crypto.c:480:52: error: incomplete
>> definition of type 'struct mlx5_err_cqe'
>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>> ~~~^
>> [2]
>> mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct
>> rte_crypto_op *op)
>> {
>> const uint32_t idx = qp->ci & (qp->entries_n - 1);
>> volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
>> &qp->cq_obj.cqes[idx];
>>
>> op->status = RTE_CRYPTO_OP_STATUS_ERROR;
>> qp->stats.dequeue_err_count++;
>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>> }
>>
>
> btw, the code that was run is ahead of the 21.11 branch.
>
> It is at: https://dpdk.org/git/dpdk-stable on branch 21.11-staging
Sorry, somehow missed the crypto driver on 21.11. Attaching the proper patch.
[-- Attachment #1.2: Type: text/html, Size: 4311 bytes --]
[-- Attachment #2: 0001-common-mlx5-fix-error-CQE-handling-for-128-bytes-CQE.patch --]
[-- Type: application/octet-stream, Size: 8097 bytes --]
From ef1696ad3dc30c73f175a13e2919eadff1490ff9 Mon Sep 17 00:00:00 2001
From: Alexander Kozyrev <akozyrev@nvidia.com>
Date: Wed, 27 Nov 2024 19:48:23 +0200
Subject: [PATCH] common/mlx5: fix error CQE handling for 128 bytes CQE
[ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
The completion queue element size can be independently configured
to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
at CQ creation. This parameter depends on the cache line size and
affects both regular CQEs and error CQEs. But the error handling
assumes that an error CQE is 64 bytes and doesn't take the padding
into consideration on platforms with 128-byte cache lines.
Fix the error CQE size in all error handling routines in mlx5.
Fixes: 957e45fb7bcb ("net/mlx5: handle Tx completion with error")
Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/mlx5_prm.h | 29 ++++++++++++++++++++-
drivers/common/mlx5/windows/mlx5_win_defs.h | 12 ---------
drivers/compress/mlx5/mlx5_compress.c | 4 +--
drivers/crypto/mlx5/mlx5_crypto.c | 2 +-
drivers/net/mlx5/mlx5_flow_aso.c | 6 ++---
drivers/net/mlx5/mlx5_rx.c | 2 +-
drivers/net/mlx5/mlx5_tx.c | 8 +++---
7 files changed, 39 insertions(+), 24 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index fd5c8b5c72..f561e53f6b 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -249,8 +249,12 @@
/* Maximum number of DS in WQE. Limited by 6-bit field. */
#define MLX5_DSEG_MAX 63
-/* The 32 bit syndrome offset in struct mlx5_err_cqe. */
+/* The 32 bit syndrome offset in struct mlx5_error_cqe. */
+#if (RTE_CACHE_LINE_SIZE == 128)
+#define MLX5_ERROR_CQE_SYNDROME_OFFSET 116
+#else
#define MLX5_ERROR_CQE_SYNDROME_OFFSET 52
+#endif
/* The completion mode offset in the WQE control segment line 2. */
#define MLX5_COMP_MODE_OFFSET 2
@@ -379,6 +383,29 @@ struct mlx5_wqe_mprq {
#define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2
+struct mlx5_error_cqe {
+#if (RTE_CACHE_LINE_SIZE == 128)
+ uint8_t padding[64];
+#endif
+ uint8_t rsvd0[2];
+ uint16_t eth_wqe_id;
+ uint8_t rsvd1[16];
+ uint16_t ib_stride_index;
+ uint8_t rsvd2[10];
+ uint32_t srqn;
+ uint8_t rsvd3[8];
+ uint32_t byte_cnt;
+ uint8_t rsvd4[4];
+ uint8_t hw_err_synd;
+ uint8_t hw_synd_type;
+ uint8_t vendor_err_synd;
+ uint8_t syndrome;
+ uint32_t s_wqe_opcode_qpn;
+ uint16_t wqe_counter;
+ uint8_t signature;
+ uint8_t op_own;
+};
+
/* CQ element structure - should be equal to the cache line size */
struct mlx5_cqe {
#if (RTE_CACHE_LINE_SIZE == 128)
diff --git a/drivers/common/mlx5/windows/mlx5_win_defs.h b/drivers/common/mlx5/windows/mlx5_win_defs.h
index 0c09325444..d79728bcda 100644
--- a/drivers/common/mlx5/windows/mlx5_win_defs.h
+++ b/drivers/common/mlx5/windows/mlx5_win_defs.h
@@ -223,18 +223,6 @@ struct mlx5_action {
} dest_tir;
};
-struct mlx5_err_cqe {
- uint8_t rsvd0[32];
- uint32_t srqn;
- uint8_t rsvd1[18];
- uint8_t vendor_err_synd;
- uint8_t syndrome;
- uint32_t s_wqe_opcode_qpn;
- uint16_t wqe_counter;
- uint8_t signature;
- uint8_t op_own;
-};
-
struct mlx5_wqe_srq_next_seg {
uint8_t rsvd0[2];
rte_be16_t next_wqe_index;
diff --git a/drivers/compress/mlx5/mlx5_compress.c b/drivers/compress/mlx5/mlx5_compress.c
index 515b5dfa67..e1ca77795c 100644
--- a/drivers/compress/mlx5/mlx5_compress.c
+++ b/drivers/compress/mlx5/mlx5_compress.c
@@ -537,7 +537,7 @@ mlx5_compress_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe,
size_t i;
DRV_LOG(ERR, "Error cqe:");
- for (i = 0; i < sizeof(struct mlx5_err_cqe) >> 2; i += 4)
+ for (i = 0; i < sizeof(struct mlx5_error_cqe) >> 2; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
cqe[i + 2], cqe[i + 3]);
DRV_LOG(ERR, "\nError wqe:");
@@ -555,7 +555,7 @@ mlx5_compress_cqe_err_handle(struct mlx5_compress_qp *qp,
struct rte_comp_op *op)
{
const uint32_t idx = qp->ci & (qp->entries_n - 1);
- volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
+ volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe *)
&qp->cq.cqes[idx];
volatile struct mlx5_gga_wqe *wqes = (volatile struct mlx5_gga_wqe *)
qp->qp.wqes;
diff --git a/drivers/crypto/mlx5/mlx5_crypto.c b/drivers/crypto/mlx5/mlx5_crypto.c
index 36db31aae5..8a53af8aef 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.c
+++ b/drivers/crypto/mlx5/mlx5_crypto.c
@@ -472,7 +472,7 @@ static __rte_noinline void
mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct rte_crypto_op *op)
{
const uint32_t idx = qp->ci & (qp->entries_n - 1);
- volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
+ volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe *)
&qp->cq_obj.cqes[idx];
op->status = RTE_CRYPTO_OP_STATUS_ERROR;
diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
index eb7fc43da3..0d01ca1310 100644
--- a/drivers/net/mlx5/mlx5_flow_aso.c
+++ b/drivers/net/mlx5/mlx5_flow_aso.c
@@ -406,7 +406,7 @@ mlx5_aso_dump_err_objs(volatile uint32_t *cqe, volatile uint32_t *wqe)
int i;
DRV_LOG(ERR, "Error cqe:");
- for (i = 0; i < 16; i += 4)
+ for (i = 0; i < (int)sizeof(struct mlx5_error_cqe) / 4; i += 4)
DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1],
cqe[i + 2], cqe[i + 3]);
DRV_LOG(ERR, "\nError wqe:");
@@ -426,8 +426,8 @@ mlx5_aso_cqe_err_handle(struct mlx5_aso_sq *sq)
{
struct mlx5_aso_cq *cq = &sq->cq;
uint32_t idx = cq->cq_ci & ((1 << cq->log_desc_n) - 1);
- volatile struct mlx5_err_cqe *cqe =
- (volatile struct mlx5_err_cqe *)&cq->cq_obj.cqes[idx];
+ volatile struct mlx5_error_cqe *cqe =
+ (volatile struct mlx5_error_cqe *)&cq->cq_obj.cqes[idx];
cq->errors++;
idx = rte_be_to_cpu_16(cqe->wqe_counter) & (1u << sq->log_desc_n);
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index eea6a5c6c7..78d3949276 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -433,7 +433,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
union {
volatile struct mlx5_cqe *cqe;
- volatile struct mlx5_err_cqe *err_cqe;
+ volatile struct mlx5_error_cqe *err_cqe;
} u = {
.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask],
};
diff --git a/drivers/net/mlx5/mlx5_tx.c b/drivers/net/mlx5/mlx5_tx.c
index 8f03743986..edb1e0265f 100644
--- a/drivers/net/mlx5/mlx5_tx.c
+++ b/drivers/net/mlx5/mlx5_tx.c
@@ -55,7 +55,7 @@ tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
static int
-check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
+check_err_cqe_seen(volatile struct mlx5_error_cqe *err_cqe)
{
static const uint8_t magic[] = "seen";
int ret = 1;
@@ -83,7 +83,7 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
*/
static int
mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
- volatile struct mlx5_err_cqe *err_cqe)
+ volatile struct mlx5_error_cqe *err_cqe)
{
if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
@@ -107,7 +107,7 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
mlx5_dump_debug_information(name, "MLX5 Error CQ:",
(const void *)((uintptr_t)
txq->cqes),
- sizeof(struct mlx5_cqe) *
+ sizeof(struct mlx5_error_cqe) *
(1 << txq->cqe_n));
mlx5_dump_debug_information(name, "MLX5 Error SQ:",
(const void *)((uintptr_t)
@@ -231,7 +231,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
*/
rte_wmb();
ret = mlx5_tx_error_cqe_handle
- (txq, (volatile struct mlx5_err_cqe *)cqe);
+ (txq, (volatile struct mlx5_error_cqe *)cqe);
if (unlikely(ret < 0)) {
/*
* Some error occurred on queue error
--
2.43.5
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
2024-12-03 18:08 ` Alexander Kozyrev
@ 2024-12-03 18:17 ` Kevin Traynor
2024-12-04 11:33 ` Kevin Traynor
0 siblings, 1 reply; 6+ messages in thread
From: Kevin Traynor @ 2024-12-03 18:17 UTC (permalink / raw)
To: Alexander Kozyrev, stable; +Cc: Slava Ovsiienko
On 03/12/2024 19:08, Alexander Kozyrev wrote:
>> On 03/12/2024 18:43, Kevin Traynor wrote:
>>> On 27/11/2024 18:48, Alexander Kozyrev wrote:
>>>> [ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
>>>>
>>>> The completion queue element size can be independently configured
>>>> to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
>>>> at CQ creation. This parameter depends on the cache line size and
>>>> affects both regular CQEs and error CQEs. But the error handling
>>>> assumes that an error CQE is 64 bytes and doesn't take the padding
>>>> into consideration on platforms with 128-byte cache lines.
>>>> Fix the error CQE size in all error handling routines in mlx5.
>>>>
>>> Hi Alexander,
>>>
>>> This is causing a build error [1] on windows in the crypto code [2].
>>>
>>> ref
>>> https://dpdkdashboard.iol.unh.edu/results/dashboard/testruns/logs/1475543/
>>>
>>> I didn't spend time to analyse. If you have a fix I can apply, or else
>>> we can drop the patch.
>>>
>>> thanks,
>>> Kevin.
>>>
>>> [1]
>>> ../drivers/crypto/mlx5/mlx5_crypto.c
>>> ../drivers/crypto/mlx5/mlx5_crypto.c:480:52: error: incomplete
>>> definition of type 'struct mlx5_err_cqe'
>>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>>> ~~~^
>>> [2]
>>> mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct
>>> rte_crypto_op *op)
>>> {
>>> const uint32_t idx = qp->ci & (qp->entries_n - 1);
>>> volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
>>> &qp->cq_obj.cqes[idx];
>>>
>>> op->status = RTE_CRYPTO_OP_STATUS_ERROR;
>>> qp->stats.dequeue_err_count++;
>>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>>> }
>>>
>>
>> btw, the code that was run is ahead of the 21.11 branch.
>>
>> It is at: https://dpdk.org/git/dpdk-stable on branch 21.11-staging
>
> Sorry, somehow missed the crypto driver on 21.11. Attaching the proper patch.
>
>
Thanks for the quick fix! I've pushed to the 21.11-staging branch and
the ci will be re-run overnight.
thanks,
Kevin.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE
2024-12-03 18:17 ` Kevin Traynor
@ 2024-12-04 11:33 ` Kevin Traynor
0 siblings, 0 replies; 6+ messages in thread
From: Kevin Traynor @ 2024-12-04 11:33 UTC (permalink / raw)
To: Alexander Kozyrev, stable; +Cc: Slava Ovsiienko
On 03/12/2024 19:17, Kevin Traynor wrote:
> On 03/12/2024 19:08, Alexander Kozyrev wrote:
>>> On 03/12/2024 18:43, Kevin Traynor wrote:
>>>> On 27/11/2024 18:48, Alexander Kozyrev wrote:
>>>>> [ upstream commit 3cddeba0ca38b00c7dc646277484d08a4cb2d862 ]
>>>>>
>>>>> The completion queue element size can be independently configured
>>>>> to report either 64 or 128 bytes CQEs by programming cqe_sz parameter
>>>>> at CQ creation. This parameter depends on the cache line size and
>>>>> affects both regular CQEs and error CQEs. But the error handling
>>>>> assumes that an error CQE is 64 bytes and doesn't take the padding
>>>>> into consideration on platforms with 128-byte cache lines.
>>>>> Fix the error CQE size in all error handling routines in mlx5.
>>>>>
>>>> Hi Alexander,
>>>>
>>>> This is causing a build error [1] on windows in the crypto code [2].
>>>>
>>>> ref
>>>> https://dpdkdashboard.iol.unh.edu/results/dashboard/testruns/logs/1475543/
>>>>
>>>> I didn't spend time to analyse. If you have a fix I can apply, or else
>>>> we can drop the patch.
>>>>
>>>> thanks,
>>>> Kevin.
>>>>
>>>> [1]
>>>> ../drivers/crypto/mlx5/mlx5_crypto.c
>>>> ../drivers/crypto/mlx5/mlx5_crypto.c:480:52: error: incomplete
>>>> definition of type 'struct mlx5_err_cqe'
>>>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>>>> ~~~^
>>>> [2]
>>>> mlx5_crypto_cqe_err_handle(struct mlx5_crypto_qp *qp, struct
>>>> rte_crypto_op *op)
>>>> {
>>>> const uint32_t idx = qp->ci & (qp->entries_n - 1);
>>>> volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
>>>> &qp->cq_obj.cqes[idx];
>>>>
>>>> op->status = RTE_CRYPTO_OP_STATUS_ERROR;
>>>> qp->stats.dequeue_err_count++;
>>>> DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
>>>> }
>>>>
>>>
>>> btw, the code that was run is ahead of the 21.11 branch.
>>>
>>> It is at: https://dpdk.org/git/dpdk-stable on branch 21.11-staging
>>
>> Sorry, somehow missed the crypto driver on 21.11. Attaching the proper patch.
>>
>>
>
> Thanks for the quick fix! I've pushed to the 21.11-staging branch and
> the ci will be re-run overnight.
>
CI passed. Thanks for rebasing. Applied and pushed to 21.11 branch.
Kevin.
> thanks,
> Kevin.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-12-04 11:33 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-11-27 17:48 [PATCH 21.11] common/mlx5: fix error CQE handling for 128 bytes CQE Alexander Kozyrev
2024-12-03 17:43 ` Kevin Traynor
2024-12-03 17:54 ` Kevin Traynor
2024-12-03 18:08 ` Alexander Kozyrev
2024-12-03 18:17 ` Kevin Traynor
2024-12-04 11:33 ` Kevin Traynor
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).