* [PATCH 3/5] net/mlx5: support enhanced CQE compression in Rx burst
2023-02-28 16:43 [PATCH 0/5] net/mlx5: enhanced CQE compression layout Alexander Kozyrev
2023-02-28 16:43 ` [PATCH 1/5] common/mlx5: detect enhanced CQE compression capability Alexander Kozyrev
2023-02-28 16:43 ` [PATCH 2/5] common/mlx5: add CQE validity iteration count Alexander Kozyrev
@ 2023-02-28 16:43 ` Alexander Kozyrev
2023-03-06 13:01 ` Slava Ovsiienko
2023-02-28 16:43 ` [PATCH 4/5] net/mlx5: support enhanced CQE zipping in vector " Alexander Kozyrev
` (2 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Alexander Kozyrev @ 2023-02-28 16:43 UTC (permalink / raw)
To: dev; +Cc: rasland, viacheslavo, matan
net/mlx5: support enhanced CQE compression
Enhanced CQE compression changes the structure of the compression block
and the number of miniCQEs per miniCQE array. Adapt to these changes in
the datapath by defining a new parsing mechanism of a miniCQE array:
1. The title CQE is no longer marked as the compressed one.
Need to copy it for the future miniCQE arrays parsing.
2. Mini CQE arrays now consist of up to 7 miniCQEs and a control block.
The control block contains the number of miniCQEs in the array
as well as an indication that this CQE is compressed.
3. The invalidation of reserved CQEs between miniCQEs arrays is not needed.
4. The owner_bit is replaced the validity_iteration_count for all CQEs.
Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
drivers/net/mlx5/mlx5_rx.c | 175 +++++++++++++++++++++++-------------
drivers/net/mlx5/mlx5_rx.h | 12 +--
drivers/net/mlx5/mlx5_rxq.c | 5 +-
3 files changed, 123 insertions(+), 69 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index 99a08ef5f1..d2eb732cf1 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -39,7 +39,8 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
static __rte_always_inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
- uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe,
+ uint16_t cqe_n, uint16_t cqe_mask,
+ volatile struct mlx5_mini_cqe8 **mcqe,
uint16_t *skip_cnt, bool mprq);
static __rte_always_inline uint32_t
@@ -297,15 +298,22 @@ int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
const unsigned int cqe_num = 1 << rxq->cqe_n;
const unsigned int cqe_mask = cqe_num - 1;
const uint16_t idx = rxq->cq_ci & cqe_num;
+ const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
if (unlikely(rxq->cqes == NULL)) {
rte_errno = EINVAL;
return -rte_errno;
}
- pmc->addr = &cqe->op_own;
- pmc->opaque[CLB_VAL_IDX] = !!idx;
- pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
+ if (rxq->cqe_comp_layout) {
+ pmc->addr = &cqe->validity_iteration_count;
+ pmc->opaque[CLB_VAL_IDX] = vic;
+ pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_VIC_INIT;
+ } else {
+ pmc->addr = &cqe->op_own;
+ pmc->opaque[CLB_VAL_IDX] = !!idx;
+ pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
+ }
pmc->fn = mlx5_monitor_callback;
pmc->size = sizeof(uint8_t);
return 0;
@@ -593,6 +601,10 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
* Pointer to RX queue.
* @param cqe
* CQE to process.
+ * @param cqe_n
+ * Completion queue count.
+ * @param cqe_mask
+ * Completion queue mask.
* @param[out] mcqe
* Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
* written.
@@ -608,13 +620,13 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
*/
static inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
- uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe,
+ uint16_t cqe_n, uint16_t cqe_mask,
+ volatile struct mlx5_mini_cqe8 **mcqe,
uint16_t *skip_cnt, bool mprq)
{
struct rxq_zip *zip = &rxq->zip;
- uint16_t cqe_n = cqe_cnt + 1;
int len = 0, ret = 0;
- uint16_t idx, end;
+ uint32_t idx, end;
do {
len = 0;
@@ -623,39 +635,47 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
(uintptr_t)(&(*rxq->cqes)[zip->ca &
- cqe_cnt].pkt_info);
+ cqe_mask].pkt_info);
len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
- rxq->byte_mask);
+ rxq->byte_mask);
*mcqe = &(*mc)[zip->ai & 7];
- if ((++zip->ai & 7) == 0) {
- /* Invalidate consumed CQEs */
- idx = zip->ca;
- end = zip->na;
- while (idx != end) {
- (*rxq->cqes)[idx & cqe_cnt].op_own =
- MLX5_CQE_INVALIDATE;
- ++idx;
+ if (rxq->cqe_comp_layout) {
+ zip->ai++;
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
}
- /*
- * Increment consumer index to skip the number
- * of CQEs consumed. Hardware leaves holes in
- * the CQ ring for software use.
- */
- zip->ca = zip->na;
- zip->na += 8;
- }
- if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
- /* Invalidate the rest */
- idx = zip->ca;
- end = zip->cq_ci;
-
- while (idx != end) {
- (*rxq->cqes)[idx & cqe_cnt].op_own =
- MLX5_CQE_INVALIDATE;
- ++idx;
+ } else {
+ if ((++zip->ai & 7) == 0) {
+ /* Invalidate consumed CQEs */
+ idx = zip->ca;
+ end = zip->na;
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_mask].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ /*
+ * Increment consumer index to skip the number
+ * of CQEs consumed. Hardware leaves holes in
+ * the CQ ring for software use.
+ */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ /* Invalidate the rest */
+ idx = zip->ca;
+ end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_mask].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
}
- rxq->cq_ci = zip->cq_ci;
- zip->ai = 0;
}
/*
* No compressed data, get next CQE and verify if it is
@@ -665,7 +685,9 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
int8_t op_own;
uint32_t cq_ci;
- ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+ ret = (rxq->cqe_comp_layout) ?
+ check_cqe_iteration(cqe, rxq->cqe_n, rxq->cq_ci) :
+ check_cqe(cqe, cqe_n, rxq->cq_ci);
if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
rxq->err_state)) {
@@ -685,16 +707,18 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
* actual CQE boundary (not pointing to the middle
* of compressed CQE session).
*/
- cq_ci = rxq->cq_ci + 1;
+ cq_ci = rxq->cq_ci + !rxq->cqe_comp_layout;
op_own = cqe->op_own;
if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
(uintptr_t)(&(*rxq->cqes)
- [cq_ci & cqe_cnt].pkt_info);
+ [cq_ci & cqe_mask].pkt_info);
/* Fix endianness. */
- zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
+ zip->cqe_cnt = rxq->cqe_comp_layout ?
+ (MLX5_CQE_NUM_MINIS(op_own) + 1U) :
+ rte_be_to_cpu_32(cqe->byte_cnt);
/*
* Current mini array position is the one
* returned by check_cqe64().
@@ -703,27 +727,44 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
* as a special case the second one is located
* 7 CQEs after the initial CQE instead of 8
* for subsequent ones.
- */
+ */
zip->ca = cq_ci;
zip->na = zip->ca + 7;
/* Compute the next non compressed CQE. */
zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
/* Get packet size to return. */
len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
- rxq->byte_mask);
+ rxq->byte_mask);
*mcqe = &(*mc)[0];
- zip->ai = 1;
- /* Prefetch all to be invalidated */
- idx = zip->ca;
- end = zip->cq_ci;
- while (idx != end) {
- rte_prefetch0(&(*rxq->cqes)[(idx) &
- cqe_cnt]);
- ++idx;
+ if (rxq->cqe_comp_layout) {
+ if (MLX5_CQE_NUM_MINIS(op_own))
+ zip->ai = 1;
+ else
+ rxq->cq_ci = zip->cq_ci;
+ } else {
+ zip->ai = 1;
+ /* Prefetch all to be invalidated */
+ idx = zip->ca;
+ end = zip->cq_ci;
+ while (idx != end) {
+ rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_mask]);
+ ++idx;
+ }
}
} else {
- rxq->cq_ci = cq_ci;
+ ++rxq->cq_ci;
len = rte_be_to_cpu_32(cqe->byte_cnt);
+ if (rxq->cqe_comp_layout) {
+ volatile struct mlx5_cqe *next;
+
+ next = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
+ ret = check_cqe_iteration(next, rxq->cqe_n, rxq->cq_ci);
+ if (ret != MLX5_CQE_STATUS_SW_OWN ||
+ MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED)
+ rte_memcpy(&rxq->title_cqe,
+ (const void *)(uintptr_t)cqe,
+ sizeof(struct mlx5_cqe));
+ }
}
}
if (unlikely(rxq->err_state)) {
@@ -732,7 +773,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
return len & MLX5_ERROR_CQE_MASK;
}
- cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
++rxq->stats.idropped;
(*skip_cnt) += mprq ? (len & MLX5_MPRQ_STRIDE_NUM_MASK) >>
MLX5_MPRQ_STRIDE_NUM_SHIFT : 1;
@@ -875,20 +916,22 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct mlx5_rxq_data *rxq = dpdk_rxq;
- const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
- const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
+ const uint32_t wqe_n = 1 << rxq->elts_n;
+ const uint32_t wqe_mask = wqe_n - 1;
+ const uint32_t cqe_n = 1 << rxq->cqe_n;
+ const uint32_t cqe_mask = cqe_n - 1;
const unsigned int sges_n = rxq->sges_n;
struct rte_mbuf *pkt = NULL;
struct rte_mbuf *seg = NULL;
volatile struct mlx5_cqe *cqe =
- &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
unsigned int i = 0;
unsigned int rq_ci = rxq->rq_ci << sges_n;
int len = 0; /* keep its value across iterations. */
while (pkts_n) {
uint16_t skip_cnt;
- unsigned int idx = rq_ci & wqe_cnt;
+ unsigned int idx = rq_ci & wqe_mask;
volatile struct mlx5_wqe_data_seg *wqe =
&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
struct rte_mbuf *rep = (*rxq->elts)[idx];
@@ -925,8 +968,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
break;
}
if (!pkt) {
- cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
- len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe, &skip_cnt, false);
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_n, cqe_mask, &mcqe, &skip_cnt, false);
if (unlikely(len & MLX5_ERROR_CQE_MASK)) {
if (len == MLX5_CRITICAL_ERROR_CQE_RET) {
rte_mbuf_raw_free(rep);
@@ -936,10 +979,10 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
rq_ci >>= sges_n;
rq_ci += skip_cnt;
rq_ci <<= sges_n;
- idx = rq_ci & wqe_cnt;
+ idx = rq_ci & wqe_mask;
wqe = &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
seg = (*rxq->elts)[idx];
- cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
len = len & ~MLX5_ERROR_CQE_MASK;
}
if (len == 0) {
@@ -949,6 +992,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
pkt = seg;
MLX5_ASSERT(len >= (rxq->crc_present << 2));
pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
+ if (rxq->cqe_comp_layout && mcqe)
+ cqe = &rxq->title_cqe;
rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
if (rxq->crc_present)
len -= RTE_ETHER_CRC_LEN;
@@ -1138,8 +1183,10 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct mlx5_rxq_data *rxq = dpdk_rxq;
const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz);
- const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
- const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
+ const uint32_t cqe_n = 1 << rxq->cqe_n;
+ const uint32_t cq_mask = cqe_n - 1;
+ const uint32_t wqe_n = 1 << rxq->elts_n;
+ const uint32_t wq_mask = wqe_n - 1;
volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
unsigned int i = 0;
uint32_t rq_ci = rxq->rq_ci;
@@ -1166,7 +1213,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
}
cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
- ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe, &skip_cnt, true);
+ ret = mlx5_rx_poll_len(rxq, cqe, cqe_n, cq_mask, &mcqe, &skip_cnt, true);
if (unlikely(ret & MLX5_ERROR_CQE_MASK)) {
if (ret == MLX5_CRITICAL_ERROR_CQE_RET) {
rq_ci = rxq->rq_ci;
@@ -1201,6 +1248,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
consumed_strd += strd_cnt;
if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
continue;
+ if (rxq->cqe_comp_layout && mcqe)
+ cqe = &rxq->title_cqe;
strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
cqe->wqe_counter :
mcqe->stride_idx);
diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 6b42e27c89..143685c6ab 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -41,11 +41,11 @@ struct mlx5_rxq_stats {
/* Compressed CQE context. */
struct rxq_zip {
+ uint16_t cqe_cnt; /* Number of CQEs. */
uint16_t ai; /* Array index. */
- uint16_t ca; /* Current array index. */
- uint16_t na; /* Next array index. */
- uint16_t cq_ci; /* The next CQE. */
- uint32_t cqe_cnt; /* Number of CQEs. */
+ uint32_t ca; /* Current array index. */
+ uint32_t na; /* Next array index. */
+ uint32_t cq_ci; /* The next CQE. */
};
/* Get pointer to the first stride. */
@@ -100,6 +100,8 @@ struct mlx5_rxq_data {
unsigned int mcqe_format:3; /* CQE compression format. */
unsigned int shared:1; /* Shared RXQ. */
unsigned int delay_drop:1; /* Enable delay drop. */
+ unsigned int cqe_comp_layout:1; /* CQE Compression Layout*/
+ unsigned int cq_ci:24;
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
uint16_t port_id;
@@ -107,7 +109,6 @@ struct mlx5_rxq_data {
uint32_t rq_ci;
uint16_t consumed_strd; /* Number of consumed strides in WQE. */
uint32_t rq_pi;
- uint32_t cq_ci;
uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
uint32_t byte_mask;
union {
@@ -119,6 +120,7 @@ struct mlx5_rxq_data {
uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
volatile void *wqes;
volatile struct mlx5_cqe(*cqes)[];
+ struct mlx5_cqe title_cqe; /* Title CQE for CQE compression. */
struct rte_mbuf *(*elts)[];
struct mlx5_mprq_buf *(*mprq_bufs)[];
struct rte_mempool *mp;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 81aa3f074a..6e99c4dde4 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -444,12 +444,15 @@ rxq_sync_cq(struct mlx5_rxq_data *rxq)
continue;
}
/* Compute the next non compressed CQE. */
- rxq->cq_ci += rte_be_to_cpu_32(cqe->byte_cnt);
+ rxq->cq_ci += rxq->cqe_comp_layout ?
+ (MLX5_CQE_NUM_MINIS(cqe->op_own) + 1U) :
+ rte_be_to_cpu_32(cqe->byte_cnt);
} while (--i);
/* Move all CQEs to HW ownership, including possible MiniCQEs. */
for (i = 0; i < cqe_n; i++) {
cqe = &(*rxq->cqes)[i];
+ cqe->validity_iteration_count = MLX5_CQE_VIC_INIT;
cqe->op_own = MLX5_CQE_INVALIDATE;
}
/* Resync CQE and WQE (WQ in RESET state). */
--
2.18.2
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 4/5] net/mlx5: support enhanced CQE zipping in vector Rx burst
2023-02-28 16:43 [PATCH 0/5] net/mlx5: enhanced CQE compression layout Alexander Kozyrev
` (2 preceding siblings ...)
2023-02-28 16:43 ` [PATCH 3/5] net/mlx5: support enhanced CQE compression in Rx burst Alexander Kozyrev
@ 2023-02-28 16:43 ` Alexander Kozyrev
2023-03-06 13:13 ` Slava Ovsiienko
2023-02-28 16:43 ` [PATCH 5/5] net/mlx5: enable enhanced CQE compression Alexander Kozyrev
2023-03-07 9:03 ` [PATCH 0/5] net/mlx5: enhanced CQE compression layout Raslan Darawsheh
5 siblings, 1 reply; 12+ messages in thread
From: Alexander Kozyrev @ 2023-02-28 16:43 UTC (permalink / raw)
To: dev; +Cc: rasland, viacheslavo, matan
Add Enhanced CQE compression support to vectorized Rx burst routines.
Adopt the same algorithm as scalar Rx burst routines have today.
1. Retrieve the validity_iteration_count from CQEs and use it
to check if the CQE is ready to be processed instead of the owner_bit.
2. Do not invalidate reserved CQEs between miniCQE arrays.
3. Copy the title packet from the last processed uncompressed CQE
since we will need it later to build packets from zipped CQEs.
4. Skip the regular CQE processing and go straight to the CQE unzip
function in case the very first CQE is compressed to sace CPU time.
Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
drivers/net/mlx5/mlx5_rx.h | 1 +
drivers/net/mlx5/mlx5_rxtx_vec.c | 24 ++++-
drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 108 ++++++++++++++++-------
drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 91 +++++++++++++------
drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 94 ++++++++++++++------
5 files changed, 232 insertions(+), 86 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 143685c6ab..8b87adad36 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -122,6 +122,7 @@ struct mlx5_rxq_data {
volatile struct mlx5_cqe(*cqes)[];
struct mlx5_cqe title_cqe; /* Title CQE for CQE compression. */
struct rte_mbuf *(*elts)[];
+ struct rte_mbuf title_pkt; /* Title packet for CQE compression. */
struct mlx5_mprq_buf *(*mprq_bufs)[];
struct rte_mempool *mp;
struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 667475a93e..2363d7ed27 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -290,13 +290,14 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
const uint16_t q_mask = q_n - 1;
const uint16_t e_n = 1 << rxq->elts_n;
const uint16_t e_mask = e_n - 1;
- volatile struct mlx5_cqe *cq;
+ volatile struct mlx5_cqe *cq, *next;
struct rte_mbuf **elts;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
uint16_t rcvd_pkt = 0;
unsigned int cq_idx = rxq->cq_ci & q_mask;
unsigned int elts_idx;
+ int ret;
MLX5_ASSERT(rxq->sges_n == 0);
MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
@@ -342,6 +343,15 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
rxq->cq_ci += nocmp_n;
rxq->rq_pi += nocmp_n;
rcvd_pkt += nocmp_n;
+ /* Copy title packet for future compressed sessions. */
+ if (rxq->cqe_comp_layout) {
+ next = &(*rxq->cqes)[rxq->cq_ci & q_mask];
+ ret = check_cqe_iteration(next, rxq->cqe_n, rxq->cq_ci);
+ if (ret != MLX5_CQE_STATUS_SW_OWN ||
+ MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED)
+ rte_memcpy(&rxq->title_pkt, elts[nocmp_n - 1],
+ sizeof(struct rte_mbuf));
+ }
/* Decompress the last CQE if compressed. */
if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
@@ -431,7 +441,7 @@ rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
const uint32_t elts_n = wqe_n * strd_n;
const uint32_t elts_mask = elts_n - 1;
- volatile struct mlx5_cqe *cq;
+ volatile struct mlx5_cqe *cq, *next;
struct rte_mbuf **elts;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
@@ -439,6 +449,7 @@ rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
uint16_t cp_pkt = 0;
unsigned int cq_idx = rxq->cq_ci & q_mask;
unsigned int elts_idx;
+ int ret;
MLX5_ASSERT(rxq->sges_n == 0);
cq = &(*rxq->cqes)[cq_idx];
@@ -482,6 +493,15 @@ rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
MLX5_ASSERT(nocmp_n <= pkts_n);
cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n);
rcvd_pkt += cp_pkt;
+ /* Copy title packet for future compressed sessions. */
+ if (rxq->cqe_comp_layout) {
+ next = &(*rxq->cqes)[rxq->cq_ci & q_mask];
+ ret = check_cqe_iteration(next, rxq->cqe_n, rxq->cq_ci);
+ if (ret != MLX5_CQE_STATUS_SW_OWN ||
+ MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED)
+ rte_memcpy(&rxq->title_pkt, elts[nocmp_n - 1],
+ sizeof(struct rte_mbuf));
+ }
/* Decompress the last CQE if compressed. */
if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index 204d17a8f2..14ffff26f4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -76,8 +76,10 @@ static inline uint16_t
rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
struct rte_mbuf **elts)
{
- volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
- struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ volatile struct mlx5_mini_cqe8 *mcq =
+ (void *)&(cq + !rxq->cqe_comp_layout)->pkt_info;
+ /* Title packet is pre-built. */
+ struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
const __vector unsigned char zero = (__vector unsigned char){0};
/* Mask to shuffle from extracted mini CQE to mbuf. */
const __vector unsigned char shuf_mask1 = (__vector unsigned char){
@@ -93,8 +95,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
-1, -1, /* skip vlan_tci */
11, 10, 9, 8}; /* bswap32, rss */
/* Restore the compressed count. Must be 16 bits. */
- const uint16_t mcqe_n = t_pkt->data_len +
- (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+ (MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+ t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t pkts_n = mcqe_n;
const __vector unsigned char rearm =
(__vector unsigned char)vec_vsx_ld(0,
(signed int const *)&t_pkt->rearm_data);
@@ -132,6 +136,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* D. store rx_descriptor_fields1.
* E. store flow tag (rte_flow mark).
*/
+cycle:
+ if (rxq->cqe_comp_layout)
+ rte_prefetch0((void *)(cq + mcqe_n));
for (pos = 0; pos < mcqe_n; ) {
__vector unsigned char mcqe1, mcqe2;
__vector unsigned char rxdf1, rxdf2;
@@ -154,9 +161,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
const __vector unsigned long shmax = {64, 64};
#endif
- for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
- if (likely(pos + i < mcqe_n))
- rte_prefetch0((void *)(cq + pos + i));
+ if (!rxq->cqe_comp_layout)
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
/* A.1 load mCQEs into a 128bit register. */
mcqe1 = (__vector unsigned char)vec_vsx_ld(0,
(signed int const *)&mcq[pos % 8]);
@@ -488,25 +496,43 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
pos += MLX5_VPMD_DESCS_PER_LOOP;
/* Move to next CQE and invalidate consumed CQEs. */
- if (!(pos & 0x7) && pos < mcqe_n) {
- if (pos + 8 < mcqe_n)
- rte_prefetch0((void *)(cq + pos + 8));
- mcq = (void *)&(cq + pos)->pkt_info;
- for (i = 0; i < 8; ++i)
- cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ if (!rxq->cqe_comp_layout) {
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ if (pos + 8 < mcqe_n)
+ rte_prefetch0((void *)(cq + pos + 8));
+ mcq = (void *)&(cq + pos)->pkt_info;
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
}
}
- /* Invalidate the rest of CQEs. */
- for (; inv < mcqe_n; ++inv)
- cq[inv].op_own = MLX5_CQE_INVALIDATE;
+ if (rxq->cqe_comp_layout) {
+ int ret;
+ /* Keep unzipping if the next CQE is the miniCQE array. */
+ cq = &cq[mcqe_n];
+ ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+ if (ret == MLX5_CQE_STATUS_SW_OWN &&
+ MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+ pos = 0;
+ elts = &elts[mcqe_n];
+ mcq = (void *)cq;
+ mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+ pkts_n += mcqe_n;
+ goto cycle;
+ }
+ } else {
+ /* Invalidate the rest of CQEs. */
+ for (; inv < pkts_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
+ }
#ifdef MLX5_PMD_SOFT_COUNTERS
- rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ipackets += pkts_n;
rxq->stats.ibytes += rcvd_byte;
#endif
- return mcqe_n;
+ return pkts_n;
}
/**
@@ -787,9 +813,13 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
uint64_t n = 0;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
- unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
+ unsigned int own = !(rxq->cq_ci & (q_mask + 1));
const __vector unsigned char zero = (__vector unsigned char){0};
const __vector unsigned char ones = vec_splat_u8(-1);
+ const __vector unsigned char vic_check =
+ (__vector unsigned char)(__vector unsigned long){
+ 0x00ff000000ff0000LL, 0x00ff000000ff0000LL};
const __vector unsigned char owner_check =
(__vector unsigned char)(__vector unsigned long){
0x0100000001000000LL, 0x0100000001000000LL};
@@ -837,7 +867,16 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
(__vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
const __vector unsigned short cqe_sel_mask2 =
(__vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
-
+ const __vector unsigned char validity = (__vector unsigned char){
+ 0, 0, vic, 0,
+ 0, 0, vic, 0,
+ 0, 0, vic, 0,
+ 0, 0, vic, 0};
+ const __vector unsigned char ownership = (__vector unsigned char){
+ 0, 0, 0, own,
+ 0, 0, 0, own,
+ 0, 0, 0, own,
+ 0, 0, 0, own};
/*
* A. load first Qword (8bytes) in one loop.
* B. copy 4 mbuf pointers from elts ring to returning pkts.
@@ -848,7 +887,7 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* uint8_t pkt_info;
* uint8_t flow_tag[3];
* uint16_t byte_cnt;
- * uint8_t rsvd4;
+ * uint8_t validity_iteration_count;
* uint8_t op_own;
* uint16_t hdr_type_etc;
* uint16_t vlan_info;
@@ -1082,17 +1121,25 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
*(__vector unsigned char *)
&pkts[pos]->pkt_len = pkt_mb0;
- /* E.2 flip owner bit to mark CQEs from last round. */
- owner_mask = (__vector unsigned char)
- vec_and((__vector unsigned long)op_own,
- (__vector unsigned long)owner_check);
- if (ownership)
+ /* E.2 mask out CQEs belonging to HW. */
+ if (rxq->cqe_comp_layout) {
+ owner_mask = (__vector unsigned char)
+ vec_and((__vector unsigned long)op_own,
+ (__vector unsigned long)vic_check);
+ owner_mask = (__vector unsigned char)
+ vec_cmpeq((__vector unsigned int)owner_mask,
+ (__vector unsigned int)validity);
owner_mask = (__vector unsigned char)
vec_xor((__vector unsigned long)owner_mask,
+ (__vector unsigned long)ones);
+ } else {
+ owner_mask = (__vector unsigned char)
+ vec_and((__vector unsigned long)op_own,
(__vector unsigned long)owner_check);
- owner_mask = (__vector unsigned char)
- vec_cmpeq((__vector unsigned int)owner_mask,
- (__vector unsigned int)owner_check);
+ owner_mask = (__vector unsigned char)
+ vec_cmpeq((__vector unsigned int)owner_mask,
+ (__vector unsigned int)ownership);
+ }
owner_mask = (__vector unsigned char)
vec_packs((__vector unsigned int)owner_mask,
(__vector unsigned int)zero);
@@ -1174,7 +1221,8 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
(__vector unsigned long)mask);
/* D.3 check error in opcode. */
- adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+ adj = (!rxq->cqe_comp_layout &&
+ comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
mask = (__vector unsigned char)(__vector unsigned long){
(adj * sizeof(uint16_t) * 8), 0};
lshift = vec_splat((__vector unsigned long)mask, 0);
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 41b9cf5444..75e8ed7e5a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -71,8 +71,10 @@ static inline uint16_t
rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
struct rte_mbuf **elts)
{
- volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
- struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ volatile struct mlx5_mini_cqe8 *mcq =
+ (void *)&(cq + !rxq->cqe_comp_layout)->pkt_info;
+ /* Title packet is pre-built. */
+ struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
unsigned int pos;
unsigned int i;
unsigned int inv = 0;
@@ -92,8 +94,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
11, 10, 9, 8 /* hash.rss, bswap32 */
};
/* Restore the compressed count. Must be 16 bits. */
- const uint16_t mcqe_n = t_pkt->data_len +
- (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+ (MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+ t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t pkts_n = mcqe_n;
const uint64x2_t rearm =
vld1q_u64((void *)&t_pkt->rearm_data);
const uint32x4_t rxdf_mask = {
@@ -131,6 +135,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* D. store rx_descriptor_fields1.
* E. store flow tag (rte_flow mark).
*/
+cycle:
+ if (rxq->cqe_comp_layout)
+ rte_prefetch0((void *)(cq + mcqe_n));
for (pos = 0; pos < mcqe_n; ) {
uint8_t *p = (void *)&mcq[pos % 8];
uint8_t *e0 = (void *)&elts[pos]->rearm_data;
@@ -145,9 +152,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
sizeof(uint16_t) * 8) : 0);
#endif
- for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
- if (likely(pos + i < mcqe_n))
- rte_prefetch0((void *)(cq + pos + i));
+ if (!rxq->cqe_comp_layout)
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
__asm__ volatile (
/* A.1 load mCQEs into a 128bit register. */
"ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
@@ -354,22 +362,40 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
}
pos += MLX5_VPMD_DESCS_PER_LOOP;
/* Move to next CQE and invalidate consumed CQEs. */
- if (!(pos & 0x7) && pos < mcqe_n) {
- if (pos + 8 < mcqe_n)
- rte_prefetch0((void *)(cq + pos + 8));
- mcq = (void *)&(cq + pos)->pkt_info;
- for (i = 0; i < 8; ++i)
- cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ if (!rxq->cqe_comp_layout) {
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ if (pos + 8 < mcqe_n)
+ rte_prefetch0((void *)(cq + pos + 8));
+ mcq = (void *)&(cq + pos)->pkt_info;
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
+ }
+ }
+ if (rxq->cqe_comp_layout) {
+ int ret;
+ /* Keep unzipping if the next CQE is the miniCQE array. */
+ cq = &cq[mcqe_n];
+ ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+ if (ret == MLX5_CQE_STATUS_SW_OWN &&
+ MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+ pos = 0;
+ elts = &elts[mcqe_n];
+ mcq = (void *)cq;
+ mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+ pkts_n += mcqe_n;
+ goto cycle;
}
+ } else {
+ /* Invalidate the rest of CQEs. */
+ for (; inv < pkts_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
}
- /* Invalidate the rest of CQEs. */
- for (; inv < mcqe_n; ++inv)
- cq[inv].op_own = MLX5_CQE_INVALIDATE;
#ifdef MLX5_PMD_SOFT_COUNTERS
- rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ipackets += pkts_n;
rxq->stats.ibytes += rcvd_byte;
#endif
- return mcqe_n;
+ return pkts_n;
}
/**
@@ -528,7 +554,9 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
uint64_t n = 0;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
+ const uint16x4_t validity = vdup_n_u16((rxq->cq_ci >> rxq->cqe_n) << 8);
const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+ const uint16x4_t vic_check = vcreate_u16(0xff00ff00ff00ff00);
const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
@@ -547,7 +575,7 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
const uint8x16_t cqe_shuf_m = {
28, 29, /* hdr_type_etc */
0, /* pkt_info */
- -1, /* null */
+ 62, /* validity_iteration_count */
47, 46, /* byte_cnt, bswap16 */
31, 30, /* vlan_info, bswap16 */
15, 14, 13, 12, /* rx_hash_res, bswap32 */
@@ -564,10 +592,10 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
};
/* Mask to generate 16B owner vector. */
const uint8x8_t owner_shuf_m = {
- 63, -1, /* 4th CQE */
- 47, -1, /* 3rd CQE */
- 31, -1, /* 2nd CQE */
- 15, -1 /* 1st CQE */
+ 63, 51, /* 4th CQE */
+ 47, 35, /* 3rd CQE */
+ 31, 19, /* 2nd CQE */
+ 15, 3 /* 1st CQE */
};
/* Mask to generate a vector having packet_type/ol_flags. */
const uint8x16_t ptype_shuf_m = {
@@ -600,7 +628,7 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* struct {
* uint16_t hdr_type_etc;
* uint8_t pkt_info;
- * uint8_t rsvd;
+ * uint8_t validity_iteration_count;
* uint16_t byte_cnt;
* uint16_t vlan_info;
* uint32_t rx_has_res;
@@ -748,9 +776,15 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
"v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23",
"v24", "v25");
- /* D.2 flip owner bit to mark CQEs from last round. */
- owner_mask = vand_u16(op_own, owner_check);
- owner_mask = vceq_u16(owner_mask, ownership);
+ /* D.2 mask out CQEs belonging to HW. */
+ if (rxq->cqe_comp_layout) {
+ owner_mask = vand_u16(op_own, vic_check);
+ owner_mask = vceq_u16(owner_mask, validity);
+ owner_mask = vmvn_u16(owner_mask);
+ } else {
+ owner_mask = vand_u16(op_own, owner_check);
+ owner_mask = vceq_u16(owner_mask, ownership);
+ }
/* D.3 get mask for invalidated CQEs. */
opcode = vand_u16(op_own, opcode_check);
invalid_mask = vceq_u16(opcode_check, opcode);
@@ -780,7 +814,8 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
-1UL >> (n * sizeof(uint16_t) * 8) : 0);
invalid_mask = vorr_u16(invalid_mask, mask);
/* D.3 check error in opcode. */
- adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+ adj = (!rxq->cqe_comp_layout &&
+ comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
mask = vcreate_u16(adj ?
-1UL >> ((n + 1) * sizeof(uint16_t) * 8) : -1UL);
mini_mask = vand_u16(invalid_mask, mask);
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index ab69af0c55..b282f8b8e6 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -73,8 +73,9 @@ static inline uint16_t
rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
struct rte_mbuf **elts)
{
- volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
- struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + !rxq->cqe_comp_layout);
+ /* Title packet is pre-built. */
+ struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
unsigned int pos;
unsigned int i;
unsigned int inv = 0;
@@ -92,8 +93,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
-1, -1, 14, 15, /* pkt_len, bswap16 */
-1, -1, -1, -1 /* skip packet_type */);
/* Restore the compressed count. Must be 16 bits. */
- const uint16_t mcqe_n = t_pkt->data_len +
- (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+ (MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+ t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ uint16_t pkts_n = mcqe_n;
const __m128i rearm =
_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
const __m128i rxdf =
@@ -124,6 +127,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* D. store rx_descriptor_fields1.
* E. store flow tag (rte_flow mark).
*/
+cycle:
+ if (rxq->cqe_comp_layout)
+ rte_prefetch0((void *)(cq + mcqe_n));
for (pos = 0; pos < mcqe_n; ) {
__m128i mcqe1, mcqe2;
__m128i rxdf1, rxdf2;
@@ -131,9 +137,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
__m128i byte_cnt, invalid_mask;
#endif
- for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
- if (likely(pos + i < mcqe_n))
- rte_prefetch0((void *)(cq + pos + i));
+ if (!rxq->cqe_comp_layout)
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
/* A.1 load mCQEs into a 128bit register. */
mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
@@ -344,22 +351,40 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
}
pos += MLX5_VPMD_DESCS_PER_LOOP;
/* Move to next CQE and invalidate consumed CQEs. */
- if (!(pos & 0x7) && pos < mcqe_n) {
- if (pos + 8 < mcqe_n)
- rte_prefetch0((void *)(cq + pos + 8));
- mcq = (void *)(cq + pos);
- for (i = 0; i < 8; ++i)
- cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ if (!rxq->cqe_comp_layout) {
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ if (pos + 8 < mcqe_n)
+ rte_prefetch0((void *)(cq + pos + 8));
+ mcq = (void *)(cq + pos);
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
+ }
+ }
+ if (rxq->cqe_comp_layout) {
+ int ret;
+ /* Keep unzipping if the next CQE is the miniCQE array. */
+ cq = &cq[mcqe_n];
+ ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+ if (ret == MLX5_CQE_STATUS_SW_OWN &&
+ MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+ pos = 0;
+ elts = &elts[mcqe_n];
+ mcq = (void *)cq;
+ mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+ pkts_n += mcqe_n;
+ goto cycle;
}
+ } else {
+ /* Invalidate the rest of CQEs. */
+ for (; inv < pkts_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
}
- /* Invalidate the rest of CQEs. */
- for (; inv < mcqe_n; ++inv)
- cq[inv].op_own = MLX5_CQE_INVALIDATE;
#ifdef MLX5_PMD_SOFT_COUNTERS
- rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ipackets += pkts_n;
rxq->stats.ibytes += rcvd_byte;
#endif
- return mcqe_n;
+ return pkts_n;
}
/**
@@ -527,7 +552,9 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
uint64_t n = 0;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
- unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
+ const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
+ const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
const __m128i owner_check = _mm_set1_epi64x(0x0100000001000000LL);
const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
@@ -541,6 +568,16 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
12, 13, 8, 9,
4, 5, 0, 1);
#endif
+ const __m128i validity =
+ _mm_set_epi8(0, vic, 0, 0,
+ 0, vic, 0, 0,
+ 0, vic, 0, 0,
+ 0, vic, 0, 0);
+ const __m128i ownership =
+ _mm_set_epi8(own, 0, 0, 0,
+ own, 0, 0, 0,
+ own, 0, 0, 0,
+ own, 0, 0, 0);
/* Mask to shuffle from extracted CQE to mbuf. */
const __m128i shuf_mask =
_mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */
@@ -573,7 +610,7 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
* uint8_t pkt_info;
* uint8_t flow_tag[3];
* uint16_t byte_cnt;
- * uint8_t rsvd4;
+ * uint8_t validity_iteration_count;
* uint8_t op_own;
* uint16_t hdr_type_etc;
* uint16_t vlan_info;
@@ -689,11 +726,15 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
/* D.1 fill in mbuf - rx_descriptor_fields1. */
_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
- /* E.2 flip owner bit to mark CQEs from last round. */
- owner_mask = _mm_and_si128(op_own, owner_check);
- if (ownership)
- owner_mask = _mm_xor_si128(owner_mask, owner_check);
- owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+ /* E.2 mask out CQEs belonging to HW. */
+ if (rxq->cqe_comp_layout) {
+ owner_mask = _mm_and_si128(op_own, vic_check);
+ owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
+ owner_mask = _mm_xor_si128(owner_mask, ones);
+ } else {
+ owner_mask = _mm_and_si128(op_own, owner_check);
+ owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
+ }
owner_mask = _mm_packs_epi32(owner_mask, zero);
/* E.3 get mask for invalidated CQEs. */
opcode = _mm_and_si128(op_own, opcode_check);
@@ -729,7 +770,8 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
mask = _mm_sll_epi64(ones, mask);
invalid_mask = _mm_or_si128(invalid_mask, mask);
/* D.3 check error in opcode. */
- adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+ adj = (!rxq->cqe_comp_layout &&
+ comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
mini_mask = _mm_sll_epi64(invalid_mask, mask);
opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
--
2.18.2
^ permalink raw reply [flat|nested] 12+ messages in thread