From: Ajit Khaparde <ajit.khaparde@broadcom.com>
To: dev@dpdk.org
Cc: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
Subject: [PATCH v3 18/18] net/bnxt: enable SSE mode for compressed CQE
Date: Tue, 26 Dec 2023 20:21:19 -0800 [thread overview]
Message-ID: <20231227042119.72469-19-ajit.khaparde@broadcom.com> (raw)
In-Reply-To: <20231227042119.72469-1-ajit.khaparde@broadcom.com>
[-- Attachment #1: Type: text/plain, Size: 9780 bytes --]
P7 device family supports 16 byte Rx completions.
Enable SSE vector mode for compressed Rx CQE processing.
Signed-off-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
---
drivers/net/bnxt/bnxt_ethdev.c | 16 ++-
drivers/net/bnxt/bnxt_rxr.h | 2 +
drivers/net/bnxt/bnxt_rxtx_vec_sse.c | 167 +++++++++++++++++++++++++--
3 files changed, 173 insertions(+), 12 deletions(-)
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index bd8c7557dd..f9cd234bb6 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -1377,7 +1377,8 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev)
* asynchronous completions and receive completions can be placed in
* the same completion ring.
*/
- if (BNXT_TRUFLOW_EN(bp) || !BNXT_NUM_ASYNC_CPR(bp))
+ if ((BNXT_TRUFLOW_EN(bp) && !BNXT_CHIP_P7(bp)) ||
+ !BNXT_NUM_ASYNC_CPR(bp))
goto use_scalar_rx;
/*
@@ -1410,12 +1411,19 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev)
return bnxt_crx_pkts_vec_avx2;
return bnxt_recv_pkts_vec_avx2;
}
- #endif
+#endif
if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
PMD_DRV_LOG(INFO,
"Using SSE vector mode receive for port %d\n",
eth_dev->data->port_id);
bp->flags |= BNXT_FLAG_RX_VECTOR_PKT_MODE;
+ if (bnxt_compressed_rx_cqe_mode_enabled(bp)) {
+#if defined(RTE_ARCH_ARM64)
+ goto use_scalar_rx;
+#else
+ return bnxt_crx_pkts_vec;
+#endif
+ }
return bnxt_recv_pkts_vec;
}
@@ -1445,7 +1453,8 @@ bnxt_transmit_function(__rte_unused struct rte_eth_dev *eth_dev)
*/
if (eth_dev->data->scattered_rx ||
(offloads & ~RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) ||
- BNXT_TRUFLOW_EN(bp) || bp->ieee_1588)
+ (BNXT_TRUFLOW_EN(bp) && !BNXT_CHIP_P7(bp)) ||
+ bp->ieee_1588)
goto use_scalar_tx;
#if defined(RTE_ARCH_X86)
@@ -3125,6 +3134,7 @@ static const struct {
} bnxt_rx_burst_info[] = {
{bnxt_recv_pkts, "Scalar"},
#if defined(RTE_ARCH_X86)
+ {bnxt_crx_pkts_vec, "Vector SSE"},
{bnxt_recv_pkts_vec, "Vector SSE"},
#endif
#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h
index a474a69ae3..d36cbded1d 100644
--- a/drivers/net/bnxt/bnxt_rxr.h
+++ b/drivers/net/bnxt/bnxt_rxr.h
@@ -156,6 +156,8 @@ int bnxt_flush_rx_cmp(struct bnxt_cp_ring_info *cpr);
#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
uint16_t bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t bnxt_crx_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq);
#endif
diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
index e99a547f58..220aa82073 100644
--- a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
+++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
@@ -54,15 +54,9 @@
static inline void
descs_to_mbufs(__m128i mm_rxcmp[4], __m128i mm_rxcmp1[4],
- __m128i mbuf_init, struct rte_mbuf **mbuf,
- struct bnxt_rx_ring_info *rxr)
+ __m128i mbuf_init, const __m128i shuf_msk,
+ struct rte_mbuf **mbuf, struct bnxt_rx_ring_info *rxr)
{
- const __m128i shuf_msk =
- _mm_set_epi8(15, 14, 13, 12, /* rss */
- 0xFF, 0xFF, /* vlan_tci (zeroes) */
- 3, 2, /* data_len */
- 0xFF, 0xFF, 3, 2, /* pkt_len */
- 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
const __m128i flags_type_mask =
_mm_set1_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK);
const __m128i flags2_mask1 =
@@ -166,6 +160,12 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
int nb_rx_pkts = 0;
const __m128i valid_target =
_mm_set1_epi32(!!(raw_cons & cp_ring_size));
+ const __m128i shuf_msk =
+ _mm_set_epi8(15, 14, 13, 12, /* rss */
+ 0xFF, 0xFF, /* vlan_tci (zeroes) */
+ 3, 2, /* data_len */
+ 0xFF, 0xFF, 3, 2, /* pkt_len */
+ 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
int i;
/* If Rx Q was stopped return */
@@ -264,7 +264,7 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (num_valid == 0)
break;
- descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, &rx_pkts[nb_rx_pkts],
+ descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, shuf_msk, &rx_pkts[nb_rx_pkts],
rxr);
nb_rx_pkts += num_valid;
@@ -283,6 +283,134 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
return nb_rx_pkts;
}
+static uint16_t
+crx_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+ struct bnxt_rx_queue *rxq = rx_queue;
+ const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
+ struct bnxt_cp_ring_info *cpr = rxq->cp_ring;
+ struct bnxt_rx_ring_info *rxr = rxq->rx_ring;
+ uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size;
+ uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size;
+ struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring;
+ uint64_t valid, desc_valid_mask = ~0ULL;
+ const __m128i info3_v_mask = _mm_set1_epi32(CMPL_BASE_V);
+ uint32_t raw_cons = cpr->cp_raw_cons;
+ uint32_t cons, mbcons;
+ int nb_rx_pkts = 0;
+ const __m128i valid_target =
+ _mm_set1_epi32(!!(raw_cons & cp_ring_size));
+ const __m128i shuf_msk =
+ _mm_set_epi8(7, 6, 5, 4, /* rss */
+ 0xFF, 0xFF, /* vlan_tci (zeroes) */
+ 3, 2, /* data_len */
+ 0xFF, 0xFF, 3, 2, /* pkt_len */
+ 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
+ int i;
+
+ /* If Rx Q was stopped return */
+ if (unlikely(!rxq->rx_started))
+ return 0;
+
+ if (rxq->rxrearm_nb >= rxq->rx_free_thresh)
+ bnxt_rxq_rearm(rxq, rxr);
+
+ cons = raw_cons & (cp_ring_size - 1);
+ mbcons = raw_cons & (rx_ring_size - 1);
+
+ /* Prefetch first four descriptor pairs. */
+ rte_prefetch0(&cp_desc_ring[cons]);
+
+ /* Ensure that we do not go past the ends of the rings. */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons,
+ cp_ring_size - cons));
+ /*
+ * If we are at the end of the ring, ensure that descriptors after the
+ * last valid entry are not treated as valid. Otherwise, force the
+ * maximum number of packets to receive to be a multiple of the per-
+ * loop count.
+ */
+ if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC128) {
+ desc_valid_mask >>=
+ 16 * (BNXT_RX_DESCS_PER_LOOP_VEC128 - nb_pkts);
+ } else {
+ nb_pkts =
+ RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC128);
+ }
+
+ /* Handle RX burst request */
+ for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC128,
+ cons += BNXT_RX_DESCS_PER_LOOP_VEC128,
+ mbcons += BNXT_RX_DESCS_PER_LOOP_VEC128) {
+ __m128i rxcmp1[BNXT_RX_DESCS_PER_LOOP_VEC128];
+ __m128i rxcmp[BNXT_RX_DESCS_PER_LOOP_VEC128];
+ __m128i tmp0, tmp1, info3_v;
+ uint32_t num_valid;
+
+ /* Copy four mbuf pointers to output array. */
+ tmp0 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons]);
+#ifdef RTE_ARCH_X86_64
+ tmp1 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons + 2]);
+#endif
+ _mm_storeu_si128((void *)&rx_pkts[i], tmp0);
+#ifdef RTE_ARCH_X86_64
+ _mm_storeu_si128((void *)&rx_pkts[i + 2], tmp1);
+#endif
+
+ /* Prefetch four descriptor pairs for next iteration. */
+ if (i + BNXT_RX_DESCS_PER_LOOP_VEC128 < nb_pkts)
+ rte_prefetch0(&cp_desc_ring[cons + 4]);
+
+ /*
+ * Load the four current descriptors into SSE registers in
+ * reverse order to ensure consistent state.
+ */
+ rxcmp[3] = _mm_load_si128((void *)&cp_desc_ring[cons + 3]);
+ rte_compiler_barrier();
+ rxcmp[2] = _mm_load_si128((void *)&cp_desc_ring[cons + 2]);
+ rte_compiler_barrier();
+ rxcmp[1] = _mm_load_si128((void *)&cp_desc_ring[cons + 1]);
+ rte_compiler_barrier();
+ rxcmp[0] = _mm_load_si128((void *)&cp_desc_ring[cons + 0]);
+
+ tmp1 = _mm_unpackhi_epi32(rxcmp[2], rxcmp[3]);
+ tmp0 = _mm_unpackhi_epi32(rxcmp[0], rxcmp[1]);
+
+ /* Isolate descriptor valid flags. */
+ info3_v = _mm_and_si128(_mm_unpacklo_epi64(tmp0, tmp1),
+ info3_v_mask);
+ info3_v = _mm_xor_si128(info3_v, valid_target);
+
+ /*
+ * Pack the 128-bit array of valid descriptor flags into 64
+ * bits and count the number of set bits in order to determine
+ * the number of valid descriptors.
+ */
+ valid = _mm_cvtsi128_si64(_mm_packs_epi32(info3_v, info3_v));
+ num_valid = rte_popcount64(valid & desc_valid_mask);
+
+ if (num_valid == 0)
+ break;
+
+ descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, shuf_msk, &rx_pkts[nb_rx_pkts],
+ rxr);
+ nb_rx_pkts += num_valid;
+
+ if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC128)
+ break;
+ }
+
+ if (nb_rx_pkts) {
+ rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts);
+
+ rxq->rxrearm_nb += nb_rx_pkts;
+ cpr->cp_raw_cons += nb_rx_pkts;
+ bnxt_db_cq(cpr);
+ }
+
+ return nb_rx_pkts;
+}
+
uint16_t
bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
{
@@ -304,6 +432,27 @@ bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
return cnt + recv_burst_vec_sse(rx_queue, rx_pkts + cnt, nb_pkts);
}
+uint16_t
+bnxt_crx_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+ uint16_t cnt = 0;
+
+ while (nb_pkts > RTE_BNXT_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = crx_burst_vec_sse(rx_queue, rx_pkts + cnt,
+ RTE_BNXT_MAX_RX_BURST);
+
+ cnt += burst;
+ nb_pkts -= burst;
+
+ if (burst < RTE_BNXT_MAX_RX_BURST)
+ return cnt;
+ }
+
+ return cnt + crx_burst_vec_sse(rx_queue, rx_pkts + cnt, nb_pkts);
+}
+
static void
bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq)
{
--
2.39.2 (Apple Git-143)
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4218 bytes --]
next prev parent reply other threads:[~2023-12-27 4:23 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-27 4:21 [PATCH v3 00/18] bnxt patchset Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 01/18] net/bnxt: add support for UDP GSO Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 02/18] net/bnxt: add support for compressed Rx CQE Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 03/18] net/bnxt: fix a typo while parsing link speed Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 04/18] net/bnxt: fix setting 50G and 100G forced speed Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 05/18] net/bnxt: fix speed change from 200G to 25G on Thor Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 06/18] net/bnxt: support backward compatibility Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 07/18] net/bnxt: reattempt mbuf allocation for Rx and AGG rings Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 08/18] net/bnxt: refactor Rx doorbell during Rx flush Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 09/18] net/bnxt: extend RSS hash support for P7 devices Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 10/18] net/bnxt: add flow query callback Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 11/18] net/bnxt: add ESP and AH header based RSS support Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 12/18] net/bnxt: set allmulti mode if multicast filter fails Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 13/18] net/bnxt: add VF FLR async event handler Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 14/18] net/bnxt: add tunnel TPA support Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 15/18] net/bnxt: add 400G get support for P7 devices Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 16/18] net/bnxt: query extended stats from firmware Ajit Khaparde
2023-12-27 4:21 ` [PATCH v3 17/18] net/bnxt: add AVX2 support for compressed CQE Ajit Khaparde
2023-12-27 4:21 ` Ajit Khaparde [this message]
2023-12-29 16:21 ` [PATCH v3 00/18] bnxt patchset Ajit Khaparde
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231227042119.72469-19-ajit.khaparde@broadcom.com \
--to=ajit.khaparde@broadcom.com \
--cc=damodharam.ammepalli@broadcom.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).