From: Ajit Khaparde <ajit.khaparde@broadcom.com>
To: dev@dpdk.org
Cc: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
Subject: [PATCH 17/18] net/bnxt: add AVX2 support for compressed CQE
Date: Thu, 21 Dec 2023 10:05:28 -0800 [thread overview]
Message-ID: <20231221180529.18687-18-ajit.khaparde@broadcom.com> (raw)
In-Reply-To: <20231221180529.18687-1-ajit.khaparde@broadcom.com>
[-- Attachment #1: Type: text/plain, Size: 13849 bytes --]
P7 device family supports 16 byte Rx completions.
Add AVX2 vector mode for compressed Rx CQE.
Signed-off-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
---
drivers/net/bnxt/bnxt_ethdev.c | 5 +
drivers/net/bnxt/bnxt_rxr.h | 2 +
drivers/net/bnxt/bnxt_rxtx_vec_avx2.c | 309 ++++++++++++++++++++++++++
3 files changed, 316 insertions(+)
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 031028eda1..bd8c7557dd 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -1406,6 +1406,8 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev)
"Using AVX2 vector mode receive for port %d\n",
eth_dev->data->port_id);
bp->flags |= BNXT_FLAG_RX_VECTOR_PKT_MODE;
+ if (bnxt_compressed_rx_cqe_mode_enabled(bp))
+ return bnxt_crx_pkts_vec_avx2;
return bnxt_recv_pkts_vec_avx2;
}
#endif
@@ -3124,6 +3126,9 @@ static const struct {
{bnxt_recv_pkts, "Scalar"},
#if defined(RTE_ARCH_X86)
{bnxt_recv_pkts_vec, "Vector SSE"},
+#endif
+#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
+ {bnxt_crx_pkts_vec_avx2, "Vector AVX2"},
{bnxt_recv_pkts_vec_avx2, "Vector AVX2"},
#endif
#if defined(RTE_ARCH_ARM64)
diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h
index c51bb2d62c..a474a69ae3 100644
--- a/drivers/net/bnxt/bnxt_rxr.h
+++ b/drivers/net/bnxt/bnxt_rxr.h
@@ -162,6 +162,8 @@ int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq);
#if defined(RTE_ARCH_X86)
uint16_t bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t bnxt_crx_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
#endif
void bnxt_set_mark_in_mbuf(struct bnxt *bp,
struct rx_pkt_cmpl_hi *rxcmp1,
diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c
index ea8dbaffba..e4d84bc9b6 100644
--- a/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c
+++ b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c
@@ -361,6 +361,294 @@ recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
return nb_rx_pkts;
}
+static uint16_t
+crx_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+ struct bnxt_rx_queue *rxq = rx_queue;
+ const __m256i mbuf_init =
+ _mm256_set_epi64x(0, 0, 0, rxq->mbuf_initializer);
+ struct bnxt_cp_ring_info *cpr = rxq->cp_ring;
+ struct bnxt_rx_ring_info *rxr = rxq->rx_ring;
+ uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size;
+ uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size;
+ struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring;
+ uint64_t valid, desc_valid_mask = ~0ULL;
+ const __m256i info3_v_mask = _mm256_set1_epi32(CMPL_BASE_V);
+ uint32_t raw_cons = cpr->cp_raw_cons;
+ uint32_t cons, mbcons;
+ int nb_rx_pkts = 0;
+ int i;
+ const __m256i valid_target =
+ _mm256_set1_epi32(!!(raw_cons & cp_ring_size));
+ const __m256i shuf_msk =
+ _mm256_set_epi8(15, 14, 13, 12, /* rss */
+ 7, 6, /* vlan_tci */
+ 3, 2, /* data_len */
+ 0xFF, 0xFF, 3, 2, /* pkt_len */
+ 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */
+ 15, 14, 13, 12, /* rss */
+ 7, 6, /* vlan_tci */
+ 3, 2, /* data_len */
+ 0xFF, 0xFF, 3, 2, /* pkt_len */
+ 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
+ const __m256i flags_type_mask =
+ _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK);
+ const __m256i flags2_mask1 =
+ _mm256_set1_epi32(CMPL_FLAGS2_VLAN_TUN_MSK);
+ const __m256i flags2_mask2 =
+ _mm256_set1_epi32(RX_PKT_CMPL_FLAGS2_IP_TYPE);
+ const __m256i rss_mask =
+ _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_RSS_VALID);
+ __m256i t0, t1, flags_type, flags2, index, errors;
+ __m256i ptype_idx, ptypes, is_tunnel;
+ __m256i mbuf01, mbuf23, mbuf45, mbuf67;
+ __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, rearm6, rearm7;
+ __m256i ol_flags, ol_flags_hi;
+ __m256i rss_flags;
+
+ /* Validate ptype table indexing at build time. */
+ bnxt_check_ptype_constants();
+
+ /* If Rx Q was stopped return */
+ if (unlikely(!rxq->rx_started))
+ return 0;
+
+ if (rxq->rxrearm_nb >= rxq->rx_free_thresh)
+ bnxt_rxq_rearm(rxq, rxr);
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256);
+
+ cons = raw_cons & (cp_ring_size - 1);
+ mbcons = raw_cons & (rx_ring_size - 1);
+
+ /* Return immediately if there is not at least one completed packet. */
+ if (!bnxt_cpr_cmp_valid(&cp_desc_ring[cons], raw_cons, cp_ring_size))
+ return 0;
+
+ /* Ensure that we do not go past the ends of the rings. */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons,
+ cp_ring_size - cons));
+ /*
+ * If we are at the end of the ring, ensure that descriptors after the
+ * last valid entry are not treated as valid. Otherwise, force the
+ * maximum number of packets to receive to be a multiple of the per-
+ * loop count.
+ */
+ if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC256) {
+ desc_valid_mask >>=
+ CHAR_BIT * (BNXT_RX_DESCS_PER_LOOP_VEC256 - nb_pkts);
+ } else {
+ nb_pkts =
+ RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256);
+ }
+
+ /* Handle RX burst request */
+ for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC256,
+ cons += BNXT_RX_DESCS_PER_LOOP_VEC256,
+ mbcons += BNXT_RX_DESCS_PER_LOOP_VEC256) {
+ __m256i rxcmp0_1, rxcmp2_3, rxcmp4_5, rxcmp6_7, info3_v;
+ __m256i errors_v2;
+ uint32_t num_valid;
+
+ /* Copy eight mbuf pointers to output array. */
+ t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons]);
+ _mm256_storeu_si256((void *)&rx_pkts[i], t0);
+#ifdef RTE_ARCH_X86_64
+ t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons + 4]);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 4], t0);
+#endif
+
+ /*
+ * Load eight receive completion descriptors into 256-bit
+ * registers. Loads are issued in reverse order in order to
+ * ensure consistent state.
+ */
+ rxcmp6_7 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 6]);
+ rte_compiler_barrier();
+ rxcmp4_5 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 4]);
+ rte_compiler_barrier();
+ rxcmp2_3 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 2]);
+ rte_compiler_barrier();
+ rxcmp0_1 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 0]);
+
+ /* Compute packet type table indices for eight packets. */
+ t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3);
+ t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7);
+ flags_type = _mm256_unpacklo_epi64(t0, t1);
+ ptype_idx = _mm256_and_si256(flags_type, flags_type_mask);
+ ptype_idx = _mm256_srli_epi32(ptype_idx,
+ RX_PKT_CMPL_FLAGS_ITYPE_SFT -
+ BNXT_PTYPE_TBL_TYPE_SFT);
+
+ t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3);
+ t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7);
+ flags2 = _mm256_unpackhi_epi64(t0, t1);
+
+ t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask1),
+ RX_PKT_CMPL_FLAGS2_META_FORMAT_SFT -
+ BNXT_PTYPE_TBL_VLAN_SFT);
+ ptype_idx = _mm256_or_si256(ptype_idx, t0);
+
+ t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask2),
+ RX_PKT_CMPL_FLAGS2_IP_TYPE_SFT -
+ BNXT_PTYPE_TBL_IP_VER_SFT);
+ ptype_idx = _mm256_or_si256(ptype_idx, t0);
+
+ /*
+ * Load ptypes for eight packets using gather. Gather operations
+ * have extremely high latency (~19 cycles), execution and use
+ * of result should be separated as much as possible.
+ */
+ ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table,
+ ptype_idx, sizeof(uint32_t));
+ /*
+ * Compute ol_flags and checksum error table indices for eight
+ * packets.
+ */
+ is_tunnel = _mm256_and_si256(flags2, _mm256_set1_epi32(4));
+ is_tunnel = _mm256_slli_epi32(is_tunnel, 3);
+ flags2 = _mm256_and_si256(flags2, _mm256_set1_epi32(0x1F));
+
+ /* Extract errors_v2 fields for eight packets. */
+ t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3);
+ t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7);
+ errors_v2 = _mm256_unpacklo_epi64(t0, t1);
+
+ errors = _mm256_srli_epi32(errors_v2, 4);
+ errors = _mm256_and_si256(errors, _mm256_set1_epi32(0xF));
+ errors = _mm256_and_si256(errors, flags2);
+
+ index = _mm256_andnot_si256(errors, flags2);
+ errors = _mm256_or_si256(errors,
+ _mm256_srli_epi32(is_tunnel, 1));
+ index = _mm256_or_si256(index, is_tunnel);
+
+ /*
+ * Load ol_flags for eight packets using gather. Gather
+ * operations have extremely high latency (~19 cycles),
+ * execution and use of result should be separated as much
+ * as possible.
+ */
+ ol_flags = _mm256_i32gather_epi32((int *)rxr->ol_flags_table,
+ index, sizeof(uint32_t));
+ errors = _mm256_i32gather_epi32((int *)rxr->ol_flags_err_table,
+ errors, sizeof(uint32_t));
+
+ /*
+ * Pack the 128-bit array of valid descriptor flags into 64
+ * bits and count the number of set bits in order to determine
+ * the number of valid descriptors.
+ */
+ const __m256i perm_msk =
+ _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk);
+ info3_v = _mm256_and_si256(errors_v2, info3_v_mask);
+ info3_v = _mm256_xor_si256(info3_v, valid_target);
+
+ info3_v = _mm256_packs_epi32(info3_v, _mm256_setzero_si256());
+ valid = _mm_cvtsi128_si64(_mm256_extracti128_si256(info3_v, 1));
+ valid = (valid << CHAR_BIT) |
+ _mm_cvtsi128_si64(_mm256_castsi256_si128(info3_v));
+ num_valid = __builtin_popcountll(valid & desc_valid_mask);
+
+ if (num_valid == 0)
+ break;
+
+ /* Update mbuf rearm_data for eight packets. */
+ mbuf01 = _mm256_shuffle_epi8(rxcmp0_1, shuf_msk);
+ mbuf23 = _mm256_shuffle_epi8(rxcmp2_3, shuf_msk);
+ mbuf45 = _mm256_shuffle_epi8(rxcmp4_5, shuf_msk);
+ mbuf67 = _mm256_shuffle_epi8(rxcmp6_7, shuf_msk);
+
+ /* Blend in ptype field for two mbufs at a time. */
+ mbuf01 = _mm256_blend_epi32(mbuf01, ptypes, 0x11);
+ mbuf23 = _mm256_blend_epi32(mbuf23,
+ _mm256_srli_si256(ptypes, 4), 0x11);
+ mbuf45 = _mm256_blend_epi32(mbuf45,
+ _mm256_srli_si256(ptypes, 8), 0x11);
+ mbuf67 = _mm256_blend_epi32(mbuf67,
+ _mm256_srli_si256(ptypes, 12), 0x11);
+
+ /* Unpack rearm data, set fixed fields for first four mbufs. */
+ rearm0 = _mm256_permute2f128_si256(mbuf_init, mbuf01, 0x20);
+ rearm1 = _mm256_blend_epi32(mbuf_init, mbuf01, 0xF0);
+ rearm2 = _mm256_permute2f128_si256(mbuf_init, mbuf23, 0x20);
+ rearm3 = _mm256_blend_epi32(mbuf_init, mbuf23, 0xF0);
+
+ /* Compute final ol_flags values for eight packets. */
+ rss_flags = _mm256_and_si256(flags_type, rss_mask);
+ rss_flags = _mm256_srli_epi32(rss_flags, 9);
+ ol_flags = _mm256_or_si256(ol_flags, errors);
+ ol_flags = _mm256_or_si256(ol_flags, rss_flags);
+ ol_flags_hi = _mm256_permute2f128_si256(ol_flags,
+ ol_flags, 0x11);
+
+ /* Set ol_flags fields for first four packets. */
+ rearm0 = _mm256_blend_epi32(rearm0,
+ _mm256_slli_si256(ol_flags, 8),
+ 0x04);
+ rearm1 = _mm256_blend_epi32(rearm1,
+ _mm256_slli_si256(ol_flags_hi, 8),
+ 0x04);
+ rearm2 = _mm256_blend_epi32(rearm2,
+ _mm256_slli_si256(ol_flags, 4),
+ 0x04);
+ rearm3 = _mm256_blend_epi32(rearm3,
+ _mm256_slli_si256(ol_flags_hi, 4),
+ 0x04);
+
+ /* Store all mbuf fields for first four packets. */
+ _mm256_storeu_si256((void *)&rx_pkts[i + 0]->rearm_data,
+ rearm0);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 1]->rearm_data,
+ rearm1);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 2]->rearm_data,
+ rearm2);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 3]->rearm_data,
+ rearm3);
+
+ /* Unpack rearm data, set fixed fields for final four mbufs. */
+ rearm4 = _mm256_permute2f128_si256(mbuf_init, mbuf45, 0x20);
+ rearm5 = _mm256_blend_epi32(mbuf_init, mbuf45, 0xF0);
+ rearm6 = _mm256_permute2f128_si256(mbuf_init, mbuf67, 0x20);
+ rearm7 = _mm256_blend_epi32(mbuf_init, mbuf67, 0xF0);
+
+ /* Set ol_flags fields for final four packets. */
+ rearm4 = _mm256_blend_epi32(rearm4, ol_flags, 0x04);
+ rearm5 = _mm256_blend_epi32(rearm5, ol_flags_hi, 0x04);
+ rearm6 = _mm256_blend_epi32(rearm6,
+ _mm256_srli_si256(ol_flags, 4),
+ 0x04);
+ rearm7 = _mm256_blend_epi32(rearm7,
+ _mm256_srli_si256(ol_flags_hi, 4),
+ 0x04);
+
+ /* Store all mbuf fields for final four packets. */
+ _mm256_storeu_si256((void *)&rx_pkts[i + 4]->rearm_data,
+ rearm4);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 5]->rearm_data,
+ rearm5);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 6]->rearm_data,
+ rearm6);
+ _mm256_storeu_si256((void *)&rx_pkts[i + 7]->rearm_data,
+ rearm7);
+
+ nb_rx_pkts += num_valid;
+ if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC256)
+ break;
+ }
+
+ if (nb_rx_pkts) {
+ rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts);
+
+ rxq->rxrearm_nb += nb_rx_pkts;
+ cpr->cp_raw_cons += nb_rx_pkts;
+ bnxt_db_cq(cpr);
+ }
+
+ return nb_rx_pkts;
+}
+
uint16_t
bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -382,6 +670,27 @@ bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
return cnt + recv_burst_vec_avx2(rx_queue, rx_pkts + cnt, nb_pkts);
}
+uint16_t
+bnxt_crx_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t cnt = 0;
+
+ while (nb_pkts > RTE_BNXT_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = crx_burst_vec_avx2(rx_queue, rx_pkts + cnt,
+ RTE_BNXT_MAX_RX_BURST);
+
+ cnt += burst;
+ nb_pkts -= burst;
+
+ if (burst < RTE_BNXT_MAX_RX_BURST)
+ return cnt;
+ }
+ return cnt + crx_burst_vec_avx2(rx_queue, rx_pkts + cnt, nb_pkts);
+}
+
static void
bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq)
{
--
2.39.2 (Apple Git-143)
[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4218 bytes --]
next prev parent reply other threads:[~2023-12-21 18:07 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-21 18:05 [PATCH 00/18] bnxt patchset Ajit Khaparde
2023-12-21 18:05 ` [PATCH 01/18] net/bnxt: add support for UDP GSO Ajit Khaparde
2023-12-21 18:05 ` [PATCH 02/18] net/bnxt: add support for compressed Rx CQE Ajit Khaparde
2023-12-21 18:05 ` [PATCH 03/18] net/bnxt: fix a typo while parsing link speed Ajit Khaparde
2023-12-21 18:05 ` [PATCH 04/18] net/bnxt: fix setting 50G and 100G forced speed Ajit Khaparde
2023-12-21 18:05 ` [PATCH 05/18] net/bnxt: fix speed change from 200G to 25G on Thor Ajit Khaparde
2023-12-21 18:05 ` [PATCH 06/18] net/bnxt: support backward compatibility Ajit Khaparde
2023-12-21 18:05 ` [PATCH 07/18] net/bnxt: reattempt mbuf allocation for Rx and AGG rings Ajit Khaparde
2023-12-21 18:05 ` [PATCH 08/18] net/bnxt: refactor Rx doorbell during Rx flush Ajit Khaparde
2023-12-21 18:05 ` [PATCH 09/18] net/bnxt: extend RSS hash support for P7 devices Ajit Khaparde
2023-12-21 18:05 ` [PATCH 10/18] net/bnxt: add flow query callback Ajit Khaparde
2023-12-21 18:05 ` [PATCH 11/18] net/bnxt: add ESP and AH header based RSS support Ajit Khaparde
2023-12-21 18:05 ` [PATCH 12/18] net/bnxt: set allmulti mode if multicast filter fails Ajit Khaparde
2023-12-21 18:05 ` [PATCH 13/18] net/bnxt: add VF FLR async event handler Ajit Khaparde
2023-12-21 18:05 ` [PATCH 14/18] net/bnxt: add tunnel TPA support Ajit Khaparde
2023-12-21 18:05 ` [PATCH 15/18] net/bnxt: add 400G get support for P7 devices Ajit Khaparde
2023-12-21 18:05 ` [PATCH 16/18] net/bnxt: query extended stats from firmware Ajit Khaparde
2023-12-21 18:05 ` Ajit Khaparde [this message]
2023-12-21 18:05 ` [PATCH 18/18] net/bnxt: enable SSE mode for compressed CQE Ajit Khaparde
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231221180529.18687-18-ajit.khaparde@broadcom.com \
--to=ajit.khaparde@broadcom.com \
--cc=damodharam.ammepalli@broadcom.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).