From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id C1D2DA00BE; Wed, 20 Apr 2022 10:17:10 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 5760C427E9; Wed, 20 Apr 2022 10:17:06 +0200 (CEST) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by mails.dpdk.org (Postfix) with ESMTP id E9DE440687 for ; Wed, 20 Apr 2022 10:17:04 +0200 (CEST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 63AC61FB; Wed, 20 Apr 2022 01:17:04 -0700 (PDT) Received: from net-x86-dell-8268.shanghai.arm.com (net-x86-dell-8268.shanghai.arm.com [10.169.210.114]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 9ED413F73B; Wed, 20 Apr 2022 01:17:01 -0700 (PDT) From: Feifei Wang To: Beilei Xing , Bruce Richardson , Konstantin Ananyev , Ruifeng Wang Cc: dev@dpdk.org, nd@arm.com, Feifei Wang , Honnappa Nagarahalli Subject: [PATCH v1 2/5] net/i40e: enable direct rearm mode Date: Wed, 20 Apr 2022 16:16:47 +0800 Message-Id: <20220420081650.2043183-3-feifei.wang2@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220420081650.2043183-1-feifei.wang2@arm.com> References: <20220420081650.2043183-1-feifei.wang2@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org For i40e driver, enable direct re-arm mode. This patch supports the case of mapping Rx/Tx queues from the same single lcore. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- drivers/net/i40e/i40e_rxtx.h | 4 + drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++ drivers/net/i40e/i40e_rxtx_vec_avx2.c | 14 +- drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++- drivers/net/i40e/i40e_rxtx_vec_neon.c | 141 ++++++++++++- drivers/net/i40e/i40e_rxtx_vec_sse.c | 170 ++++++++++++++- 6 files changed, 839 insertions(+), 8 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h index 5e6eecc501..1fdf4305f4 100644 --- a/drivers/net/i40e/i40e_rxtx.h +++ b/drivers/net/i40e/i40e_rxtx.h @@ -102,6 +102,8 @@ struct i40e_rx_queue { uint16_t rxrearm_nb; /**< number of remaining to be re-armed */ uint16_t rxrearm_start; /**< the idx we start the re-arming from */ + uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */ + uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */ uint64_t mbuf_initializer; /**< value to init mbufs */ uint16_t port_id; /**< device port ID */ @@ -121,6 +123,8 @@ struct i40e_rx_queue { uint16_t rx_using_sse; /**qrx_tail, rx_id); } + +static __rte_always_inline void +i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512) +{ + struct rte_eth_dev *dev; + struct i40e_tx_queue *txq; + volatile union i40e_rx_desc *rxdp; + struct i40e_tx_entry *txep; + struct i40e_rx_entry *rxep; + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH]; + uint16_t tx_port_id, tx_queue_id; + uint16_t rx_id; + uint16_t i, n; + uint16_t nb_rearm = 0; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + tx_port_id = rxq->direct_rxrearm_port; + tx_queue_id = rxq->direct_rxrearm_queue; + dev = &rte_eth_devices[tx_port_id]; + txq = dev->data->tx_queues[tx_queue_id]; + + /* check Rx queue is able to take in the whole + * batch of free mbufs from Tx queue + */ + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { + /* check DD bits on threshold descriptor */ + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { + goto mempool_bulk; + } + + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH) + goto mempool_bulk; + + n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh-1) + */ + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { + /* directly put mbufs from Tx to Rx, + * and initialize the mbufs in vector + */ + for (i = 0; i < n; i++) + rxep[i].mbuf = txep[i].mbuf; + } else { + for (i = 0; i < n; i++) { + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf); + /* ensure each Tx freed buffer is valid */ + if (m[i] != NULL) + nb_rearm++; + } + + if (nb_rearm != n) { + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + + goto mempool_bulk; + } else { + for (i = 0; i < n; i++) + rxep[i].mbuf = m[i]; + } + } + + /* update counters for Tx */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + } else { +mempool_bulk: + /* if TX did not free bufs into Rx sw-ring, + * get new bufs from mempool + */ + n = RTE_I40E_RXQ_REARM_THRESH; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mempool_get_bulk(rxq->mp, + (void *)rxep, + RTE_I40E_RXQ_REARM_THRESH) < 0) { + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + _mm_store_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_I40E_RXQ_REARM_THRESH; + return; + } + } + +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC + struct rte_mbuf *mb0, *mb1; + __m128i dma_addr0, dma_addr1; + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < n; i += 2, rxep += 2) { + __m128i vaddr0, vaddr1; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); + } +#else +#ifdef __AVX512VL__ + if (avx512) { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + struct rte_mbuf *mb4, *mb5, *mb6, *mb7; + __m512i dma_addr0_3, dma_addr4_7; + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m128i vaddr4, vaddr5, vaddr6, vaddr7; + __m256i vaddr0_1, vaddr2_3; + __m256i vaddr4_5, vaddr6_7; + __m512i vaddr0_3, vaddr4_7; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + mb2 = rxep[2].mbuf; + mb3 = rxep[3].mbuf; + mb4 = rxep[4].mbuf; + mb5 = rxep[5].mbuf; + mb6 = rxep[6].mbuf; + mb7 = rxep[7].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3, and so on. + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + vaddr4_5 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), + vaddr5, 1); + vaddr6_7 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), + vaddr7, 1); + vaddr0_3 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), + vaddr2_3, 1); + vaddr4_7 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), + vaddr6_7, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); + + /* add headroom to pa values */ + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); + + /* flush desc with pa dma_addr */ + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); + } + } else { +#endif /* __AVX512VL__*/ + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + __m256i dma_addr0_1, dma_addr2_3; + __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 4 mbufs in one loop */ + for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m256i vaddr0_1, vaddr2_3; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + mb2 = rxep[2].mbuf; + mb3 = rxep[3].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3 + */ + vaddr0_1 = _mm256_inserti128_si256 + (_mm256_castsi128_si256(vaddr0), vaddr1, 1); + vaddr2_3 = _mm256_inserti128_si256 + (_mm256_castsi128_si256(vaddr2), vaddr3, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1); + dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3); + + /* add headroom to pa values */ + dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room); + dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room); + + /* flush desc with pa dma_addr */ + _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1); + _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3); + } + } + +#endif + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += n; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; + if (!rxq->rxrearm_start) + rx_id = rxq->nb_rx_desc - 1; + else + rx_id = rxq->rxrearm_start - 1; + } + + rxq->rxrearm_nb -= n; + + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); +} #endif /* __AVX2__*/ #endif /*_I40E_RXTX_COMMON_AVX_H_*/ diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c index c73b2a321b..fcb7ba0273 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c @@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) return i40e_rxq_rearm_common(rxq, false); } +static __rte_always_inline void +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) +{ + return i40e_rxq_direct_rearm_common(rxq, false); +} + #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC /* Handles 32B descriptor FDIR ID processing: * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc @@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) - i40e_rxq_rearm(rxq); + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { + if (rxq->direct_rxrearm_enable) + i40e_rxq_direct_rearm(rxq); + else + i40e_rxq_rearm(rxq); + } /* Before we start moving massive data around, check to see if * there is actually a packet available diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c index 2e8a3f0df6..d967095edc 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c @@ -21,6 +21,12 @@ #define RTE_I40E_DESCS_PER_LOOP_AVX 8 +enum i40e_direct_rearm_type_value { + I40E_DIRECT_REARM_TYPE_NORMAL = 0x0, + I40E_DIRECT_REARM_TYPE_FAST_FREE = 0x1, + I40E_DIRECT_REARM_TYPE_PRE_FREE = 0x2, +}; + static __rte_always_inline void i40e_rxq_rearm(struct i40e_rx_queue *rxq) { @@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); } +static __rte_always_inline void +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) +{ + struct rte_eth_dev *dev; + struct i40e_tx_queue *txq; + volatile union i40e_rx_desc *rxdp; + struct i40e_vec_tx_entry *txep; + struct i40e_rx_entry *rxep; + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH]; + uint16_t tx_port_id, tx_queue_id; + uint16_t rx_id; + uint16_t i, n; + uint16_t j = 0; + uint16_t nb_rearm = 0; + enum i40e_direct_rearm_type_value type; + struct rte_mempool_cache *cache = NULL; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + tx_port_id = rxq->direct_rxrearm_port; + tx_queue_id = rxq->direct_rxrearm_queue; + dev = &rte_eth_devices[tx_port_id]; + txq = dev->data->tx_queues[tx_queue_id]; + + /* check Rx queue is able to take in the whole + * batch of free mbufs from Tx queue + */ + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { + /* check DD bits on threshold descriptor */ + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { + goto mempool_bulk; + } + + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH) + goto mempool_bulk; + + n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh-1) + */ + txep = (void *)txq->sw_ring; + txep += txq->tx_next_dd - (n - 1); + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { + /* directly put mbufs from Tx to Rx */ + uint32_t copied = 0; + /* n is multiple of 32 */ + while (copied < n) { + const __m512i a = _mm512_load_si512(&txep[copied]); + const __m512i b = _mm512_load_si512(&txep[copied + 8]); + const __m512i c = _mm512_load_si512(&txep[copied + 16]); + const __m512i d = _mm512_load_si512(&txep[copied + 24]); + + _mm512_storeu_si512(&rxep[copied], a); + _mm512_storeu_si512(&rxep[copied + 8], b); + _mm512_storeu_si512(&rxep[copied + 16], c); + _mm512_storeu_si512(&rxep[copied + 24], d); + copied += 32; + } + type = I40E_DIRECT_REARM_TYPE_FAST_FREE; + } else { + for (i = 0; i < n; i++) { + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf); + /* ensure each Tx freed buffer is valid */ + if (m[i] != NULL) + nb_rearm++; + } + + if (nb_rearm != n) { + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + + goto mempool_bulk; + } else { + type = I40E_DIRECT_REARM_TYPE_PRE_FREE; + } + } + + /* update counters for Tx */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + } else { +mempool_bulk: + cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id()); + + if (unlikely(!cache)) + return i40e_rxq_rearm_common(rxq, true); + + n = RTE_I40E_RXQ_REARM_THRESH; + + /* We need to pull 'n' more MBUFs into the software ring from mempool + * We inline the mempool function here, so we can vectorize the copy + * from the cache into the shadow ring. + */ + + if (cache->len < RTE_I40E_RXQ_REARM_THRESH) { + /* No. Backfill the cache first, and then fill from it */ + uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size - + cache->len); + + /* How many do we require + * i.e. number to fill the cache + the request + */ + int ret = rte_mempool_ops_dequeue_bulk(rxq->mp, + &cache->objs[cache->len], req); + if (ret == 0) { + cache->len += req; + } else { + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + _mm_store_si128 + ((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_I40E_RXQ_REARM_THRESH; + return; + } + } + + type = I40E_DIRECT_REARM_TYPE_NORMAL; + } + + const __m512i iova_offsets = _mm512_set1_epi64 + (offsetof(struct rte_mbuf, buf_iova)); + const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC + /* to shuffle the addresses to correct slots. Values 4-7 will contain + * zeros, so use 7 for a zero-value. + */ + const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); +#else + const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0); +#endif + + __m512i mbuf_ptrs; + + /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking + * from mempool cache and populating both shadow and HW rings + */ + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) { + switch (type) { + case I40E_DIRECT_REARM_TYPE_FAST_FREE: + mbuf_ptrs = _mm512_loadu_si512(rxep); + break; + case I40E_DIRECT_REARM_TYPE_PRE_FREE: + mbuf_ptrs = _mm512_loadu_si512(&m[j]); + _mm512_store_si512(rxep, mbuf_ptrs); + j += 8; + break; + case I40E_DIRECT_REARM_TYPE_NORMAL: + mbuf_ptrs = _mm512_loadu_si512 + (&cache->objs[cache->len - 8]); + _mm512_store_si512(rxep, mbuf_ptrs); + cache->len -= 8; + break; + } + + /* gather iova of mbuf0-7 into one zmm reg */ + const __m512i iova_base_addrs = _mm512_i64gather_epi64 + (_mm512_add_epi64(mbuf_ptrs, iova_offsets), + 0, /* base */ + 1 /* scale */); + const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, + headroom); +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC + const __m512i iovas0 = _mm512_castsi256_si512 + (_mm512_extracti64x4_epi64(iova_addrs, 0)); + const __m512i iovas1 = _mm512_castsi256_si512 + (_mm512_extracti64x4_epi64(iova_addrs, 1)); + + /* permute leaves desc 2-3 addresses in header address slots 0-1 + * but these are ignored by driver since header split not + * enabled. Similarly for desc 4 & 5. + */ + const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64 + (permute_idx, iovas0); + const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8); + + const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64 + (permute_idx, iovas1); + const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8); + + _mm512_store_si512((void *)rxdp, desc_rd_0_1); + _mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3); + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5); + _mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7); +#else + /* permute leaves desc 4-7 addresses in header address slots 0-3 + * but these are ignored by driver since header split not + * enabled. + */ + const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64 + (permute_idx, iova_addrs); + const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8); + + _mm512_store_si512((void *)rxdp, desc_rd_0_3); + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7); +#endif + rxdp += 8, rxep += 8; + } + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += n; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; + if (!rxq->rxrearm_start) + rx_id = rxq->nb_rx_desc - 1; + else + rx_id = rxq->rxrearm_start - 1; + } + + rxq->rxrearm_nb -= n; + + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); +} + #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC /* Handles 32B descriptor FDIR ID processing: * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc @@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) - i40e_rxq_rearm(rxq); + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { + if (rxq->direct_rxrearm_enable) + i40e_rxq_direct_rearm(rxq); + else + i40e_rxq_rearm(rxq); + } /* Before we start moving massive data around, check to see if * there is actually a packet available diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index fa9e6582c5..dc78e3c90b 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } +static inline void +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) +{ + struct rte_eth_dev *dev; + struct i40e_tx_queue *txq; + volatile union i40e_rx_desc *rxdp; + struct i40e_tx_entry *txep; + struct i40e_rx_entry *rxep; + uint16_t tx_port_id, tx_queue_id; + uint16_t rx_id; + struct rte_mbuf *mb0, *mb1, *m; + uint64x2_t dma_addr0, dma_addr1; + uint64x2_t zero = vdupq_n_u64(0); + uint64_t paddr; + uint16_t i, n; + uint16_t nb_rearm = 0; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + tx_port_id = rxq->direct_rxrearm_port; + tx_queue_id = rxq->direct_rxrearm_queue; + dev = &rte_eth_devices[tx_port_id]; + txq = dev->data->tx_queues[tx_queue_id]; + + /* check Rx queue is able to take in the whole + * batch of free mbufs from Tx queue + */ + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { + /* check DD bits on threshold descriptor */ + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { + goto mempool_bulk; + } + + n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh-1) + */ + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { + /* directly put mbufs from Tx to Rx, + * and initialize the mbufs in vector + */ + for (i = 0; i < n; i++, rxep++, txep++) { + rxep[0].mbuf = txep[0].mbuf; + + /* Initialize rxdp descs */ + mb0 = txep[0].mbuf; + + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr0 = vdupq_n_u64(paddr); + /* flush desc with pa dma_addr */ + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); + } + } else { + for (i = 0; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m != NULL) { + rxep[i].mbuf = m; + + /* Initialize rxdp descs */ + paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr0 = vdupq_n_u64(paddr); + /* flush desc with pa dma_addr */ + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); + nb_rearm++; + } + } + n = nb_rearm; + } + + /* update counters for Tx */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + } else { +mempool_bulk: + /* if TX did not free bufs into Rx sw-ring, + * get new bufs from mempool + */ + n = RTE_I40E_RXQ_REARM_THRESH; + if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) { + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) { + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + vst1q_u64((uint64_t *)&rxdp[i].read, zero); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n; + return; + } + + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < n; i += 2, rxep += 2) { + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr0 = vdupq_n_u64(paddr); + /* flush desc with pa dma_addr */ + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); + + paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr1 = vdupq_n_u64(paddr); + /* flush desc with pa dma_addr */ + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1); + } + } + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += n; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; + if (!rxq->rxrearm_start) + rx_id = rxq->nb_rx_desc - 1; + else + rx_id = rxq->rxrearm_start - 1; + } + + rxq->rxrearm_nb -= n; + + rte_io_wmb(); + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); +} + #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */ static inline uint32x4_t @@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq, /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) - i40e_rxq_rearm(rxq); + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { + if (rxq->direct_rxrearm_enable) + i40e_rxq_direct_rearm(rxq); + else + i40e_rxq_rearm(rxq); + } /* Before we start moving massive data around, check to see if * there is actually a packet available diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c index 3782e8052f..b2f1ab2c8d 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c @@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); } +static inline void +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) +{ + struct rte_eth_dev *dev; + struct i40e_tx_queue *txq; + volatile union i40e_rx_desc *rxdp; + struct i40e_tx_entry *txep; + struct i40e_rx_entry *rxep; + uint16_t tx_port_id, tx_queue_id; + uint16_t rx_id; + struct rte_mbuf *mb0, *mb1, *m; + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + __m128i dma_addr0, dma_addr1; + __m128i vaddr0, vaddr1; + uint16_t i, n; + uint16_t nb_rearm = 0; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + tx_port_id = rxq->direct_rxrearm_port; + tx_queue_id = rxq->direct_rxrearm_queue; + dev = &rte_eth_devices[tx_port_id]; + txq = dev->data->tx_queues[tx_queue_id]; + + /* check Rx queue is able to take in the whole + * batch of free mbufs from Tx queue + */ + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { + /* check DD bits on threshold descriptor */ + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { + goto mempool_bulk; + } + + n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh-1) + */ + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { + /* directly put mbufs from Tx to Rx, + * and initialize the mbufs in vector + */ + for (i = 0; i < n; i++, rxep++, txep++) { + rxep[0].mbuf = txep[0].mbuf; + + /* Initialize rxdp descs */ + mb0 = txep[0].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + } + } else { + for (i = 0; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m != NULL) { + rxep[i].mbuf = m; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + nb_rearm++; + } + } + n = nb_rearm; + } + + /* update counters for Tx */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + } else { +mempool_bulk: + /* if TX did not free bufs into Rx sw-ring, + * get new bufs from mempool + */ + n = RTE_I40E_RXQ_REARM_THRESH; + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) { + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) { + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + _mm_store_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_I40E_RXQ_REARM_THRESH; + return; + } + + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) { + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); + } + } + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += n; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; + if (!rxq->rxrearm_start) + rx_id = rxq->nb_rx_desc - 1; + else + rx_id = rxq->rxrearm_start - 1; + } + + rxq->rxrearm_nb -= n; + + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); +} + #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */ static inline __m128i @@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) - i40e_rxq_rearm(rxq); + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { + if (rxq->direct_rxrearm_enable) + i40e_rxq_direct_rearm(rxq); + else + i40e_rxq_rearm(rxq); + } /* Before we start moving massive data around, check to see if * there is actually a packet available -- 2.25.1