From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id C3E77A0032; Thu, 12 May 2022 00:28:44 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id B1F8E40DDD; Thu, 12 May 2022 00:28:44 +0200 (CEST) Received: from forward500j.mail.yandex.net (forward500j.mail.yandex.net [5.45.198.250]) by mails.dpdk.org (Postfix) with ESMTP id 84794406B4 for ; Thu, 12 May 2022 00:28:43 +0200 (CEST) Received: from myt6-ecec3fffc7db.qloud-c.yandex.net (myt6-ecec3fffc7db.qloud-c.yandex.net [IPv6:2a02:6b8:c12:4681:0:640:ecec:3fff]) by forward500j.mail.yandex.net (Yandex) with ESMTP id 1F88D6CB6659; Thu, 12 May 2022 01:28:43 +0300 (MSK) Received: from myt6-9bdf92ffd111.qloud-c.yandex.net (myt6-9bdf92ffd111.qloud-c.yandex.net [2a02:6b8:c12:468a:0:640:9bdf:92ff]) by myt6-ecec3fffc7db.qloud-c.yandex.net (mxback/Yandex) with ESMTP id evOKlOeR6v-Sgg0d3Co; Thu, 12 May 2022 01:28:43 +0300 X-Yandex-Fwd: 2 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yandex.ru; s=mail; t=1652308123; bh=MOAfkLl5tuPk04Gs2C6LSSpAAEXP0lyORh9VPLWKCdg=; h=In-Reply-To:From:Subject:Cc:References:Date:Message-ID:To; b=kq+47wGa97vHyFEwBYQYLtSHiTKlNpwNQnYWegpjjB7ZBqt3fiXz5zuCom/joVh/S RCOQ12JuhIwoh+ntLre6ENcCRM1BeJhFbkAQ1a/xcstFWtZU+dMCwPexNQX/X10U5n Bi7i/cZhmblX8tw+2LzOQzDDbq8VcZa6kX2EXQ3U= Authentication-Results: myt6-ecec3fffc7db.qloud-c.yandex.net; dkim=pass header.i=@yandex.ru Received: by myt6-9bdf92ffd111.qloud-c.yandex.net (smtp/Yandex) with ESMTPSA id FEPNG6w6Ai-SfMmc6qD; Thu, 12 May 2022 01:28:41 +0300 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)) (Client certificate not present) Message-ID: <2ca1c24f-f196-dd3e-d582-f141177bc006@yandex.ru> Date: Wed, 11 May 2022 23:28:37 +0100 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.8.0 Subject: Re: [PATCH v1 2/5] net/i40e: enable direct rearm mode Content-Language: en-US To: Feifei Wang , Beilei Xing , Bruce Richardson , Konstantin Ananyev , Ruifeng Wang Cc: dev@dpdk.org, nd@arm.com, Honnappa Nagarahalli References: <20220420081650.2043183-1-feifei.wang2@arm.com> <20220420081650.2043183-3-feifei.wang2@arm.com> From: Konstantin Ananyev In-Reply-To: <20220420081650.2043183-3-feifei.wang2@arm.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org > For i40e driver, enable direct re-arm mode. This patch supports the case > of mapping Rx/Tx queues from the same single lcore. > > Suggested-by: Honnappa Nagarahalli > Signed-off-by: Feifei Wang > Reviewed-by: Ruifeng Wang > Reviewed-by: Honnappa Nagarahalli > --- > drivers/net/i40e/i40e_rxtx.h | 4 + > drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++ > drivers/net/i40e/i40e_rxtx_vec_avx2.c | 14 +- > drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++- > drivers/net/i40e/i40e_rxtx_vec_neon.c | 141 ++++++++++++- > drivers/net/i40e/i40e_rxtx_vec_sse.c | 170 ++++++++++++++- > 6 files changed, 839 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h > index 5e6eecc501..1fdf4305f4 100644 > --- a/drivers/net/i40e/i40e_rxtx.h > +++ b/drivers/net/i40e/i40e_rxtx.h > @@ -102,6 +102,8 @@ struct i40e_rx_queue { > > uint16_t rxrearm_nb; /**< number of remaining to be re-armed */ > uint16_t rxrearm_start; /**< the idx we start the re-arming from */ > + uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */ > + uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */ > uint64_t mbuf_initializer; /**< value to init mbufs */ > > uint16_t port_id; /**< device port ID */ > @@ -121,6 +123,8 @@ struct i40e_rx_queue { > uint16_t rx_using_sse; /** uint8_t dcb_tc; /**< Traffic class of rx queue */ > uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */ > + /**< 0 if direct re-arm mode disabled, 1 when enabled */ > + bool direct_rxrearm_enable; > const struct rte_memzone *mz; > }; > > diff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h > index cfc1e63173..a742723e07 100644 > --- a/drivers/net/i40e/i40e_rxtx_common_avx.h > +++ b/drivers/net/i40e/i40e_rxtx_common_avx.h > @@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512) > /* Update the tail pointer on the NIC */ > I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); > } > + > +static __rte_always_inline void > +i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512) > +{ > + struct rte_eth_dev *dev; > + struct i40e_tx_queue *txq;rivers/net/i40e/i40e_rxtx_common_avx.h > + volatile union i40e_rx_desc *rxdp; > + struct i40e_tx_entry *txep; > + struct i40e_rx_entry *rxep; > + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH]; > + uint16_t tx_port_id, tx_queue_id; > + uint16_t rx_id; > + uint16_t i, n; > + uint16_t nb_rearm = 0; > + > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > + > + tx_port_id = rxq->direct_rxrearm_port; > + tx_queue_id = rxq->direct_rxrearm_queue; > + dev = &rte_eth_devices[tx_port_id]; > + txq = dev->data->tx_queues[tx_queue_id]; > + > + /* check Rx queue is able to take in the whole > + * batch of free mbufs from Tx queue > + */ > + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { > + /* check DD bits on threshold descriptor */ > + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & > + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { > + goto mempool_bulk; > + } > + > + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH) > + goto mempool_bulk; I think all these checks (is this mode can be enabled) should be done at config phase, not at data-path. > + > + n = txq->tx_rs_thresh; > + > + /* first buffer to free from S/W ring is at index > + * tx_next_dd - (tx_rs_thresh-1) > + */ > + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; It really looks bad that RX function acesses and modifies TXQ data directly. Would be much better to hide TXD checking/manipulation into a separate TXQ function (txq_mbuf() or so) that RX path can invoke. > + > + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { > + /* directly put mbufs from Tx to Rx, > + * and initialize the mbufs in vector > + */ > + for (i = 0; i < n; i++) > + rxep[i].mbuf = txep[i].mbuf; > + } else { > + for (i = 0; i < n; i++) { > + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf); > + /* ensure each Tx freed buffer is valid */ > + if (m[i] != NULL) > + nb_rearm++; > + } > + > + if (nb_rearm != n) { > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); So if nb_rearm != 0 what would happen with mbufs collected in m[]? Are you just dropping/forgetting them? > + > + goto mempool_bulk; > + } else { > + for (i = 0; i < n; i++) > + rxep[i].mbuf = m[i]; > + } > + } > + > + /* update counters for Tx */ > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); > + } else { I suppose the chunk of code below is just a copy&paste of exising i40e_rxq_direct_rearm_common()? If so, no point to duplicate it, better to just invoke it here (I presume a bit of re-factoring) would be need for that. Pretty much same thoughts for other rearm functions below. > +mempool_bulk: > + /* if TX did not free bufs into Rx sw-ring, > + * get new bufs from mempool > + */ > + n = RTE_I40E_RXQ_REARM_THRESH; > + > + /* Pull 'n' more MBUFs into the software ring */ > + if (rte_mempool_get_bulk(rxq->mp, > + (void *)rxep, > + RTE_I40E_RXQ_REARM_THRESH) < 0) { > + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= > + rxq->nb_rx_desc) { > + __m128i dma_addr0; > + dma_addr0 = _mm_setzero_si128(); > + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { > + rxep[i].mbuf = &rxq->fake_mbuf; > + _mm_store_si128((__m128i *)&rxdp[i].read, > + dma_addr0); > + } > + } > + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += > + RTE_I40E_RXQ_REARM_THRESH; > + return; > + } > + } > + > +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > + struct rte_mbuf *mb0, *mb1; > + __m128i dma_addr0, dma_addr1; > + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, > + RTE_PKTMBUF_HEADROOM); > + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ > + for (i = 0; i < n; i += 2, rxep += 2) { > + __m128i vaddr0, vaddr1; > + > + mb0 = rxep[0].mbuf; > + mb1 = rxep[1].mbuf; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); > + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); > + > + /* add headroom to pa values */ > + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); > + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); > + } > +#else > +#ifdef __AVX512VL__ > + if (avx512) { > + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; > + struct rte_mbuf *mb4, *mb5, *mb6, *mb7; > + __m512i dma_addr0_3, dma_addr4_7; > + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); > + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ > + for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) { > + __m128i vaddr0, vaddr1, vaddr2, vaddr3; > + __m128i vaddr4, vaddr5, vaddr6, vaddr7; > + __m256i vaddr0_1, vaddr2_3; > + __m256i vaddr4_5, vaddr6_7; > + __m512i vaddr0_3, vaddr4_7; > + > + mb0 = rxep[0].mbuf; > + mb1 = rxep[1].mbuf; > + mb2 = rxep[2].mbuf; > + mb3 = rxep[3].mbuf; > + mb4 = rxep[4].mbuf; > + mb5 = rxep[5].mbuf; > + mb6 = rxep[6].mbuf; > + mb7 = rxep[7].mbuf; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); > + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); > + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); > + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); > + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); > + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); > + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); > + > + /** > + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 > + * into the high lanes. Similarly for 2 & 3, and so on. > + */ > + vaddr0_1 = > + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), > + vaddr1, 1); > + vaddr2_3 = > + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), > + vaddr3, 1); > + vaddr4_5 = > + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), > + vaddr5, 1); > + vaddr6_7 = > + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), > + vaddr7, 1); > + vaddr0_3 = > + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), > + vaddr2_3, 1); > + vaddr4_7 = > + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), > + vaddr6_7, 1); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); > + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); > + > + /* add headroom to pa values */ > + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); > + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); > + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); > + } > + } else { > +#endif /* __AVX512VL__*/ > + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; > + __m256i dma_addr0_1, dma_addr2_3; > + __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM); > + /* Initialize the mbufs in vector, process 4 mbufs in one loop */ > + for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) { > + __m128i vaddr0, vaddr1, vaddr2, vaddr3; > + __m256i vaddr0_1, vaddr2_3; > + > + mb0 = rxep[0].mbuf; > + mb1 = rxep[1].mbuf; > + mb2 = rxep[2].mbuf; > + mb3 = rxep[3].mbuf; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); > + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); > + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); > + > + /** > + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 > + * into the high lanes. Similarly for 2 & 3 > + */ > + vaddr0_1 = _mm256_inserti128_si256 > + (_mm256_castsi128_si256(vaddr0), vaddr1, 1); > + vaddr2_3 = _mm256_inserti128_si256 > + (_mm256_castsi128_si256(vaddr2), vaddr3, 1); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1); > + dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3); > + > + /* add headroom to pa values */ > + dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room); > + dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1); > + _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3); > + } > + } > + > +#endif > + > + /* Update the descriptor initializer index */ > + rxq->rxrearm_start += n; > + rx_id = rxq->rxrearm_start - 1; > + > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; > + if (!rxq->rxrearm_start) > + rx_id = rxq->nb_rx_desc - 1; > + else > + rx_id = rxq->rxrearm_start - 1; > + } > + > + rxq->rxrearm_nb -= n; > + > + /* Update the tail pointer on the NIC */ > + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); > +} > #endif /* __AVX2__*/ > > #endif /*_I40E_RXTX_COMMON_AVX_H_*/ > diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c > index c73b2a321b..fcb7ba0273 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c > @@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > return i40e_rxq_rearm_common(rxq, false); > } > > +static __rte_always_inline void > +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) > +{ > + return i40e_rxq_direct_rearm_common(rxq, false); > +} > + > #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > /* Handles 32B descriptor FDIR ID processing: > * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc > @@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, > /* See if we need to rearm the RX queue - gives the prefetch a bit > * of time to act > */ > - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) > - i40e_rxq_rearm(rxq); > + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { > + if (rxq->direct_rxrearm_enable) > + i40e_rxq_direct_rearm(rxq); > + else > + i40e_rxq_rearm(rxq); > + } > > /* Before we start moving massive data around, check to see if > * there is actually a packet available > diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c > index 2e8a3f0df6..d967095edc 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c > @@ -21,6 +21,12 @@ > > #define RTE_I40E_DESCS_PER_LOOP_AVX 8 > > +enum i40e_direct_rearm_type_value { > + I40E_DIRECT_REARM_TYPE_NORMAL = 0x0, > + I40E_DIRECT_REARM_TYPE_FAST_FREE = 0x1, > + I40E_DIRECT_REARM_TYPE_PRE_FREE = 0x2, > +}; > + > static __rte_always_inline void > i40e_rxq_rearm(struct i40e_rx_queue *rxq) > { > @@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); > } > > +static __rte_always_inline void > +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) > +{ > + struct rte_eth_dev *dev; > + struct i40e_tx_queue *txq; > + volatile union i40e_rx_desc *rxdp; > + struct i40e_vec_tx_entry *txep; > + struct i40e_rx_entry *rxep; > + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH]; > + uint16_t tx_port_id, tx_queue_id; > + uint16_t rx_id; > + uint16_t i, n; > + uint16_t j = 0; > + uint16_t nb_rearm = 0; > + enum i40e_direct_rearm_type_value type; > + struct rte_mempool_cache *cache = NULL; > + > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > + > + tx_port_id = rxq->direct_rxrearm_port; > + tx_queue_id = rxq->direct_rxrearm_queue; > + dev = &rte_eth_devices[tx_port_id]; > + txq = dev->data->tx_queues[tx_queue_id]; > + > + /* check Rx queue is able to take in the whole > + * batch of free mbufs from Tx queue > + */ > + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { > + /* check DD bits on threshold descriptor */ > + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & > + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { > + goto mempool_bulk; > + } > + > + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH) > + goto mempool_bulk; > + > + n = txq->tx_rs_thresh; > + > + /* first buffer to free from S/W ring is at index > + * tx_next_dd - (tx_rs_thresh-1) > + */ > + txep = (void *)txq->sw_ring; > + txep += txq->tx_next_dd - (n - 1); > + > + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { > + /* directly put mbufs from Tx to Rx */ > + uint32_t copied = 0; > + /* n is multiple of 32 */ > + while (copied < n) { > + const __m512i a = _mm512_load_si512(&txep[copied]); > + const __m512i b = _mm512_load_si512(&txep[copied + 8]); > + const __m512i c = _mm512_load_si512(&txep[copied + 16]); > + const __m512i d = _mm512_load_si512(&txep[copied + 24]); > + > + _mm512_storeu_si512(&rxep[copied], a); > + _mm512_storeu_si512(&rxep[copied + 8], b); > + _mm512_storeu_si512(&rxep[copied + 16], c); > + _mm512_storeu_si512(&rxep[copied + 24], d); > + copied += 32; > + } > + type = I40E_DIRECT_REARM_TYPE_FAST_FREE; > + } else { > + for (i = 0; i < n; i++) { > + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf); > + /* ensure each Tx freed buffer is valid */ > + if (m[i] != NULL) > + nb_rearm++; > + } > + > + if (nb_rearm != n) { > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); > + > + goto mempool_bulk; > + } else { > + type = I40E_DIRECT_REARM_TYPE_PRE_FREE; > + } > + } > + > + /* update counters for Tx */ > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); > + } else { > +mempool_bulk: > + cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id()); > + > + if (unlikely(!cache)) > + return i40e_rxq_rearm_common(rxq, true); > + > + n = RTE_I40E_RXQ_REARM_THRESH; > + > + /* We need to pull 'n' more MBUFs into the software ring from mempool > + * We inline the mempool function here, so we can vectorize the copy > + * from the cache into the shadow ring. > + */ > + > + if (cache->len < RTE_I40E_RXQ_REARM_THRESH) { > + /* No. Backfill the cache first, and then fill from it */ > + uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size - > + cache->len); > + > + /* How many do we require > + * i.e. number to fill the cache + the request > + */ > + int ret = rte_mempool_ops_dequeue_bulk(rxq->mp, > + &cache->objs[cache->len], req); > + if (ret == 0) { > + cache->len += req; > + } else { > + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= > + rxq->nb_rx_desc) { > + __m128i dma_addr0; > + > + dma_addr0 = _mm_setzero_si128(); > + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { > + rxep[i].mbuf = &rxq->fake_mbuf; > + _mm_store_si128 > + ((__m128i *)&rxdp[i].read, > + dma_addr0); > + } > + } > + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += > + RTE_I40E_RXQ_REARM_THRESH; > + return; > + } > + } > + > + type = I40E_DIRECT_REARM_TYPE_NORMAL; > + } > + > + const __m512i iova_offsets = _mm512_set1_epi64 > + (offsetof(struct rte_mbuf, buf_iova)); > + const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); > + > +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > + /* to shuffle the addresses to correct slots. Values 4-7 will contain > + * zeros, so use 7 for a zero-value. > + */ > + const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); > +#else > + const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0); > +#endif > + > + __m512i mbuf_ptrs; > + > + /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking > + * from mempool cache and populating both shadow and HW rings > + */ > + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) { > + switch (type) { > + case I40E_DIRECT_REARM_TYPE_FAST_FREE: > + mbuf_ptrs = _mm512_loadu_si512(rxep); > + break; > + case I40E_DIRECT_REARM_TYPE_PRE_FREE: > + mbuf_ptrs = _mm512_loadu_si512(&m[j]); > + _mm512_store_si512(rxep, mbuf_ptrs); > + j += 8; > + break; > + case I40E_DIRECT_REARM_TYPE_NORMAL: > + mbuf_ptrs = _mm512_loadu_si512 > + (&cache->objs[cache->len - 8]); > + _mm512_store_si512(rxep, mbuf_ptrs); > + cache->len -= 8; > + break; > + } > + > + /* gather iova of mbuf0-7 into one zmm reg */ > + const __m512i iova_base_addrs = _mm512_i64gather_epi64 > + (_mm512_add_epi64(mbuf_ptrs, iova_offsets), > + 0, /* base */ > + 1 /* scale */); > + const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, > + headroom); > +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > + const __m512i iovas0 = _mm512_castsi256_si512 > + (_mm512_extracti64x4_epi64(iova_addrs, 0)); > + const __m512i iovas1 = _mm512_castsi256_si512 > + (_mm512_extracti64x4_epi64(iova_addrs, 1)); > + > + /* permute leaves desc 2-3 addresses in header address slots 0-1 > + * but these are ignored by driver since header split not > + * enabled. Similarly for desc 4 & 5. > + */ > + const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64 > + (permute_idx, iovas0); > + const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8); > + > + const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64 > + (permute_idx, iovas1); > + const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8); > + > + _mm512_store_si512((void *)rxdp, desc_rd_0_1); > + _mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3); > + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5); > + _mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7); > +#else > + /* permute leaves desc 4-7 addresses in header address slots 0-3 > + * but these are ignored by driver since header split not > + * enabled. > + */ > + const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64 > + (permute_idx, iova_addrs); > + const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8); > + > + _mm512_store_si512((void *)rxdp, desc_rd_0_3); > + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7); > +#endif > + rxdp += 8, rxep += 8; > + } > + > + /* Update the descriptor initializer index */ > + rxq->rxrearm_start += n; > + rx_id = rxq->rxrearm_start - 1; > + > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; > + if (!rxq->rxrearm_start) > + rx_id = rxq->nb_rx_desc - 1; > + else > + rx_id = rxq->rxrearm_start - 1; > + } > + > + rxq->rxrearm_nb -= n; > + > + /* Update the tail pointer on the NIC */ > + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); > +} > + > #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > /* Handles 32B descriptor FDIR ID processing: > * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc > @@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, > /* See if we need to rearm the RX queue - gives the prefetch a bit > * of time to act > */ > - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) > - i40e_rxq_rearm(rxq); > + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { > + if (rxq->direct_rxrearm_enable) > + i40e_rxq_direct_rearm(rxq); > + else > + i40e_rxq_rearm(rxq); > + } > > /* Before we start moving massive data around, check to see if > * there is actually a packet available > diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c > index fa9e6582c5..dc78e3c90b 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c > @@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); > } > > +static inline void > +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) > +{ > + struct rte_eth_dev *dev; > + struct i40e_tx_queue *txq; > + volatile union i40e_rx_desc *rxdp; > + struct i40e_tx_entry *txep; > + struct i40e_rx_entry *rxep; > + uint16_t tx_port_id, tx_queue_id; > + uint16_t rx_id; > + struct rte_mbuf *mb0, *mb1, *m; > + uint64x2_t dma_addr0, dma_addr1; > + uint64x2_t zero = vdupq_n_u64(0); > + uint64_t paddr; > + uint16_t i, n; > + uint16_t nb_rearm = 0; > + > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > + > + tx_port_id = rxq->direct_rxrearm_port; > + tx_queue_id = rxq->direct_rxrearm_queue; > + dev = &rte_eth_devices[tx_port_id]; > + txq = dev->data->tx_queues[tx_queue_id]; > + > + /* check Rx queue is able to take in the whole > + * batch of free mbufs from Tx queue > + */ > + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { > + /* check DD bits on threshold descriptor */ > + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & > + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { > + goto mempool_bulk; > + } > + > + n = txq->tx_rs_thresh; > + > + /* first buffer to free from S/W ring is at index > + * tx_next_dd - (tx_rs_thresh-1) > + */ > + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; > + > + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { > + /* directly put mbufs from Tx to Rx, > + * and initialize the mbufs in vector > + */ > + for (i = 0; i < n; i++, rxep++, txep++) { > + rxep[0].mbuf = txep[0].mbuf; > + > + /* Initialize rxdp descs */ > + mb0 = txep[0].mbuf; > + > + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; > + dma_addr0 = vdupq_n_u64(paddr); > + /* flush desc with pa dma_addr */ > + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); > + } > + } else { > + for (i = 0; i < n; i++) { > + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > + if (m != NULL) { > + rxep[i].mbuf = m; > + > + /* Initialize rxdp descs */ > + paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM; > + dma_addr0 = vdupq_n_u64(paddr); > + /* flush desc with pa dma_addr */ > + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); > + nb_rearm++; > + } > + } > + n = nb_rearm; > + } > + > + /* update counters for Tx */ > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); > + } else { > +mempool_bulk: > + /* if TX did not free bufs into Rx sw-ring, > + * get new bufs from mempool > + */ > + n = RTE_I40E_RXQ_REARM_THRESH; > + if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) { > + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) { > + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { > + rxep[i].mbuf = &rxq->fake_mbuf; > + vst1q_u64((uint64_t *)&rxdp[i].read, zero); > + } > + } > + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n; > + return; > + } > + > + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ > + for (i = 0; i < n; i += 2, rxep += 2) { > + mb0 = rxep[0].mbuf; > + mb1 = rxep[1].mbuf; > + > + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; > + dma_addr0 = vdupq_n_u64(paddr); > + /* flush desc with pa dma_addr */ > + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); > + > + paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM; > + dma_addr1 = vdupq_n_u64(paddr); > + /* flush desc with pa dma_addr */ > + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1); > + } > + } > + > + /* Update the descriptor initializer index */ > + rxq->rxrearm_start += n; > + rx_id = rxq->rxrearm_start - 1; > + > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; > + if (!rxq->rxrearm_start) > + rx_id = rxq->nb_rx_desc - 1; > + else > + rx_id = rxq->rxrearm_start - 1; > + } > + > + rxq->rxrearm_nb -= n; > + > + rte_io_wmb(); > + /* Update the tail pointer on the NIC */ > + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); > +} > + > #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */ > static inline uint32x4_t > @@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq, > /* See if we need to rearm the RX queue - gives the prefetch a bit > * of time to act > */ > - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) > - i40e_rxq_rearm(rxq); > + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { > + if (rxq->direct_rxrearm_enable) > + i40e_rxq_direct_rearm(rxq); > + else > + i40e_rxq_rearm(rxq); > + } > > /* Before we start moving massive data around, check to see if > * there is actually a packet available > diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c > index 3782e8052f..b2f1ab2c8d 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c > @@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); > } > > +static inline void > +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq) > +{ > + struct rte_eth_dev *dev; > + struct i40e_tx_queue *txq; > + volatile union i40e_rx_desc *rxdp; > + struct i40e_tx_entry *txep; > + struct i40e_rx_entry *rxep; > + uint16_t tx_port_id, tx_queue_id; > + uint16_t rx_id; > + struct rte_mbuf *mb0, *mb1, *m; > + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, > + RTE_PKTMBUF_HEADROOM); > + __m128i dma_addr0, dma_addr1; > + __m128i vaddr0, vaddr1; > + uint16_t i, n; > + uint16_t nb_rearm = 0; > + > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > + > + tx_port_id = rxq->direct_rxrearm_port; > + tx_queue_id = rxq->direct_rxrearm_queue; > + dev = &rte_eth_devices[tx_port_id]; > + txq = dev->data->tx_queues[tx_queue_id]; > + > + /* check Rx queue is able to take in the whole > + * batch of free mbufs from Tx queue > + */ > + if (rxq->rxrearm_nb > txq->tx_rs_thresh) { > + /* check DD bits on threshold descriptor */ > + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & > + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) { > + goto mempool_bulk; > + } > + > + n = txq->tx_rs_thresh; > + > + /* first buffer to free from S/W ring is at index > + * tx_next_dd - (tx_rs_thresh-1) > + */ > + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; > + > + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { > + /* directly put mbufs from Tx to Rx, > + * and initialize the mbufs in vector > + */ > + for (i = 0; i < n; i++, rxep++, txep++) { > + rxep[0].mbuf = txep[0].mbuf; > + > + /* Initialize rxdp descs */ > + mb0 = txep[0].mbuf; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); > + > + /* add headroom to pa values */ > + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); > + } > + } else { > + for (i = 0; i < n; i++) { > + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > + if (m != NULL) { > + rxep[i].mbuf = m; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); > + > + /* add headroom to pa values */ > + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); > + nb_rearm++; > + } > + } > + n = nb_rearm; > + } > + > + /* update counters for Tx */ > + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); > + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); > + if (txq->tx_next_dd >= txq->nb_tx_desc) > + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); > + } else { > +mempool_bulk: > + /* if TX did not free bufs into Rx sw-ring, > + * get new bufs from mempool > + */ > + n = RTE_I40E_RXQ_REARM_THRESH; > + /* Pull 'n' more MBUFs into the software ring */ > + if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) { > + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) { > + dma_addr0 = _mm_setzero_si128(); > + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { > + rxep[i].mbuf = &rxq->fake_mbuf; > + _mm_store_si128((__m128i *)&rxdp[i].read, > + dma_addr0); > + } > + } > + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += > + RTE_I40E_RXQ_REARM_THRESH; > + return; > + } > + > + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ > + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) { > + mb0 = rxep[0].mbuf; > + mb1 = rxep[1].mbuf; > + > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); > + > + /* convert pa to dma_addr hdr/data */ > + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); > + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); > + > + /* add headroom to pa values */ > + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); > + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); > + > + /* flush desc with pa dma_addr */ > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); > + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); > + } > + } > + > + /* Update the descriptor initializer index */ > + rxq->rxrearm_start += n; > + rx_id = rxq->rxrearm_start - 1; > + > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc; > + if (!rxq->rxrearm_start) > + rx_id = rxq->nb_rx_desc - 1; > + else > + rx_id = rxq->rxrearm_start - 1; > + } > + > + rxq->rxrearm_nb -= n; > + > + /* Update the tail pointer on the NIC */ > + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); > +} > + > #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC > /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */ > static inline __m128i > @@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, > /* See if we need to rearm the RX queue - gives the prefetch a bit > * of time to act > */ > - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) > - i40e_rxq_rearm(rxq); > + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) { > + if (rxq->direct_rxrearm_enable) > + i40e_rxq_direct_rearm(rxq); > + else > + i40e_rxq_rearm(rxq); > + } > > /* Before we start moving massive data around, check to see if > * there is actually a packet available