From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 2C72545E03; Mon, 2 Dec 2024 12:27:31 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 04ED640E36; Mon, 2 Dec 2024 12:26:22 +0100 (CET) Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) by mails.dpdk.org (Postfix) with ESMTP id 843FD40DDE for ; Mon, 2 Dec 2024 12:26:13 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1733138774; x=1764674774; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=VJUEg0zRcehPuVN8MZjX7LB35mYdT0ylY8UTTnjKoM4=; b=WD6GdifZSZfF7oDfcaSdbIhUJ2a+K3muMefoBr9hmMw4ynecfi7PGlSq +N19XmmTjVynuqcJLN6WCPBctP16ZFcP7Uq6j7oM69VKFNVWhTHwcTDCx QD8qaQXEBynrGxOi5f3N9P6Bd3r7keGr5aTugdjzR0JAlmR+RfI3u6vph YtQD6TSmGNl/K8nj9RL49U9epPCWqyQvdb716v4fLD5GM8AOtW9j305An YC+gnGXmkzLmWx0Q/9yUejOeq46NVBI7ixXOBA6ItWbc728wXMXaTJ6zr sAPfczAGAIsplOZgLDZAvVC4+G9YFmAy17kbcP0iFQUY/alaVTVlQT/6W Q==; X-CSE-ConnectionGUID: 4w0zL2RqS/KaFiIN2pzT5g== X-CSE-MsgGUID: BHDNND1wQrWNewLCdRzscw== X-IronPort-AV: E=McAfee;i="6700,10204,11273"; a="43786200" X-IronPort-AV: E=Sophos;i="6.12,202,1728975600"; d="scan'208";a="43786200" Received: from orviesa005.jf.intel.com ([10.64.159.145]) by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 02 Dec 2024 03:26:14 -0800 X-CSE-ConnectionGUID: xDe/7PUQTPamyTtolqP9WA== X-CSE-MsgGUID: KcOyGOYiQTCgUGhRshfHkQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="6.12,202,1728975600"; d="scan'208";a="98040317" Received: from silpixa00401197coob.ir.intel.com (HELO silpixa00401385.ir.intel.com) ([10.237.214.45]) by orviesa005.jf.intel.com with ESMTP; 02 Dec 2024 03:26:12 -0800 From: Bruce Richardson To: dev@dpdk.org Cc: Bruce Richardson , Konstantin Ananyev , Ian Stokes , Anatoly Burakov Subject: [PATCH v1 12/21] net/_common_intel: add Tx buffer free fn for AVX-512 Date: Mon, 2 Dec 2024 11:24:32 +0000 Message-ID: <20241202112444.1517416-13-bruce.richardson@intel.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20241202112444.1517416-1-bruce.richardson@intel.com> References: <20241122125418.2857301-1-bruce.richardson@intel.com> <20241202112444.1517416-1-bruce.richardson@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org AVX-512 code paths for ice and i40e drivers are common, and differ from the regular post-Tx free function in that the SW ring from which the buffers are freed does not contain anything other than the mbuf pointer. Merge these into a common function in intel_common to reduce duplication. Signed-off-by: Bruce Richardson --- drivers/net/_common_intel/tx.h | 93 +++++++++++++++++++ drivers/net/i40e/i40e_rxtx_vec_avx512.c | 114 +---------------------- drivers/net/ice/ice_rxtx_vec_avx512.c | 117 +----------------------- 3 files changed, 95 insertions(+), 229 deletions(-) diff --git a/drivers/net/_common_intel/tx.h b/drivers/net/_common_intel/tx.h index a930309c05..145501834a 100644 --- a/drivers/net/_common_intel/tx.h +++ b/drivers/net/_common_intel/tx.h @@ -178,4 +178,97 @@ ci_tx_free_bufs(struct ci_tx_queue *txq, ci_desc_done_fn desc_done) return txq->tx_rs_thresh; } +static __rte_always_inline int +ci_tx_free_bufs_vec(struct ci_tx_queue *txq, ci_desc_done_fn desc_done) +{ + int nb_free = 0; + struct rte_mbuf *free[IETH_VPMD_TX_MAX_FREE_BUF]; + struct rte_mbuf *m; + + /* check DD bits on threshold descriptor */ + if (!desc_done(txq, txq->tx_next_dd)) + return 0; + + const uint32_t n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh - 1) + */ + struct ci_tx_entry_vec *txep = txq->sw_ring_vec; + txep += txq->tx_next_dd - (n - 1); + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { + struct rte_mempool *mp = txep[0].mbuf->pool; + void **cache_objs; + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, + rte_lcore_id()); + + if (!cache || cache->len == 0) + goto normal; + + cache_objs = &cache->objs[cache->len]; + + if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); + goto done; + } + + /* The cache follows the following algorithm + * 1. Add the objects to the cache + * 2. Anything greater than the cache min value (if it + * crosses the cache flush threshold) is flushed to the ring. + */ + /* Add elements back into the cache */ + uint32_t copied = 0; + /* n is multiple of 32 */ + while (copied < n) { + memcpy(&cache_objs[copied], &txep[copied], 32 * sizeof(void *)); + copied += 32; + } + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; + } + goto done; + } + +normal: + m = rte_pktmbuf_prefree_seg(txep[0].mbuf); + if (likely(m)) { + free[0] = m; + nb_free = 1; + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (likely(m)) { + if (likely(m->pool == free[0]->pool)) { + free[nb_free++] = m; + } else { + rte_mempool_put_bulk(free[0]->pool, (void *)free, nb_free); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); + } else { + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m) + rte_mempool_put(m->pool, m); + } + } + +done: + /* buffers were freed, update counters */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + + return txq->tx_rs_thresh; +} + #endif /* _COMMON_INTEL_TX_H_ */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c index a3f6d1667f..9bb2a44231 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c @@ -754,118 +754,6 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -i40e_tx_free_bufs_avx512(struct ci_tx_queue *txq) -{ - struct ci_tx_entry_vec *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->i40e_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != - rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh-1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_generic_put(mp, (void *)txep, n, cache); - goto done; - } - - cache_objs = &cache->objs[cache->len]; - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 8]); - const __m512i c = _mm512_load_si512(&txep[copied + 16]); - const __m512i d = _mm512_load_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - rte_mbuf_prefetch_part2(txep[i + 3].mbuf); - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static inline void vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) { @@ -941,7 +829,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD; if (txq->nb_tx_free < txq->tx_free_thresh) - i40e_tx_free_bufs_avx512(txq); + ci_tx_free_bufs_vec(txq, i40e_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c index eabd8b04a0..538be707ef 100644 --- a/drivers/net/ice/ice_rxtx_vec_avx512.c +++ b/drivers/net/ice/ice_rxtx_vec_avx512.c @@ -859,121 +859,6 @@ ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -ice_tx_free_bufs_avx512(struct ci_tx_queue *txq) -{ - struct ci_tx_entry_vec *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->ice_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) != - rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh - 1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || cache->len == 0) - goto normal; - - cache_objs = &cache->objs[cache->len]; - - if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); - goto done; - } - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); - const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); - const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - -normal: - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static __rte_always_inline void ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags, bool do_offload) @@ -1064,7 +949,7 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); if (txq->nb_tx_free < txq->tx_free_thresh) - ice_tx_free_bufs_avx512(txq); + ci_tx_free_bufs_vec(txq, ice_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) -- 2.43.0