From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id DFB3445EFA; Fri, 20 Dec 2024 15:41:22 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 13DBF40654; Fri, 20 Dec 2024 15:40:21 +0100 (CET) Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.14]) by mails.dpdk.org (Postfix) with ESMTP id D8223402A3 for ; Fri, 20 Dec 2024 15:40:19 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1734705620; x=1766241620; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=slIddy8394MDjMPW1OmsVqb+W+d3D2m7+4bX0XmJrJ4=; b=V6SYN/l5/UnJlF3YvI72JGRZHpt2CtuR8sMXPfM+iW5tLpe0AkMnVeCm 7Fm0mBcb1uQJYXum5QZVdCdO3QojvD4PWul9DfvP448XG4OaUCHmu+2b5 Og+BKrvx8OhWmNJTcB03V8N4hU6O/FPffiw38EatnKgYs03X8DjpRSAL2 5pHRX+CwPt2qPyKkLjSvvY/TIVaObk7QV7VgtcFEc38dqoVLd8/TAQEbp qmjwXHOvTxAoj8FshsmwPRX6YtbNFvQaULfkDX5QvNvnD437Nb475PY6v gnc/EdawfbRw5FmD0w6Uuytn3Lh6ltSTz8lxTzrVxqvqN1VE2/GT+uG6v A==; X-CSE-ConnectionGUID: KbuU+0d4THu+ipPvWvil7Q== X-CSE-MsgGUID: VoTd6+9NQVmSy3ahjy83GA== X-IronPort-AV: E=McAfee;i="6700,10204,11292"; a="39040208" X-IronPort-AV: E=Sophos;i="6.12,251,1728975600"; d="scan'208";a="39040208" Received: from fmviesa006.fm.intel.com ([10.60.135.146]) by orvoesa106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Dec 2024 06:40:20 -0800 X-CSE-ConnectionGUID: g6eEX14aTN+Wow34J9FS+A== X-CSE-MsgGUID: Ac5s0tYbTD2obUWCfQTjVA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="6.12,251,1728975600"; d="scan'208";a="98310706" Received: from silpixa00401197coob.ir.intel.com (HELO silpixa00401385.ir.intel.com) ([10.237.214.45]) by fmviesa006.fm.intel.com with ESMTP; 20 Dec 2024 06:40:16 -0800 From: Bruce Richardson To: dev@dpdk.org Cc: Bruce Richardson , Konstantin Ananyev , Ian Stokes , Anatoly Burakov Subject: [PATCH v4 12/24] net/_common_intel: add Tx buffer free fn for AVX-512 Date: Fri, 20 Dec 2024 14:39:09 +0000 Message-ID: <20241220143925.609044-13-bruce.richardson@intel.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20241220143925.609044-1-bruce.richardson@intel.com> References: <20241122125418.2857301-1-bruce.richardson@intel.com> <20241220143925.609044-1-bruce.richardson@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org AVX-512 code paths for ice and i40e drivers are common, and differ from the regular post-Tx free function in that the SW ring from which the buffers are freed does not contain anything other than the mbuf pointer. Merge these into a common function in intel_common to reduce duplication. Signed-off-by: Bruce Richardson --- drivers/net/_common_intel/tx.h | 92 +++++++++++++++++++ drivers/net/i40e/i40e_rxtx_vec_avx512.c | 114 +---------------------- drivers/net/ice/ice_rxtx_vec_avx512.c | 117 +----------------------- 3 files changed, 94 insertions(+), 229 deletions(-) diff --git a/drivers/net/_common_intel/tx.h b/drivers/net/_common_intel/tx.h index a930309c05..84ff839672 100644 --- a/drivers/net/_common_intel/tx.h +++ b/drivers/net/_common_intel/tx.h @@ -178,4 +178,96 @@ ci_tx_free_bufs(struct ci_tx_queue *txq, ci_desc_done_fn desc_done) return txq->tx_rs_thresh; } +static __rte_always_inline int +ci_tx_free_bufs_vec(struct ci_tx_queue *txq, ci_desc_done_fn desc_done) +{ + int nb_free = 0; + struct rte_mbuf *free[IETH_VPMD_TX_MAX_FREE_BUF]; + struct rte_mbuf *m; + + /* check DD bits on threshold descriptor */ + if (!desc_done(txq, txq->tx_next_dd)) + return 0; + + const uint32_t n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh - 1) + */ + struct ci_tx_entry_vec *txep = txq->sw_ring_vec; + txep += txq->tx_next_dd - (n - 1); + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { + struct rte_mempool *mp = txep[0].mbuf->pool; + void **cache_objs; + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, rte_lcore_id()); + + if (!cache || cache->len == 0) + goto normal; + + cache_objs = &cache->objs[cache->len]; + + if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); + goto done; + } + + /* The cache follows the following algorithm + * 1. Add the objects to the cache + * 2. Anything greater than the cache min value (if it + * crosses the cache flush threshold) is flushed to the ring. + */ + /* Add elements back into the cache */ + uint32_t copied = 0; + /* n is multiple of 32 */ + while (copied < n) { + memcpy(&cache_objs[copied], &txep[copied], 32 * sizeof(void *)); + copied += 32; + } + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; + } + goto done; + } + +normal: + m = rte_pktmbuf_prefree_seg(txep[0].mbuf); + if (likely(m)) { + free[0] = m; + nb_free = 1; + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (likely(m)) { + if (likely(m->pool == free[0]->pool)) { + free[nb_free++] = m; + } else { + rte_mempool_put_bulk(free[0]->pool, (void *)free, nb_free); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); + } else { + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m) + rte_mempool_put(m->pool, m); + } + } + +done: + /* buffers were freed, update counters */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + + return txq->tx_rs_thresh; +} + #endif /* _COMMON_INTEL_TX_H_ */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c index a3f6d1667f..9bb2a44231 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c @@ -754,118 +754,6 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -i40e_tx_free_bufs_avx512(struct ci_tx_queue *txq) -{ - struct ci_tx_entry_vec *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->i40e_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != - rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh-1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_generic_put(mp, (void *)txep, n, cache); - goto done; - } - - cache_objs = &cache->objs[cache->len]; - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 8]); - const __m512i c = _mm512_load_si512(&txep[copied + 16]); - const __m512i d = _mm512_load_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - rte_mbuf_prefetch_part2(txep[i + 3].mbuf); - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static inline void vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) { @@ -941,7 +829,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD; if (txq->nb_tx_free < txq->tx_free_thresh) - i40e_tx_free_bufs_avx512(txq); + ci_tx_free_bufs_vec(txq, i40e_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c index eabd8b04a0..538be707ef 100644 --- a/drivers/net/ice/ice_rxtx_vec_avx512.c +++ b/drivers/net/ice/ice_rxtx_vec_avx512.c @@ -859,121 +859,6 @@ ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -ice_tx_free_bufs_avx512(struct ci_tx_queue *txq) -{ - struct ci_tx_entry_vec *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->ice_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) != - rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh - 1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || cache->len == 0) - goto normal; - - cache_objs = &cache->objs[cache->len]; - - if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); - goto done; - } - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); - const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); - const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - -normal: - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static __rte_always_inline void ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags, bool do_offload) @@ -1064,7 +949,7 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); if (txq->nb_tx_free < txq->tx_free_thresh) - ice_tx_free_bufs_avx512(txq); + ci_tx_free_bufs_vec(txq, ice_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) -- 2.43.0