From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 3A85345D6F; Fri, 22 Nov 2024 13:56:18 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 11EA74341D; Fri, 22 Nov 2024 13:55:01 +0100 (CET) Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.9]) by mails.dpdk.org (Postfix) with ESMTP id 63F4D43407 for ; Fri, 22 Nov 2024 13:54:58 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1732280098; x=1763816098; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=uzFf0ghuI8BM5Ha8VGuGnfIiub6c/WZB5YhHtMAOBKI=; b=RyxdJpr/rUKx0vyTIlovDV54XoSIvUFEErIbYgGHjCiJLkmqIp3bZnlb NnZGD+4xNhepyCGRbPsU7kf5KDpdjryHifuunEZrUyggi+Pzg9ugndf67 FJHG0G84kc6kBItY2KLA94XChNuhQGIjEbrME3QEJy5OHq3qb3uHM8wj6 aHDSIohCwEVIMEgUD2JKlp9tGp/ZMXAPSf+tnUKanLywtg5qzDlamDsvU 4tpOizf+jz+IoCnqxRu/I0+3PkfJGjCnvnpxksG2iUv7BIyWdC7acI5BI pZOTOSgYMKVgM9qupEdwi1NlubFtqjwzHaZ4e6Ca+ETcN6b7zqzXgc9pH g==; X-CSE-ConnectionGUID: IcYdn8W9TxSP8bvflIGvmw== X-CSE-MsgGUID: QW5SuuTrTTi/12fbmm82XA== X-IronPort-AV: E=McAfee;i="6700,10204,11263"; a="43085370" X-IronPort-AV: E=Sophos;i="6.12,175,1728975600"; d="scan'208";a="43085370" Received: from fmviesa007.fm.intel.com ([10.60.135.147]) by fmvoesa103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 22 Nov 2024 04:54:58 -0800 X-CSE-ConnectionGUID: Y8Vv85KfSfuP7k0zxL+daw== X-CSE-MsgGUID: OihZ3nynSUquIbzUpn44lw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="6.12,175,1728975600"; d="scan'208";a="90373239" Received: from unknown (HELO silpixa00401385.ir.intel.com) ([10.237.214.25]) by fmviesa007.fm.intel.com with ESMTP; 22 Nov 2024 04:54:56 -0800 From: Bruce Richardson To: dev@dpdk.org Cc: Bruce Richardson , Konstantin Ananyev , Ian Stokes , Anatoly Burakov Subject: [RFC PATCH 12/21] common/intel_eth: add Tx buffer free fn for AVX-512 Date: Fri, 22 Nov 2024 12:54:05 +0000 Message-ID: <20241122125418.2857301-13-bruce.richardson@intel.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20241122125418.2857301-1-bruce.richardson@intel.com> References: <20241122125418.2857301-1-bruce.richardson@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org AVX-512 code paths for ice and i40e drivers are common, and differ from the regular post-Tx free function in that the SW ring from which the buffers are freed does not contain anything other than the mbuf pointer. Merge these into a common function in common/intel_eth saving duplication. Signed-off-by: Bruce Richardson --- .../common/intel_eth/ieth_rxtx_vec_common.h | 93 ++++++++++++++ drivers/net/i40e/i40e_rxtx_vec_avx512.c | 114 +---------------- drivers/net/ice/ice_rxtx_vec_avx512.c | 117 +----------------- 3 files changed, 95 insertions(+), 229 deletions(-) diff --git a/drivers/common/intel_eth/ieth_rxtx_vec_common.h b/drivers/common/intel_eth/ieth_rxtx_vec_common.h index aadc3dcfac..61b48c88da 100644 --- a/drivers/common/intel_eth/ieth_rxtx_vec_common.h +++ b/drivers/common/intel_eth/ieth_rxtx_vec_common.h @@ -157,4 +157,97 @@ ieth_tx_free_bufs(struct ieth_tx_queue *txq, ieth_desc_done_fn desc_done) return txq->tx_rs_thresh; } +static __rte_always_inline int +ieth_tx_free_bufs_vector(struct ieth_tx_queue *txq, ieth_desc_done_fn desc_done) +{ + int nb_free = 0; + struct rte_mbuf *free[IETH_VPMD_TX_MAX_FREE_BUF]; + struct rte_mbuf *m; + + /* check DD bits on threshold descriptor */ + if (!desc_done(txq, txq->tx_next_dd)) + return 0; + + const uint32_t n = txq->tx_rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh - 1) + */ + struct ieth_vec_tx_entry *txep = txq->sw_ring_v; + txep += txq->tx_next_dd - (n - 1); + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { + struct rte_mempool *mp = txep[0].mbuf->pool; + void **cache_objs; + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, + rte_lcore_id()); + + if (!cache || cache->len == 0) + goto normal; + + cache_objs = &cache->objs[cache->len]; + + if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); + goto done; + } + + /* The cache follows the following algorithm + * 1. Add the objects to the cache + * 2. Anything greater than the cache min value (if it + * crosses the cache flush threshold) is flushed to the ring. + */ + /* Add elements back into the cache */ + uint32_t copied = 0; + /* n is multiple of 32 */ + while (copied < n) { + memcpy(&cache_objs[copied], &txep[copied], 32 * sizeof(void *)); + copied += 32; + } + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; + } + goto done; + } + +normal: + m = rte_pktmbuf_prefree_seg(txep[0].mbuf); + if (likely(m)) { + free[0] = m; + nb_free = 1; + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (likely(m)) { + if (likely(m->pool == free[0]->pool)) { + free[nb_free++] = m; + } else { + rte_mempool_put_bulk(free[0]->pool, (void *)free, nb_free); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); + } else { + for (uint32_t i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m) + rte_mempool_put(m->pool, m); + } + } + +done: + /* buffers were freed, update counters */ + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); + if (txq->tx_next_dd >= txq->nb_tx_desc) + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); + + return txq->tx_rs_thresh; +} + #endif /* IETH_RXTX_VEC_COMMON_H_ */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c index b4b38d7db6..23415c4949 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c @@ -754,118 +754,6 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -i40e_tx_free_bufs_avx512(struct ieth_tx_queue *txq) -{ - struct ieth_vec_tx_entry *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->i40e_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != - rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh-1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_generic_put(mp, (void *)txep, n, cache); - goto done; - } - - cache_objs = &cache->objs[cache->len]; - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 8]); - const __m512i c = _mm512_load_si512(&txep[copied + 16]); - const __m512i d = _mm512_load_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_load_si512(&txep[copied]); - const __m512i b = _mm512_load_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - rte_mbuf_prefetch_part2(txep[i + 3].mbuf); - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static inline void vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) { @@ -941,7 +829,7 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD; if (txq->nb_tx_free < txq->tx_free_thresh) - i40e_tx_free_bufs_avx512(txq); + ieth_tx_free_bufs_vector(txq, i40e_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c index 4d95561f8c..fc8f9ad34a 100644 --- a/drivers/net/ice/ice_rxtx_vec_avx512.c +++ b/drivers/net/ice/ice_rxtx_vec_avx512.c @@ -859,121 +859,6 @@ ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, rx_pkts + retval, nb_pkts); } -static __rte_always_inline int -ice_tx_free_bufs_avx512(struct ieth_tx_queue *txq) -{ - struct ieth_vec_tx_entry *txep; - uint32_t n; - uint32_t i; - int nb_free = 0; - struct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ]; - - /* check DD bits on threshold descriptor */ - if ((txq->ice_tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & - rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) != - rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE)) - return 0; - - n = txq->tx_rs_thresh; - - /* first buffer to free from S/W ring is at index - * tx_next_dd - (tx_rs_thresh - 1) - */ - txep = (void *)txq->sw_ring; - txep += txq->tx_next_dd - (n - 1); - - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, - rte_lcore_id()); - - if (!cache || cache->len == 0) - goto normal; - - cache_objs = &cache->objs[cache->len]; - - if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); - goto done; - } - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { -#ifdef RTE_ARCH_64 - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); - const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); - const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); - - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 8], b); - _mm512_storeu_si512(&cache_objs[copied + 16], c); - _mm512_storeu_si512(&cache_objs[copied + 24], d); -#else - const __m512i a = _mm512_loadu_si512(&txep[copied]); - const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); - _mm512_storeu_si512(&cache_objs[copied], a); - _mm512_storeu_si512(&cache_objs[copied + 16], b); -#endif - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk - (mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - -normal: - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, - (void *)free, - nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: - /* buffers were freed, update counters */ - txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); - txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); - if (txq->tx_next_dd >= txq->nb_tx_desc) - txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); - - return txq->tx_rs_thresh; -} - static __rte_always_inline void ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags, bool do_offload) @@ -1064,7 +949,7 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); if (txq->nb_tx_free < txq->tx_free_thresh) - ice_tx_free_bufs_avx512(txq); + ieth_tx_free_bufs_vector(txq, ice_tx_desc_done); nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); if (unlikely(nb_pkts == 0)) -- 2.43.0