From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id BAD382B91 for ; Fri, 8 Mar 2019 18:49:32 +0100 (CET) Received: from Internal Mail-Server by MTLPINE1 (envelope-from yskoh@mellanox.com) with ESMTPS (AES256-SHA encrypted); 8 Mar 2019 19:49:29 +0200 Received: from scfae-sc-2.mti.labs.mlnx (scfae-sc-2.mti.labs.mlnx [10.101.0.96]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x28HloBF002625; Fri, 8 Mar 2019 19:49:28 +0200 From: Yongseok Koh To: Shahed Shaikh Cc: Rasesh Mody , dpdk stable Date: Fri, 8 Mar 2019 09:47:37 -0800 Message-Id: <20190308174749.30771-59-yskoh@mellanox.com> X-Mailer: git-send-email 2.11.0 In-Reply-To: <20190308174749.30771-1-yskoh@mellanox.com> References: <20190308174749.30771-1-yskoh@mellanox.com> Subject: [dpdk-stable] patch 'net/qede: fix performance bottleneck in Rx path' has been queued to LTS release 17.11.6 X-BeenThere: stable@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches for DPDK stable branches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 08 Mar 2019 17:49:33 -0000 Hi, FYI, your patch has been queued to LTS release 17.11.6 Note it hasn't been pushed to http://dpdk.org/browse/dpdk-stable yet. It will be pushed if I get no objection by 03/13/19. So please shout if anyone has objection. Also note that after the patch there's a diff of the upstream commit vs the patch applied to the branch. If the code is different (ie: not only metadata diffs), due for example to a change in context or macro names, please double check it. Thanks. Yongseok --- >>From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Fri, 18 Jan 2019 02:29:29 -0800 Subject: [PATCH] net/qede: fix performance bottleneck in Rx path [ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ] Allocating replacement buffer per received packet is expensive. Instead, process received packets first and allocate replacement buffers in bulk later. This improves performance by ~25% in terms of PPS on AMD platforms. Fixes: 2ea6f76aff40 ("qede: add core driver") Signed-off-by: Shahed Shaikh Acked-by: Rasesh Mody --- drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++----------- drivers/net/qede/qede_rxtx.h | 2 + 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index cdb85c218..b525075ca 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) return 0; } +#define QEDE_MAX_BULK_ALLOC_COUNT 512 + +static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count) +{ + void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned; + struct rte_mbuf *mbuf = NULL; + struct eth_rx_bd *rx_bd; + dma_addr_t mapping; + int i, ret = 0; + uint16_t idx; + + idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); + + if (count > QEDE_MAX_BULK_ALLOC_COUNT) + count = QEDE_MAX_BULK_ALLOC_COUNT; + + ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count); + if (unlikely(ret)) { + PMD_RX_LOG(ERR, rxq, + "Failed to allocate %d rx buffers " + "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u", + count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq), + rte_mempool_avail_count(rxq->mb_pool), + rte_mempool_in_use_count(rxq->mb_pool)); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + mbuf = obj_p[i]; + if (likely(i < count - 1)) + rte_prefetch0(obj_p[i + 1]); + + idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); + rxq->sw_rx_ring[idx].mbuf = mbuf; + rxq->sw_rx_ring[idx].page_offset = 0; + mapping = rte_mbuf_data_iova_default(mbuf); + rx_bd = (struct eth_rx_bd *) + ecore_chain_produce(&rxq->rx_bd_ring); + rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping)); + rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping)); + rxq->sw_rx_prod++; + } + + return 0; +} + /* Criterias for calculating Rx buffer size - * 1) rx_buf_size should not exceed the size of mbuf * 2) In scattered_rx mode - minimum rx_buf_size should be @@ -1134,7 +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev, struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons) { struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring); - uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq); + uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); struct qede_rx_entry *curr_prod; dma_addr_t new_mapping; @@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) uint8_t bitfield_val; #endif uint8_t tunn_parse_flag; - uint8_t j; struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa; uint64_t ol_flags; uint32_t packet_type; @@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) uint8_t offset, tpa_agg_idx, flags; struct qede_agg_info *tpa_info = NULL; uint32_t rss_hash; + int rx_alloc_count = 0; hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr); sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring); @@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (hw_comp_cons == sw_comp_cons) return 0; + /* Allocate buffers that we used in previous loop */ + if (rxq->rx_alloc_count) { + if (unlikely(qede_alloc_rx_bulk_mbufs(rxq, + rxq->rx_alloc_count))) { + struct rte_eth_dev *dev; + + PMD_RX_LOG(ERR, rxq, + "New buffer allocation failed," + "dropping incoming packetn"); + dev = &rte_eth_devices[rxq->port_id]; + dev->data->rx_mbuf_alloc_failed += + rxq->rx_alloc_count; + rxq->rx_alloc_errors += rxq->rx_alloc_count; + return 0; + } + qede_update_rx_prod(qdev, rxq); + rxq->rx_alloc_count = 0; + } + while (sw_comp_cons != hw_comp_cons) { ol_flags = 0; packet_type = RTE_PTYPE_UNKNOWN; @@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) rx_mb->hash.rss = rss_hash; } - if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) { - PMD_RX_LOG(ERR, rxq, - "New buffer allocation failed," - "dropping incoming packet\n"); - qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num); - rte_eth_devices[rxq->port_id]. - data->rx_mbuf_alloc_failed++; - rxq->rx_alloc_errors++; - break; - } + rx_alloc_count++; qede_rx_bd_ring_consume(rxq); if (!tpa_start_flg && fp_cqe->bd_num > 1) { @@ -1577,17 +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (qede_process_sg_pkts(p_rxq, seg1, num_segs, pkt_len - len)) goto next_cqe; - for (j = 0; j < num_segs; j++) { - if (qede_alloc_rx_buffer(rxq)) { - PMD_RX_LOG(ERR, rxq, - "Buffer allocation failed"); - rte_eth_devices[rxq->port_id]. - data->rx_mbuf_alloc_failed++; - rxq->rx_alloc_errors++; - break; - } - rxq->rx_segs++; - } + + rx_alloc_count += num_segs; + rxq->rx_segs += num_segs; } rxq->rx_segs++; /* for the first segment */ @@ -1629,7 +1677,8 @@ next_cqe: } } - qede_update_rx_prod(qdev, rxq); + /* Request number of bufferes to be allocated in next loop */ + rxq->rx_alloc_count = rx_alloc_count; rxq->rcv_pkts += rx_pkt; diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index fe80237d5..574831e61 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h @@ -196,6 +196,8 @@ struct qede_rx_queue { uint16_t queue_id; uint16_t port_id; uint16_t rx_buf_size; + uint16_t rx_alloc_count; + uint16_t unused; uint64_t rcv_pkts; uint64_t rx_segs; uint64_t rx_hw_errors; -- 2.11.0 --- Diff of the applied patch vs upstream commit (please double-check if non-empty: --- --- - 2019-03-08 09:46:43.105876155 -0800 +++ 0059-net-qede-fix-performance-bottleneck-in-Rx-path.patch 2019-03-08 09:46:40.311403000 -0800 @@ -1,8 +1,10 @@ -From 8f2312474529ad7ff0e4b65b82efc8530e7484ce Mon Sep 17 00:00:00 2001 +From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Fri, 18 Jan 2019 02:29:29 -0800 Subject: [PATCH] net/qede: fix performance bottleneck in Rx path +[ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ] + Allocating replacement buffer per received packet is expensive. Instead, process received packets first and allocate replacement buffers in bulk later. @@ -11,7 +13,6 @@ platforms. Fixes: 2ea6f76aff40 ("qede: add core driver") -Cc: stable@dpdk.org Signed-off-by: Shahed Shaikh Acked-by: Rasesh Mody @@ -21,10 +22,10 @@ 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c -index 0e33be1a3..684c4aeef 100644 +index cdb85c218..b525075ca 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c -@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) +@@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) return 0; } @@ -77,7 +78,7 @@ /* Criterias for calculating Rx buffer size - * 1) rx_buf_size should not exceed the size of mbuf * 2) In scattered_rx mode - minimum rx_buf_size should be -@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev, +@@ -1134,7 +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev, struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons) { struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring); @@ -86,7 +87,7 @@ struct qede_rx_entry *curr_prod; dma_addr_t new_mapping; -@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +@@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) uint8_t bitfield_val; #endif uint8_t tunn_parse_flag; @@ -94,7 +95,7 @@ struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa; uint64_t ol_flags; uint32_t packet_type; -@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +@@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) uint8_t offset, tpa_agg_idx, flags; struct qede_agg_info *tpa_info = NULL; uint32_t rss_hash; @@ -102,7 +103,7 @@ hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr); sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring); -@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +@@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (hw_comp_cons == sw_comp_cons) return 0; @@ -128,7 +129,7 @@ while (sw_comp_cons != hw_comp_cons) { ol_flags = 0; packet_type = RTE_PTYPE_UNKNOWN; -@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +@@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) rx_mb->hash.rss = rss_hash; } @@ -146,7 +147,7 @@ qede_rx_bd_ring_consume(rxq); if (!tpa_start_flg && fp_cqe->bd_num > 1) { -@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +@@ -1577,17 +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) if (qede_process_sg_pkts(p_rxq, seg1, num_segs, pkt_len - len)) goto next_cqe; @@ -167,7 +168,7 @@ } rxq->rx_segs++; /* for the first segment */ -@@ -1626,7 +1674,8 @@ next_cqe: +@@ -1629,7 +1677,8 @@ next_cqe: } } @@ -178,10 +179,10 @@ rxq->rcv_pkts += rx_pkt; diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h -index 454daa07b..5b249cbb2 100644 +index fe80237d5..574831e61 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h -@@ -192,6 +192,8 @@ struct qede_rx_queue { +@@ -196,6 +196,8 @@ struct qede_rx_queue { uint16_t queue_id; uint16_t port_id; uint16_t rx_buf_size;