From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <yskoh@mellanox.com>
Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129])
 by dpdk.org (Postfix) with ESMTP id BAD382B91
 for <stable@dpdk.org>; Fri,  8 Mar 2019 18:49:32 +0100 (CET)
Received: from Internal Mail-Server by MTLPINE1 (envelope-from
 yskoh@mellanox.com)
 with ESMTPS (AES256-SHA encrypted); 8 Mar 2019 19:49:29 +0200
Received: from scfae-sc-2.mti.labs.mlnx (scfae-sc-2.mti.labs.mlnx
 [10.101.0.96])
 by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x28HloBF002625;
 Fri, 8 Mar 2019 19:49:28 +0200
From: Yongseok Koh <yskoh@mellanox.com>
To: Shahed Shaikh <shshaikh@marvell.com>
Cc: Rasesh Mody <rmody@marvell.com>, dpdk stable <stable@dpdk.org>
Date: Fri,  8 Mar 2019 09:47:37 -0800
Message-Id: <20190308174749.30771-59-yskoh@mellanox.com>
X-Mailer: git-send-email 2.11.0
In-Reply-To: <20190308174749.30771-1-yskoh@mellanox.com>
References: <20190308174749.30771-1-yskoh@mellanox.com>
Subject: [dpdk-stable] patch 'net/qede: fix performance bottleneck in Rx
	path' has been queued to LTS release 17.11.6
X-BeenThere: stable@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches for DPDK stable branches <stable.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/stable>,
 <mailto:stable-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/stable/>
List-Post: <mailto:stable@dpdk.org>
List-Help: <mailto:stable-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/stable>,
 <mailto:stable-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Fri, 08 Mar 2019 17:49:33 -0000

Hi,

FYI, your patch has been queued to LTS release 17.11.6

Note it hasn't been pushed to http://dpdk.org/browse/dpdk-stable yet.
It will be pushed if I get no objection by 03/13/19. So please
shout if anyone has objection.

Also note that after the patch there's a diff of the upstream commit vs the patch applied
to the branch. If the code is different (ie: not only metadata diffs), due for example to
a change in context or macro names, please double check it.

Thanks.

Yongseok

---
>>From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00 2001
From: Shahed Shaikh <shshaikh@marvell.com>
Date: Fri, 18 Jan 2019 02:29:29 -0800
Subject: [PATCH] net/qede: fix performance bottleneck in Rx path

[ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ]

Allocating replacement buffer per received packet is expensive.
Instead, process received packets first and allocate
replacement buffers in bulk later.

This improves performance by ~25% in terms of PPS on AMD
platforms.

Fixes: 2ea6f76aff40 ("qede: add core driver")

Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
Acked-by: Rasesh Mody <rmody@marvell.com>
---
 drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----------
 drivers/net/qede/qede_rxtx.h |  2 +
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index cdb85c218..b525075ca 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 	return 0;
 }
 
+#define QEDE_MAX_BULK_ALLOC_COUNT 512
+
+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
+{
+	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
+	struct rte_mbuf *mbuf = NULL;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	int i, ret = 0;
+	uint16_t idx;
+
+	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
+		count = QEDE_MAX_BULK_ALLOC_COUNT;
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+	if (unlikely(ret)) {
+		PMD_RX_LOG(ERR, rxq,
+			   "Failed to allocate %d rx buffers "
+			    "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
+			    count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
+			    rte_mempool_avail_count(rxq->mb_pool),
+			    rte_mempool_in_use_count(rxq->mb_pool));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		mbuf = obj_p[i];
+		if (likely(i < count - 1))
+			rte_prefetch0(obj_p[i + 1]);
+
+		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+		rxq->sw_rx_ring[idx].mbuf = mbuf;
+		rxq->sw_rx_ring[idx].page_offset = 0;
+		mapping = rte_mbuf_data_iova_default(mbuf);
+		rx_bd = (struct eth_rx_bd *)
+			ecore_chain_produce(&rxq->rx_bd_ring);
+		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
+		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
+		rxq->sw_rx_prod++;
+	}
+
+	return 0;
+}
+
 /* Criterias for calculating Rx buffer size -
  * 1) rx_buf_size should not exceed the size of mbuf
  * 2) In scattered_rx mode - minimum rx_buf_size should be
@@ -1134,7 +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
 		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
-	uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
+	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
 	struct qede_rx_entry *curr_prod;
 	dma_addr_t new_mapping;
 
@@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t bitfield_val;
 #endif
 	uint8_t tunn_parse_flag;
-	uint8_t j;
 	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
 	uint64_t ol_flags;
 	uint32_t packet_type;
@@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint8_t offset, tpa_agg_idx, flags;
 	struct qede_agg_info *tpa_info = NULL;
 	uint32_t rss_hash;
+	int rx_alloc_count = 0;
 
 	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
 	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
@@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (hw_comp_cons == sw_comp_cons)
 		return 0;
 
+	/* Allocate buffers that we used in previous loop */
+	if (rxq->rx_alloc_count) {
+		if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
+			     rxq->rx_alloc_count))) {
+			struct rte_eth_dev *dev;
+
+			PMD_RX_LOG(ERR, rxq,
+				   "New buffer allocation failed,"
+				   "dropping incoming packetn");
+			dev = &rte_eth_devices[rxq->port_id];
+			dev->data->rx_mbuf_alloc_failed +=
+							rxq->rx_alloc_count;
+			rxq->rx_alloc_errors += rxq->rx_alloc_count;
+			return 0;
+		}
+		qede_update_rx_prod(qdev, rxq);
+		rxq->rx_alloc_count = 0;
+	}
+
 	while (sw_comp_cons != hw_comp_cons) {
 		ol_flags = 0;
 		packet_type = RTE_PTYPE_UNKNOWN;
@@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_mb->hash.rss = rss_hash;
 		}
 
-		if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
-			PMD_RX_LOG(ERR, rxq,
-				   "New buffer allocation failed,"
-				   "dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
-			rte_eth_devices[rxq->port_id].
-			    data->rx_mbuf_alloc_failed++;
-			rxq->rx_alloc_errors++;
-			break;
-		}
+		rx_alloc_count++;
 		qede_rx_bd_ring_consume(rxq);
 
 		if (!tpa_start_flg && fp_cqe->bd_num > 1) {
@@ -1577,17 +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
 						 pkt_len - len))
 				goto next_cqe;
-			for (j = 0; j < num_segs; j++) {
-				if (qede_alloc_rx_buffer(rxq)) {
-					PMD_RX_LOG(ERR, rxq,
-						"Buffer allocation failed");
-					rte_eth_devices[rxq->port_id].
-						data->rx_mbuf_alloc_failed++;
-					rxq->rx_alloc_errors++;
-					break;
-				}
-				rxq->rx_segs++;
-			}
+
+			rx_alloc_count += num_segs;
+			rxq->rx_segs += num_segs;
 		}
 		rxq->rx_segs++; /* for the first segment */
 
@@ -1629,7 +1677,8 @@ next_cqe:
 		}
 	}
 
-	qede_update_rx_prod(qdev, rxq);
+	/* Request number of bufferes to be allocated in next loop */
+	rxq->rx_alloc_count = rx_alloc_count;
 
 	rxq->rcv_pkts += rx_pkt;
 
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index fe80237d5..574831e61 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -196,6 +196,8 @@ struct qede_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t rx_buf_size;
+	uint16_t rx_alloc_count;
+	uint16_t unused;
 	uint64_t rcv_pkts;
 	uint64_t rx_segs;
 	uint64_t rx_hw_errors;
-- 
2.11.0

---
  Diff of the applied patch vs upstream commit (please double-check if non-empty:
---
--- -	2019-03-08 09:46:43.105876155 -0800
+++ 0059-net-qede-fix-performance-bottleneck-in-Rx-path.patch	2019-03-08 09:46:40.311403000 -0800
@@ -1,8 +1,10 @@
-From 8f2312474529ad7ff0e4b65b82efc8530e7484ce Mon Sep 17 00:00:00 2001
+From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00 2001
 From: Shahed Shaikh <shshaikh@marvell.com>
 Date: Fri, 18 Jan 2019 02:29:29 -0800
 Subject: [PATCH] net/qede: fix performance bottleneck in Rx path
 
+[ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ]
+
 Allocating replacement buffer per received packet is expensive.
 Instead, process received packets first and allocate
 replacement buffers in bulk later.
@@ -11,7 +13,6 @@
 platforms.
 
 Fixes: 2ea6f76aff40 ("qede: add core driver")
-Cc: stable@dpdk.org
 
 Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
 Acked-by: Rasesh Mody <rmody@marvell.com>
@@ -21,10 +22,10 @@
  2 files changed, 75 insertions(+), 24 deletions(-)
 
 diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
-index 0e33be1a3..684c4aeef 100644
+index cdb85c218..b525075ca 100644
 --- a/drivers/net/qede/qede_rxtx.c
 +++ b/drivers/net/qede/qede_rxtx.c
-@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
+@@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
  	return 0;
  }
  
@@ -77,7 +78,7 @@
  /* Criterias for calculating Rx buffer size -
   * 1) rx_buf_size should not exceed the size of mbuf
   * 2) In scattered_rx mode - minimum rx_buf_size should be
-@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
+@@ -1134,7 +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
  		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
  {
  	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
@@ -86,7 +87,7 @@
  	struct qede_rx_entry *curr_prod;
  	dma_addr_t new_mapping;
  
-@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+@@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  	uint8_t bitfield_val;
  #endif
  	uint8_t tunn_parse_flag;
@@ -94,7 +95,7 @@
  	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
  	uint64_t ol_flags;
  	uint32_t packet_type;
-@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+@@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  	uint8_t offset, tpa_agg_idx, flags;
  	struct qede_agg_info *tpa_info = NULL;
  	uint32_t rss_hash;
@@ -102,7 +103,7 @@
  
  	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
  	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
-@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+@@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  	if (hw_comp_cons == sw_comp_cons)
  		return 0;
  
@@ -128,7 +129,7 @@
  	while (sw_comp_cons != hw_comp_cons) {
  		ol_flags = 0;
  		packet_type = RTE_PTYPE_UNKNOWN;
-@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+@@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  			rx_mb->hash.rss = rss_hash;
  		}
  
@@ -146,7 +147,7 @@
  		qede_rx_bd_ring_consume(rxq);
  
  		if (!tpa_start_flg && fp_cqe->bd_num > 1) {
-@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+@@ -1577,17 +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
  			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
  						 pkt_len - len))
  				goto next_cqe;
@@ -167,7 +168,7 @@
  		}
  		rxq->rx_segs++; /* for the first segment */
  
-@@ -1626,7 +1674,8 @@ next_cqe:
+@@ -1629,7 +1677,8 @@ next_cqe:
  		}
  	}
  
@@ -178,10 +179,10 @@
  	rxq->rcv_pkts += rx_pkt;
  
 diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
-index 454daa07b..5b249cbb2 100644
+index fe80237d5..574831e61 100644
 --- a/drivers/net/qede/qede_rxtx.h
 +++ b/drivers/net/qede/qede_rxtx.h
-@@ -192,6 +192,8 @@ struct qede_rx_queue {
+@@ -196,6 +196,8 @@ struct qede_rx_queue {
  	uint16_t queue_id;
  	uint16_t port_id;
  	uint16_t rx_buf_size;