DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries
@ 2021-03-05 13:13 Balazs Nemeth
  2021-03-05 13:13 ` [dpdk-dev] [PATCH 1/8] net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf Balazs Nemeth
                   ` (9 more replies)
  0 siblings, 10 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:13 UTC (permalink / raw)
  To: bnemeth, dev

This patch set optimizes qede_{rx,tx}_entry and introduces
rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
improvement depends on the use-case; in a physical-virtual-physical test
on a ThunderX2 99xx system with two SMT threads used in ovs,
and two cores used in a vm, an improvement of around 2.55% is observed
due to this patch set.

Balazs Nemeth (8):
  net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
  net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
  net/qede: assume txq->sw_tx_ring[idx] is never null in
    qede_free_tx_pkt
  net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
  net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
  net/qede: prefetch txq->hw_cons_ptr
  net/qede: prefetch next packet to free
  net/qede: remove page_offset from struct qede_rx_entry and simplify

 drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------------
 drivers/net/qede/qede_rxtx.h |  21 +----
 2 files changed, 81 insertions(+), 88 deletions(-)

--
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 1/8] net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
@ 2021-03-05 13:13 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 2/8] net/qede: avoid repeatedly calling ecore_chain_get_cons_idx Balazs Nemeth
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:13 UTC (permalink / raw)
  To: bnemeth, dev

Each sw_tx_ring entry was of type struct qede_tx_entry:

struct qede_tx_entry {
       struct rte_mbuf *mbuf;
       uint8_t flags;
};

Leaving the unused flags member here has a few performance implications.
First, each qede_tx_entry takes up more memory which has caching
implications as less entries fit in a cache line while multiple entries
are frequently handled in batches. Second, an array of qede_tx_entry
entries is incompatible with existing APIs that expect an array of
rte_mbuf pointers. Consequently, an extra array need to be allocated
before calling such APIs and each entry needs to be copied over.

This patch omits the flags field and replaces the qede_tx_entry entry
by a simple rte_mbuf pointer.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 20 ++++++++++----------
 drivers/net/qede/qede_rxtx.h | 10 +---------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 75d78cebb..c91c96ece 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -426,8 +426,7 @@ qede_alloc_tx_queue_mem(struct rte_eth_dev *dev,
 
 	/* Allocate software ring */
 	txq->sw_tx_ring = rte_zmalloc_socket("txq->sw_tx_ring",
-					     (sizeof(struct qede_tx_entry) *
-					      txq->nb_tx_desc),
+					     (sizeof(txq->sw_tx_ring) * txq->nb_tx_desc),
 					     RTE_CACHE_LINE_SIZE, socket_id);
 
 	if (!txq->sw_tx_ring) {
@@ -523,9 +522,9 @@ static void qede_tx_queue_release_mbufs(struct qede_tx_queue *txq)
 
 	if (txq->sw_tx_ring) {
 		for (i = 0; i < txq->nb_tx_desc; i++) {
-			if (txq->sw_tx_ring[i].mbuf) {
-				rte_pktmbuf_free(txq->sw_tx_ring[i].mbuf);
-				txq->sw_tx_ring[i].mbuf = NULL;
+			if (txq->sw_tx_ring[i]) {
+				rte_pktmbuf_free(txq->sw_tx_ring[i]);
+				txq->sw_tx_ring[i] = NULL;
 			}
 		}
 	}
@@ -889,10 +888,11 @@ qede_free_tx_pkt(struct qede_tx_queue *txq)
 	uint16_t idx;
 
 	idx = TX_CONS(txq);
-	mbuf = txq->sw_tx_ring[idx].mbuf;
+	mbuf = txq->sw_tx_ring[idx];
 	if (mbuf) {
 		nb_segs = mbuf->nb_segs;
 		PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
+
 		while (nb_segs) {
 			/* It's like consuming rxbuf in recv() */
 			ecore_chain_consume(&txq->tx_pbl);
@@ -900,7 +900,7 @@ qede_free_tx_pkt(struct qede_tx_queue *txq)
 			nb_segs--;
 		}
 		rte_pktmbuf_free(mbuf);
-		txq->sw_tx_ring[idx].mbuf = NULL;
+		txq->sw_tx_ring[idx] = NULL;
 		txq->sw_tx_cons++;
 		PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
 	} else {
@@ -2243,7 +2243,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	struct eth_tx_3rd_bd *bd3;
 	struct rte_mbuf *m_seg = NULL;
 	struct rte_mbuf *mbuf;
-	struct qede_tx_entry *sw_tx_ring;
+	struct rte_mbuf **sw_tx_ring;
 	uint16_t nb_tx_pkts;
 	uint16_t bd_prod;
 	uint16_t idx;
@@ -2306,7 +2306,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		/* Fill the entry in the SW ring and the BDs in the FW ring */
 		idx = TX_PROD(txq);
-		sw_tx_ring[idx].mbuf = mbuf;
+		sw_tx_ring[idx] = mbuf;
 
 		/* BD1 */
 		bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl);
@@ -2612,7 +2612,7 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		/* Fill the entry in the SW ring and the BDs in the FW ring */
 		idx = TX_PROD(txq);
-		txq->sw_tx_ring[idx].mbuf = mbuf;
+		txq->sw_tx_ring[idx] = mbuf;
 
 		/* BD1 */
 		bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl);
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index fcb564a1b..335016847 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -203,14 +203,6 @@ struct qede_rx_queue {
 	void *handle;
 };
 
-/*
- * TX BD descriptor ring
- */
-struct qede_tx_entry {
-	struct rte_mbuf *mbuf;
-	uint8_t flags;
-};
-
 union db_prod {
 	struct eth_db_data data;
 	uint32_t raw;
@@ -220,7 +212,7 @@ struct qede_tx_queue {
 	/* Always keep qdev as first member */
 	struct qede_dev *qdev;
 	struct ecore_chain tx_pbl;
-	struct qede_tx_entry *sw_tx_ring;
+	struct rte_mbuf **sw_tx_ring;
 	uint16_t nb_tx_desc;
 	uint16_t nb_tx_avail;
 	uint16_t tx_free_thresh;
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 2/8] net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
  2021-03-05 13:13 ` [dpdk-dev] [PATCH 1/8] net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 3/8] net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt Balazs Nemeth
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

Calling ecore_chain_get_cons_idx repeatedly is slower than calling it
once and using the result for the remainder of qede_process_tx_compl.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index c91c96ece..1308c71db 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -880,12 +880,13 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id)
 	return rc;
 }
 
-static inline void
+static inline uint16_t
 qede_free_tx_pkt(struct qede_tx_queue *txq)
 {
 	struct rte_mbuf *mbuf;
 	uint16_t nb_segs;
 	uint16_t idx;
+	uint16_t ret;
 
 	idx = TX_CONS(txq);
 	mbuf = txq->sw_tx_ring[idx];
@@ -893,20 +894,22 @@ qede_free_tx_pkt(struct qede_tx_queue *txq)
 		nb_segs = mbuf->nb_segs;
 		PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
 
+		ret = nb_segs;
 		while (nb_segs) {
 			/* It's like consuming rxbuf in recv() */
 			ecore_chain_consume(&txq->tx_pbl);
-			txq->nb_tx_avail++;
 			nb_segs--;
 		}
+
 		rte_pktmbuf_free(mbuf);
 		txq->sw_tx_ring[idx] = NULL;
 		txq->sw_tx_cons++;
 		PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
 	} else {
 		ecore_chain_consume(&txq->tx_pbl);
-		txq->nb_tx_avail++;
+		ret = 1;
 	}
+	return ret;
 }
 
 static inline void
@@ -914,19 +917,23 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 		      struct qede_tx_queue *txq)
 {
 	uint16_t hw_bd_cons;
-#ifdef RTE_LIBRTE_QEDE_DEBUG_TX
 	uint16_t sw_tx_cons;
-#endif
+	uint16_t remaining;
 
 	rte_compiler_barrier();
+	sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl);
 	hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr);
 #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
-	sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl);
 	PMD_TX_LOG(DEBUG, txq, "Tx Completions = %u\n",
 		   abs(hw_bd_cons - sw_tx_cons));
 #endif
-	while (hw_bd_cons !=  ecore_chain_get_cons_idx(&txq->tx_pbl))
-		qede_free_tx_pkt(txq);
+
+	remaining = hw_bd_cons - sw_tx_cons;
+	txq->nb_tx_avail += remaining;
+
+	while (remaining) {
+		remaining -= qede_free_tx_pkt(txq);
+	}
 }
 
 static int qede_drain_txq(struct qede_dev *qdev,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 3/8] net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
  2021-03-05 13:13 ` [dpdk-dev] [PATCH 1/8] net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 2/8] net/qede: avoid repeatedly calling ecore_chain_get_cons_idx Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 4/8] net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk Balazs Nemeth
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

The ring txq->sw_tx_ring is managed with txq->sw_tx_cons. As long as
txq->sw_tx_cons is correct, there is no need to check if
txq->sw_tx_ring[idx] is null explicitly.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 1308c71db..c5e12ac6b 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -890,25 +890,20 @@ qede_free_tx_pkt(struct qede_tx_queue *txq)
 
 	idx = TX_CONS(txq);
 	mbuf = txq->sw_tx_ring[idx];
-	if (mbuf) {
-		nb_segs = mbuf->nb_segs;
-		PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
+	RTE_ASSERT(mbuf);
+	nb_segs = mbuf->nb_segs;
+	PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
 
-		ret = nb_segs;
-		while (nb_segs) {
-			/* It's like consuming rxbuf in recv() */
-			ecore_chain_consume(&txq->tx_pbl);
-			nb_segs--;
-		}
-
-		rte_pktmbuf_free(mbuf);
-		txq->sw_tx_ring[idx] = NULL;
-		txq->sw_tx_cons++;
-		PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
-	} else {
+	ret = nb_segs;
+	while (nb_segs) {
+		/* It's like consuming rxbuf in recv() */
 		ecore_chain_consume(&txq->tx_pbl);
-		ret = 1;
+		nb_segs--;
 	}
+
+	rte_pktmbuf_free(mbuf);
+	txq->sw_tx_cons++;
+	PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
 	return ret;
 }
 
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 4/8] net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (2 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 3/8] net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 5/8] net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free Balazs Nemeth
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

The next patch will introduce the use of rte_pktmbuf_free_bulk.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 51 ++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index c5e12ac6b..7ce4c00e3 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -880,33 +880,6 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id)
 	return rc;
 }
 
-static inline uint16_t
-qede_free_tx_pkt(struct qede_tx_queue *txq)
-{
-	struct rte_mbuf *mbuf;
-	uint16_t nb_segs;
-	uint16_t idx;
-	uint16_t ret;
-
-	idx = TX_CONS(txq);
-	mbuf = txq->sw_tx_ring[idx];
-	RTE_ASSERT(mbuf);
-	nb_segs = mbuf->nb_segs;
-	PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
-
-	ret = nb_segs;
-	while (nb_segs) {
-		/* It's like consuming rxbuf in recv() */
-		ecore_chain_consume(&txq->tx_pbl);
-		nb_segs--;
-	}
-
-	rte_pktmbuf_free(mbuf);
-	txq->sw_tx_cons++;
-	PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
-	return ret;
-}
-
 static inline void
 qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 		      struct qede_tx_queue *txq)
@@ -914,6 +887,10 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 	uint16_t hw_bd_cons;
 	uint16_t sw_tx_cons;
 	uint16_t remaining;
+	uint16_t mask;
+	struct rte_mbuf *mbuf;
+	uint16_t nb_segs;
+	uint16_t idx;
 
 	rte_compiler_barrier();
 	sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl);
@@ -923,12 +900,30 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 		   abs(hw_bd_cons - sw_tx_cons));
 #endif
 
+	mask = NUM_TX_BDS(txq);
+	idx = txq->sw_tx_cons & mask;
+
 	remaining = hw_bd_cons - sw_tx_cons;
 	txq->nb_tx_avail += remaining;
 
 	while (remaining) {
-		remaining -= qede_free_tx_pkt(txq);
+		mbuf = txq->sw_tx_ring[idx];
+		RTE_ASSERT(mbuf);
+		nb_segs = mbuf->nb_segs;
+		remaining -= nb_segs;
+
+		PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
+
+		while (nb_segs) {
+			ecore_chain_consume(&txq->tx_pbl);
+			nb_segs--;
+		}
+
+		rte_pktmbuf_free(mbuf);
+		idx = (idx + 1) & mask;
+		PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
 	}
+	txq->sw_tx_cons = idx;
 }
 
 static int qede_drain_txq(struct qede_dev *qdev,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 5/8] net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (3 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 4/8] net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 6/8] net/qede: prefetch txq->hw_cons_ptr Balazs Nemeth
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

rte_pktmbuf_free_bulk calls rte_mempool_put_bulk with the number of
pending packets to return to the mempool. In contrast, rte_pktmbuf_free
calls rte_mempool_put that calls rte_mempool_put_bulk with one object.
An important performance related downside of adding one packet at a time
to the mempool is that on each call, the per-core cache pointer needs to
be read from tls while a single rte_mempool_put_bulk only reads from the
tls once.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 7ce4c00e3..e24a937f4 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -891,6 +891,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 	struct rte_mbuf *mbuf;
 	uint16_t nb_segs;
 	uint16_t idx;
+	uint16_t first_idx;
 
 	rte_compiler_barrier();
 	sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl);
@@ -905,6 +906,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 
 	remaining = hw_bd_cons - sw_tx_cons;
 	txq->nb_tx_avail += remaining;
+	first_idx = idx;
 
 	while (remaining) {
 		mbuf = txq->sw_tx_ring[idx];
@@ -919,11 +921,17 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 			nb_segs--;
 		}
 
-		rte_pktmbuf_free(mbuf);
 		idx = (idx + 1) & mask;
 		PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n");
 	}
 	txq->sw_tx_cons = idx;
+
+	if (first_idx > idx) {
+		rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], mask - first_idx + 1);
+		rte_pktmbuf_free_bulk(&txq->sw_tx_ring[0], idx);
+	} else {
+		rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], idx - first_idx);
+	}
 }
 
 static int qede_drain_txq(struct qede_dev *qdev,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 6/8] net/qede: prefetch txq->hw_cons_ptr
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (4 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 5/8] net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 7/8] net/qede: prefetch next packet to free Balazs Nemeth
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

Ensure that, while ecore_chain_get_cons_idx is running, txq->hw_cons_ptr
is prefetched. This shows a slight performance improvement.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index e24a937f4..b74a1ec1b 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -894,6 +894,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 	uint16_t first_idx;
 
 	rte_compiler_barrier();
+	rte_prefetch0(txq->hw_cons_ptr);
 	sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl);
 	hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr);
 #ifdef RTE_LIBRTE_QEDE_DEBUG_TX
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 7/8] net/qede: prefetch next packet to free
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (5 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 6/8] net/qede: prefetch txq->hw_cons_ptr Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 8/8] net/qede: remove page_offset from struct qede_rx_entry and simplify Balazs Nemeth
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

While handling the current mbuf, pull the next next mbuf into the cache.
Note that the last four mbufs pulled into the cache are not handled, but
that doesn't matter.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index b74a1ec1b..e89498811 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -915,6 +915,11 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev,
 		nb_segs = mbuf->nb_segs;
 		remaining -= nb_segs;
 
+		/* Prefetch the next mbuf. Note that at least the last 4 mbufs
+		   that are prefetched will not be used in the current call. */
+		rte_mbuf_prefetch_part1(txq->sw_tx_ring[(idx + 4) & mask]);
+		rte_mbuf_prefetch_part2(txq->sw_tx_ring[(idx + 4) & mask]);
+
 		PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs);
 
 		while (nb_segs) {
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 8/8] net/qede: remove page_offset from struct qede_rx_entry and simplify
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (6 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 7/8] net/qede: prefetch next packet to free Balazs Nemeth
@ 2021-03-05 13:14 ` Balazs Nemeth
  2021-03-08 18:13 ` [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Jerin Jacob
  2021-03-22 17:08 ` [dpdk-dev] [EXT] " Igor Russkikh
  9 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-05 13:14 UTC (permalink / raw)
  To: bnemeth, dev

The member page_offset is always zero. Having this in the qede_rx_entry
makes it larger than it needs to be and this has cache performance
implications so remove that field. In addition, since qede_rx_entry only
has an rte_mbuf*, remove the definition of qede_rx_entry.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 drivers/net/qede/qede_rxtx.c | 55 ++++++++++++++++++------------------
 drivers/net/qede/qede_rxtx.h | 11 +-------
 2 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index e89498811..1e8829ea5 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -24,8 +24,7 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 			   rte_mempool_in_use_count(rxq->mb_pool));
 		return -ENOMEM;
 	}
-	rxq->sw_rx_ring[idx].mbuf = new_mb;
-	rxq->sw_rx_ring[idx].page_offset = 0;
+	rxq->sw_rx_ring[idx] = new_mb;
 	mapping = rte_mbuf_data_iova_default(new_mb);
 	/* Advance PROD and get BD pointer */
 	rx_bd = (struct eth_rx_bd *)ecore_chain_produce(&rxq->rx_bd_ring);
@@ -39,17 +38,23 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
 
 static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
 {
-	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
 	struct rte_mbuf *mbuf = NULL;
 	struct eth_rx_bd *rx_bd;
 	dma_addr_t mapping;
 	int i, ret = 0;
 	uint16_t idx;
+	uint16_t mask = NUM_RX_BDS(rxq);
 
 	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
 		count = QEDE_MAX_BULK_ALLOC_COUNT;
 
-	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+	if (count > mask - idx + 1)
+		count = mask - idx + 1;
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count);
+
 	if (unlikely(ret)) {
 		PMD_RX_LOG(ERR, rxq,
 			   "Failed to allocate %d rx buffers "
@@ -63,20 +68,17 @@ static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
 	}
 
 	for (i = 0; i < count; i++) {
-		mbuf = obj_p[i];
-		if (likely(i < count - 1))
-			rte_prefetch0(obj_p[i + 1]);
+		rte_prefetch0(rxq->sw_rx_ring[(idx + 1) & NUM_RX_BDS(rxq)]);
+		mbuf = rxq->sw_rx_ring[idx & NUM_RX_BDS(rxq)];
 
-		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
-		rxq->sw_rx_ring[idx].mbuf = mbuf;
-		rxq->sw_rx_ring[idx].page_offset = 0;
 		mapping = rte_mbuf_data_iova_default(mbuf);
 		rx_bd = (struct eth_rx_bd *)
 			ecore_chain_produce(&rxq->rx_bd_ring);
 		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
 		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
-		rxq->sw_rx_prod++;
+		idx++;
 	}
+	rxq->sw_rx_prod = idx;
 
 	return 0;
 }
@@ -309,9 +311,9 @@ static void qede_rx_queue_release_mbufs(struct qede_rx_queue *rxq)
 
 	if (rxq->sw_rx_ring) {
 		for (i = 0; i < rxq->nb_rx_desc; i++) {
-			if (rxq->sw_rx_ring[i].mbuf) {
-				rte_pktmbuf_free(rxq->sw_rx_ring[i].mbuf);
-				rxq->sw_rx_ring[i].mbuf = NULL;
+			if (rxq->sw_rx_ring[i]) {
+				rte_pktmbuf_free(rxq->sw_rx_ring[i]);
+				rxq->sw_rx_ring[i] = NULL;
 			}
 		}
 	}
@@ -1313,18 +1315,15 @@ static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
 
 static inline void
 qede_reuse_page(__rte_unused struct qede_dev *qdev,
-		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
+		struct qede_rx_queue *rxq, struct rte_mbuf *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
 	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
-	struct qede_rx_entry *curr_prod;
 	dma_addr_t new_mapping;
 
-	curr_prod = &rxq->sw_rx_ring[idx];
-	*curr_prod = *curr_cons;
+	rxq->sw_rx_ring[idx] = curr_cons;
 
-	new_mapping = rte_mbuf_data_iova_default(curr_prod->mbuf) +
-		      curr_prod->page_offset;
+	new_mapping = rte_mbuf_data_iova_default(curr_cons);
 
 	rx_bd_prod->addr.hi = rte_cpu_to_le_32(U64_HI(new_mapping));
 	rx_bd_prod->addr.lo = rte_cpu_to_le_32(U64_LO(new_mapping));
@@ -1336,10 +1335,10 @@ static inline void
 qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq,
 			struct qede_dev *qdev, uint8_t count)
 {
-	struct qede_rx_entry *curr_cons;
+	struct rte_mbuf *curr_cons;
 
 	for (; count > 0; count--) {
-		curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)];
+		curr_cons = rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)];
 		qede_reuse_page(qdev, rxq, curr_cons);
 		qede_rx_bd_ring_consume(rxq);
 	}
@@ -1361,7 +1360,7 @@ qede_rx_process_tpa_cmn_cont_end_cqe(__rte_unused struct qede_dev *qdev,
 	if (rte_le_to_cpu_16(len)) {
 		tpa_info = &rxq->tpa_info[agg_index];
 		cons_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
-		curr_frag = rxq->sw_rx_ring[cons_idx].mbuf;
+		curr_frag = rxq->sw_rx_ring[cons_idx];
 		assert(curr_frag);
 		curr_frag->nb_segs = 1;
 		curr_frag->pkt_len = rte_le_to_cpu_16(len);
@@ -1493,7 +1492,7 @@ qede_process_sg_pkts(void *p_rxq,  struct rte_mbuf *rx_mb,
 			return -EINVAL;
 		}
 		sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
-		seg2 = rxq->sw_rx_ring[sw_rx_index].mbuf;
+		seg2 = rxq->sw_rx_ring[sw_rx_index];
 		qede_rx_bd_ring_consume(rxq);
 		pkt_len -= cur_size;
 		seg2->data_len = cur_size;
@@ -1612,7 +1611,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		/* Get the data from the SW ring */
 		sw_rx_index = rxq->sw_rx_cons & num_rx_bds;
-		rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf;
+		rx_mb = rxq->sw_rx_ring[sw_rx_index];
 		assert(rx_mb != NULL);
 
 		parse_flag = rte_le_to_cpu_16(fp_cqe->pars_flags.flags);
@@ -1711,7 +1710,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		/* Prefetch next mbuf while processing current one. */
 		preload_idx = rxq->sw_rx_cons & num_rx_bds;
-		rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf);
+		rte_prefetch0(rxq->sw_rx_ring[preload_idx]);
 
 		/* Update rest of the MBUF fields */
 		rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM;
@@ -1869,7 +1868,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		/* Get the data from the SW ring */
 		sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
-		rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf;
+		rx_mb = rxq->sw_rx_ring[sw_rx_index];
 		assert(rx_mb != NULL);
 
 		/* Handle regular CQE or TPA start CQE */
@@ -2000,7 +1999,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		/* Prefetch next mbuf while processing current one. */
 		preload_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
-		rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf);
+		rte_prefetch0(rxq->sw_rx_ring[preload_idx]);
 
 		/* Update rest of the MBUF fields */
 		rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM;
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index 335016847..c9334448c 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -159,15 +159,6 @@
 #define QEDE_TX_OFFLOAD_NOTSUP_MASK \
 	(PKT_TX_OFFLOAD_MASK ^ QEDE_TX_OFFLOAD_MASK)
 
-/*
- * RX BD descriptor ring
- */
-struct qede_rx_entry {
-	struct rte_mbuf *mbuf;
-	uint32_t page_offset;
-	/* allows expansion .. */
-};
-
 /* TPA related structures */
 struct qede_agg_info {
 	struct rte_mbuf *tpa_head; /* Pointer to first TPA segment */
@@ -185,7 +176,7 @@ struct qede_rx_queue {
 	struct ecore_chain rx_comp_ring;
 	uint16_t *hw_cons_ptr;
 	void OSAL_IOMEM *hw_rxq_prod_addr;
-	struct qede_rx_entry *sw_rx_ring;
+	struct rte_mbuf **sw_rx_ring;
 	struct ecore_sb_info *sb_info;
 	uint16_t sw_rx_cons;
 	uint16_t sw_rx_prod;
-- 
2.29.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (7 preceding siblings ...)
  2021-03-05 13:14 ` [dpdk-dev] [PATCH 8/8] net/qede: remove page_offset from struct qede_rx_entry and simplify Balazs Nemeth
@ 2021-03-08 18:13 ` Jerin Jacob
  2021-03-10  6:43   ` [dpdk-dev] [EXT] " Igor Russkikh
  2021-03-22 17:08 ` [dpdk-dev] [EXT] " Igor Russkikh
  9 siblings, 1 reply; 17+ messages in thread
From: Jerin Jacob @ 2021-03-08 18:13 UTC (permalink / raw)
  To: Balazs Nemeth, Rasesh Mody, Shahed Shaikh; +Cc: dpdk-dev

On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote:
>
> This patch set optimizes qede_{rx,tx}_entry and introduces
> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> improvement depends on the use-case; in a physical-virtual-physical test
> on a ThunderX2 99xx system with two SMT threads used in ovs,
> and two cores used in a vm, an improvement of around 2.55% is observed
> due to this patch set.


Cc: rmody@marvell.com
Cc: shshaikh@marvell.com

Hi Rasesh, Shahed
Could you review this series from Balazs?


>
> Balazs Nemeth (8):
>   net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
>   net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
>   net/qede: assume txq->sw_tx_ring[idx] is never null in
>     qede_free_tx_pkt
>   net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
>   net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
>   net/qede: prefetch txq->hw_cons_ptr
>   net/qede: prefetch next packet to free
>   net/qede: remove page_offset from struct qede_rx_entry and simplify
>
>  drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------------
>  drivers/net/qede/qede_rxtx.h |  21 +----
>  2 files changed, 81 insertions(+), 88 deletions(-)
>
> --
> 2.29.2
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-08 18:13 ` [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Jerin Jacob
@ 2021-03-10  6:43   ` Igor Russkikh
  2021-03-10  7:51     ` Jerin Jacob
  2021-03-20 13:16     ` Jerin Jacob
  0 siblings, 2 replies; 17+ messages in thread
From: Igor Russkikh @ 2021-03-10  6:43 UTC (permalink / raw)
  To: Jerin Jacob, Balazs Nemeth, Rasesh Mody, Devendra Singh Rawat; +Cc: dpdk-dev



> On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote:
>>
>> This patch set optimizes qede_{rx,tx}_entry and introduces
>> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
>> improvement depends on the use-case; in a physical-virtual-physical test
>> on a ThunderX2 99xx system with two SMT threads used in ovs,
>> and two cores used in a vm, an improvement of around 2.55% is observed
>> due to this patch set.

Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable.
Doing a detailed review.

> Cc: rmody@marvell.com
> Cc: shshaikh@marvell.com>
> Hi Rasesh, Shahed
> Could you review this series from Balazs?

Shahed is not within marvell, adding Devendra from our DPKD support team.

Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-10  6:43   ` [dpdk-dev] [EXT] " Igor Russkikh
@ 2021-03-10  7:51     ` Jerin Jacob
  2021-03-10  8:17       ` Igor Russkikh
  2021-03-20 13:16     ` Jerin Jacob
  1 sibling, 1 reply; 17+ messages in thread
From: Jerin Jacob @ 2021-03-10  7:51 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: Balazs Nemeth, Rasesh Mody, Devendra Singh Rawat, dpdk-dev

On Wed, Mar 10, 2021 at 12:13 PM Igor Russkikh <irusskikh@marvell.com> wrote:
>
>
>
> > On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote:
> >>
> >> This patch set optimizes qede_{rx,tx}_entry and introduces
> >> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> >> improvement depends on the use-case; in a physical-virtual-physical test
> >> on a ThunderX2 99xx system with two SMT threads used in ovs,
> >> and two cores used in a vm, an improvement of around 2.55% is observed
> >> due to this patch set.
>
> Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable.
> Doing a detailed review.
>
> > Cc: rmody@marvell.com
> > Cc: shshaikh@marvell.com>
> > Hi Rasesh, Shahed
> > Could you review this series from Balazs?
>
> Shahed is not within marvell, adding Devendra from our DPKD support team.

Could you update the MAINTAINERS to reflect that status?


>
> Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-10  7:51     ` Jerin Jacob
@ 2021-03-10  8:17       ` Igor Russkikh
  0 siblings, 0 replies; 17+ messages in thread
From: Igor Russkikh @ 2021-03-10  8:17 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Balazs Nemeth, Rasesh Mody, Devendra Singh Rawat, dpdk-dev


>>> Cc: rmody@marvell.com
>>> Cc: shshaikh@marvell.com>
>>> Hi Rasesh, Shahed
>>> Could you review this series from Balazs?
>>
>> Shahed is not within marvell, adding Devendra from our DPKD support team.
> 
> Could you update the MAINTAINERS to reflect that status?

Sure, done, just sent!

Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-10  6:43   ` [dpdk-dev] [EXT] " Igor Russkikh
  2021-03-10  7:51     ` Jerin Jacob
@ 2021-03-20 13:16     ` Jerin Jacob
  1 sibling, 0 replies; 17+ messages in thread
From: Jerin Jacob @ 2021-03-20 13:16 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: Balazs Nemeth, Rasesh Mody, Devendra Singh Rawat, dpdk-dev

On Wed, Mar 10, 2021 at 12:13 PM Igor Russkikh <irusskikh@marvell.com> wrote:
>
>
>
> > On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote:
> >>
> >> This patch set optimizes qede_{rx,tx}_entry and introduces
> >> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> >> improvement depends on the use-case; in a physical-virtual-physical test
> >> on a ThunderX2 99xx system with two SMT threads used in ovs,
> >> and two cores used in a vm, an improvement of around 2.55% is observed
> >> due to this patch set.
>
> Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable.

Waiting for the review/ack to merge this series

> Doing a detailed review.
>
> > Cc: rmody@marvell.com
> > Cc: shshaikh@marvell.com>
> > Hi Rasesh, Shahed
> > Could you review this series from Balazs?
>
> Shahed is not within marvell, adding Devendra from our DPKD support team.
>
> Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
                   ` (8 preceding siblings ...)
  2021-03-08 18:13 ` [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Jerin Jacob
@ 2021-03-22 17:08 ` Igor Russkikh
  2021-03-24  9:18   ` Jerin Jacob
  9 siblings, 1 reply; 17+ messages in thread
From: Igor Russkikh @ 2021-03-22 17:08 UTC (permalink / raw)
  To: Balazs Nemeth, dev, Jerin Jacob; +Cc: Devendra Singh Rawat, Rasesh Mody



On 3/5/2021 2:13 PM, Balazs Nemeth wrote:
> External Email
> 
> ----------------------------------------------------------------------
> This patch set optimizes qede_{rx,tx}_entry and introduces
> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> improvement depends on the use-case; in a physical-virtual-physical test
> on a ThunderX2 99xx system with two SMT threads used in ovs,
> and two cores used in a vm, an improvement of around 2.55% is observed
> due to this patch set.
> 
> Balazs Nemeth (8):
>   net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
>   net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
>   net/qede: assume txq->sw_tx_ring[idx] is never null in
>     qede_free_tx_pkt
>   net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
>   net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
>   net/qede: prefetch txq->hw_cons_ptr
>   net/qede: prefetch next packet to free
>   net/qede: remove page_offset from struct qede_rx_entry and simplify
> 
>  drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------------
>  drivers/net/qede/qede_rxtx.h |  21 +----
>  2 files changed, 81 insertions(+), 88 deletions(-)

Series reviewed, for the series

Acked-by: Igor Russkikh <irusskikh@marvell.com>

One checkpatch warn I see in patchwork output, probably worth fixing:

ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
#120: FILE: drivers/net/qede/qede_rxtx.c:56:
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count);

Thanks
  Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-22 17:08 ` [dpdk-dev] [EXT] " Igor Russkikh
@ 2021-03-24  9:18   ` Jerin Jacob
  2021-03-24  9:34     ` Balazs Nemeth
  0 siblings, 1 reply; 17+ messages in thread
From: Jerin Jacob @ 2021-03-24  9:18 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: Balazs Nemeth, dpdk-dev, Devendra Singh Rawat, Rasesh Mody

On Mon, Mar 22, 2021 at 10:38 PM Igor Russkikh <irusskikh@marvell.com> wrote:
>
>
>
> On 3/5/2021 2:13 PM, Balazs Nemeth wrote:
> > External Email
> >
> > ----------------------------------------------------------------------
> > This patch set optimizes qede_{rx,tx}_entry and introduces
> > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> > improvement depends on the use-case; in a physical-virtual-physical test
> > on a ThunderX2 99xx system with two SMT threads used in ovs,
> > and two cores used in a vm, an improvement of around 2.55% is observed
> > due to this patch set.
> >
> > Balazs Nemeth (8):
> >   net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
> >   net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
> >   net/qede: assume txq->sw_tx_ring[idx] is never null in
> >     qede_free_tx_pkt
> >   net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
> >   net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
> >   net/qede: prefetch txq->hw_cons_ptr
> >   net/qede: prefetch next packet to free
> >   net/qede: remove page_offset from struct qede_rx_entry and simplify
> >
> >  drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------------
> >  drivers/net/qede/qede_rxtx.h |  21 +----
> >  2 files changed, 81 insertions(+), 88 deletions(-)
>
> Series reviewed, for the series
>
> Acked-by: Igor Russkikh <irusskikh@marvell.com>
>
> One checkpatch warn I see in patchwork output, probably worth fixing:
>
> ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
> #120: FILE: drivers/net/qede/qede_rxtx.c:56:
> +       ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count);


Hi @Balazs Nemeth

Please fix the following checkpatc.shh and check-git-log.sh issues and
add Igor reviewed by in next version.
I will merge the next version with the fixes. Updaed  this series
status in the patchwork as "Changes requested"


Wrong headline format:
        net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
        net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
        net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt
        net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
        net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
        net/qede: prefetch txq->hw_cons_ptr
        net/qede: remove page_offset from struct qede_rx_entry and simplify
Headline too long:
        net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
        net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt
        net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
        net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
        net/qede: remove page_offset from struct qede_rx_entry and simplify

Invalid patch(es) found - checked 8 patches
check-git-log failed



### net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf

WARNING:LONG_LINE: line length of 89 exceeds 80 columns
#37: FILE: drivers/net/qede/qede_rxtx.c:429:
+                                            (sizeof(txq->sw_tx_ring)
* txq->nb_tx_desc),

total: 0 errors, 1 warnings, 0 checks, 87 lines checked

### net/qede: avoid repeatedly calling ecore_chain_get_cons_idx

WARNING:BRACES: braces {} are not necessary for single statement blocks
#82: FILE: drivers/net/qede/qede_rxtx.c:934:
+       while (remaining) {
+               remaining -= qede_free_tx_pkt(txq);
+       }

total: 0 errors, 1 warnings, 0 checks, 67 lines checked

### net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free

WARNING:LONG_LINE: line length of 89 exceeds 80 columns
#48: FILE: drivers/net/qede/qede_rxtx.c:930:
+               rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx],
mask - first_idx + 1);

WARNING:LONG_LINE: line length of 84 exceeds 80 columns
#51: FILE: drivers/net/qede/qede_rxtx.c:933:
+               rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], idx
- first_idx);

total: 0 errors, 2 warnings, 0 checks, 32 lines checked

### net/qede: prefetch next packet to free

WARNING:REPEATED_WORD: Possible repeated word: 'next'
#6:
While handling the current mbuf, pull the next next mbuf into the cache.

WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines
#21: FILE: drivers/net/qede/qede_rxtx.c:919:
+               /* Prefetch the next mbuf. Note that at least the last 4 mbufs
+                  that are prefetched will not be used in the current call. */

WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a separate line
#21: FILE: drivers/net/qede/qede_rxtx.c:919:
+                  that are prefetched will not be used in the current call. */

total: 0 errors, 3 warnings, 0 checks, 11 lines checked

### net/qede: remove page_offset from struct qede_rx_entry and simplify

WARNING:LONG_LINE: line length of 87 exceeds 80 columns
#49: FILE: drivers/net/qede/qede_rxtx.c:56:
+       ret = rte_mempool_get_bulk(rxq->mb_pool,
(void**)&rxq->sw_rx_ring[idx], count);

ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
#49: FILE: drivers/net/qede/qede_rxtx.c:56:
+       ret = rte_mempool_get_bulk(rxq->mb_pool,
(void**)&rxq->sw_rx_ring[idx], count);

total: 1 errors, 1 warnings, 0 checks, 174 lines checked



>
> Thanks
>   Igor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 0/8] Optimize qede use of rx/tx_entries
  2021-03-24  9:18   ` Jerin Jacob
@ 2021-03-24  9:34     ` Balazs Nemeth
  0 siblings, 0 replies; 17+ messages in thread
From: Balazs Nemeth @ 2021-03-24  9:34 UTC (permalink / raw)
  To: Jerin Jacob, Igor Russkikh; +Cc: dpdk-dev, Devendra Singh Rawat, Rasesh Mody

On Wed, 2021-03-24 at 14:48 +0530, Jerin Jacob wrote:
> On Mon, Mar 22, 2021 at 10:38 PM Igor Russkikh
> <irusskikh@marvell.com> wrote:
> > 
> > 
> > 
> > On 3/5/2021 2:13 PM, Balazs Nemeth wrote:
> > > External Email
> > > 
> > > -----------------------------------------------------------------
> > > -----
> > > This patch set optimizes qede_{rx,tx}_entry and introduces
> > > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall
> > > performance
> > > improvement depends on the use-case; in a physical-virtual-
> > > physical test
> > > on a ThunderX2 99xx system with two SMT threads used in ovs,
> > > and two cores used in a vm, an improvement of around 2.55% is
> > > observed
> > > due to this patch set.
> > > 
> > > Balazs Nemeth (8):
> > >   net/qede: remove flags from qede_tx_entry and simplify to
> > > rte_mbuf
> > >   net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
> > >   net/qede: assume txq->sw_tx_ring[idx] is never null in
> > >     qede_free_tx_pkt
> > >   net/qede: inline qede_free_tx_pkt to prepare for
> > > rte_pktmbuf_free_bulk
> > >   net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
> > >   net/qede: prefetch txq->hw_cons_ptr
> > >   net/qede: prefetch next packet to free
> > >   net/qede: remove page_offset from struct qede_rx_entry and
> > > simplify
> > > 
> > >  drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------
> > > ------
> > >  drivers/net/qede/qede_rxtx.h |  21 +----
> > >  2 files changed, 81 insertions(+), 88 deletions(-)
> > 
> > Series reviewed, for the series
> > 
> > Acked-by: Igor Russkikh <irusskikh@marvell.com>
> > 
> > One checkpatch warn I see in patchwork output, probably worth
> > fixing:
> > 
> > ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
> > #120: FILE: drivers/net/qede/qede_rxtx.c:56:
> > +       ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq-
> > >sw_rx_ring[idx], count);
> 
> 
> Hi @Balazs Nemeth
> 
> Please fix the following checkpatc.shh and check-git-log.sh issues
> and
> add Igor reviewed by in next version.
> I will merge the next version with the fixes. Updaed  this series
> status in the patchwork as "Changes requested"
> 

Ok I will provide a new version next week. Thanks for the feedback!

> 
> Wrong headline format:
>         net/qede: remove flags from qede_tx_entry and simplify to
> rte_mbuf
>         net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
>         net/qede: assume txq->sw_tx_ring[idx] is never null in
> qede_free_tx_pkt
>         net/qede: inline qede_free_tx_pkt to prepare for
> rte_pktmbuf_free_bulk
>         net/qede: use rte_pktmbuf_free_bulk instead of
> rte_pktmbuf_free
>         net/qede: prefetch txq->hw_cons_ptr
>         net/qede: remove page_offset from struct qede_rx_entry and
> simplify
> Headline too long:
>         net/qede: remove flags from qede_tx_entry and simplify to
> rte_mbuf
>         net/qede: assume txq->sw_tx_ring[idx] is never null in
> qede_free_tx_pkt
>         net/qede: inline qede_free_tx_pkt to prepare for
> rte_pktmbuf_free_bulk
>         net/qede: use rte_pktmbuf_free_bulk instead of
> rte_pktmbuf_free
>         net/qede: remove page_offset from struct qede_rx_entry and
> simplify
> 
> Invalid patch(es) found - checked 8 patches
> check-git-log failed
> 
> 
> 
> ### net/qede: remove flags from qede_tx_entry and simplify to
> rte_mbuf
> 
> WARNING:LONG_LINE: line length of 89 exceeds 80 columns
> #37: FILE: drivers/net/qede/qede_rxtx.c:429:
> +                                            (sizeof(txq->sw_tx_ring)
> * txq->nb_tx_desc),
> 
> total: 0 errors, 1 warnings, 0 checks, 87 lines checked
> 
> ### net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
> 
> WARNING:BRACES: braces {} are not necessary for single statement
> blocks
> #82: FILE: drivers/net/qede/qede_rxtx.c:934:
> +       while (remaining) {
> +               remaining -= qede_free_tx_pkt(txq);
> +       }
> 
> total: 0 errors, 1 warnings, 0 checks, 67 lines checked
> 
> ### net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
> 
> WARNING:LONG_LINE: line length of 89 exceeds 80 columns
> #48: FILE: drivers/net/qede/qede_rxtx.c:930:
> +               rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx],
> mask - first_idx + 1);
> 
> WARNING:LONG_LINE: line length of 84 exceeds 80 columns
> #51: FILE: drivers/net/qede/qede_rxtx.c:933:
> +               rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx],
> idx
> - first_idx);
> 
> total: 0 errors, 2 warnings, 0 checks, 32 lines checked
> 
> ### net/qede: prefetch next packet to free
> 
> WARNING:REPEATED_WORD: Possible repeated word: 'next'
> #6:
> While handling the current mbuf, pull the next next mbuf into the
> cache.
> 
> WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines
> #21: FILE: drivers/net/qede/qede_rxtx.c:919:
> +               /* Prefetch the next mbuf. Note that at least the
> last 4 mbufs
> +                  that are prefetched will not be used in the
> current call. */
> 
> WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a
> separate line
> #21: FILE: drivers/net/qede/qede_rxtx.c:919:
> +                  that are prefetched will not be used in the
> current call. */
> 
> total: 0 errors, 3 warnings, 0 checks, 11 lines checked
> 
> ### net/qede: remove page_offset from struct qede_rx_entry and
> simplify
> 
> WARNING:LONG_LINE: line length of 87 exceeds 80 columns
> #49: FILE: drivers/net/qede/qede_rxtx.c:56:
> +       ret = rte_mempool_get_bulk(rxq->mb_pool,
> (void**)&rxq->sw_rx_ring[idx], count);
> 
> ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
> #49: FILE: drivers/net/qede/qede_rxtx.c:56:
> +       ret = rte_mempool_get_bulk(rxq->mb_pool,
> (void**)&rxq->sw_rx_ring[idx], count);
> 
> total: 1 errors, 1 warnings, 0 checks, 174 lines checked
> 
> 
> 
> > 
> > Thanks
> >   Igor
> 

Regards,
Balazs



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-03-24  9:34 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 13:13 [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Balazs Nemeth
2021-03-05 13:13 ` [dpdk-dev] [PATCH 1/8] net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 2/8] net/qede: avoid repeatedly calling ecore_chain_get_cons_idx Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 3/8] net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 4/8] net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 5/8] net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 6/8] net/qede: prefetch txq->hw_cons_ptr Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 7/8] net/qede: prefetch next packet to free Balazs Nemeth
2021-03-05 13:14 ` [dpdk-dev] [PATCH 8/8] net/qede: remove page_offset from struct qede_rx_entry and simplify Balazs Nemeth
2021-03-08 18:13 ` [dpdk-dev] [PATCH 0/8] Optimize qede use of rx/tx_entries Jerin Jacob
2021-03-10  6:43   ` [dpdk-dev] [EXT] " Igor Russkikh
2021-03-10  7:51     ` Jerin Jacob
2021-03-10  8:17       ` Igor Russkikh
2021-03-20 13:16     ` Jerin Jacob
2021-03-22 17:08 ` [dpdk-dev] [EXT] " Igor Russkikh
2021-03-24  9:18   ` Jerin Jacob
2021-03-24  9:34     ` Balazs Nemeth

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).