This patch set optimizes qede_{rx,tx}_entry and introduces rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance improvement depends on the use-case; in a physical-virtual-physical test on a ThunderX2 99xx system with two SMT threads used in ovs, and two cores used in a vm, an improvement of around 2.55% is observed due to this patch set. Balazs Nemeth (8): net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf net/qede: avoid repeatedly calling ecore_chain_get_cons_idx net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free net/qede: prefetch txq->hw_cons_ptr net/qede: prefetch next packet to free net/qede: remove page_offset from struct qede_rx_entry and simplify drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++---------------- drivers/net/qede/qede_rxtx.h | 21 +---- 2 files changed, 81 insertions(+), 88 deletions(-) -- 2.29.2
Each sw_tx_ring entry was of type struct qede_tx_entry: struct qede_tx_entry { struct rte_mbuf *mbuf; uint8_t flags; }; Leaving the unused flags member here has a few performance implications. First, each qede_tx_entry takes up more memory which has caching implications as less entries fit in a cache line while multiple entries are frequently handled in batches. Second, an array of qede_tx_entry entries is incompatible with existing APIs that expect an array of rte_mbuf pointers. Consequently, an extra array need to be allocated before calling such APIs and each entry needs to be copied over. This patch omits the flags field and replaces the qede_tx_entry entry by a simple rte_mbuf pointer. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 20 ++++++++++---------- drivers/net/qede/qede_rxtx.h | 10 +--------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 75d78cebb..c91c96ece 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -426,8 +426,7 @@ qede_alloc_tx_queue_mem(struct rte_eth_dev *dev, /* Allocate software ring */ txq->sw_tx_ring = rte_zmalloc_socket("txq->sw_tx_ring", - (sizeof(struct qede_tx_entry) * - txq->nb_tx_desc), + (sizeof(txq->sw_tx_ring) * txq->nb_tx_desc), RTE_CACHE_LINE_SIZE, socket_id); if (!txq->sw_tx_ring) { @@ -523,9 +522,9 @@ static void qede_tx_queue_release_mbufs(struct qede_tx_queue *txq) if (txq->sw_tx_ring) { for (i = 0; i < txq->nb_tx_desc; i++) { - if (txq->sw_tx_ring[i].mbuf) { - rte_pktmbuf_free(txq->sw_tx_ring[i].mbuf); - txq->sw_tx_ring[i].mbuf = NULL; + if (txq->sw_tx_ring[i]) { + rte_pktmbuf_free(txq->sw_tx_ring[i]); + txq->sw_tx_ring[i] = NULL; } } } @@ -889,10 +888,11 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) uint16_t idx; idx = TX_CONS(txq); - mbuf = txq->sw_tx_ring[idx].mbuf; + mbuf = txq->sw_tx_ring[idx]; if (mbuf) { nb_segs = mbuf->nb_segs; PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + while (nb_segs) { /* It's like consuming rxbuf in recv() */ ecore_chain_consume(&txq->tx_pbl); @@ -900,7 +900,7 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) nb_segs--; } rte_pktmbuf_free(mbuf); - txq->sw_tx_ring[idx].mbuf = NULL; + txq->sw_tx_ring[idx] = NULL; txq->sw_tx_cons++; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } else { @@ -2243,7 +2243,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) struct eth_tx_3rd_bd *bd3; struct rte_mbuf *m_seg = NULL; struct rte_mbuf *mbuf; - struct qede_tx_entry *sw_tx_ring; + struct rte_mbuf **sw_tx_ring; uint16_t nb_tx_pkts; uint16_t bd_prod; uint16_t idx; @@ -2306,7 +2306,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Fill the entry in the SW ring and the BDs in the FW ring */ idx = TX_PROD(txq); - sw_tx_ring[idx].mbuf = mbuf; + sw_tx_ring[idx] = mbuf; /* BD1 */ bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl); @@ -2612,7 +2612,7 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Fill the entry in the SW ring and the BDs in the FW ring */ idx = TX_PROD(txq); - txq->sw_tx_ring[idx].mbuf = mbuf; + txq->sw_tx_ring[idx] = mbuf; /* BD1 */ bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl); diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index fcb564a1b..335016847 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h @@ -203,14 +203,6 @@ struct qede_rx_queue { void *handle; }; -/* - * TX BD descriptor ring - */ -struct qede_tx_entry { - struct rte_mbuf *mbuf; - uint8_t flags; -}; - union db_prod { struct eth_db_data data; uint32_t raw; @@ -220,7 +212,7 @@ struct qede_tx_queue { /* Always keep qdev as first member */ struct qede_dev *qdev; struct ecore_chain tx_pbl; - struct qede_tx_entry *sw_tx_ring; + struct rte_mbuf **sw_tx_ring; uint16_t nb_tx_desc; uint16_t nb_tx_avail; uint16_t tx_free_thresh; -- 2.29.2
Calling ecore_chain_get_cons_idx repeatedly is slower than calling it once and using the result for the remainder of qede_process_tx_compl. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index c91c96ece..1308c71db 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -880,12 +880,13 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id) return rc; } -static inline void +static inline uint16_t qede_free_tx_pkt(struct qede_tx_queue *txq) { struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t ret; idx = TX_CONS(txq); mbuf = txq->sw_tx_ring[idx]; @@ -893,20 +894,22 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) nb_segs = mbuf->nb_segs; PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + ret = nb_segs; while (nb_segs) { /* It's like consuming rxbuf in recv() */ ecore_chain_consume(&txq->tx_pbl); - txq->nb_tx_avail++; nb_segs--; } + rte_pktmbuf_free(mbuf); txq->sw_tx_ring[idx] = NULL; txq->sw_tx_cons++; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } else { ecore_chain_consume(&txq->tx_pbl); - txq->nb_tx_avail++; + ret = 1; } + return ret; } static inline void @@ -914,19 +917,23 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct qede_tx_queue *txq) { uint16_t hw_bd_cons; -#ifdef RTE_LIBRTE_QEDE_DEBUG_TX uint16_t sw_tx_cons; -#endif + uint16_t remaining; rte_compiler_barrier(); + sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr); #ifdef RTE_LIBRTE_QEDE_DEBUG_TX - sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); PMD_TX_LOG(DEBUG, txq, "Tx Completions = %u\n", abs(hw_bd_cons - sw_tx_cons)); #endif - while (hw_bd_cons != ecore_chain_get_cons_idx(&txq->tx_pbl)) - qede_free_tx_pkt(txq); + + remaining = hw_bd_cons - sw_tx_cons; + txq->nb_tx_avail += remaining; + + while (remaining) { + remaining -= qede_free_tx_pkt(txq); + } } static int qede_drain_txq(struct qede_dev *qdev, -- 2.29.2
The ring txq->sw_tx_ring is managed with txq->sw_tx_cons. As long as txq->sw_tx_cons is correct, there is no need to check if txq->sw_tx_ring[idx] is null explicitly. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 1308c71db..c5e12ac6b 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -890,25 +890,20 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) idx = TX_CONS(txq); mbuf = txq->sw_tx_ring[idx]; - if (mbuf) { - nb_segs = mbuf->nb_segs; - PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + RTE_ASSERT(mbuf); + nb_segs = mbuf->nb_segs; + PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); - ret = nb_segs; - while (nb_segs) { - /* It's like consuming rxbuf in recv() */ - ecore_chain_consume(&txq->tx_pbl); - nb_segs--; - } - - rte_pktmbuf_free(mbuf); - txq->sw_tx_ring[idx] = NULL; - txq->sw_tx_cons++; - PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); - } else { + ret = nb_segs; + while (nb_segs) { + /* It's like consuming rxbuf in recv() */ ecore_chain_consume(&txq->tx_pbl); - ret = 1; + nb_segs--; } + + rte_pktmbuf_free(mbuf); + txq->sw_tx_cons++; + PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); return ret; } -- 2.29.2
The next patch will introduce the use of rte_pktmbuf_free_bulk. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 51 ++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index c5e12ac6b..7ce4c00e3 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -880,33 +880,6 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id) return rc; } -static inline uint16_t -qede_free_tx_pkt(struct qede_tx_queue *txq) -{ - struct rte_mbuf *mbuf; - uint16_t nb_segs; - uint16_t idx; - uint16_t ret; - - idx = TX_CONS(txq); - mbuf = txq->sw_tx_ring[idx]; - RTE_ASSERT(mbuf); - nb_segs = mbuf->nb_segs; - PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); - - ret = nb_segs; - while (nb_segs) { - /* It's like consuming rxbuf in recv() */ - ecore_chain_consume(&txq->tx_pbl); - nb_segs--; - } - - rte_pktmbuf_free(mbuf); - txq->sw_tx_cons++; - PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); - return ret; -} - static inline void qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct qede_tx_queue *txq) @@ -914,6 +887,10 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, uint16_t hw_bd_cons; uint16_t sw_tx_cons; uint16_t remaining; + uint16_t mask; + struct rte_mbuf *mbuf; + uint16_t nb_segs; + uint16_t idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -923,12 +900,30 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, abs(hw_bd_cons - sw_tx_cons)); #endif + mask = NUM_TX_BDS(txq); + idx = txq->sw_tx_cons & mask; + remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; while (remaining) { - remaining -= qede_free_tx_pkt(txq); + mbuf = txq->sw_tx_ring[idx]; + RTE_ASSERT(mbuf); + nb_segs = mbuf->nb_segs; + remaining -= nb_segs; + + PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + + while (nb_segs) { + ecore_chain_consume(&txq->tx_pbl); + nb_segs--; + } + + rte_pktmbuf_free(mbuf); + idx = (idx + 1) & mask; + PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } + txq->sw_tx_cons = idx; } static int qede_drain_txq(struct qede_dev *qdev, -- 2.29.2
rte_pktmbuf_free_bulk calls rte_mempool_put_bulk with the number of pending packets to return to the mempool. In contrast, rte_pktmbuf_free calls rte_mempool_put that calls rte_mempool_put_bulk with one object. An important performance related downside of adding one packet at a time to the mempool is that on each call, the per-core cache pointer needs to be read from tls while a single rte_mempool_put_bulk only reads from the tls once. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 7ce4c00e3..e24a937f4 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -891,6 +891,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t first_idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -905,6 +906,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; + first_idx = idx; while (remaining) { mbuf = txq->sw_tx_ring[idx]; @@ -919,11 +921,17 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs--; } - rte_pktmbuf_free(mbuf); idx = (idx + 1) & mask; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } txq->sw_tx_cons = idx; + + if (first_idx > idx) { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], mask - first_idx + 1); + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[0], idx); + } else { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], idx - first_idx); + } } static int qede_drain_txq(struct qede_dev *qdev, -- 2.29.2
Ensure that, while ecore_chain_get_cons_idx is running, txq->hw_cons_ptr is prefetched. This shows a slight performance improvement. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index e24a937f4..b74a1ec1b 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -894,6 +894,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, uint16_t first_idx; rte_compiler_barrier(); + rte_prefetch0(txq->hw_cons_ptr); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr); #ifdef RTE_LIBRTE_QEDE_DEBUG_TX -- 2.29.2
While handling the current mbuf, pull the next next mbuf into the cache. Note that the last four mbufs pulled into the cache are not handled, but that doesn't matter. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index b74a1ec1b..e89498811 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -915,6 +915,11 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs = mbuf->nb_segs; remaining -= nb_segs; + /* Prefetch the next mbuf. Note that at least the last 4 mbufs + that are prefetched will not be used in the current call. */ + rte_mbuf_prefetch_part1(txq->sw_tx_ring[(idx + 4) & mask]); + rte_mbuf_prefetch_part2(txq->sw_tx_ring[(idx + 4) & mask]); + PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); while (nb_segs) { -- 2.29.2
The member page_offset is always zero. Having this in the qede_rx_entry makes it larger than it needs to be and this has cache performance implications so remove that field. In addition, since qede_rx_entry only has an rte_mbuf*, remove the definition of qede_rx_entry. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> --- drivers/net/qede/qede_rxtx.c | 55 ++++++++++++++++++------------------ drivers/net/qede/qede_rxtx.h | 11 +------- 2 files changed, 28 insertions(+), 38 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index e89498811..1e8829ea5 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -24,8 +24,7 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) rte_mempool_in_use_count(rxq->mb_pool)); return -ENOMEM; } - rxq->sw_rx_ring[idx].mbuf = new_mb; - rxq->sw_rx_ring[idx].page_offset = 0; + rxq->sw_rx_ring[idx] = new_mb; mapping = rte_mbuf_data_iova_default(new_mb); /* Advance PROD and get BD pointer */ rx_bd = (struct eth_rx_bd *)ecore_chain_produce(&rxq->rx_bd_ring); @@ -39,17 +38,23 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count) { - void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned; struct rte_mbuf *mbuf = NULL; struct eth_rx_bd *rx_bd; dma_addr_t mapping; int i, ret = 0; uint16_t idx; + uint16_t mask = NUM_RX_BDS(rxq); if (count > QEDE_MAX_BULK_ALLOC_COUNT) count = QEDE_MAX_BULK_ALLOC_COUNT; - ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count); + idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); + + if (count > mask - idx + 1) + count = mask - idx + 1; + + ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count); + if (unlikely(ret)) { PMD_RX_LOG(ERR, rxq, "Failed to allocate %d rx buffers " @@ -63,20 +68,17 @@ static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count) } for (i = 0; i < count; i++) { - mbuf = obj_p[i]; - if (likely(i < count - 1)) - rte_prefetch0(obj_p[i + 1]); + rte_prefetch0(rxq->sw_rx_ring[(idx + 1) & NUM_RX_BDS(rxq)]); + mbuf = rxq->sw_rx_ring[idx & NUM_RX_BDS(rxq)]; - idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); - rxq->sw_rx_ring[idx].mbuf = mbuf; - rxq->sw_rx_ring[idx].page_offset = 0; mapping = rte_mbuf_data_iova_default(mbuf); rx_bd = (struct eth_rx_bd *) ecore_chain_produce(&rxq->rx_bd_ring); rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping)); rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping)); - rxq->sw_rx_prod++; + idx++; } + rxq->sw_rx_prod = idx; return 0; } @@ -309,9 +311,9 @@ static void qede_rx_queue_release_mbufs(struct qede_rx_queue *rxq) if (rxq->sw_rx_ring) { for (i = 0; i < rxq->nb_rx_desc; i++) { - if (rxq->sw_rx_ring[i].mbuf) { - rte_pktmbuf_free(rxq->sw_rx_ring[i].mbuf); - rxq->sw_rx_ring[i].mbuf = NULL; + if (rxq->sw_rx_ring[i]) { + rte_pktmbuf_free(rxq->sw_rx_ring[i]); + rxq->sw_rx_ring[i] = NULL; } } } @@ -1313,18 +1315,15 @@ static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq) static inline void qede_reuse_page(__rte_unused struct qede_dev *qdev, - struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons) + struct qede_rx_queue *rxq, struct rte_mbuf *curr_cons) { struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring); uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); - struct qede_rx_entry *curr_prod; dma_addr_t new_mapping; - curr_prod = &rxq->sw_rx_ring[idx]; - *curr_prod = *curr_cons; + rxq->sw_rx_ring[idx] = curr_cons; - new_mapping = rte_mbuf_data_iova_default(curr_prod->mbuf) + - curr_prod->page_offset; + new_mapping = rte_mbuf_data_iova_default(curr_cons); rx_bd_prod->addr.hi = rte_cpu_to_le_32(U64_HI(new_mapping)); rx_bd_prod->addr.lo = rte_cpu_to_le_32(U64_LO(new_mapping)); @@ -1336,10 +1335,10 @@ static inline void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, struct qede_dev *qdev, uint8_t count) { - struct qede_rx_entry *curr_cons; + struct rte_mbuf *curr_cons; for (; count > 0; count--) { - curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)]; + curr_cons = rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)]; qede_reuse_page(qdev, rxq, curr_cons); qede_rx_bd_ring_consume(rxq); } @@ -1361,7 +1360,7 @@ qede_rx_process_tpa_cmn_cont_end_cqe(__rte_unused struct qede_dev *qdev, if (rte_le_to_cpu_16(len)) { tpa_info = &rxq->tpa_info[agg_index]; cons_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - curr_frag = rxq->sw_rx_ring[cons_idx].mbuf; + curr_frag = rxq->sw_rx_ring[cons_idx]; assert(curr_frag); curr_frag->nb_segs = 1; curr_frag->pkt_len = rte_le_to_cpu_16(len); @@ -1493,7 +1492,7 @@ qede_process_sg_pkts(void *p_rxq, struct rte_mbuf *rx_mb, return -EINVAL; } sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - seg2 = rxq->sw_rx_ring[sw_rx_index].mbuf; + seg2 = rxq->sw_rx_ring[sw_rx_index]; qede_rx_bd_ring_consume(rxq); pkt_len -= cur_size; seg2->data_len = cur_size; @@ -1612,7 +1611,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Get the data from the SW ring */ sw_rx_index = rxq->sw_rx_cons & num_rx_bds; - rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf; + rx_mb = rxq->sw_rx_ring[sw_rx_index]; assert(rx_mb != NULL); parse_flag = rte_le_to_cpu_16(fp_cqe->pars_flags.flags); @@ -1711,7 +1710,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Prefetch next mbuf while processing current one. */ preload_idx = rxq->sw_rx_cons & num_rx_bds; - rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf); + rte_prefetch0(rxq->sw_rx_ring[preload_idx]); /* Update rest of the MBUF fields */ rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM; @@ -1869,7 +1868,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Get the data from the SW ring */ sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf; + rx_mb = rxq->sw_rx_ring[sw_rx_index]; assert(rx_mb != NULL); /* Handle regular CQE or TPA start CQE */ @@ -2000,7 +1999,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Prefetch next mbuf while processing current one. */ preload_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf); + rte_prefetch0(rxq->sw_rx_ring[preload_idx]); /* Update rest of the MBUF fields */ rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM; diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index 335016847..c9334448c 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h @@ -159,15 +159,6 @@ #define QEDE_TX_OFFLOAD_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ QEDE_TX_OFFLOAD_MASK) -/* - * RX BD descriptor ring - */ -struct qede_rx_entry { - struct rte_mbuf *mbuf; - uint32_t page_offset; - /* allows expansion .. */ -}; - /* TPA related structures */ struct qede_agg_info { struct rte_mbuf *tpa_head; /* Pointer to first TPA segment */ @@ -185,7 +176,7 @@ struct qede_rx_queue { struct ecore_chain rx_comp_ring; uint16_t *hw_cons_ptr; void OSAL_IOMEM *hw_rxq_prod_addr; - struct qede_rx_entry *sw_rx_ring; + struct rte_mbuf **sw_rx_ring; struct ecore_sb_info *sb_info; uint16_t sw_rx_cons; uint16_t sw_rx_prod; -- 2.29.2
On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote: > > This patch set optimizes qede_{rx,tx}_entry and introduces > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance > improvement depends on the use-case; in a physical-virtual-physical test > on a ThunderX2 99xx system with two SMT threads used in ovs, > and two cores used in a vm, an improvement of around 2.55% is observed > due to this patch set. Cc: rmody@marvell.com Cc: shshaikh@marvell.com Hi Rasesh, Shahed Could you review this series from Balazs? > > Balazs Nemeth (8): > net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf > net/qede: avoid repeatedly calling ecore_chain_get_cons_idx > net/qede: assume txq->sw_tx_ring[idx] is never null in > qede_free_tx_pkt > net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk > net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free > net/qede: prefetch txq->hw_cons_ptr > net/qede: prefetch next packet to free > net/qede: remove page_offset from struct qede_rx_entry and simplify > > drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++---------------- > drivers/net/qede/qede_rxtx.h | 21 +---- > 2 files changed, 81 insertions(+), 88 deletions(-) > > -- > 2.29.2 >
> On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote: >> >> This patch set optimizes qede_{rx,tx}_entry and introduces >> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance >> improvement depends on the use-case; in a physical-virtual-physical test >> on a ThunderX2 99xx system with two SMT threads used in ovs, >> and two cores used in a vm, an improvement of around 2.55% is observed >> due to this patch set. Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable. Doing a detailed review. > Cc: rmody@marvell.com > Cc: shshaikh@marvell.com> > Hi Rasesh, Shahed > Could you review this series from Balazs? Shahed is not within marvell, adding Devendra from our DPKD support team. Igor
On Wed, Mar 10, 2021 at 12:13 PM Igor Russkikh <irusskikh@marvell.com> wrote: > > > > > On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote: > >> > >> This patch set optimizes qede_{rx,tx}_entry and introduces > >> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance > >> improvement depends on the use-case; in a physical-virtual-physical test > >> on a ThunderX2 99xx system with two SMT threads used in ovs, > >> and two cores used in a vm, an improvement of around 2.55% is observed > >> due to this patch set. > > Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable. > Doing a detailed review. > > > Cc: rmody@marvell.com > > Cc: shshaikh@marvell.com> > > Hi Rasesh, Shahed > > Could you review this series from Balazs? > > Shahed is not within marvell, adding Devendra from our DPKD support team. Could you update the MAINTAINERS to reflect that status? > > Igor
>>> Cc: rmody@marvell.com
>>> Cc: shshaikh@marvell.com>
>>> Hi Rasesh, Shahed
>>> Could you review this series from Balazs?
>>
>> Shahed is not within marvell, adding Devendra from our DPKD support team.
>
> Could you update the MAINTAINERS to reflect that status?
Sure, done, just sent!
Igor
On Wed, Mar 10, 2021 at 12:13 PM Igor Russkikh <irusskikh@marvell.com> wrote: > > > > > On Fri, Mar 5, 2021 at 6:44 PM Balazs Nemeth <bnemeth@redhat.com> wrote: > >> > >> This patch set optimizes qede_{rx,tx}_entry and introduces > >> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance > >> improvement depends on the use-case; in a physical-virtual-physical test > >> on a ThunderX2 99xx system with two SMT threads used in ovs, > >> and two cores used in a vm, an improvement of around 2.55% is observed > >> due to this patch set. > > Hi Balazs, thanks for posting, quickly reviewed it, looks reasonable. Waiting for the review/ack to merge this series > Doing a detailed review. > > > Cc: rmody@marvell.com > > Cc: shshaikh@marvell.com> > > Hi Rasesh, Shahed > > Could you review this series from Balazs? > > Shahed is not within marvell, adding Devendra from our DPKD support team. > > Igor
On 3/5/2021 2:13 PM, Balazs Nemeth wrote:
> External Email
>
> ----------------------------------------------------------------------
> This patch set optimizes qede_{rx,tx}_entry and introduces
> rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance
> improvement depends on the use-case; in a physical-virtual-physical test
> on a ThunderX2 99xx system with two SMT threads used in ovs,
> and two cores used in a vm, an improvement of around 2.55% is observed
> due to this patch set.
>
> Balazs Nemeth (8):
> net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf
> net/qede: avoid repeatedly calling ecore_chain_get_cons_idx
> net/qede: assume txq->sw_tx_ring[idx] is never null in
> qede_free_tx_pkt
> net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk
> net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free
> net/qede: prefetch txq->hw_cons_ptr
> net/qede: prefetch next packet to free
> net/qede: remove page_offset from struct qede_rx_entry and simplify
>
> drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++----------------
> drivers/net/qede/qede_rxtx.h | 21 +----
> 2 files changed, 81 insertions(+), 88 deletions(-)
Series reviewed, for the series
Acked-by: Igor Russkikh <irusskikh@marvell.com>
One checkpatch warn I see in patchwork output, probably worth fixing:
ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)"
#120: FILE: drivers/net/qede/qede_rxtx.c:56:
+ ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count);
Thanks
Igor
On Mon, Mar 22, 2021 at 10:38 PM Igor Russkikh <irusskikh@marvell.com> wrote: > > > > On 3/5/2021 2:13 PM, Balazs Nemeth wrote: > > External Email > > > > ---------------------------------------------------------------------- > > This patch set optimizes qede_{rx,tx}_entry and introduces > > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance > > improvement depends on the use-case; in a physical-virtual-physical test > > on a ThunderX2 99xx system with two SMT threads used in ovs, > > and two cores used in a vm, an improvement of around 2.55% is observed > > due to this patch set. > > > > Balazs Nemeth (8): > > net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf > > net/qede: avoid repeatedly calling ecore_chain_get_cons_idx > > net/qede: assume txq->sw_tx_ring[idx] is never null in > > qede_free_tx_pkt > > net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk > > net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free > > net/qede: prefetch txq->hw_cons_ptr > > net/qede: prefetch next packet to free > > net/qede: remove page_offset from struct qede_rx_entry and simplify > > > > drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++---------------- > > drivers/net/qede/qede_rxtx.h | 21 +---- > > 2 files changed, 81 insertions(+), 88 deletions(-) > > Series reviewed, for the series > > Acked-by: Igor Russkikh <irusskikh@marvell.com> > > One checkpatch warn I see in patchwork output, probably worth fixing: > > ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)" > #120: FILE: drivers/net/qede/qede_rxtx.c:56: > + ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count); Hi @Balazs Nemeth Please fix the following checkpatc.shh and check-git-log.sh issues and add Igor reviewed by in next version. I will merge the next version with the fixes. Updaed this series status in the patchwork as "Changes requested" Wrong headline format: net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf net/qede: avoid repeatedly calling ecore_chain_get_cons_idx net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free net/qede: prefetch txq->hw_cons_ptr net/qede: remove page_offset from struct qede_rx_entry and simplify Headline too long: net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf net/qede: assume txq->sw_tx_ring[idx] is never null in qede_free_tx_pkt net/qede: inline qede_free_tx_pkt to prepare for rte_pktmbuf_free_bulk net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free net/qede: remove page_offset from struct qede_rx_entry and simplify Invalid patch(es) found - checked 8 patches check-git-log failed ### net/qede: remove flags from qede_tx_entry and simplify to rte_mbuf WARNING:LONG_LINE: line length of 89 exceeds 80 columns #37: FILE: drivers/net/qede/qede_rxtx.c:429: + (sizeof(txq->sw_tx_ring) * txq->nb_tx_desc), total: 0 errors, 1 warnings, 0 checks, 87 lines checked ### net/qede: avoid repeatedly calling ecore_chain_get_cons_idx WARNING:BRACES: braces {} are not necessary for single statement blocks #82: FILE: drivers/net/qede/qede_rxtx.c:934: + while (remaining) { + remaining -= qede_free_tx_pkt(txq); + } total: 0 errors, 1 warnings, 0 checks, 67 lines checked ### net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free WARNING:LONG_LINE: line length of 89 exceeds 80 columns #48: FILE: drivers/net/qede/qede_rxtx.c:930: + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], mask - first_idx + 1); WARNING:LONG_LINE: line length of 84 exceeds 80 columns #51: FILE: drivers/net/qede/qede_rxtx.c:933: + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], idx - first_idx); total: 0 errors, 2 warnings, 0 checks, 32 lines checked ### net/qede: prefetch next packet to free WARNING:REPEATED_WORD: Possible repeated word: 'next' #6: While handling the current mbuf, pull the next next mbuf into the cache. WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines #21: FILE: drivers/net/qede/qede_rxtx.c:919: + /* Prefetch the next mbuf. Note that at least the last 4 mbufs + that are prefetched will not be used in the current call. */ WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a separate line #21: FILE: drivers/net/qede/qede_rxtx.c:919: + that are prefetched will not be used in the current call. */ total: 0 errors, 3 warnings, 0 checks, 11 lines checked ### net/qede: remove page_offset from struct qede_rx_entry and simplify WARNING:LONG_LINE: line length of 87 exceeds 80 columns #49: FILE: drivers/net/qede/qede_rxtx.c:56: + ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count); ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)" #49: FILE: drivers/net/qede/qede_rxtx.c:56: + ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq->sw_rx_ring[idx], count); total: 1 errors, 1 warnings, 0 checks, 174 lines checked > > Thanks > Igor
On Wed, 2021-03-24 at 14:48 +0530, Jerin Jacob wrote: > On Mon, Mar 22, 2021 at 10:38 PM Igor Russkikh > <irusskikh@marvell.com> wrote: > > > > > > > > On 3/5/2021 2:13 PM, Balazs Nemeth wrote: > > > External Email > > > > > > ----------------------------------------------------------------- > > > ----- > > > This patch set optimizes qede_{rx,tx}_entry and introduces > > > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall > > > performance > > > improvement depends on the use-case; in a physical-virtual- > > > physical test > > > on a ThunderX2 99xx system with two SMT threads used in ovs, > > > and two cores used in a vm, an improvement of around 2.55% is > > > observed > > > due to this patch set. > > > > > > Balazs Nemeth (8): > > > net/qede: remove flags from qede_tx_entry and simplify to > > > rte_mbuf > > > net/qede: avoid repeatedly calling ecore_chain_get_cons_idx > > > net/qede: assume txq->sw_tx_ring[idx] is never null in > > > qede_free_tx_pkt > > > net/qede: inline qede_free_tx_pkt to prepare for > > > rte_pktmbuf_free_bulk > > > net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free > > > net/qede: prefetch txq->hw_cons_ptr > > > net/qede: prefetch next packet to free > > > net/qede: remove page_offset from struct qede_rx_entry and > > > simplify > > > > > > drivers/net/qede/qede_rxtx.c | 148 +++++++++++++++++++---------- > > > ------ > > > drivers/net/qede/qede_rxtx.h | 21 +---- > > > 2 files changed, 81 insertions(+), 88 deletions(-) > > > > Series reviewed, for the series > > > > Acked-by: Igor Russkikh <irusskikh@marvell.com> > > > > One checkpatch warn I see in patchwork output, probably worth > > fixing: > > > > ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)" > > #120: FILE: drivers/net/qede/qede_rxtx.c:56: > > + ret = rte_mempool_get_bulk(rxq->mb_pool, (void**)&rxq- > > >sw_rx_ring[idx], count); > > > Hi @Balazs Nemeth > > Please fix the following checkpatc.shh and check-git-log.sh issues > and > add Igor reviewed by in next version. > I will merge the next version with the fixes. Updaed this series > status in the patchwork as "Changes requested" > Ok I will provide a new version next week. Thanks for the feedback! > > Wrong headline format: > net/qede: remove flags from qede_tx_entry and simplify to > rte_mbuf > net/qede: avoid repeatedly calling ecore_chain_get_cons_idx > net/qede: assume txq->sw_tx_ring[idx] is never null in > qede_free_tx_pkt > net/qede: inline qede_free_tx_pkt to prepare for > rte_pktmbuf_free_bulk > net/qede: use rte_pktmbuf_free_bulk instead of > rte_pktmbuf_free > net/qede: prefetch txq->hw_cons_ptr > net/qede: remove page_offset from struct qede_rx_entry and > simplify > Headline too long: > net/qede: remove flags from qede_tx_entry and simplify to > rte_mbuf > net/qede: assume txq->sw_tx_ring[idx] is never null in > qede_free_tx_pkt > net/qede: inline qede_free_tx_pkt to prepare for > rte_pktmbuf_free_bulk > net/qede: use rte_pktmbuf_free_bulk instead of > rte_pktmbuf_free > net/qede: remove page_offset from struct qede_rx_entry and > simplify > > Invalid patch(es) found - checked 8 patches > check-git-log failed > > > > ### net/qede: remove flags from qede_tx_entry and simplify to > rte_mbuf > > WARNING:LONG_LINE: line length of 89 exceeds 80 columns > #37: FILE: drivers/net/qede/qede_rxtx.c:429: > + (sizeof(txq->sw_tx_ring) > * txq->nb_tx_desc), > > total: 0 errors, 1 warnings, 0 checks, 87 lines checked > > ### net/qede: avoid repeatedly calling ecore_chain_get_cons_idx > > WARNING:BRACES: braces {} are not necessary for single statement > blocks > #82: FILE: drivers/net/qede/qede_rxtx.c:934: > + while (remaining) { > + remaining -= qede_free_tx_pkt(txq); > + } > > total: 0 errors, 1 warnings, 0 checks, 67 lines checked > > ### net/qede: use rte_pktmbuf_free_bulk instead of rte_pktmbuf_free > > WARNING:LONG_LINE: line length of 89 exceeds 80 columns > #48: FILE: drivers/net/qede/qede_rxtx.c:930: > + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], > mask - first_idx + 1); > > WARNING:LONG_LINE: line length of 84 exceeds 80 columns > #51: FILE: drivers/net/qede/qede_rxtx.c:933: > + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], > idx > - first_idx); > > total: 0 errors, 2 warnings, 0 checks, 32 lines checked > > ### net/qede: prefetch next packet to free > > WARNING:REPEATED_WORD: Possible repeated word: 'next' > #6: > While handling the current mbuf, pull the next next mbuf into the > cache. > > WARNING:BLOCK_COMMENT_STYLE: Block comments use * on subsequent lines > #21: FILE: drivers/net/qede/qede_rxtx.c:919: > + /* Prefetch the next mbuf. Note that at least the > last 4 mbufs > + that are prefetched will not be used in the > current call. */ > > WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a > separate line > #21: FILE: drivers/net/qede/qede_rxtx.c:919: > + that are prefetched will not be used in the > current call. */ > > total: 0 errors, 3 warnings, 0 checks, 11 lines checked > > ### net/qede: remove page_offset from struct qede_rx_entry and > simplify > > WARNING:LONG_LINE: line length of 87 exceeds 80 columns > #49: FILE: drivers/net/qede/qede_rxtx.c:56: > + ret = rte_mempool_get_bulk(rxq->mb_pool, > (void**)&rxq->sw_rx_ring[idx], count); > > ERROR:POINTER_LOCATION: "(foo**)" should be "(foo **)" > #49: FILE: drivers/net/qede/qede_rxtx.c:56: > + ret = rte_mempool_get_bulk(rxq->mb_pool, > (void**)&rxq->sw_rx_ring[idx], count); > > total: 1 errors, 1 warnings, 0 checks, 174 lines checked > > > > > > > Thanks > > Igor > Regards, Balazs