This patch set optimizes qede_{rx,tx}_entry and introduces rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance improvement depends on the use-case; in a physical-virtual-physical test on a ThunderX2 99xx system with two SMT threads used in ovs, and two cores used in a vm, an improvement of around 2.55% is observed due to this patch set. Changes in v2: - Fix checkpatches.sh warnings - Fix check-git-log.sh warnings - Add Reviewed-by: Igor Russkikh <irusskikh@marvell.com> Balazs Nemeth (7): net/qede: remove flags from Tx entry net/qede: get consumer index once net/qede: assume mbuf to free is never null net/qede: free packets in bulk instead of one by one net/qede: prefetch hardware consumer net/qede: prefetch next packet to free net/qede: remove unnecessary field in Rx entry and simplify drivers/net/qede/qede_rxtx.c | 154 +++++++++++++++++++---------------- drivers/net/qede/qede_rxtx.h | 21 +---- 2 files changed, 87 insertions(+), 88 deletions(-) -- 2.30.2
Each sw_tx_ring entry was of type struct qede_tx_entry: struct qede_tx_entry { struct rte_mbuf *mbuf; uint8_t flags; }; Leaving the unused flags member here has a few performance implications. First, each qede_tx_entry takes up more memory which has caching implications as less entries fit in a cache line while multiple entries are frequently handled in batches. Second, an array of qede_tx_entry entries is incompatible with existing APIs that expect an array of rte_mbuf pointers. Consequently, an extra array need to be allocated before calling such APIs and each entry needs to be copied over. This patch omits the flags field and replaces the qede_tx_entry entry by a simple rte_mbuf pointer. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 22 ++++++++++++---------- drivers/net/qede/qede_rxtx.h | 10 +--------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 87f52d725..d2f77ed2e 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -393,6 +393,7 @@ qede_alloc_tx_queue_mem(struct rte_eth_dev *dev, struct ecore_dev *edev = &qdev->edev; struct qede_tx_queue *txq; int rc; + size_t sw_tx_ring_size; txq = rte_zmalloc_socket("qede_tx_queue", sizeof(struct qede_tx_queue), RTE_CACHE_LINE_SIZE, socket_id); @@ -425,9 +426,9 @@ qede_alloc_tx_queue_mem(struct rte_eth_dev *dev, } /* Allocate software ring */ + sw_tx_ring_size = sizeof(txq->sw_tx_ring) * txq->nb_tx_desc; txq->sw_tx_ring = rte_zmalloc_socket("txq->sw_tx_ring", - (sizeof(struct qede_tx_entry) * - txq->nb_tx_desc), + sw_tx_ring_size, RTE_CACHE_LINE_SIZE, socket_id); if (!txq->sw_tx_ring) { @@ -523,9 +524,9 @@ static void qede_tx_queue_release_mbufs(struct qede_tx_queue *txq) if (txq->sw_tx_ring) { for (i = 0; i < txq->nb_tx_desc; i++) { - if (txq->sw_tx_ring[i].mbuf) { - rte_pktmbuf_free(txq->sw_tx_ring[i].mbuf); - txq->sw_tx_ring[i].mbuf = NULL; + if (txq->sw_tx_ring[i]) { + rte_pktmbuf_free(txq->sw_tx_ring[i]); + txq->sw_tx_ring[i] = NULL; } } } @@ -889,10 +890,11 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) uint16_t idx; idx = TX_CONS(txq); - mbuf = txq->sw_tx_ring[idx].mbuf; + mbuf = txq->sw_tx_ring[idx]; if (mbuf) { nb_segs = mbuf->nb_segs; PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + while (nb_segs) { /* It's like consuming rxbuf in recv() */ ecore_chain_consume(&txq->tx_pbl); @@ -900,7 +902,7 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) nb_segs--; } rte_pktmbuf_free(mbuf); - txq->sw_tx_ring[idx].mbuf = NULL; + txq->sw_tx_ring[idx] = NULL; txq->sw_tx_cons++; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } else { @@ -2243,7 +2245,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) struct eth_tx_3rd_bd *bd3; struct rte_mbuf *m_seg = NULL; struct rte_mbuf *mbuf; - struct qede_tx_entry *sw_tx_ring; + struct rte_mbuf **sw_tx_ring; uint16_t nb_tx_pkts; uint16_t bd_prod; uint16_t idx; @@ -2306,7 +2308,7 @@ qede_xmit_pkts_regular(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Fill the entry in the SW ring and the BDs in the FW ring */ idx = TX_PROD(txq); - sw_tx_ring[idx].mbuf = mbuf; + sw_tx_ring[idx] = mbuf; /* BD1 */ bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl); @@ -2612,7 +2614,7 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) /* Fill the entry in the SW ring and the BDs in the FW ring */ idx = TX_PROD(txq); - txq->sw_tx_ring[idx].mbuf = mbuf; + txq->sw_tx_ring[idx] = mbuf; /* BD1 */ bd1 = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl); diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index fcb564a1b..335016847 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h @@ -203,14 +203,6 @@ struct qede_rx_queue { void *handle; }; -/* - * TX BD descriptor ring - */ -struct qede_tx_entry { - struct rte_mbuf *mbuf; - uint8_t flags; -}; - union db_prod { struct eth_db_data data; uint32_t raw; @@ -220,7 +212,7 @@ struct qede_tx_queue { /* Always keep qdev as first member */ struct qede_dev *qdev; struct ecore_chain tx_pbl; - struct qede_tx_entry *sw_tx_ring; + struct rte_mbuf **sw_tx_ring; uint16_t nb_tx_desc; uint16_t nb_tx_avail; uint16_t tx_free_thresh; -- 2.30.2
Calling ecore_chain_get_cons_idx repeatedly is slower than calling it once and using the result for the remainder of qede_process_tx_compl. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index d2f77ed2e..0002e2c1e 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -882,12 +882,13 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id) return rc; } -static inline void +static inline uint16_t qede_free_tx_pkt(struct qede_tx_queue *txq) { struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t ret; idx = TX_CONS(txq); mbuf = txq->sw_tx_ring[idx]; @@ -895,20 +896,22 @@ qede_free_tx_pkt(struct qede_tx_queue *txq) nb_segs = mbuf->nb_segs; PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + ret = nb_segs; while (nb_segs) { /* It's like consuming rxbuf in recv() */ ecore_chain_consume(&txq->tx_pbl); - txq->nb_tx_avail++; nb_segs--; } + rte_pktmbuf_free(mbuf); txq->sw_tx_ring[idx] = NULL; txq->sw_tx_cons++; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } else { ecore_chain_consume(&txq->tx_pbl); - txq->nb_tx_avail++; + ret = 1; } + return ret; } static inline void @@ -916,19 +919,22 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct qede_tx_queue *txq) { uint16_t hw_bd_cons; -#ifdef RTE_LIBRTE_QEDE_DEBUG_TX uint16_t sw_tx_cons; -#endif + uint16_t remaining; rte_compiler_barrier(); + sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr); #ifdef RTE_LIBRTE_QEDE_DEBUG_TX - sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); PMD_TX_LOG(DEBUG, txq, "Tx Completions = %u\n", abs(hw_bd_cons - sw_tx_cons)); #endif - while (hw_bd_cons != ecore_chain_get_cons_idx(&txq->tx_pbl)) - qede_free_tx_pkt(txq); + + remaining = hw_bd_cons - sw_tx_cons; + txq->nb_tx_avail += remaining; + + while (remaining) + remaining -= qede_free_tx_pkt(txq); } static int qede_drain_txq(struct qede_dev *qdev, -- 2.30.2
The ring txq->sw_tx_ring is managed with txq->sw_tx_cons. As long as txq->sw_tx_cons is correct, there is no need to check if txq->sw_tx_ring[idx] is null explicitly. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 59 +++++++++++++++--------------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 0002e2c1e..9294f79eb 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -882,38 +882,6 @@ qede_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t tx_queue_id) return rc; } -static inline uint16_t -qede_free_tx_pkt(struct qede_tx_queue *txq) -{ - struct rte_mbuf *mbuf; - uint16_t nb_segs; - uint16_t idx; - uint16_t ret; - - idx = TX_CONS(txq); - mbuf = txq->sw_tx_ring[idx]; - if (mbuf) { - nb_segs = mbuf->nb_segs; - PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); - - ret = nb_segs; - while (nb_segs) { - /* It's like consuming rxbuf in recv() */ - ecore_chain_consume(&txq->tx_pbl); - nb_segs--; - } - - rte_pktmbuf_free(mbuf); - txq->sw_tx_ring[idx] = NULL; - txq->sw_tx_cons++; - PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); - } else { - ecore_chain_consume(&txq->tx_pbl); - ret = 1; - } - return ret; -} - static inline void qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct qede_tx_queue *txq) @@ -921,6 +889,10 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, uint16_t hw_bd_cons; uint16_t sw_tx_cons; uint16_t remaining; + uint16_t mask; + struct rte_mbuf *mbuf; + uint16_t nb_segs; + uint16_t idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -930,11 +902,30 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, abs(hw_bd_cons - sw_tx_cons)); #endif + mask = NUM_TX_BDS(txq); + idx = txq->sw_tx_cons & mask; + remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; - while (remaining) - remaining -= qede_free_tx_pkt(txq); + while (remaining) { + mbuf = txq->sw_tx_ring[idx]; + RTE_ASSERT(mbuf); + nb_segs = mbuf->nb_segs; + remaining -= nb_segs; + + PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); + + while (nb_segs) { + ecore_chain_consume(&txq->tx_pbl); + nb_segs--; + } + + rte_pktmbuf_free(mbuf); + idx = (idx + 1) & mask; + PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); + } + txq->sw_tx_cons = idx; } static int qede_drain_txq(struct qede_dev *qdev, -- 2.30.2
rte_pktmbuf_free_bulk calls rte_mempool_put_bulk with the number of pending packets to return to the mempool. In contrast, rte_pktmbuf_free calls rte_mempool_put that calls rte_mempool_put_bulk with one object. An important performance related downside of adding one packet at a time to the mempool is that on each call, the per-core cache pointer needs to be read from tls while a single rte_mempool_put_bulk only reads from the tls once. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 9294f79eb..f439ee056 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -893,6 +893,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t first_idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -907,6 +908,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; + first_idx = idx; while (remaining) { mbuf = txq->sw_tx_ring[idx]; @@ -921,11 +923,19 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs--; } - rte_pktmbuf_free(mbuf); idx = (idx + 1) & mask; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } txq->sw_tx_cons = idx; + + if (first_idx > idx) { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], + mask - first_idx + 1); + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[0], idx); + } else { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], + idx - first_idx); + } } static int qede_drain_txq(struct qede_dev *qdev, -- 2.30.2
Ensure that, while ecore_chain_get_cons_idx is running, txq->hw_cons_ptr is prefetched. This shows a slight performance improvement. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index f439ee056..4f58abfbf 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -896,6 +896,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, uint16_t first_idx; rte_compiler_barrier(); + rte_prefetch0(txq->hw_cons_ptr); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); hw_bd_cons = rte_le_to_cpu_16(*txq->hw_cons_ptr); #ifdef RTE_LIBRTE_QEDE_DEBUG_TX -- 2.30.2
While handling the current mbuf, pull the next mbuf into the cache. Note that the last four mbufs pulled into the cache are not handled, but that doesn't matter. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 4f58abfbf..ed3617160 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -917,6 +917,12 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs = mbuf->nb_segs; remaining -= nb_segs; + /* Prefetch the next mbuf. Note that at least the last 4 mbufs + * that are prefetched will not be used in the current call. + */ + rte_mbuf_prefetch_part1(txq->sw_tx_ring[(idx + 4) & mask]); + rte_mbuf_prefetch_part2(txq->sw_tx_ring[(idx + 4) & mask]); + PMD_TX_LOG(DEBUG, txq, "nb_segs to free %u\n", nb_segs); while (nb_segs) { -- 2.30.2
The member page_offset is always zero. Having this in the qede_rx_entry makes it larger than it needs to be and this has cache performance implications so remove that field. In addition, since qede_rx_entry only has an rte_mbuf*, remove the definition of qede_rx_entry. Signed-off-by: Balazs Nemeth <bnemeth@redhat.com> Reviewed-by: Igor Russkikh <irusskikh@marvell.com> --- drivers/net/qede/qede_rxtx.c | 56 ++++++++++++++++++------------------ drivers/net/qede/qede_rxtx.h | 11 +------ 2 files changed, 29 insertions(+), 38 deletions(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index ed3617160..298f4e3e4 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -24,8 +24,7 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) rte_mempool_in_use_count(rxq->mb_pool)); return -ENOMEM; } - rxq->sw_rx_ring[idx].mbuf = new_mb; - rxq->sw_rx_ring[idx].page_offset = 0; + rxq->sw_rx_ring[idx] = new_mb; mapping = rte_mbuf_data_iova_default(new_mb); /* Advance PROD and get BD pointer */ rx_bd = (struct eth_rx_bd *)ecore_chain_produce(&rxq->rx_bd_ring); @@ -39,17 +38,24 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq) static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count) { - void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned; struct rte_mbuf *mbuf = NULL; struct eth_rx_bd *rx_bd; dma_addr_t mapping; int i, ret = 0; uint16_t idx; + uint16_t mask = NUM_RX_BDS(rxq); if (count > QEDE_MAX_BULK_ALLOC_COUNT) count = QEDE_MAX_BULK_ALLOC_COUNT; - ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count); + idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); + + if (count > mask - idx + 1) + count = mask - idx + 1; + + ret = rte_mempool_get_bulk(rxq->mb_pool, (void **)&rxq->sw_rx_ring[idx], + count); + if (unlikely(ret)) { PMD_RX_LOG(ERR, rxq, "Failed to allocate %d rx buffers " @@ -63,20 +69,17 @@ static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count) } for (i = 0; i < count; i++) { - mbuf = obj_p[i]; - if (likely(i < count - 1)) - rte_prefetch0(obj_p[i + 1]); + rte_prefetch0(rxq->sw_rx_ring[(idx + 1) & NUM_RX_BDS(rxq)]); + mbuf = rxq->sw_rx_ring[idx & NUM_RX_BDS(rxq)]; - idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); - rxq->sw_rx_ring[idx].mbuf = mbuf; - rxq->sw_rx_ring[idx].page_offset = 0; mapping = rte_mbuf_data_iova_default(mbuf); rx_bd = (struct eth_rx_bd *) ecore_chain_produce(&rxq->rx_bd_ring); rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping)); rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping)); - rxq->sw_rx_prod++; + idx++; } + rxq->sw_rx_prod = idx; return 0; } @@ -309,9 +312,9 @@ static void qede_rx_queue_release_mbufs(struct qede_rx_queue *rxq) if (rxq->sw_rx_ring) { for (i = 0; i < rxq->nb_rx_desc; i++) { - if (rxq->sw_rx_ring[i].mbuf) { - rte_pktmbuf_free(rxq->sw_rx_ring[i].mbuf); - rxq->sw_rx_ring[i].mbuf = NULL; + if (rxq->sw_rx_ring[i]) { + rte_pktmbuf_free(rxq->sw_rx_ring[i]); + rxq->sw_rx_ring[i] = NULL; } } } @@ -1318,18 +1321,15 @@ static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq) static inline void qede_reuse_page(__rte_unused struct qede_dev *qdev, - struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons) + struct qede_rx_queue *rxq, struct rte_mbuf *curr_cons) { struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring); uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq); - struct qede_rx_entry *curr_prod; dma_addr_t new_mapping; - curr_prod = &rxq->sw_rx_ring[idx]; - *curr_prod = *curr_cons; + rxq->sw_rx_ring[idx] = curr_cons; - new_mapping = rte_mbuf_data_iova_default(curr_prod->mbuf) + - curr_prod->page_offset; + new_mapping = rte_mbuf_data_iova_default(curr_cons); rx_bd_prod->addr.hi = rte_cpu_to_le_32(U64_HI(new_mapping)); rx_bd_prod->addr.lo = rte_cpu_to_le_32(U64_LO(new_mapping)); @@ -1341,10 +1341,10 @@ static inline void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, struct qede_dev *qdev, uint8_t count) { - struct qede_rx_entry *curr_cons; + struct rte_mbuf *curr_cons; for (; count > 0; count--) { - curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)]; + curr_cons = rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS(rxq)]; qede_reuse_page(qdev, rxq, curr_cons); qede_rx_bd_ring_consume(rxq); } @@ -1366,7 +1366,7 @@ qede_rx_process_tpa_cmn_cont_end_cqe(__rte_unused struct qede_dev *qdev, if (rte_le_to_cpu_16(len)) { tpa_info = &rxq->tpa_info[agg_index]; cons_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - curr_frag = rxq->sw_rx_ring[cons_idx].mbuf; + curr_frag = rxq->sw_rx_ring[cons_idx]; assert(curr_frag); curr_frag->nb_segs = 1; curr_frag->pkt_len = rte_le_to_cpu_16(len); @@ -1498,7 +1498,7 @@ qede_process_sg_pkts(void *p_rxq, struct rte_mbuf *rx_mb, return -EINVAL; } sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - seg2 = rxq->sw_rx_ring[sw_rx_index].mbuf; + seg2 = rxq->sw_rx_ring[sw_rx_index]; qede_rx_bd_ring_consume(rxq); pkt_len -= cur_size; seg2->data_len = cur_size; @@ -1617,7 +1617,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Get the data from the SW ring */ sw_rx_index = rxq->sw_rx_cons & num_rx_bds; - rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf; + rx_mb = rxq->sw_rx_ring[sw_rx_index]; assert(rx_mb != NULL); parse_flag = rte_le_to_cpu_16(fp_cqe->pars_flags.flags); @@ -1716,7 +1716,7 @@ qede_recv_pkts_regular(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Prefetch next mbuf while processing current one. */ preload_idx = rxq->sw_rx_cons & num_rx_bds; - rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf); + rte_prefetch0(rxq->sw_rx_ring[preload_idx]); /* Update rest of the MBUF fields */ rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM; @@ -1874,7 +1874,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Get the data from the SW ring */ sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - rx_mb = rxq->sw_rx_ring[sw_rx_index].mbuf; + rx_mb = rxq->sw_rx_ring[sw_rx_index]; assert(rx_mb != NULL); /* Handle regular CQE or TPA start CQE */ @@ -2005,7 +2005,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) /* Prefetch next mbuf while processing current one. */ preload_idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq); - rte_prefetch0(rxq->sw_rx_ring[preload_idx].mbuf); + rte_prefetch0(rxq->sw_rx_ring[preload_idx]); /* Update rest of the MBUF fields */ rx_mb->data_off = offset + RTE_PKTMBUF_HEADROOM; diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index 335016847..c9334448c 100644 --- a/drivers/net/qede/qede_rxtx.h +++ b/drivers/net/qede/qede_rxtx.h @@ -159,15 +159,6 @@ #define QEDE_TX_OFFLOAD_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ QEDE_TX_OFFLOAD_MASK) -/* - * RX BD descriptor ring - */ -struct qede_rx_entry { - struct rte_mbuf *mbuf; - uint32_t page_offset; - /* allows expansion .. */ -}; - /* TPA related structures */ struct qede_agg_info { struct rte_mbuf *tpa_head; /* Pointer to first TPA segment */ @@ -185,7 +176,7 @@ struct qede_rx_queue { struct ecore_chain rx_comp_ring; uint16_t *hw_cons_ptr; void OSAL_IOMEM *hw_rxq_prod_addr; - struct qede_rx_entry *sw_rx_ring; + struct rte_mbuf **sw_rx_ring; struct ecore_sb_info *sb_info; uint16_t sw_rx_cons; uint16_t sw_rx_prod; -- 2.30.2
On Fri, Mar 26, 2021 at 4:32 PM Balazs Nemeth <bnemeth@redhat.com> wrote: > > This patch set optimizes qede_{rx,tx}_entry and introduces > rte_pktmbuf_free_bulk in qede_process_tx_compl. The overall performance > improvement depends on the use-case; in a physical-virtual-physical test > on a ThunderX2 99xx system with two SMT threads used in ovs, > and two cores used in a vm, an improvement of around 2.55% is observed > due to this patch set. > > Changes in v2: > - Fix checkpatches.sh warnings > - Fix check-git-log.sh warnings > - Add Reviewed-by: Igor Russkikh <irusskikh@marvell.com> Series applied to dpdk-next-net-mrvl/for-main. Thanks. > > Balazs Nemeth (7): > net/qede: remove flags from Tx entry > net/qede: get consumer index once > net/qede: assume mbuf to free is never null > net/qede: free packets in bulk instead of one by one > net/qede: prefetch hardware consumer > net/qede: prefetch next packet to free > net/qede: remove unnecessary field in Rx entry and simplify > > drivers/net/qede/qede_rxtx.c | 154 +++++++++++++++++++---------------- > drivers/net/qede/qede_rxtx.h | 21 +---- > 2 files changed, 87 insertions(+), 88 deletions(-) > > -- > 2.30.2 >