From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 5B246A0548; Thu, 11 Nov 2021 14:41:13 +0100 (CET) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id EEDFD41160; Thu, 11 Nov 2021 14:41:08 +0100 (CET) Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189]) by mails.dpdk.org (Postfix) with ESMTP id C5D1540E2D for ; Thu, 11 Nov 2021 14:41:07 +0100 (CET) Received: from dggeme756-chm.china.huawei.com (unknown [172.30.72.55]) by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4HqjW303nBz8vMK; Thu, 11 Nov 2021 21:39:27 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggeme756-chm.china.huawei.com (10.3.19.102) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256_P256) id 15.1.2308.15; Thu, 11 Nov 2021 21:41:04 +0800 From: "Min Hu (Connor)" To: CC: , Subject: [PATCH 2/2] net/hns3: optimized Tx performance Date: Thu, 11 Nov 2021 21:38:59 +0800 Message-ID: <20211111133859.13705-3-humin29@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20211111133859.13705-1-humin29@huawei.com> References: <20211111133859.13705-1-humin29@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To dggeme756-chm.china.huawei.com (10.3.19.102) X-CFilter-Loop: Reflected X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: Chengwen Feng The PMD should check whether the descriptor is done by hardware before free the corresponding mbuf. Currently the common xmit algorithm will free mbuf every time when it's invoked. Because hardware may not have finished sending, this may lead to many invalid queries which are whether the descriptors are done. This patch uses tx_free_thresh to control whether invoke free mbuf, and free tx_rs_thresh mbufs each time. This patch also modifies the implementation of PMD's tx_done_cleanup because the mbuf free algorithm changed. In the testpmd single core MAC forwarding scenario, the performance is improved by 10% at 64B on Kunpeng920 platform. Cc: stable@dpdk.org Signed-off-by: Chengwen Feng Signed-off-by: Min Hu (Connor) --- drivers/net/hns3/hns3_rxtx.c | 118 ++++++++++++++++------------------- 1 file changed, 54 insertions(+), 64 deletions(-) diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c index 78227a139f..114b2f397f 100644 --- a/drivers/net/hns3/hns3_rxtx.c +++ b/drivers/net/hns3/hns3_rxtx.c @@ -3077,40 +3077,51 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc, return 0; } -static void +static int hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq) { uint16_t tx_next_clean = txq->next_to_clean; - uint16_t tx_next_use = txq->next_to_use; - uint16_t tx_bd_ready = txq->tx_bd_ready; - uint16_t tx_bd_max = txq->nb_tx_desc; - struct hns3_entry *tx_bak_pkt = &txq->sw_ring[tx_next_clean]; + uint16_t tx_next_use = txq->next_to_use; + struct hns3_entry *tx_entry = &txq->sw_ring[tx_next_clean]; struct hns3_desc *desc = &txq->tx_ring[tx_next_clean]; - struct rte_mbuf *mbuf; + int i; - while ((!(desc->tx.tp_fe_sc_vld_ra_ri & - rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))) && - tx_next_use != tx_next_clean) { - mbuf = tx_bak_pkt->mbuf; - if (mbuf) { - rte_pktmbuf_free_seg(mbuf); - tx_bak_pkt->mbuf = NULL; - } + if (tx_next_use >= tx_next_clean && + tx_next_use < tx_next_clean + txq->tx_rs_thresh) + return -1; - desc++; - tx_bak_pkt++; - tx_next_clean++; - tx_bd_ready++; - - if (tx_next_clean >= tx_bd_max) { - tx_next_clean = 0; - desc = txq->tx_ring; - tx_bak_pkt = txq->sw_ring; - } + /* + * All mbufs can be released only when the VLD bits of all + * descriptors in a batch are cleared. + */ + for (i = 0; i < txq->tx_rs_thresh; i++) { + if (desc[i].tx.tp_fe_sc_vld_ra_ri & + rte_le_to_cpu_16(BIT(HNS3_TXD_VLD_B))) + return -1; } - txq->next_to_clean = tx_next_clean; - txq->tx_bd_ready = tx_bd_ready; + for (i = 0; i < txq->tx_rs_thresh; i++) { + rte_pktmbuf_free_seg(tx_entry[i].mbuf); + tx_entry[i].mbuf = NULL; + } + + /* Update numbers of available descriptor due to buffer freed */ + txq->tx_bd_ready += txq->tx_rs_thresh; + txq->next_to_clean += txq->tx_rs_thresh; + if (txq->next_to_clean >= txq->nb_tx_desc) + txq->next_to_clean = 0; + + return 0; +} + +static inline int +hns3_tx_free_required_buffer(struct hns3_tx_queue *txq, uint16_t required_bds) +{ + while (required_bds > txq->tx_bd_ready) { + if (hns3_tx_free_useless_buffer(txq) != 0) + return -1; + } + return 0; } int @@ -4147,8 +4158,8 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) uint16_t nb_tx; uint16_t i; - /* free useless buffer */ - hns3_tx_free_useless_buffer(txq); + if (txq->tx_bd_ready < txq->tx_free_thresh) + (void)hns3_tx_free_useless_buffer(txq); tx_next_use = txq->next_to_use; tx_bd_max = txq->nb_tx_desc; @@ -4163,11 +4174,14 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) nb_buf = tx_pkt->nb_segs; if (nb_buf > txq->tx_bd_ready) { - txq->dfx_stats.queue_full_cnt++; - if (nb_tx == 0) - return 0; - - goto end_of_tx; + /* Try to release the required MBUF, but avoid releasing + * all MBUFs, otherwise, the MBUFs will be released for + * a long time and may cause jitter. + */ + if (hns3_tx_free_required_buffer(txq, nb_buf) != 0) { + txq->dfx_stats.queue_full_cnt++; + goto end_of_tx; + } } /* @@ -4577,46 +4591,22 @@ hns3_dev_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id) static int hns3_tx_done_cleanup_full(struct hns3_tx_queue *txq, uint32_t free_cnt) { - uint16_t next_to_clean = txq->next_to_clean; - uint16_t next_to_use = txq->next_to_use; - uint16_t tx_bd_ready = txq->tx_bd_ready; - struct hns3_entry *tx_pkt = &txq->sw_ring[next_to_clean]; - struct hns3_desc *desc = &txq->tx_ring[next_to_clean]; + uint16_t round_free_cnt; uint32_t idx; if (free_cnt == 0 || free_cnt > txq->nb_tx_desc) free_cnt = txq->nb_tx_desc; - for (idx = 0; idx < free_cnt; idx++) { - if (next_to_clean == next_to_use) - break; + if (txq->tx_rs_thresh == 0) + return 0; - if (desc->tx.tp_fe_sc_vld_ra_ri & - rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B))) + round_free_cnt = roundup(free_cnt, txq->tx_rs_thresh); + for (idx = 0; idx < round_free_cnt; idx += txq->tx_rs_thresh) { + if (hns3_tx_free_useless_buffer(txq) != 0) break; - - if (tx_pkt->mbuf != NULL) { - rte_pktmbuf_free_seg(tx_pkt->mbuf); - tx_pkt->mbuf = NULL; - } - - next_to_clean++; - tx_bd_ready++; - tx_pkt++; - desc++; - if (next_to_clean == txq->nb_tx_desc) { - tx_pkt = txq->sw_ring; - desc = txq->tx_ring; - next_to_clean = 0; - } - } - - if (idx > 0) { - txq->next_to_clean = next_to_clean; - txq->tx_bd_ready = tx_bd_ready; } - return (int)idx; + return RTE_MIN(idx, free_cnt); } int -- 2.33.0