- * [dpdk-stable] [PATCH 19.11 v2 1/5] net/hns3: remove unnecessary assignments in Tx
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
@ 2020-06-02  1:28 ` Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 2/5] net/hns3: reduce judgements of free Tx ring space Wei Hu (Xavier)
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Wei Hu (Xavier) @ 2020-06-02  1:28 UTC (permalink / raw)
  To: luca.boccassi; +Cc: stable, xavier.huwei
[ upstream commit 27f97077853de6f6f55f3d9411657d57809f0123 ]
This patch removes the unnecessary assignment in the '.tx_pkt_burst' ops
implementation function to avoid performance loss.
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Yisen Zhuang <yisen.zhuang@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index a0fcb4c..44e883e 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1686,8 +1686,7 @@ hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 		(tx_next_use != tx_next_clean || tx_bd_ready < tx_bd_max)) {
 		mbuf = tx_bak_pkt->mbuf;
 		if (mbuf) {
-			mbuf->next = NULL;
-			rte_pktmbuf_free(mbuf);
+			rte_pktmbuf_free_seg(mbuf);
 			tx_bak_pkt->mbuf = NULL;
 		}
 
@@ -2105,9 +2104,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	struct rte_mbuf *new_pkt;
 	struct rte_mbuf *tx_pkt;
 	struct rte_mbuf *m_seg;
-	struct rte_mbuf *temp;
 	uint32_t nb_hold = 0;
-	uint16_t tx_next_clean;
 	uint16_t tx_next_use;
 	uint16_t tx_bd_ready;
 	uint16_t tx_pkt_num;
@@ -2122,11 +2119,8 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	if (tx_bd_ready == 0)
 		return 0;
 
-	tx_next_clean = txq->next_to_clean;
 	tx_next_use   = txq->next_to_use;
 	tx_bd_max     = txq->nb_tx_desc;
-	tx_bak_pkt = &txq->sw_ring[tx_next_clean];
-
 	tx_pkt_num = (tx_bd_ready < nb_pkts) ? tx_bd_ready : nb_pkts;
 
 	/* send packets */
@@ -2181,9 +2175,8 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		i = 0;
 		do {
 			fill_desc(txq, tx_next_use, m_seg, (i == 0), 0);
-			temp = m_seg->next;
 			tx_bak_pkt->mbuf = m_seg;
-			m_seg = temp;
+			m_seg = m_seg->next;
 			tx_next_use++;
 			tx_bak_pkt++;
 			if (tx_next_use >= tx_bd_max) {
@@ -2202,7 +2195,6 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 	if (likely(nb_tx)) {
 		hns3_queue_xmit(txq, nb_hold);
-		txq->next_to_clean = tx_next_clean;
 		txq->tx_bd_ready   = tx_bd_ready - nb_hold;
 	}
 
-- 
2.7.4
^ permalink raw reply	[flat|nested] 7+ messages in thread
- * [dpdk-stable] [PATCH 19.11 v2 2/5] net/hns3: reduce judgements of free Tx ring space
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 1/5] net/hns3: remove unnecessary assignments in Tx Wei Hu (Xavier)
@ 2020-06-02  1:28 ` Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 3/5] net/hns3: remove one IO barrier in Rx Wei Hu (Xavier)
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Wei Hu (Xavier) @ 2020-06-02  1:28 UTC (permalink / raw)
  To: luca.boccassi; +Cc: stable, xavier.huwei
From: Yisen Zhuang <yisen.zhuang@huawei.com>
[ upstream commit eb570862a206adb53932525ed19211cee0f940de ]
This patch reduces the number of the judgement of the free Tx ring space
in the 'tx_pkt_burst' ops implementation function to avoid performance
loss. According to hardware constraints, we need to reserve a Tx Buffer
Descriptor in the TX ring in hns3 network engine.
Signed-off-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 44e883e..af7972f 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -643,7 +643,7 @@ hns3_init_tx_queue(struct hns3_tx_queue *queue)
 
 	txq->next_to_use = 0;
 	txq->next_to_clean = 0;
-	txq->tx_bd_ready = txq->nb_tx_desc;
+	txq->tx_bd_ready = txq->nb_tx_desc - 1;
 	hns3_init_tx_queue_hw(txq);
 }
 
@@ -1640,7 +1640,7 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->hns = hns;
 	txq->next_to_use = 0;
 	txq->next_to_clean = 0;
-	txq->tx_bd_ready = txq->nb_tx_desc;
+	txq->tx_bd_ready = txq->nb_tx_desc - 1;
 	txq->port_id = dev->data->port_id;
 	txq->configured = true;
 	txq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
@@ -1652,19 +1652,6 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static inline int
-tx_ring_dist(struct hns3_tx_queue *txq, int begin, int end)
-{
-	return (end - begin + txq->nb_tx_desc) % txq->nb_tx_desc;
-}
-
-static inline int
-tx_ring_space(struct hns3_tx_queue *txq)
-{
-	return txq->nb_tx_desc -
-		tx_ring_dist(txq, txq->next_to_clean, txq->next_to_use) - 1;
-}
-
 static inline void
 hns3_queue_xmit(struct hns3_tx_queue *txq, uint32_t buf_num)
 {
@@ -1683,7 +1670,7 @@ hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 	struct rte_mbuf *mbuf;
 
 	while ((!hns3_get_bit(desc->tx.tp_fe_sc_vld_ra_ri, HNS3_TXD_VLD_B)) &&
-		(tx_next_use != tx_next_clean || tx_bd_ready < tx_bd_max)) {
+		tx_next_use != tx_next_clean) {
 		mbuf = tx_bak_pkt->mbuf;
 		if (mbuf) {
 			rte_pktmbuf_free_seg(mbuf);
@@ -2106,7 +2093,6 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	struct rte_mbuf *m_seg;
 	uint32_t nb_hold = 0;
 	uint16_t tx_next_use;
-	uint16_t tx_bd_ready;
 	uint16_t tx_pkt_num;
 	uint16_t tx_bd_max;
 	uint16_t nb_buf;
@@ -2115,13 +2101,10 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 	/* free useless buffer */
 	hns3_tx_free_useless_buffer(txq);
-	tx_bd_ready = txq->tx_bd_ready;
-	if (tx_bd_ready == 0)
-		return 0;
 
 	tx_next_use   = txq->next_to_use;
 	tx_bd_max     = txq->nb_tx_desc;
-	tx_pkt_num = (tx_bd_ready < nb_pkts) ? tx_bd_ready : nb_pkts;
+	tx_pkt_num = nb_pkts;
 
 	/* send packets */
 	tx_bak_pkt = &txq->sw_ring[tx_next_use];
@@ -2130,7 +2113,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		nb_buf = tx_pkt->nb_segs;
 
-		if (nb_buf > tx_ring_space(txq)) {
+		if (nb_buf > txq->tx_bd_ready) {
 			if (nb_tx == 0)
 				return 0;
 
@@ -2189,14 +2172,13 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 		nb_hold += i;
 		txq->next_to_use = tx_next_use;
+		txq->tx_bd_ready -= i;
 	}
 
 end_of_tx:
 
-	if (likely(nb_tx)) {
+	if (likely(nb_tx))
 		hns3_queue_xmit(txq, nb_hold);
-		txq->tx_bd_ready   = tx_bd_ready - nb_hold;
-	}
 
 	return nb_tx;
 }
-- 
2.7.4
^ permalink raw reply	[flat|nested] 7+ messages in thread
- * [dpdk-stable] [PATCH 19.11 v2 3/5] net/hns3: remove one IO barrier in Rx
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 1/5] net/hns3: remove unnecessary assignments in Tx Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 2/5] net/hns3: reduce judgements of free Tx ring space Wei Hu (Xavier)
@ 2020-06-02  1:28 ` Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 4/5] net/hns3: add free threshold " Wei Hu (Xavier)
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Wei Hu (Xavier) @ 2020-06-02  1:28 UTC (permalink / raw)
  To: luca.boccassi; +Cc: stable, xavier.huwei
[ upstream commit 5cf7a75b2c3b33c4c3579eba6716a4c7ca02ec5b ]
When receiving a packet, hns3 hardware network engine firstly writes the
packet content to the memory pointed by the 'addr' field of the Rx
Buffer Descriptor, secondly fills the result of parsing the packet
include the valid field into the Rx Buffer Descriptor in one write
operation, and thirdly writes the number of the Buffer Descriptor not
processed by the driver to the HNS3_RING_RX_FBDNUM_REG register.
This patch optimizes the Rx performance by removing one rte_io_rmb call
in the '.rx_pkt_burst' ops implementation function named hns3_recv_pkts.
The change as follows:
1. Driver no longer read HNS3_RING_RX_FBDNUM_REG register, so remove one
   rte_io_rmb call, and directly read the valid flag of Rx Buffer
   Descriptor to check whether the BD is ready.
2. Delete the non_vld_descs field from the statistic information of the
   hns3 driver because now it has become a common case that the valid
   flag of Rx Buffer Descriptor read by the driver is invalid.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c  | 12 +++---------
 drivers/net/hns3/hns3_rxtx.h  |  1 -
 drivers/net/hns3/hns3_stats.c |  3 ---
 3 files changed, 3 insertions(+), 13 deletions(-)
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index af7972f..23bb115 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1225,7 +1225,6 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	rxq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
 				idx * HNS3_TQP_REG_SIZE);
 	rxq->rx_buf_len = hw->rx_buf_len;
-	rxq->non_vld_descs = 0;
 	rxq->l2_errors = 0;
 	rxq->pkt_len_errors = 0;
 	rxq->l3_csum_erros = 0;
@@ -1472,7 +1471,6 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t pkt_len;
 	uint16_t nb_rx;
 	uint16_t rx_id;
-	int num;                        /* num of desc in ring */
 	int ret;
 
 	nb_rx = 0;
@@ -1486,15 +1484,11 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	last_seg = rxq->pkt_last_seg;
 	sw_ring = rxq->sw_ring;
 
-	/* Get num of packets in descriptor ring */
-	num = hns3_read_dev(rxq, HNS3_RING_RX_FBDNUM_REG);
-	while (nb_rx_bd < num && nb_rx < nb_pkts) {
+	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
-		if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
-			rxq->non_vld_descs++;
+		if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B)))
 			break;
-		}
 
 		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
 		if (unlikely(nmb == NULL)) {
@@ -1505,7 +1499,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		nb_rx_bd++;
 		rxe = &sw_ring[rx_id];
 		rx_id++;
-		if (rx_id == rxq->nb_rx_desc)
+		if (unlikely(rx_id == rxq->nb_rx_desc))
 			rx_id = 0;
 
 		rte_prefetch0(sw_ring[rx_id].mbuf);
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index b751472..00b92cf 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -245,7 +245,6 @@ struct hns3_rx_queue {
 	bool rx_deferred_start; /* don't start this queue in dev start */
 	bool configured;        /* indicate if rx queue has been configured */
 
-	uint64_t non_vld_descs; /* num of non valid rx descriptors */
 	uint64_t l2_errors;
 	uint64_t pkt_len_errors;
 	uint64_t l3_csum_erros;
diff --git a/drivers/net/hns3/hns3_stats.c b/drivers/net/hns3/hns3_stats.c
index 6e13948..10cc757 100644
--- a/drivers/net/hns3/hns3_stats.c
+++ b/drivers/net/hns3/hns3_stats.c
@@ -219,8 +219,6 @@ static const struct hns3_xstats_name_offset hns3_reset_stats_strings[] = {
 
 /* The statistic of errors in Rx BD */
 static const struct hns3_xstats_name_offset hns3_rx_bd_error_strings[] = {
-	{"NONE_VALIDATED_DESCRIPTORS",
-		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(non_vld_descs)},
 	{"RX_PKT_LEN_ERRORS",
 		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(pkt_len_errors)},
 	{"L2_RX_ERRORS",
@@ -512,7 +510,6 @@ hns3_stats_reset(struct rte_eth_dev *eth_dev)
 		rxq = eth_dev->data->rx_queues[i];
 		if (rxq) {
 			rxq->pkt_len_errors = 0;
-			rxq->non_vld_descs = 0;
 			rxq->l2_errors = 0;
 			rxq->l3_csum_erros = 0;
 			rxq->l4_csum_erros = 0;
-- 
2.7.4
^ permalink raw reply	[flat|nested] 7+ messages in thread
- * [dpdk-stable] [PATCH 19.11 v2 4/5] net/hns3: add free threshold in Rx
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
                   ` (2 preceding siblings ...)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 3/5] net/hns3: remove one IO barrier in Rx Wei Hu (Xavier)
@ 2020-06-02  1:28 ` Wei Hu (Xavier)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 5/5] net/hns3: replace memory barrier with data dependency order Wei Hu (Xavier)
  2020-06-02 13:27 ` [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Luca Boccassi
  5 siblings, 0 replies; 7+ messages in thread
From: Wei Hu (Xavier) @ 2020-06-02  1:28 UTC (permalink / raw)
  To: luca.boccassi; +Cc: stable, xavier.huwei
[ upstream commit ffd0ec015b9f51d3f39e320a02448fc5d43932be ]
This patch optimizes the Rx performance by adding the rx_free_thresh
related process in the '.rx_pkt_burst' ops implementation function named
hns3_recv_pkts. The related change as follows:
1. Adding the rx_free_thresh related process to reduce the number of
   writing the HNS3_RING_RX_HEAD_REG register.
2. Adjusting the internal macro named DEFAULT_RX_FREE_THRESH to 32 and
   adjusting HNS3_MIN_RING_DESC to 64 to make the effect of the thresh
   more obvious.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 12 ++++++++++--
 drivers/net/hns3/hns3_rxtx.h |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 23bb115..34e7448 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -30,7 +30,7 @@
 #include "hns3_logs.h"
 
 #define HNS3_CFG_DESC_NUM(num)	((num) / 8 - 1)
-#define DEFAULT_RX_FREE_THRESH	16
+#define DEFAULT_RX_FREE_THRESH	32
 
 static void
 hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
@@ -610,6 +610,7 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 
 	rxq->next_to_use = 0;
 	rxq->next_to_clean = 0;
+	rxq->nb_rx_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 
 	return 0;
@@ -624,6 +625,7 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
 	rxq->next_to_clean = 0;
+	rxq->nb_rx_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -1577,7 +1579,13 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	rxq->next_to_clean = rx_id;
 	rxq->pkt_first_seg = first_seg;
 	rxq->pkt_last_seg = last_seg;
-	hns3_clean_rx_buffers(rxq, nb_rx_bd);
+
+	nb_rx_bd = nb_rx_bd + rxq->nb_rx_hold;
+	if (nb_rx_bd > rxq->rx_free_thresh) {
+		hns3_clean_rx_buffers(rxq, nb_rx_bd);
+		nb_rx_bd = 0;
+	}
+	rxq->nb_rx_hold = nb_rx_bd;
 
 	return nb_rx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 00b92cf..771f3c9 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -5,7 +5,7 @@
 #ifndef _HNS3_RXTX_H_
 #define _HNS3_RXTX_H_
 
-#define	HNS3_MIN_RING_DESC	32
+#define	HNS3_MIN_RING_DESC	64
 #define	HNS3_MAX_RING_DESC	32768
 #define HNS3_DEFAULT_RING_DESC  1024
 #define	HNS3_ALIGN_RING_DESC	32
-- 
2.7.4
^ permalink raw reply	[flat|nested] 7+ messages in thread
- * [dpdk-stable] [PATCH 19.11 v2 5/5] net/hns3: replace memory barrier with data dependency order
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
                   ` (3 preceding siblings ...)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 4/5] net/hns3: add free threshold " Wei Hu (Xavier)
@ 2020-06-02  1:28 ` Wei Hu (Xavier)
  2020-06-02 13:27 ` [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Luca Boccassi
  5 siblings, 0 replies; 7+ messages in thread
From: Wei Hu (Xavier) @ 2020-06-02  1:28 UTC (permalink / raw)
  To: luca.boccassi; +Cc: stable, xavier.huwei
From: Chengwen Feng <fengchengwen@huawei.com>
[ upstream commit 8162238b7d92bb9ce05bd7f8244bed7ffca2d5b6 ]
This patch optimizes the Rx performance by using data dependency
ordering to instead of memory barrier which is rte_cio_rmb in the
'.rx_pkt_burst' ops implementation function named hns3_recv_pkts.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 85 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 12 deletions(-)
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 34e7448..34dc389 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1453,13 +1453,14 @@ hns3_rx_set_cksum_flag(struct rte_mbuf *rxm, uint64_t packet_type,
 uint16_t
 hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
+	volatile struct hns3_desc *rx_ring;  /* RX ring (desc) */
+	volatile struct hns3_desc *rxdp;     /* pointer of the current desc */
 	struct hns3_rx_queue *rxq;      /* RX queue */
-	struct hns3_desc *rx_ring;      /* RX ring (desc) */
 	struct hns3_entry *sw_ring;
 	struct hns3_entry *rxe;
-	struct hns3_desc *rxdp;         /* pointer of the current desc */
 	struct rte_mbuf *first_seg;
 	struct rte_mbuf *last_seg;
+	struct hns3_desc rxd;
 	struct rte_mbuf *nmb;           /* pointer of the new mbuf */
 	struct rte_mbuf *rxm;
 	struct rte_eth_dev *dev;
@@ -1491,6 +1492,67 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
 		if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B)))
 			break;
+		/*
+		 * The interactive process between software and hardware of
+		 * receiving a new packet in hns3 network engine:
+		 * 1. Hardware network engine firstly writes the packet content
+		 *    to the memory pointed by the 'addr' field of the Rx Buffer
+		 *    Descriptor, secondly fills the result of parsing the
+		 *    packet include the valid field into the Rx Buffer
+		 *    Descriptor in one write operation.
+		 * 2. Driver reads the Rx BD's valid field in the loop to check
+		 *    whether it's valid, if valid then assign a new address to
+		 *    the addr field, clear the valid field, get the other
+		 *    information of the packet by parsing Rx BD's other fields,
+		 *    finally write back the number of Rx BDs processed by the
+		 *    driver to the HNS3_RING_RX_HEAD_REG register to inform
+		 *    hardware.
+		 * In the above process, the ordering is very important. We must
+		 * make sure that CPU read Rx BD's other fields only after the
+		 * Rx BD is valid.
+		 *
+		 * There are two type of re-ordering: compiler re-ordering and
+		 * CPU re-ordering under the ARMv8 architecture.
+		 * 1. we use volatile to deal with compiler re-ordering, so you
+		 *    can see that rx_ring/rxdp defined with volatile.
+		 * 2. we commonly use memory barrier to deal with CPU
+		 *    re-ordering, but the cost is high.
+		 *
+		 * In order to solve the high cost of using memory barrier, we
+		 * use the data dependency order under the ARMv8 architecture,
+		 * for example:
+		 *      instr01: load A
+		 *      instr02: load B <- A
+		 * the instr02 will always execute after instr01.
+		 *
+		 * To construct the data dependency ordering, we use the
+		 * following assignment:
+		 *      rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+		 *                 (1u<<HNS3_RXD_VLD_B)]
+		 * Using gcc compiler under the ARMv8 architecture, the related
+		 * assembly code example as follows:
+		 * note: (1u << HNS3_RXD_VLD_B) equal 0x10
+		 *      instr01: ldr w26, [x22, #28]  --read bd_base_info
+		 *      instr02: and w0, w26, #0x10   --calc bd_base_info & 0x10
+		 *      instr03: sub w0, w0, #0x10    --calc (bd_base_info &
+		 *                                            0x10) - 0x10
+		 *      instr04: add x0, x22, x0, lsl #5 --calc copy source addr
+		 *      instr05: ldp x2, x3, [x0]
+		 *      instr06: stp x2, x3, [x29, #256] --copy BD's [0 ~ 15]B
+		 *      instr07: ldp x4, x5, [x0, #16]
+		 *      instr08: stp x4, x5, [x29, #272] --copy BD's [16 ~ 31]B
+		 * the instr05~08 depend on x0's value, x0 depent on w26's
+		 * value, the w26 is the bd_base_info, this form the data
+		 * dependency ordering.
+		 * note: if BD is valid, (bd_base_info & (1u<<HNS3_RXD_VLD_B)) -
+		 *       (1u<<HNS3_RXD_VLD_B) will always zero, so the
+		 *       assignment is correct.
+		 *
+		 * So we use the data dependency ordering instead of memory
+		 * barrier to improve receive performance.
+		 */
+		rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+			   (1u << HNS3_RXD_VLD_B)];
 
 		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
 		if (unlikely(nmb == NULL)) {
@@ -1514,14 +1576,13 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxe->mbuf = nmb;
 
 		dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
-		rxdp->addr = dma_addr;
 		rxdp->rx.bd_base_info = 0;
+		rxdp->addr = dma_addr;
 
-		rte_cio_rmb();
 		/* Load remained descriptor data and extract necessary fields */
-		data_len = (uint16_t)(rte_le_to_cpu_16(rxdp->rx.size));
-		l234_info = rte_le_to_cpu_32(rxdp->rx.l234_info);
-		ol_info = rte_le_to_cpu_32(rxdp->rx.ol_info);
+		data_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.size));
+		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
 
 		if (first_seg == NULL) {
 			first_seg = rxm;
@@ -1540,14 +1601,14 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 
 		/* The last buffer of the received packet */
-		pkt_len = (uint16_t)(rte_le_to_cpu_16(rxdp->rx.pkt_len));
+		pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len));
 		first_seg->pkt_len = pkt_len;
 		first_seg->port = rxq->port_id;
-		first_seg->hash.rss = rte_le_to_cpu_32(rxdp->rx.rss_hash);
+		first_seg->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
 		first_seg->ol_flags = PKT_RX_RSS_HASH;
 		if (unlikely(hns3_get_bit(bd_base_info, HNS3_RXD_LUM_B))) {
 			first_seg->hash.fdir.hi =
-				rte_le_to_cpu_32(rxdp->rx.fd_id);
+				rte_le_to_cpu_32(rxd.rx.fd_id);
 			first_seg->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
 		}
 		rxm->next = NULL;
@@ -1565,9 +1626,9 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 					       first_seg->packet_type,
 					       cksum_err);
 
-		first_seg->vlan_tci = rte_le_to_cpu_16(rxdp->rx.vlan_tag);
+		first_seg->vlan_tci = rte_le_to_cpu_16(rxd.rx.vlan_tag);
 		first_seg->vlan_tci_outer =
-			rte_le_to_cpu_16(rxdp->rx.ot_vlan_tag);
+			rte_le_to_cpu_16(rxd.rx.ot_vlan_tag);
 		rx_pkts[nb_rx++] = first_seg;
 		first_seg = NULL;
 		continue;
-- 
2.7.4
^ permalink raw reply	[flat|nested] 7+ messages in thread
- * Re: [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver
  2020-06-02  1:28 [dpdk-stable] [PATCH 19.11 v2 0/5] improving I/O backport for hns3 PMD driver Wei Hu (Xavier)
                   ` (4 preceding siblings ...)
  2020-06-02  1:28 ` [dpdk-stable] [PATCH 19.11 v2 5/5] net/hns3: replace memory barrier with data dependency order Wei Hu (Xavier)
@ 2020-06-02 13:27 ` Luca Boccassi
  5 siblings, 0 replies; 7+ messages in thread
From: Luca Boccassi @ 2020-06-02 13:27 UTC (permalink / raw)
  To: Wei Hu (Xavier); +Cc: stable
On Tue, 2020-06-02 at 09:28 +0800, Wei Hu (Xavier) wrote:
> From: Chengwen Feng <fengchengwen@huawei.com>
> 
> This series are backport patches about improving I/O performance
> to DPDK 19.11.3 for hns3 PMD driver.
> 
> v1 - > v2:
>   update the upstream commit ids those from dpdk/master branch.
> 
> Chengwen Feng (1):
>   net/hns3: replace memory barrier with data dependency order
> 
> Wei Hu (Xavier) (3):
>   net/hns3: remove unnecessary assignments in Tx
>   net/hns3: remove one IO barrier in Rx
>   net/hns3: add free threshold in Rx
> 
> Yisen Zhuang (1):
>   net/hns3: reduce judgements of free Tx ring space
> 
>  drivers/net/hns3/hns3_rxtx.c  | 153 ++++++++++++++++++++++++++----------------
>  drivers/net/hns3/hns3_rxtx.h  |   3 +-
>  drivers/net/hns3/hns3_stats.c |   3 -
>  3 files changed, 96 insertions(+), 63 deletions(-)
Series-Acked-by: Luca Boccassi <bluca@debian.org>
Thanks, applied and pushed.
^ permalink raw reply	[flat|nested] 7+ messages in thread