DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf
@ 2021-06-13  1:42 Min Hu (Connor)
  2021-06-13  1:42 ` Min Hu (Connor)
  0 siblings, 1 reply; 9+ messages in thread
From: Min Hu (Connor) @ 2021-06-13  1:42 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit

From: Chengwen Feng <fengchengwen@huawei.com>

Kunpeng 930 support tx push mode which could improve performance, It
works like below:
1. Add pcie bar45 which support driver direct write the Tx descriptor
or tail reg to it.
2. Support three operations: a) direct write one Tx descriptor, b)
direct write two Tx descriptors, c) direct write tail reg.
3. The original tail reg located at bar23, the above bar45 tail reg
could provide better bandwidth from the hardware perspective.

The hns3 driver only support direct write tail reg (also have the name
of quick doorbell), the detail:
1. Considering compatibility, firmware will report tx push capa if the
hardware support it.
2. Add control macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH which was not
defined default.
3. If user define macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH and hardware
support, then driver will direct write bar45 tail reg to inform the
hardware.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c        |  4 +-
 drivers/net/hns3/hns3_ethdev.h        |  4 ++
 drivers/net/hns3/hns3_ethdev_vf.c     |  4 +-
 drivers/net/hns3/hns3_rxtx.c          | 73 ++++++++++++++++++++++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          | 19 +++++++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h |  2 +-
 drivers/net/hns3/hns3_rxtx_vec_sve.c  |  2 +-
 7 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 2049130..467d770 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -5177,6 +5177,8 @@ hns3_init_pf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/*
 	 * To ensure that the hardware environment is clean during
 	 * initialization, the driver actively clear the hardware environment
@@ -7422,8 +7424,8 @@ hns3_dev_init(struct rte_eth_dev *eth_dev)
 				     "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 1714eba..575bacd 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -862,6 +862,7 @@ enum {
 	HNS3_DEV_SUPPORT_COPPER_B,
 	HNS3_DEV_SUPPORT_FD_QUEUE_REGION_B,
 	HNS3_DEV_SUPPORT_PTP_B,
+	HNS3_DEV_SUPPORT_TX_PUSH_B,
 	HNS3_DEV_SUPPORT_INDEP_TXRX_B,
 	HNS3_DEV_SUPPORT_STASH_B,
 	HNS3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
@@ -900,6 +901,9 @@ enum {
 #define hns3_dev_ras_imp_supported(hw) \
 	hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_RAS_IMP_B)
 
+#define hns3_dev_tx_push_supported(hw) \
+		hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_TX_PUSH_B)
+
 #define HNS3_DEV_PRIVATE_TO_HW(adapter) \
 	(&((struct hns3_adapter *)adapter)->hw)
 #define HNS3_DEV_PRIVATE_TO_PF(adapter) \
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 41dd8ee..1e94a1e 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -1921,6 +1921,8 @@ hns3vf_init_vf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/* Get VF resource */
 	ret = hns3_query_vf_resource(hw);
 	if (ret)
@@ -2928,8 +2930,8 @@ hns3vf_dev_init(struct rte_eth_dev *eth_dev)
 					  "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 1d7a769..88d4fc0 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
 	return 0;
 }
 
+static void *
+hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+#define HNS3_TX_PUSH_TQP_REGION_SIZE		0x10000
+#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET	64
+#define HNS3_TX_PUSH_PCI_BAR_INDEX		4
+
+	struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
+	uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
+
+	/*
+	 * If device support tx push then its pcie bar45 must exist, and DPDK
+	 * framework will mmap the bar45 default in pci probe stage.
+	 *
+	 * In the bar45, the first half is for roce, and the second half is for
+	 * nic, every TQP occupy 64KB.
+	 *
+	 * The quick doorbell located at 64B offset in the TQP region.
+	 */
+	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
+			(pci_dev->mem_resource[bar_id].len >> 1) +
+			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
+			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);
+}
+
+void
+hns3_tx_push_init(struct rte_eth_dev *dev)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	volatile uint32_t *reg;
+	uint32_t val;
+
+	if (!hns3_dev_tx_push_supported(hw))
+		return;
+
+	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
+	/*
+	 * Because the size of bar45 is about 8GB size, it may take a long time
+	 * to do the page fault in Tx process when work with vfio-pci, so use
+	 * one read operation to make kernel setup page table mapping for bar45
+	 * in the init stage.
+	 * Note: the bar45 is readable but the result is all 1.
+	 */
+	val = *reg;
+	RTE_SET_USED(val);
+}
+
+static void
+hns3_tx_push_queue_init(struct rte_eth_dev *dev,
+			uint16_t queue_id,
+			struct hns3_tx_queue *txq)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	if (!hns3_dev_tx_push_supported(hw)) {
+		txq->tx_push_enable = false;
+		return;
+	}
+
+	txq->io_tail_reg = (volatile void *)hns3_tx_push_get_queue_tail_reg(dev,
+						queue_id);
+	txq->tx_push_enable = true;
+}
+
 int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
@@ -2983,6 +3046,12 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
 	memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
 
+	/*
+	 * Call hns3_tx_push_queue_init after assigned io_tail_reg field because
+	 * it may overwrite the io_tail_reg field.
+	 */
+	hns3_tx_push_queue_init(dev, idx, txq);
+
 	rte_spinlock_lock(&hw->lock);
 	dev->data->tx_queues[idx] = txq;
 	rte_spinlock_unlock(&hw->lock);
@@ -4029,7 +4098,7 @@ hns3_xmit_pkts_simple(void *tx_queue,
 	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
 	txq->next_to_use += nb_pkts - nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
@@ -4146,7 +4215,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 end_of_tx:
 
 	if (likely(nb_tx))
-		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
+		hns3_write_txq_tail_reg(txq, nb_hold);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index ba24e00..7419386 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -419,6 +419,7 @@ struct hns3_tx_dfx_stats {
 };
 
 struct hns3_tx_queue {
+	/* The io_tail_reg is write-only if working in tx push mode */
 	volatile void *io_tail_reg;
 	struct hns3_desc *tx_ring;
 	struct hns3_entry *sw_ring;
@@ -659,6 +660,23 @@ hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
 		return ptype_tbl->l3table[l3id] | ptype_tbl->l4table[l4id];
 }
 
+/*
+ * If enable using tx push feature and also device support it, then use qick
+ * doorbell (bar45) to inform the hardware.
+ *
+ * The other cases (such as: device don't support or user don't enable using)
+ * then use normal doorbell (bar23) to inform the hardware.
+ */
+static inline void
+hns3_write_txq_tail_reg(struct hns3_tx_queue *txq, uint32_t value)
+{
+	rte_io_wmb();
+	if (txq->tx_push_enable)
+		rte_write64_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+	else
+		rte_write32_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+}
+
 void hns3_dev_rx_queue_release(void *queue);
 void hns3_dev_tx_queue_release(void *queue);
 void hns3_free_all_queues(struct rte_eth_dev *dev);
@@ -741,5 +759,6 @@ int hns3_tx_done_cleanup(void *txq, uint32_t free_cnt);
 void hns3_enable_rxd_adv_layout(struct hns3_hw *hw);
 int hns3_dev_rx_descriptor_status(void *rx_queue, uint16_t offset);
 int hns3_dev_tx_descriptor_status(void *tx_queue, uint16_t offset);
+void hns3_tx_push_init(struct rte_eth_dev *dev);
 
 #endif /* _HNS3_RXTX_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index e5c7d69..74c848d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -84,7 +84,7 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 	txq->next_to_use = next_to_use;
 	txq->tx_bd_ready -= nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
+	hns3_write_txq_tail_reg(txq, nb_tx);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
index bf7f704..d5c4933 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
+++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
@@ -475,7 +475,7 @@ hns3_xmit_fixed_burst_vec_sve(void *__restrict tx_queue,
 	txq->next_to_use += nb_pkts - nb_tx;
 
 	txq->tx_bd_ready -= nb_pkts;
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-13  1:42 [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf Min Hu (Connor)
@ 2021-06-13  1:42 ` Min Hu (Connor)
  2021-06-14 14:35   ` Andrew Rybchenko
  2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
  0 siblings, 2 replies; 9+ messages in thread
From: Min Hu (Connor) @ 2021-06-13  1:42 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit

From: Chengwen Feng <fengchengwen@huawei.com>

Kunpeng 930 support tx push mode which could improve performance, It
works like below:
1. Add pcie bar45 which support driver direct write the Tx descriptor
or tail reg to it.
2. Support three operations: a) direct write one Tx descriptor, b)
direct write two Tx descriptors, c) direct write tail reg.
3. The original tail reg located at bar23, the above bar45 tail reg
could provide better bandwidth from the hardware perspective.

The hns3 driver only support direct write tail reg (also have the name
of quick doorbell), the detail:
1. Considering compatibility, firmware will report tx push capa if the
hardware support it.
2. Add control macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH which was not
defined default.
3. If user define macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH and hardware
support, then driver will direct write bar45 tail reg to inform the
hardware.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c        |  4 +-
 drivers/net/hns3/hns3_ethdev.h        |  4 ++
 drivers/net/hns3/hns3_ethdev_vf.c     |  4 +-
 drivers/net/hns3/hns3_rxtx.c          | 73 ++++++++++++++++++++++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          | 19 +++++++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h |  2 +-
 drivers/net/hns3/hns3_rxtx_vec_sve.c  |  2 +-
 7 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 2049130..467d770 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -5177,6 +5177,8 @@ hns3_init_pf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/*
 	 * To ensure that the hardware environment is clean during
 	 * initialization, the driver actively clear the hardware environment
@@ -7422,8 +7424,8 @@ hns3_dev_init(struct rte_eth_dev *eth_dev)
 				     "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 1714eba..575bacd 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -862,6 +862,7 @@ enum {
 	HNS3_DEV_SUPPORT_COPPER_B,
 	HNS3_DEV_SUPPORT_FD_QUEUE_REGION_B,
 	HNS3_DEV_SUPPORT_PTP_B,
+	HNS3_DEV_SUPPORT_TX_PUSH_B,
 	HNS3_DEV_SUPPORT_INDEP_TXRX_B,
 	HNS3_DEV_SUPPORT_STASH_B,
 	HNS3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
@@ -900,6 +901,9 @@ enum {
 #define hns3_dev_ras_imp_supported(hw) \
 	hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_RAS_IMP_B)
 
+#define hns3_dev_tx_push_supported(hw) \
+		hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_TX_PUSH_B)
+
 #define HNS3_DEV_PRIVATE_TO_HW(adapter) \
 	(&((struct hns3_adapter *)adapter)->hw)
 #define HNS3_DEV_PRIVATE_TO_PF(adapter) \
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 41dd8ee..1e94a1e 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -1921,6 +1921,8 @@ hns3vf_init_vf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/* Get VF resource */
 	ret = hns3_query_vf_resource(hw);
 	if (ret)
@@ -2928,8 +2930,8 @@ hns3vf_dev_init(struct rte_eth_dev *eth_dev)
 					  "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 1d7a769..88d4fc0 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
 	return 0;
 }
 
+static void *
+hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+#define HNS3_TX_PUSH_TQP_REGION_SIZE		0x10000
+#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET	64
+#define HNS3_TX_PUSH_PCI_BAR_INDEX		4
+
+	struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
+	uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
+
+	/*
+	 * If device support tx push then its pcie bar45 must exist, and DPDK
+	 * framework will mmap the bar45 default in pci probe stage.
+	 *
+	 * In the bar45, the first half is for roce, and the second half is for
+	 * nic, every TQP occupy 64KB.
+	 *
+	 * The quick doorbell located at 64B offset in the TQP region.
+	 */
+	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
+			(pci_dev->mem_resource[bar_id].len >> 1) +
+			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
+			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);
+}
+
+void
+hns3_tx_push_init(struct rte_eth_dev *dev)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	volatile uint32_t *reg;
+	uint32_t val;
+
+	if (!hns3_dev_tx_push_supported(hw))
+		return;
+
+	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
+	/*
+	 * Because the size of bar45 is about 8GB size, it may take a long time
+	 * to do the page fault in Tx process when work with vfio-pci, so use
+	 * one read operation to make kernel setup page table mapping for bar45
+	 * in the init stage.
+	 * Note: the bar45 is readable but the result is all 1.
+	 */
+	val = *reg;
+	RTE_SET_USED(val);
+}
+
+static void
+hns3_tx_push_queue_init(struct rte_eth_dev *dev,
+			uint16_t queue_id,
+			struct hns3_tx_queue *txq)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	if (!hns3_dev_tx_push_supported(hw)) {
+		txq->tx_push_enable = false;
+		return;
+	}
+
+	txq->io_tail_reg = (volatile void *)hns3_tx_push_get_queue_tail_reg(dev,
+						queue_id);
+	txq->tx_push_enable = true;
+}
+
 int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
@@ -2983,6 +3046,12 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
 	memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
 
+	/*
+	 * Call hns3_tx_push_queue_init after assigned io_tail_reg field because
+	 * it may overwrite the io_tail_reg field.
+	 */
+	hns3_tx_push_queue_init(dev, idx, txq);
+
 	rte_spinlock_lock(&hw->lock);
 	dev->data->tx_queues[idx] = txq;
 	rte_spinlock_unlock(&hw->lock);
@@ -4029,7 +4098,7 @@ hns3_xmit_pkts_simple(void *tx_queue,
 	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
 	txq->next_to_use += nb_pkts - nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
@@ -4146,7 +4215,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 end_of_tx:
 
 	if (likely(nb_tx))
-		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
+		hns3_write_txq_tail_reg(txq, nb_hold);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index ba24e00..7419386 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -419,6 +419,7 @@ struct hns3_tx_dfx_stats {
 };
 
 struct hns3_tx_queue {
+	/* The io_tail_reg is write-only if working in tx push mode */
 	volatile void *io_tail_reg;
 	struct hns3_desc *tx_ring;
 	struct hns3_entry *sw_ring;
@@ -659,6 +660,23 @@ hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
 		return ptype_tbl->l3table[l3id] | ptype_tbl->l4table[l4id];
 }
 
+/*
+ * If enable using tx push feature and also device support it, then use qick
+ * doorbell (bar45) to inform the hardware.
+ *
+ * The other cases (such as: device don't support or user don't enable using)
+ * then use normal doorbell (bar23) to inform the hardware.
+ */
+static inline void
+hns3_write_txq_tail_reg(struct hns3_tx_queue *txq, uint32_t value)
+{
+	rte_io_wmb();
+	if (txq->tx_push_enable)
+		rte_write64_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+	else
+		rte_write32_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+}
+
 void hns3_dev_rx_queue_release(void *queue);
 void hns3_dev_tx_queue_release(void *queue);
 void hns3_free_all_queues(struct rte_eth_dev *dev);
@@ -741,5 +759,6 @@ int hns3_tx_done_cleanup(void *txq, uint32_t free_cnt);
 void hns3_enable_rxd_adv_layout(struct hns3_hw *hw);
 int hns3_dev_rx_descriptor_status(void *rx_queue, uint16_t offset);
 int hns3_dev_tx_descriptor_status(void *tx_queue, uint16_t offset);
+void hns3_tx_push_init(struct rte_eth_dev *dev);
 
 #endif /* _HNS3_RXTX_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index e5c7d69..74c848d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -84,7 +84,7 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 	txq->next_to_use = next_to_use;
 	txq->tx_bd_ready -= nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
+	hns3_write_txq_tail_reg(txq, nb_tx);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
index bf7f704..d5c4933 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
+++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
@@ -475,7 +475,7 @@ hns3_xmit_fixed_burst_vec_sve(void *__restrict tx_queue,
 	txq->next_to_use += nb_pkts - nb_tx;
 
 	txq->tx_bd_ready -= nb_pkts;
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-13  1:42 ` Min Hu (Connor)
@ 2021-06-14 14:35   ` Andrew Rybchenko
  2021-06-15  1:35     ` Min Hu (Connor)
  2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
  1 sibling, 1 reply; 9+ messages in thread
From: Andrew Rybchenko @ 2021-06-14 14:35 UTC (permalink / raw)
  To: Min Hu (Connor), dev; +Cc: ferruh.yigit

Why is the same patch sent twice? Could you cleanup patchwork, please,
and set appropriate status?

On 6/13/21 4:42 AM, Min Hu (Connor) wrote:
> From: Chengwen Feng <fengchengwen@huawei.com>
> 
> Kunpeng 930 support tx push mode which could improve performance, It

tx -> Tx

> works like below:
> 1. Add pcie bar45 which support driver direct write the Tx descriptor

pcie -> PCIe (as an official abbreviation of the PCI express)

> or tail reg to it.
> 2. Support three operations: a) direct write one Tx descriptor, b)
> direct write two Tx descriptors, c) direct write tail reg.
> 3. The original tail reg located at bar23, the above bar45 tail reg
> could provide better bandwidth from the hardware perspective.
> 
> The hns3 driver only support direct write tail reg (also have the name
> of quick doorbell), the detail:
> 1. Considering compatibility, firmware will report tx push capa if the

tx -> Tx

> hardware support it.
> 2. Add control macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH which was not
> defined default.
> 3. If user define macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH and hardware
> support, then driver will direct write bar45 tail reg to inform the
> hardware.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
> ---
>   drivers/net/hns3/hns3_ethdev.c        |  4 +-
>   drivers/net/hns3/hns3_ethdev.h        |  4 ++
>   drivers/net/hns3/hns3_ethdev_vf.c     |  4 +-
>   drivers/net/hns3/hns3_rxtx.c          | 73 ++++++++++++++++++++++++++++++++++-
>   drivers/net/hns3/hns3_rxtx.h          | 19 +++++++++
>   drivers/net/hns3/hns3_rxtx_vec_neon.h |  2 +-
>   drivers/net/hns3/hns3_rxtx_vec_sve.c  |  2 +-
>   7 files changed, 102 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
> index 2049130..467d770 100644
> --- a/drivers/net/hns3/hns3_ethdev.c
> +++ b/drivers/net/hns3/hns3_ethdev.c
> @@ -5177,6 +5177,8 @@ hns3_init_pf(struct rte_eth_dev *eth_dev)
>   		goto err_cmd_init;
>   	}
>   
> +	hns3_tx_push_init(eth_dev);
> +
>   	/*
>   	 * To ensure that the hardware environment is clean during
>   	 * initialization, the driver actively clear the hardware environment
> @@ -7422,8 +7424,8 @@ hns3_dev_init(struct rte_eth_dev *eth_dev)
>   				     "process, ret = %d", ret);
>   			goto err_mp_init_secondary;
>   		}
> -
>   		hw->secondary_cnt++;
> +		hns3_tx_push_init(eth_dev);
>   		return 0;
>   	}
>   
> diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
> index 1714eba..575bacd 100644
> --- a/drivers/net/hns3/hns3_ethdev.h
> +++ b/drivers/net/hns3/hns3_ethdev.h
> @@ -862,6 +862,7 @@ enum {
>   	HNS3_DEV_SUPPORT_COPPER_B,
>   	HNS3_DEV_SUPPORT_FD_QUEUE_REGION_B,
>   	HNS3_DEV_SUPPORT_PTP_B,
> +	HNS3_DEV_SUPPORT_TX_PUSH_B,
>   	HNS3_DEV_SUPPORT_INDEP_TXRX_B,
>   	HNS3_DEV_SUPPORT_STASH_B,
>   	HNS3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
> @@ -900,6 +901,9 @@ enum {
>   #define hns3_dev_ras_imp_supported(hw) \
>   	hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_RAS_IMP_B)
>   
> +#define hns3_dev_tx_push_supported(hw) \
> +		hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_TX_PUSH_B)
> +
>   #define HNS3_DEV_PRIVATE_TO_HW(adapter) \
>   	(&((struct hns3_adapter *)adapter)->hw)
>   #define HNS3_DEV_PRIVATE_TO_PF(adapter) \
> diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
> index 41dd8ee..1e94a1e 100644
> --- a/drivers/net/hns3/hns3_ethdev_vf.c
> +++ b/drivers/net/hns3/hns3_ethdev_vf.c
> @@ -1921,6 +1921,8 @@ hns3vf_init_vf(struct rte_eth_dev *eth_dev)
>   		goto err_cmd_init;
>   	}
>   
> +	hns3_tx_push_init(eth_dev);
> +
>   	/* Get VF resource */
>   	ret = hns3_query_vf_resource(hw);
>   	if (ret)
> @@ -2928,8 +2930,8 @@ hns3vf_dev_init(struct rte_eth_dev *eth_dev)
>   					  "process, ret = %d", ret);
>   			goto err_mp_init_secondary;
>   		}
> -
>   		hw->secondary_cnt++;
> +		hns3_tx_push_init(eth_dev);
>   		return 0;
>   	}
>   
> diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
> index 1d7a769..88d4fc0 100644
> --- a/drivers/net/hns3/hns3_rxtx.c
> +++ b/drivers/net/hns3/hns3_rxtx.c
> @@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
>   	return 0;
>   }
>   
> +static void *
> +hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t queue_id)
> +{
> +#define HNS3_TX_PUSH_TQP_REGION_SIZE		0x10000
> +#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET	64
> +#define HNS3_TX_PUSH_PCI_BAR_INDEX		4
> +
> +	struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
> +	uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
> +
> +	/*
> +	 * If device support tx push then its pcie bar45 must exist, and DPDK

tx -> Tx

> +	 * framework will mmap the bar45 default in pci probe stage.

pci -> PCI

> +	 *
> +	 * In the bar45, the first half is for roce, and the second half is for

roce ?

> +	 * nic, every TQP occupy 64KB.


nic -> NIC

> +	 *
> +	 * The quick doorbell located at 64B offset in the TQP region.
> +	 */
> +	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
> +			(pci_dev->mem_resource[bar_id].len >> 1) +
> +			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
> +			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);
> +}
> +
> +void
> +hns3_tx_push_init(struct rte_eth_dev *dev)
> +{
> +	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	volatile uint32_t *reg;
> +	uint32_t val;
> +
> +	if (!hns3_dev_tx_push_supported(hw))
> +		return;
> +
> +	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
> +	/*
> +	 * Because the size of bar45 is about 8GB size, it may take a long time
> +	 * to do the page fault in Tx process when work with vfio-pci, so use
> +	 * one read operation to make kernel setup page table mapping for bar45
> +	 * in the init stage.
> +	 * Note: the bar45 is readable but the result is all 1.
> +	 */
> +	val = *reg;
> +	RTE_SET_USED(val);
> +}
> +
> +static void
> +hns3_tx_push_queue_init(struct rte_eth_dev *dev,
> +			uint16_t queue_id,
> +			struct hns3_tx_queue *txq)
> +{
> +	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	if (!hns3_dev_tx_push_supported(hw)) {
> +		txq->tx_push_enable = false;
> +		return;
> +	}
> +
> +	txq->io_tail_reg = (volatile void *)hns3_tx_push_get_queue_tail_reg(dev,
> +						queue_id);
> +	txq->tx_push_enable = true;
> +}
> +
>   int
>   hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
>   		    unsigned int socket_id, const struct rte_eth_txconf *conf)
> @@ -2983,6 +3046,12 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
>   	memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
>   	memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
>   
> +	/*
> +	 * Call hns3_tx_push_queue_init after assigned io_tail_reg field because
> +	 * it may overwrite the io_tail_reg field.
> +	 */
> +	hns3_tx_push_queue_init(dev, idx, txq);
> +
>   	rte_spinlock_lock(&hw->lock);
>   	dev->data->tx_queues[idx] = txq;
>   	rte_spinlock_unlock(&hw->lock);
> @@ -4029,7 +4098,7 @@ hns3_xmit_pkts_simple(void *tx_queue,
>   	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
>   	txq->next_to_use += nb_pkts - nb_tx;
>   
> -	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
> +	hns3_write_txq_tail_reg(txq, nb_pkts);
>   
>   	return nb_pkts;
>   }
> @@ -4146,7 +4215,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
>   end_of_tx:
>   
>   	if (likely(nb_tx))
> -		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
> +		hns3_write_txq_tail_reg(txq, nb_hold);
>   
>   	return nb_tx;
>   }
> diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
> index ba24e00..7419386 100644
> --- a/drivers/net/hns3/hns3_rxtx.h
> +++ b/drivers/net/hns3/hns3_rxtx.h
> @@ -419,6 +419,7 @@ struct hns3_tx_dfx_stats {
>   };
>   
>   struct hns3_tx_queue {
> +	/* The io_tail_reg is write-only if working in tx push mode */
>   	volatile void *io_tail_reg;
>   	struct hns3_desc *tx_ring;
>   	struct hns3_entry *sw_ring;
> @@ -659,6 +660,23 @@ hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
>   		return ptype_tbl->l3table[l3id] | ptype_tbl->l4table[l4id];
>   }
>   
> +/*
> + * If enable using tx push feature and also device support it, then use qick

tx -> Tx

qick -> quick ?

> + * doorbell (bar45) to inform the hardware.
> + *
> + * The other cases (such as: device don't support or user don't enable using)
> + * then use normal doorbell (bar23) to inform the hardware.
> + */
> +static inline void
> +hns3_write_txq_tail_reg(struct hns3_tx_queue *txq, uint32_t value)
> +{
> +	rte_io_wmb();
> +	if (txq->tx_push_enable)
> +		rte_write64_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
> +	else
> +		rte_write32_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
> +}
> +
>   void hns3_dev_rx_queue_release(void *queue);
>   void hns3_dev_tx_queue_release(void *queue);
>   void hns3_free_all_queues(struct rte_eth_dev *dev);
> @@ -741,5 +759,6 @@ int hns3_tx_done_cleanup(void *txq, uint32_t free_cnt);
>   void hns3_enable_rxd_adv_layout(struct hns3_hw *hw);
>   int hns3_dev_rx_descriptor_status(void *rx_queue, uint16_t offset);
>   int hns3_dev_tx_descriptor_status(void *tx_queue, uint16_t offset);
> +void hns3_tx_push_init(struct rte_eth_dev *dev);
>   
>   #endif /* _HNS3_RXTX_H_ */
> diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
> index e5c7d69..74c848d 100644
> --- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
> +++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
> @@ -84,7 +84,7 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
>   	txq->next_to_use = next_to_use;
>   	txq->tx_bd_ready -= nb_tx;
>   
> -	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
> +	hns3_write_txq_tail_reg(txq, nb_tx);
>   
>   	return nb_tx;
>   }
> diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
> index bf7f704..d5c4933 100644
> --- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
> +++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
> @@ -475,7 +475,7 @@ hns3_xmit_fixed_burst_vec_sve(void *__restrict tx_queue,
>   	txq->next_to_use += nb_pkts - nb_tx;
>   
>   	txq->tx_bd_ready -= nb_pkts;
> -	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
> +	hns3_write_txq_tail_reg(txq, nb_pkts);
>   
>   	return nb_pkts;
>   }
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-13  1:42 ` Min Hu (Connor)
  2021-06-14 14:35   ` Andrew Rybchenko
@ 2021-06-15  1:34   ` Min Hu (Connor)
  2021-06-15  2:36     ` Stephen Hemminger
                       ` (2 more replies)
  1 sibling, 3 replies; 9+ messages in thread
From: Min Hu (Connor) @ 2021-06-15  1:34 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, andrew.rybchenko

From: Chengwen Feng <fengchengwen@huawei.com>

Kunpeng 930 support Tx push mode which could improve performance, It
works like below:
1. Add PCIe bar45 which support driver direct write the Tx descriptor
or tail reg to it.
2. Support three operations: a) direct write one Tx descriptor, b)
direct write two Tx descriptors, c) direct write tail reg.
3. The original tail reg located at bar23, the above bar45 tail reg
could provide better bandwidth from the hardware perspective.

The hns3 driver only support direct write tail reg (also have the name
of quick doorbell), the detail:
Considering compatibility, firmware will report Tx push capa if the
hardware support it.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
---
v2:
* change abbreviation style.
---
 drivers/net/hns3/hns3_ethdev.c        |  4 +-
 drivers/net/hns3/hns3_ethdev.h        |  4 ++
 drivers/net/hns3/hns3_ethdev_vf.c     |  4 +-
 drivers/net/hns3/hns3_rxtx.c          | 73 ++++++++++++++++++++++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          | 19 +++++++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h |  2 +-
 drivers/net/hns3/hns3_rxtx_vec_sve.c  |  2 +-
 7 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 2049130..467d770 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -5177,6 +5177,8 @@ hns3_init_pf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/*
 	 * To ensure that the hardware environment is clean during
 	 * initialization, the driver actively clear the hardware environment
@@ -7422,8 +7424,8 @@ hns3_dev_init(struct rte_eth_dev *eth_dev)
 				     "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 1714eba..575bacd 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -862,6 +862,7 @@ enum {
 	HNS3_DEV_SUPPORT_COPPER_B,
 	HNS3_DEV_SUPPORT_FD_QUEUE_REGION_B,
 	HNS3_DEV_SUPPORT_PTP_B,
+	HNS3_DEV_SUPPORT_TX_PUSH_B,
 	HNS3_DEV_SUPPORT_INDEP_TXRX_B,
 	HNS3_DEV_SUPPORT_STASH_B,
 	HNS3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
@@ -900,6 +901,9 @@ enum {
 #define hns3_dev_ras_imp_supported(hw) \
 	hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_RAS_IMP_B)
 
+#define hns3_dev_tx_push_supported(hw) \
+		hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_TX_PUSH_B)
+
 #define HNS3_DEV_PRIVATE_TO_HW(adapter) \
 	(&((struct hns3_adapter *)adapter)->hw)
 #define HNS3_DEV_PRIVATE_TO_PF(adapter) \
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 41dd8ee..1e94a1e 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -1921,6 +1921,8 @@ hns3vf_init_vf(struct rte_eth_dev *eth_dev)
 		goto err_cmd_init;
 	}
 
+	hns3_tx_push_init(eth_dev);
+
 	/* Get VF resource */
 	ret = hns3_query_vf_resource(hw);
 	if (ret)
@@ -2928,8 +2930,8 @@ hns3vf_dev_init(struct rte_eth_dev *eth_dev)
 					  "process, ret = %d", ret);
 			goto err_mp_init_secondary;
 		}
-
 		hw->secondary_cnt++;
+		hns3_tx_push_init(eth_dev);
 		return 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 1d7a769..1fb16cd 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
 	return 0;
 }
 
+static void *
+hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+#define HNS3_TX_PUSH_TQP_REGION_SIZE		0x10000
+#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET	64
+#define HNS3_TX_PUSH_PCI_BAR_INDEX		4
+
+	struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
+	uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
+
+	/*
+	 * If device support Tx push then its PCIe bar45 must exist, and DPDK
+	 * framework will mmap the bar45 default in pci probe stage.
+	 *
+	 * In the bar45, the first half is for roce(RDMA over Converged
+	 * Ethernet), and the second half is for NIC, every TQP occupy 64KB.
+	 *
+	 * The quick doorbell located at 64B offset in the TQP region.
+	 */
+	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
+			(pci_dev->mem_resource[bar_id].len >> 1) +
+			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
+			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);
+}
+
+void
+hns3_tx_push_init(struct rte_eth_dev *dev)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	volatile uint32_t *reg;
+	uint32_t val;
+
+	if (!hns3_dev_tx_push_supported(hw))
+		return;
+
+	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
+	/*
+	 * Because the size of bar45 is about 8GB size, it may take a long time
+	 * to do the page fault in Tx process when work with vfio-pci, so use
+	 * one read operation to make kernel setup page table mapping for bar45
+	 * in the init stage.
+	 * Note: the bar45 is readable but the result is all 1.
+	 */
+	val = *reg;
+	RTE_SET_USED(val);
+}
+
+static void
+hns3_tx_push_queue_init(struct rte_eth_dev *dev,
+			uint16_t queue_id,
+			struct hns3_tx_queue *txq)
+{
+	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	if (!hns3_dev_tx_push_supported(hw)) {
+		txq->tx_push_enable = false;
+		return;
+	}
+
+	txq->io_tail_reg = (volatile void *)hns3_tx_push_get_queue_tail_reg(dev,
+						queue_id);
+	txq->tx_push_enable = true;
+}
+
 int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
@@ -2983,6 +3046,12 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
 	memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
 
+	/*
+	 * Call hns3_tx_push_queue_init after assigned io_tail_reg field because
+	 * it may overwrite the io_tail_reg field.
+	 */
+	hns3_tx_push_queue_init(dev, idx, txq);
+
 	rte_spinlock_lock(&hw->lock);
 	dev->data->tx_queues[idx] = txq;
 	rte_spinlock_unlock(&hw->lock);
@@ -4029,7 +4098,7 @@ hns3_xmit_pkts_simple(void *tx_queue,
 	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
 	txq->next_to_use += nb_pkts - nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
@@ -4146,7 +4215,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 end_of_tx:
 
 	if (likely(nb_tx))
-		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
+		hns3_write_txq_tail_reg(txq, nb_hold);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index ba24e00..56c1b80 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -419,6 +419,7 @@ struct hns3_tx_dfx_stats {
 };
 
 struct hns3_tx_queue {
+	/* The io_tail_reg is write-only if working in tx push mode */
 	volatile void *io_tail_reg;
 	struct hns3_desc *tx_ring;
 	struct hns3_entry *sw_ring;
@@ -659,6 +660,23 @@ hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
 		return ptype_tbl->l3table[l3id] | ptype_tbl->l4table[l4id];
 }
 
+/*
+ * If enable using Tx push feature and also device support it, then use quick
+ * doorbell (bar45) to inform the hardware.
+ *
+ * The other cases (such as: device don't support or user don't enable using)
+ * then use normal doorbell (bar23) to inform the hardware.
+ */
+static inline void
+hns3_write_txq_tail_reg(struct hns3_tx_queue *txq, uint32_t value)
+{
+	rte_io_wmb();
+	if (txq->tx_push_enable)
+		rte_write64_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+	else
+		rte_write32_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
+}
+
 void hns3_dev_rx_queue_release(void *queue);
 void hns3_dev_tx_queue_release(void *queue);
 void hns3_free_all_queues(struct rte_eth_dev *dev);
@@ -741,5 +759,6 @@ int hns3_tx_done_cleanup(void *txq, uint32_t free_cnt);
 void hns3_enable_rxd_adv_layout(struct hns3_hw *hw);
 int hns3_dev_rx_descriptor_status(void *rx_queue, uint16_t offset);
 int hns3_dev_tx_descriptor_status(void *tx_queue, uint16_t offset);
+void hns3_tx_push_init(struct rte_eth_dev *dev);
 
 #endif /* _HNS3_RXTX_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index e5c7d69..74c848d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -84,7 +84,7 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 	txq->next_to_use = next_to_use;
 	txq->tx_bd_ready -= nb_tx;
 
-	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
+	hns3_write_txq_tail_reg(txq, nb_tx);
 
 	return nb_tx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
index bf7f704..d5c4933 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
+++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
@@ -475,7 +475,7 @@ hns3_xmit_fixed_burst_vec_sve(void *__restrict tx_queue,
 	txq->next_to_use += nb_pkts - nb_tx;
 
 	txq->tx_bd_ready -= nb_pkts;
-	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+	hns3_write_txq_tail_reg(txq, nb_pkts);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-14 14:35   ` Andrew Rybchenko
@ 2021-06-15  1:35     ` Min Hu (Connor)
  0 siblings, 0 replies; 9+ messages in thread
From: Min Hu (Connor) @ 2021-06-15  1:35 UTC (permalink / raw)
  To: Andrew Rybchenko, dev; +Cc: ferruh.yigit

fixed in v2, thanks.

在 2021/6/14 22:35, Andrew Rybchenko 写道:
> Why is the same patch sent twice? Could you cleanup patchwork, please,
> and set appropriate status?
> 
> On 6/13/21 4:42 AM, Min Hu (Connor) wrote:
>> From: Chengwen Feng <fengchengwen@huawei.com>
>>
>> Kunpeng 930 support tx push mode which could improve performance, It
> 
> tx -> Tx
> 
>> works like below:
>> 1. Add pcie bar45 which support driver direct write the Tx descriptor
> 
> pcie -> PCIe (as an official abbreviation of the PCI express)
> 
>> or tail reg to it.
>> 2. Support three operations: a) direct write one Tx descriptor, b)
>> direct write two Tx descriptors, c) direct write tail reg.
>> 3. The original tail reg located at bar23, the above bar45 tail reg
>> could provide better bandwidth from the hardware perspective.
>>
>> The hns3 driver only support direct write tail reg (also have the name
>> of quick doorbell), the detail:
>> 1. Considering compatibility, firmware will report tx push capa if the
> 
> tx -> Tx
> 
>> hardware support it.
>> 2. Add control macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH which was not
>> defined default.
>> 3. If user define macro RTE_LIBRTE_HNS3_ENABLE_TX_PUSH and hardware
>> support, then driver will direct write bar45 tail reg to inform the
>> hardware.
>>
>> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
>> Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
>> ---
>>   drivers/net/hns3/hns3_ethdev.c        |  4 +-
>>   drivers/net/hns3/hns3_ethdev.h        |  4 ++
>>   drivers/net/hns3/hns3_ethdev_vf.c     |  4 +-
>>   drivers/net/hns3/hns3_rxtx.c          | 73 
>> ++++++++++++++++++++++++++++++++++-
>>   drivers/net/hns3/hns3_rxtx.h          | 19 +++++++++
>>   drivers/net/hns3/hns3_rxtx_vec_neon.h |  2 +-
>>   drivers/net/hns3/hns3_rxtx_vec_sve.c  |  2 +-
>>   7 files changed, 102 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/net/hns3/hns3_ethdev.c 
>> b/drivers/net/hns3/hns3_ethdev.c
>> index 2049130..467d770 100644
>> --- a/drivers/net/hns3/hns3_ethdev.c
>> +++ b/drivers/net/hns3/hns3_ethdev.c
>> @@ -5177,6 +5177,8 @@ hns3_init_pf(struct rte_eth_dev *eth_dev)
>>           goto err_cmd_init;
>>       }
>> +    hns3_tx_push_init(eth_dev);
>> +
>>       /*
>>        * To ensure that the hardware environment is clean during
>>        * initialization, the driver actively clear the hardware 
>> environment
>> @@ -7422,8 +7424,8 @@ hns3_dev_init(struct rte_eth_dev *eth_dev)
>>                        "process, ret = %d", ret);
>>               goto err_mp_init_secondary;
>>           }
>> -
>>           hw->secondary_cnt++;
>> +        hns3_tx_push_init(eth_dev);
>>           return 0;
>>       }
>> diff --git a/drivers/net/hns3/hns3_ethdev.h 
>> b/drivers/net/hns3/hns3_ethdev.h
>> index 1714eba..575bacd 100644
>> --- a/drivers/net/hns3/hns3_ethdev.h
>> +++ b/drivers/net/hns3/hns3_ethdev.h
>> @@ -862,6 +862,7 @@ enum {
>>       HNS3_DEV_SUPPORT_COPPER_B,
>>       HNS3_DEV_SUPPORT_FD_QUEUE_REGION_B,
>>       HNS3_DEV_SUPPORT_PTP_B,
>> +    HNS3_DEV_SUPPORT_TX_PUSH_B,
>>       HNS3_DEV_SUPPORT_INDEP_TXRX_B,
>>       HNS3_DEV_SUPPORT_STASH_B,
>>       HNS3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
>> @@ -900,6 +901,9 @@ enum {
>>   #define hns3_dev_ras_imp_supported(hw) \
>>       hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_RAS_IMP_B)
>> +#define hns3_dev_tx_push_supported(hw) \
>> +        hns3_get_bit((hw)->capability, HNS3_DEV_SUPPORT_TX_PUSH_B)
>> +
>>   #define HNS3_DEV_PRIVATE_TO_HW(adapter) \
>>       (&((struct hns3_adapter *)adapter)->hw)
>>   #define HNS3_DEV_PRIVATE_TO_PF(adapter) \
>> diff --git a/drivers/net/hns3/hns3_ethdev_vf.c 
>> b/drivers/net/hns3/hns3_ethdev_vf.c
>> index 41dd8ee..1e94a1e 100644
>> --- a/drivers/net/hns3/hns3_ethdev_vf.c
>> +++ b/drivers/net/hns3/hns3_ethdev_vf.c
>> @@ -1921,6 +1921,8 @@ hns3vf_init_vf(struct rte_eth_dev *eth_dev)
>>           goto err_cmd_init;
>>       }
>> +    hns3_tx_push_init(eth_dev);
>> +
>>       /* Get VF resource */
>>       ret = hns3_query_vf_resource(hw);
>>       if (ret)
>> @@ -2928,8 +2930,8 @@ hns3vf_dev_init(struct rte_eth_dev *eth_dev)
>>                         "process, ret = %d", ret);
>>               goto err_mp_init_secondary;
>>           }
>> -
>>           hw->secondary_cnt++;
>> +        hns3_tx_push_init(eth_dev);
>>           return 0;
>>       }
>> diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
>> index 1d7a769..88d4fc0 100644
>> --- a/drivers/net/hns3/hns3_rxtx.c
>> +++ b/drivers/net/hns3/hns3_rxtx.c
>> @@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, 
>> const struct rte_eth_txconf *conf,
>>       return 0;
>>   }
>> +static void *
>> +hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t 
>> queue_id)
>> +{
>> +#define HNS3_TX_PUSH_TQP_REGION_SIZE        0x10000
>> +#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET    64
>> +#define HNS3_TX_PUSH_PCI_BAR_INDEX        4
>> +
>> +    struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
>> +    uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
>> +
>> +    /*
>> +     * If device support tx push then its pcie bar45 must exist, and 
>> DPDK
> 
> tx -> Tx
> 
>> +     * framework will mmap the bar45 default in pci probe stage.
> 
> pci -> PCI
> 
>> +     *
>> +     * In the bar45, the first half is for roce, and the second half 
>> is for
> 
> roce ?
> 
>> +     * nic, every TQP occupy 64KB.
> 
> 
> nic -> NIC
> 
>> +     *
>> +     * The quick doorbell located at 64B offset in the TQP region.
>> +     */
>> +    return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
>> +            (pci_dev->mem_resource[bar_id].len >> 1) +
>> +            HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
>> +            HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);
>> +}
>> +
>> +void
>> +hns3_tx_push_init(struct rte_eth_dev *dev)
>> +{
>> +    struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>> +    volatile uint32_t *reg;
>> +    uint32_t val;
>> +
>> +    if (!hns3_dev_tx_push_supported(hw))
>> +        return;
>> +
>> +    reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
>> +    /*
>> +     * Because the size of bar45 is about 8GB size, it may take a 
>> long time
>> +     * to do the page fault in Tx process when work with vfio-pci, so 
>> use
>> +     * one read operation to make kernel setup page table mapping for 
>> bar45
>> +     * in the init stage.
>> +     * Note: the bar45 is readable but the result is all 1.
>> +     */
>> +    val = *reg;
>> +    RTE_SET_USED(val);
>> +}
>> +
>> +static void
>> +hns3_tx_push_queue_init(struct rte_eth_dev *dev,
>> +            uint16_t queue_id,
>> +            struct hns3_tx_queue *txq)
>> +{
>> +    struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>> +    if (!hns3_dev_tx_push_supported(hw)) {
>> +        txq->tx_push_enable = false;
>> +        return;
>> +    }
>> +
>> +    txq->io_tail_reg = (volatile void 
>> *)hns3_tx_push_get_queue_tail_reg(dev,
>> +                        queue_id);
>> +    txq->tx_push_enable = true;
>> +}
>> +
>>   int
>>   hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t 
>> nb_desc,
>>               unsigned int socket_id, const struct rte_eth_txconf *conf)
>> @@ -2983,6 +3046,12 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, 
>> uint16_t idx, uint16_t nb_desc,
>>       memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
>>       memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
>> +    /*
>> +     * Call hns3_tx_push_queue_init after assigned io_tail_reg field 
>> because
>> +     * it may overwrite the io_tail_reg field.
>> +     */
>> +    hns3_tx_push_queue_init(dev, idx, txq);
>> +
>>       rte_spinlock_lock(&hw->lock);
>>       dev->data->tx_queues[idx] = txq;
>>       rte_spinlock_unlock(&hw->lock);
>> @@ -4029,7 +4098,7 @@ hns3_xmit_pkts_simple(void *tx_queue,
>>       hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
>>       txq->next_to_use += nb_pkts - nb_tx;
>> -    hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
>> +    hns3_write_txq_tail_reg(txq, nb_pkts);
>>       return nb_pkts;
>>   }
>> @@ -4146,7 +4215,7 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf 
>> **tx_pkts, uint16_t nb_pkts)
>>   end_of_tx:
>>       if (likely(nb_tx))
>> -        hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
>> +        hns3_write_txq_tail_reg(txq, nb_hold);
>>       return nb_tx;
>>   }
>> diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
>> index ba24e00..7419386 100644
>> --- a/drivers/net/hns3/hns3_rxtx.h
>> +++ b/drivers/net/hns3/hns3_rxtx.h
>> @@ -419,6 +419,7 @@ struct hns3_tx_dfx_stats {
>>   };
>>   struct hns3_tx_queue {
>> +    /* The io_tail_reg is write-only if working in tx push mode */
>>       volatile void *io_tail_reg;
>>       struct hns3_desc *tx_ring;
>>       struct hns3_entry *sw_ring;
>> @@ -659,6 +660,23 @@ hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, 
>> const uint32_t l234_info,
>>           return ptype_tbl->l3table[l3id] | ptype_tbl->l4table[l4id];
>>   }
>> +/*
>> + * If enable using tx push feature and also device support it, then 
>> use qick
> 
> tx -> Tx
> 
> qick -> quick ?
> 
>> + * doorbell (bar45) to inform the hardware.
>> + *
>> + * The other cases (such as: device don't support or user don't 
>> enable using)
>> + * then use normal doorbell (bar23) to inform the hardware.
>> + */
>> +static inline void
>> +hns3_write_txq_tail_reg(struct hns3_tx_queue *txq, uint32_t value)
>> +{
>> +    rte_io_wmb();
>> +    if (txq->tx_push_enable)
>> +        rte_write64_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
>> +    else
>> +        rte_write32_relaxed(rte_cpu_to_le_32(value), txq->io_tail_reg);
>> +}
>> +
>>   void hns3_dev_rx_queue_release(void *queue);
>>   void hns3_dev_tx_queue_release(void *queue);
>>   void hns3_free_all_queues(struct rte_eth_dev *dev);
>> @@ -741,5 +759,6 @@ int hns3_tx_done_cleanup(void *txq, uint32_t 
>> free_cnt);
>>   void hns3_enable_rxd_adv_layout(struct hns3_hw *hw);
>>   int hns3_dev_rx_descriptor_status(void *rx_queue, uint16_t offset);
>>   int hns3_dev_tx_descriptor_status(void *tx_queue, uint16_t offset);
>> +void hns3_tx_push_init(struct rte_eth_dev *dev);
>>   #endif /* _HNS3_RXTX_H_ */
>> diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h 
>> b/drivers/net/hns3/hns3_rxtx_vec_neon.h
>> index e5c7d69..74c848d 100644
>> --- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
>> +++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
>> @@ -84,7 +84,7 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
>>       txq->next_to_use = next_to_use;
>>       txq->tx_bd_ready -= nb_tx;
>> -    hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
>> +    hns3_write_txq_tail_reg(txq, nb_tx);
>>       return nb_tx;
>>   }
>> diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c 
>> b/drivers/net/hns3/hns3_rxtx_vec_sve.c
>> index bf7f704..d5c4933 100644
>> --- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
>> +++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
>> @@ -475,7 +475,7 @@ hns3_xmit_fixed_burst_vec_sve(void *__restrict 
>> tx_queue,
>>       txq->next_to_use += nb_pkts - nb_tx;
>>       txq->tx_bd_ready -= nb_pkts;
>> -    hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
>> +    hns3_write_txq_tail_reg(txq, nb_pkts);
>>       return nb_pkts;
>>   }
>>
> 
> .

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
@ 2021-06-15  2:36     ` Stephen Hemminger
  2021-06-15  2:37     ` Stephen Hemminger
  2021-06-17 15:18     ` Andrew Rybchenko
  2 siblings, 0 replies; 9+ messages in thread
From: Stephen Hemminger @ 2021-06-15  2:36 UTC (permalink / raw)
  To: Min Hu (Connor); +Cc: dev, ferruh.yigit, andrew.rybchenko

On Tue, 15 Jun 2021 09:34:29 +0800
"Min Hu (Connor)" <humin29@huawei.com> wrote:

> +	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
> +			(pci_dev->mem_resource[bar_id].len >> 1) +
> +			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
> +			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);

Minor nit: cast here (in C) is unnecessary. C will automatically
cast the return any pointer type to void *

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
  2021-06-15  2:36     ` Stephen Hemminger
@ 2021-06-15  2:37     ` Stephen Hemminger
  2021-06-15  3:46       ` Min Hu (Connor)
  2021-06-17 15:18     ` Andrew Rybchenko
  2 siblings, 1 reply; 9+ messages in thread
From: Stephen Hemminger @ 2021-06-15  2:37 UTC (permalink / raw)
  To: Min Hu (Connor); +Cc: dev, ferruh.yigit, andrew.rybchenko

On Tue, 15 Jun 2021 09:34:29 +0800
"Min Hu (Connor)" <humin29@huawei.com> wrote:

> +void
> +hns3_tx_push_init(struct rte_eth_dev *dev)
> +{
> +	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	volatile uint32_t *reg;
> +	uint32_t val;
> +
> +	if (!hns3_dev_tx_push_supported(hw))
> +		return;
> +
> +	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);

Better to use proper barrier function than using volatile.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-15  2:37     ` Stephen Hemminger
@ 2021-06-15  3:46       ` Min Hu (Connor)
  0 siblings, 0 replies; 9+ messages in thread
From: Min Hu (Connor) @ 2021-06-15  3:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, ferruh.yigit, andrew.rybchenko

Hi, Stephen,
Volatile is used here to prevent compiler optimization (deleting the reg
write operation).
By the way, this is an initialization process and does not involve
multi-thread synchronization.
Therefore no need to use the barrier.

在 2021/6/15 10:37, Stephen Hemminger 写道:
> On Tue, 15 Jun 2021 09:34:29 +0800
> "Min Hu (Connor)" <humin29@huawei.com> wrote:
> 
>> +void
>> +hns3_tx_push_init(struct rte_eth_dev *dev)
>> +{
>> +	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>> +	volatile uint32_t *reg;
>> +	uint32_t val;
>> +
>> +	if (!hns3_dev_tx_push_supported(hw))
>> +		return;
>> +
>> +	reg = (volatile uint32_t *)hns3_tx_push_get_queue_tail_reg(dev, 0);
> 
> Better to use proper barrier function than using volatile.
> .
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/hns3: support Tx push quick doorbell to improve perf
  2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
  2021-06-15  2:36     ` Stephen Hemminger
  2021-06-15  2:37     ` Stephen Hemminger
@ 2021-06-17 15:18     ` Andrew Rybchenko
  2 siblings, 0 replies; 9+ messages in thread
From: Andrew Rybchenko @ 2021-06-17 15:18 UTC (permalink / raw)
  To: Min Hu (Connor), dev; +Cc: ferruh.yigit, Stephen Hemminger

On 6/15/21 4:34 AM, Min Hu (Connor) wrote:
> From: Chengwen Feng <fengchengwen@huawei.com>
> 
> Kunpeng 930 support Tx push mode which could improve performance, It
> works like below:
> 1. Add PCIe bar45 which support driver direct write the Tx descriptor
> or tail reg to it.
> 2. Support three operations: a) direct write one Tx descriptor, b)
> direct write two Tx descriptors, c) direct write tail reg.
> 3. The original tail reg located at bar23, the above bar45 tail reg
> could provide better bandwidth from the hardware perspective.
> 
> The hns3 driver only support direct write tail reg (also have the name
> of quick doorbell), the detail:
> Considering compatibility, firmware will report Tx push capa if the
> hardware support it.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> Signed-off-by: Min Hu (Connor) <humin29@huawei.com>

With description mangled a bit and few minor fixes
described below.

Applied, thanks.

[snip]

> diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
> index 1d7a769..1fb16cd 100644
> --- a/drivers/net/hns3/hns3_rxtx.c
> +++ b/drivers/net/hns3/hns3_rxtx.c
> @@ -2892,6 +2892,69 @@ hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
>  	return 0;
>  }
>  
> +static void *
> +hns3_tx_push_get_queue_tail_reg(struct rte_eth_dev *dev, uint16_t queue_id)
> +{
> +#define HNS3_TX_PUSH_TQP_REGION_SIZE		0x10000
> +#define HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET	64
> +#define HNS3_TX_PUSH_PCI_BAR_INDEX		4
> +
> +	struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(dev->device);
> +	uint8_t bar_id = HNS3_TX_PUSH_PCI_BAR_INDEX;
> +
> +	/*
> +	 * If device support Tx push then its PCIe bar45 must exist, and DPDK
> +	 * framework will mmap the bar45 default in pci probe stage.

pci -> PCI

> +	 *
> +	 * In the bar45, the first half is for roce(RDMA over Converged

roce -> RoCE

> +	 * Ethernet), and the second half is for NIC, every TQP occupy 64KB.
> +	 *
> +	 * The quick doorbell located at 64B offset in the TQP region.
> +	 */
> +	return (void *)((char *)pci_dev->mem_resource[bar_id].addr +
> +			(pci_dev->mem_resource[bar_id].len >> 1) +
> +			HNS3_TX_PUSH_TQP_REGION_SIZE * queue_id +
> +			HNS3_TX_PUSH_QUICK_DOORBELL_OFFSET);

Remove unnecessary type cast to 'void *'.

[snip]

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-06-17 15:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-13  1:42 [dpdk-dev] [PATCH] net/hns3: support Tx push quick doorbell to improve perf Min Hu (Connor)
2021-06-13  1:42 ` Min Hu (Connor)
2021-06-14 14:35   ` Andrew Rybchenko
2021-06-15  1:35     ` Min Hu (Connor)
2021-06-15  1:34   ` [dpdk-dev] [PATCH v2] " Min Hu (Connor)
2021-06-15  2:36     ` Stephen Hemminger
2021-06-15  2:37     ` Stephen Hemminger
2021-06-15  3:46       ` Min Hu (Connor)
2021-06-17 15:18     ` Andrew Rybchenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).