DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx
@ 2020-09-07  9:08 Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
                   ` (8 more replies)
  0 siblings, 9 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

This series are updates for Rx/Tx process.

Chengchang Tang (1):
  net/hns3: fix segfault when Tx multiple buffer packets

Wei Hu (Xavier) (7):
  net/hns3: report Rx free threshold
  net/hns3: reduce address calculation in Rx
  net/hns3: add simple Rx process function
  net/hns3: add simple Tx process function
  net/hns3: add vector Tx burst with NEON instructions
  net/hns3: add vector Rx burst with NEON instructions
  net/hns3: add restriction on setting VF MTU

 config/common_base                    |    1 +
 config/common_linux                   |    1 +
 drivers/net/hns3/Makefile             |    5 +
 drivers/net/hns3/hns3_ethdev.c        |   18 +-
 drivers/net/hns3/hns3_ethdev.h        |   54 +-
 drivers/net/hns3/hns3_ethdev_vf.c     |   41 +-
 drivers/net/hns3/hns3_rxtx.c          | 1006 +++++++++++++++++++++++----------
 drivers/net/hns3/hns3_rxtx.h          |  227 +++++++-
 drivers/net/hns3/hns3_rxtx_vec.c      |  214 +++++++
 drivers/net/hns3/hns3_rxtx_vec.h      |   77 +++
 drivers/net/hns3/hns3_rxtx_vec_neon.h |  284 ++++++++++
 drivers/net/hns3/meson.build          |    4 +
 12 files changed, 1631 insertions(+), 301 deletions(-)
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.c
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.h
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec_neon.h

-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 1/8] net/hns3: report Rx free threshold
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch reports .rx_free_thresh value in the .dev_infos_get ops
implementation function named hns3_dev_infos_get and hns3vf_dev_infos_get.
In addition, the name of the member variable of struct hns3_rx_queue is
modified and comments are added to improve code readability.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |  2 ++
 drivers/net/hns3/hns3_ethdev_vf.c |  2 ++
 drivers/net/hns3/hns3_rxtx.c      | 30 ++++++++++++------------------
 drivers/net/hns3/hns3_rxtx.h      |  9 ++++++---
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 4d5fa94..6fa34e8 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2501,12 +2501,14 @@ hns3_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 	};
 
 	info->default_rxconf = (struct rte_eth_rxconf) {
+		.rx_free_thresh = HNS3_DEFAULT_RX_FREE_THRESH,
 		/*
 		 * If there are no available Rx buffer descriptors, incoming
 		 * packets are always dropped by hardware based on hns3 network
 		 * engine.
 		 */
 		.rx_drop_en = 1,
+		.offloads = 0,
 	};
 
 	info->vmdq_queue_num = 0;
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 7fd0e6a..7226cc5 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -944,12 +944,14 @@ hns3vf_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 	};
 
 	info->default_rxconf = (struct rte_eth_rxconf) {
+		.rx_free_thresh = HNS3_DEFAULT_RX_FREE_THRESH,
 		/*
 		 * If there are no available Rx buffer descriptors, incoming
 		 * packets are always dropped by hardware based on hns3 network
 		 * engine.
 		 */
 		.rx_drop_en = 1,
+		.offloads = 0,
 	};
 
 	info->vmdq_queue_num = 0;
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 308d0a6..fe2a7a4 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -652,8 +652,7 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	}
 
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 
 	return 0;
@@ -667,8 +666,7 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -1303,10 +1301,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
 	rxq->hns = hns;
 	rxq->mb_pool = mp;
-	if (conf->rx_free_thresh <= 0)
-		rxq->rx_free_thresh = DEFAULT_RX_FREE_THRESH;
-	else
-		rxq->rx_free_thresh = conf->rx_free_thresh;
+	rxq->rx_free_thresh = (conf->rx_free_thresh > 0) ?
+		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
 	rxq->rx_deferred_start = conf->rx_deferred_start;
 
 	rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
@@ -1319,8 +1315,7 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	}
 
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	rxq->port_id = dev->data->port_id;
@@ -1656,11 +1651,11 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	nb_rx_bd = 0;
 	rxq = rx_queue;
 
-	rx_id = rxq->next_to_clean;
+	rx_id = rxq->next_to_use;
 	rx_ring = rxq->rx_ring;
+	sw_ring = rxq->sw_ring;
 	first_seg = rxq->pkt_first_seg;
 	last_seg = rxq->pkt_last_seg;
-	sw_ring = rxq->sw_ring;
 
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
@@ -1843,16 +1838,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		first_seg = NULL;
 	}
 
-	rxq->next_to_clean = rx_id;
+	rxq->next_to_use = rx_id;
 	rxq->pkt_first_seg = first_seg;
 	rxq->pkt_last_seg = last_seg;
 
-	nb_rx_bd = nb_rx_bd + rxq->nb_rx_hold;
-	if (nb_rx_bd > rxq->rx_free_thresh) {
-		hns3_clean_rx_buffers(rxq, nb_rx_bd);
-		nb_rx_bd = 0;
+	rxq->rx_free_hold += nb_rx_bd;
+	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+		hns3_clean_rx_buffers(rxq, rxq->rx_free_hold);
+		rxq->rx_free_hold = 0;
 	}
-	rxq->nb_rx_hold = nb_rx_bd;
 
 	return nb_rx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 4b3269b..a2d6514 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -10,6 +10,7 @@
 #define HNS3_DEFAULT_RING_DESC  1024
 #define	HNS3_ALIGN_RING_DESC	32
 #define HNS3_RING_BASE_ALIGN	128
+#define HNS3_DEFAULT_RX_FREE_THRESH	32
 
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
@@ -243,12 +244,14 @@ struct hns3_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_rx_desc;
-	uint16_t nb_rx_hold;
-	uint16_t rx_tail;
-	uint16_t next_to_clean;
 	uint16_t next_to_use;
 	uint16_t rx_buf_len;
+	/*
+	 * threshold for the number of BDs waited to passed to hardware. If the
+	 * number exceeds the threshold, driver will pass these BDs to hardware.
+	 */
 	uint16_t rx_free_thresh;
+	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
 
 	/*
 	 * port based vlan configuration state.
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 2/8] net/hns3: reduce address calculation in Rx
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds the internal function named hns3_write_reg_opt to avoid
performance loss from address calculation during register access in the
'.rx_pkt_burst' ops implementation function named hns3_recv_pkts.

In addition, because hardware always access register in little-endian mode
based on hns3 network engine, so driver should also call rte_cpu_to_le_32
to convert data in little-endian mode before writing register and call
rte_le_to_cpu_32 to convert data after reading from register. Here the
driver encapsulates the data conversion operation in the register
read/write operation function as below:
  hns3_write_reg
  hns3_write_reg_opt
  hns3_read_reg
Therefore, when calling these functions, conversion is not required again.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.h | 29 +++++++++++++++++++++++++++--
 drivers/net/hns3/hns3_rxtx.c   | 14 +++-----------
 drivers/net/hns3/hns3_rxtx.h   |  1 +
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 9e49e28..3cb0535 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -708,14 +708,39 @@ struct hns3_adapter {
 
 #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
+/*
+ * Because hardware always access register in little-endian mode based on hns3
+ * network engine, so driver should also call rte_cpu_to_le_32 to convert data
+ * in little-endian mode before writing register and call rte_le_to_cpu_32 to
+ * convert data after reading from register.
+ *
+ * Here the driver encapsulates the data conversion operation in the register
+ * read/write operation function as below:
+ *   hns3_write_reg
+ *   hns3_write_reg_opt
+ *   hns3_read_reg
+ * Therefore, when calling these functions, conversion is not required again.
+ */
 static inline void hns3_write_reg(void *base, uint32_t reg, uint32_t value)
 {
-	rte_write32(value, (volatile void *)((char *)base + reg));
+	rte_write32(rte_cpu_to_le_32(value),
+		    (volatile void *)((char *)base + reg));
+}
+
+/*
+ * The optimized function for writing registers used in the '.rx_pkt_burst' and
+ * '.tx_pkt_burst' ops implementation function.
+ */
+static inline void hns3_write_reg_opt(volatile void *addr, uint32_t value)
+{
+	rte_io_wmb();
+	rte_write32_relaxed(rte_cpu_to_le_32(value), addr);
 }
 
 static inline uint32_t hns3_read_reg(void *base, uint32_t reg)
 {
-	return rte_read32((volatile void *)((char *)base + reg));
+	uint32_t read_val = rte_read32((volatile void *)((char *)base + reg));
+	return rte_le_to_cpu_32(read_val);
 }
 
 #define hns3_write_dev(a, reg, value) \
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index fe2a7a4..703b12a 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1323,6 +1323,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	rxq->configured = true;
 	rxq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
 				idx * HNS3_TQP_REG_SIZE);
+	rxq->io_head_reg = (volatile void *)((char *)rxq->io_base +
+			   HNS3_RING_RX_HEAD_REG);
 	rxq->rx_buf_len = rx_buf_size;
 	rxq->l2_errors = 0;
 	rxq->pkt_len_errors = 0;
@@ -1472,16 +1474,6 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	return NULL;
 }
 
-static void
-hns3_clean_rx_buffers(struct hns3_rx_queue *rxq, int count)
-{
-	rxq->next_to_use += count;
-	if (rxq->next_to_use >= rxq->nb_rx_desc)
-		rxq->next_to_use -= rxq->nb_rx_desc;
-
-	hns3_write_dev(rxq, HNS3_RING_RX_HEAD_REG, count);
-}
-
 static int
 hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
 		   uint32_t bd_base_info, uint32_t l234_info,
@@ -1844,7 +1836,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 	rxq->rx_free_hold += nb_rx_bd;
 	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
-		hns3_clean_rx_buffers(rxq, rxq->rx_free_hold);
+		hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
 		rxq->rx_free_hold = 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index a2d6514..c1a34e2 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -231,6 +231,7 @@ struct hns3_entry {
 
 struct hns3_rx_queue {
 	void *io_base;
+	volatile void *io_head_reg;
 	struct hns3_adapter *hns;
 	struct rte_mempool *mb_pool;
 	struct hns3_desc *rx_ring;
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 3/8] net/hns3: add simple Rx process function
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds simple Rx process function and support chose Rx function
by real Rx offloads capability.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |   7 +-
 drivers/net/hns3/hns3_ethdev.h    |  21 ++
 drivers/net/hns3/hns3_ethdev_vf.c |  11 +-
 drivers/net/hns3/hns3_rxtx.c      | 538 +++++++++++++++++++++++---------------
 drivers/net/hns3/hns3_rxtx.h      | 130 ++++++++-
 5 files changed, 489 insertions(+), 218 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 6fa34e8..5d612f1 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2351,6 +2351,8 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 	if (ret)
 		goto cfg_err;
 
+	hns->rx_simple_allowed = true;
+	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
 	return 0;
@@ -4746,6 +4748,7 @@ hns3_dev_start(struct rte_eth_dev *dev)
 	hw->adapter_state = HNS3_NIC_STARTED;
 	rte_spinlock_unlock(&hw->lock);
 
+	hns3_rx_scattered_calc(dev);
 	hns3_set_rxtx_function(dev);
 	hns3_mp_req_start_rxtx(dev);
 	rte_eal_alarm_set(HNS3_SERVICE_INTERVAL, hns3_service_handler, dev);
@@ -4844,6 +4847,7 @@ hns3_dev_stop(struct rte_eth_dev *dev)
 		hns3_dev_release_mbufs(hns);
 		hw->adapter_state = HNS3_NIC_CONFIGURED;
 	}
+	hns3_rx_scattered_reset(dev);
 	rte_eal_alarm_cancel(hns3_service_handler, dev);
 	rte_spinlock_unlock(&hw->lock);
 }
@@ -5514,6 +5518,7 @@ hns3_reset_service(void *param)
 }
 
 static const struct eth_dev_ops hns3_eth_dev_ops = {
+	.dev_configure      = hns3_dev_configure,
 	.dev_start          = hns3_dev_start,
 	.dev_stop           = hns3_dev_stop,
 	.dev_close          = hns3_dev_close,
@@ -5539,7 +5544,7 @@ static const struct eth_dev_ops hns3_eth_dev_ops = {
 	.rx_queue_intr_disable  = hns3_dev_rx_queue_intr_disable,
 	.rxq_info_get           = hns3_rxq_info_get,
 	.txq_info_get           = hns3_txq_info_get,
-	.dev_configure          = hns3_dev_configure,
+	.rx_burst_mode_get      = hns3_rx_burst_mode_get,
 	.flow_ctrl_get          = hns3_flow_ctrl_get,
 	.flow_ctrl_set          = hns3_flow_ctrl_set,
 	.priority_flow_ctrl_set = hns3_priority_flow_ctrl_set,
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 3cb0535..d93c5b2 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -433,6 +433,7 @@ struct hns3_hw {
 	uint16_t tqps_num;          /* num task queue pairs of this function */
 	uint16_t intr_tqps_num;     /* num queue pairs mapping interrupt */
 	uint16_t rss_size_max;      /* HW defined max RSS task queue */
+	uint16_t rx_buf_len;        /* hold min hardware rx buf len */
 	uint16_t num_tx_desc;       /* desc num of per tx queue */
 	uint16_t num_rx_desc;       /* desc num of per rx queue */
 	uint32_t mng_entry_num;     /* number of manager table entry */
@@ -575,6 +576,23 @@ struct hns3_mp_param {
 /* Key string for IPC. */
 #define HNS3_MP_NAME "net_hns3_mp"
 
+#define HNS3_L2TBL_NUM	4
+#define HNS3_L3TBL_NUM	16
+#define HNS3_L4TBL_NUM	16
+#define HNS3_OL3TBL_NUM	16
+#define HNS3_OL4TBL_NUM	16
+
+struct hns3_ptype_table {
+	uint32_t l2table[HNS3_L2TBL_NUM];
+	uint32_t l3table[HNS3_L3TBL_NUM];
+	uint32_t l4table[HNS3_L4TBL_NUM];
+	uint32_t inner_l2table[HNS3_L2TBL_NUM];
+	uint32_t inner_l3table[HNS3_L3TBL_NUM];
+	uint32_t inner_l4table[HNS3_L4TBL_NUM];
+	uint32_t ol3table[HNS3_OL3TBL_NUM];
+	uint32_t ol4table[HNS3_OL4TBL_NUM];
+};
+
 struct hns3_pf {
 	struct hns3_adapter *adapter;
 	bool is_main_pf;
@@ -623,6 +641,9 @@ struct hns3_adapter {
 		struct hns3_pf pf;
 		struct hns3_vf vf;
 	};
+
+	bool rx_simple_allowed;
+	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
 #define HNS3_DEV_SUPPORT_DCB_B			0x0
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 7226cc5..0f155d8 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -745,7 +745,8 @@ hns3vf_init_ring_with_vector(struct hns3_hw *hw)
 static int
 hns3vf_dev_configure(struct rte_eth_dev *dev)
 {
-	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
 	struct hns3_rss_conf *rss_cfg = &hw->rss_info;
 	struct rte_eth_conf *conf = &dev->data->dev_conf;
 	enum rte_eth_rx_mq_mode mq_mode = conf->rxmode.mq_mode;
@@ -820,6 +821,9 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 	if (ret)
 		goto cfg_err;
 
+	hns->rx_simple_allowed = true;
+	hns3_init_rx_ptype_tble(dev);
+
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 	return 0;
 
@@ -1875,6 +1879,7 @@ hns3vf_dev_stop(struct rte_eth_dev *dev)
 		hns3_dev_release_mbufs(hns);
 		hw->adapter_state = HNS3_NIC_CONFIGURED;
 	}
+	hns3_rx_scattered_reset(dev);
 	rte_eal_alarm_cancel(hns3vf_service_handler, dev);
 	rte_spinlock_unlock(&hw->lock);
 }
@@ -2111,6 +2116,7 @@ hns3vf_dev_start(struct rte_eth_dev *dev)
 	hw->adapter_state = HNS3_NIC_STARTED;
 	rte_spinlock_unlock(&hw->lock);
 
+	hns3_rx_scattered_calc(dev);
 	hns3_set_rxtx_function(dev);
 	hns3_mp_req_start_rxtx(dev);
 	rte_eal_alarm_set(HNS3VF_SERVICE_INTERVAL, hns3vf_service_handler, dev);
@@ -2508,6 +2514,7 @@ hns3vf_reinit_dev(struct hns3_adapter *hns)
 }
 
 static const struct eth_dev_ops hns3vf_eth_dev_ops = {
+	.dev_configure      = hns3vf_dev_configure,
 	.dev_start          = hns3vf_dev_start,
 	.dev_stop           = hns3vf_dev_stop,
 	.dev_close          = hns3vf_dev_close,
@@ -2533,7 +2540,7 @@ static const struct eth_dev_ops hns3vf_eth_dev_ops = {
 	.rx_queue_intr_disable  = hns3_dev_rx_queue_intr_disable,
 	.rxq_info_get       = hns3_rxq_info_get,
 	.txq_info_get       = hns3_txq_info_get,
-	.dev_configure      = hns3vf_dev_configure,
+	.rx_burst_mode_get  = hns3_rx_burst_mode_get,
 	.mac_addr_add       = hns3vf_add_mac_addr,
 	.mac_addr_remove    = hns3vf_remove_mac_addr,
 	.mac_addr_set       = hns3vf_set_default_mac_addr,
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 703b12a..38ad454 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -30,7 +30,7 @@
 #include "hns3_logs.h"
 
 #define HNS3_CFG_DESC_NUM(num)	((num) / 8 - 1)
-#define DEFAULT_RX_FREE_THRESH	32
+#define HNS3_RX_RING_PREFECTH_MASK	3
 
 static void
 hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
@@ -38,13 +38,20 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 	uint16_t i;
 
 	/* Note: Fake rx queue will not enter here */
-	if (rxq->sw_ring) {
-		for (i = 0; i < rxq->nb_rx_desc; i++) {
-			if (rxq->sw_ring[i].mbuf) {
-				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
-				rxq->sw_ring[i].mbuf = NULL;
-			}
-		}
+	if (rxq->sw_ring == NULL)
+		return;
+
+	for (i = 0; i < rxq->nb_rx_desc; i++)
+		if (rxq->sw_ring[i].mbuf)
+			rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+
+	for (i = 0; i < rxq->bulk_mbuf_num; i++)
+		rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
+	rxq->bulk_mbuf_num = 0;
+
+	if (rxq->pkt_first_seg) {
+		rte_pktmbuf_free(rxq->pkt_first_seg);
+		rxq->pkt_first_seg = NULL;
 	}
 }
 
@@ -653,6 +660,8 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->pkt_first_seg = NULL;
+	rxq->pkt_last_seg = NULL;
 	hns3_init_rx_queue_hw(rxq);
 
 	return 0;
@@ -1243,6 +1252,33 @@ hns3_rx_buf_len_calc(struct rte_mempool *mp, uint16_t *rx_buf_len)
 	return 0;
 }
 
+static int
+hns3_rx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_rxconf *conf,
+			 struct rte_mempool *mp, uint16_t nb_desc,
+			 uint16_t *buf_size)
+{
+	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
+	    nb_desc % HNS3_ALIGN_RING_DESC) {
+		hns3_err(hw, "Number (%u) of rx descriptors is invalid",
+			 nb_desc);
+		return -EINVAL;
+	}
+
+	if (conf->rx_drop_en == 0)
+		hns3_warn(hw, "if no descriptors available, packets are always "
+			  "dropped and rx_drop_en (1) is fixed on");
+
+	if (hns3_rx_buf_len_calc(mp, buf_size)) {
+		hns3_err(hw, "rxq mbufs' data room size (%u) is not enough! "
+				"minimal data room size (%u).",
+				rte_pktmbuf_data_room_size(mp),
+				HNS3_MIN_BD_BUF_SIZE + RTE_PKTMBUF_HEADROOM);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int
 hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_rxconf *conf,
@@ -1254,24 +1290,16 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	struct hns3_rx_queue *rxq;
 	uint16_t rx_buf_size;
 	int rx_entry_len;
+	int ret;
 
 	if (dev->data->dev_started) {
 		hns3_err(hw, "rx_queue_setup after dev_start no supported");
 		return -EINVAL;
 	}
 
-	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
-	    nb_desc % HNS3_ALIGN_RING_DESC) {
-		hns3_err(hw, "Number (%u) of rx descriptors is invalid",
-			 nb_desc);
-		return -EINVAL;
-	}
-
-	if (conf->rx_drop_en == 0)
-		hns3_warn(hw, "if there are no available Rx descriptors,"
-			  "incoming packets are always dropped. input parameter"
-			  " conf->rx_drop_en(%u) is uneffective.",
-			  conf->rx_drop_en);
+	ret = hns3_rx_queue_conf_check(hw, conf, mp, nb_desc, &rx_buf_size);
+	if (ret)
+		return ret;
 
 	if (dev->data->rx_queues[idx]) {
 		hns3_rx_queue_release(dev->data->rx_queues[idx]);
@@ -1284,14 +1312,6 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	q_info.type = "hns3 RX queue";
 	q_info.ring_name = "rx_ring";
 
-	if (hns3_rx_buf_len_calc(mp, &rx_buf_size)) {
-		hns3_err(hw, "rxq mbufs' data room size:%u is not enough! "
-				"minimal data room size:%u.",
-				rte_pktmbuf_data_room_size(mp),
-				HNS3_MIN_BD_BUF_SIZE + RTE_PKTMBUF_HEADROOM);
-		return -EINVAL;
-	}
-
 	rxq = hns3_alloc_rxq_and_dma_zone(dev, &q_info);
 	if (rxq == NULL) {
 		hns3_err(hw,
@@ -1300,6 +1320,7 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	}
 
 	rxq->hns = hns;
+	rxq->ptype_tbl = &hns->ptype_tbl;
 	rxq->mb_pool = mp;
 	rxq->rx_free_thresh = (conf->rx_free_thresh > 0) ?
 		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
@@ -1339,6 +1360,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	else
 		rxq->crc_len = 0;
 
+	rxq->bulk_mbuf_num = 0;
+
 	rte_spinlock_lock(&hw->lock);
 	dev->data->rx_queues[idx] = rxq;
 	rte_spinlock_unlock(&hw->lock);
@@ -1346,104 +1369,40 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static inline uint32_t
-rxd_pkt_info_to_pkt_type(uint32_t pkt_info, uint32_t ol_info)
+void
+hns3_rx_scattered_reset(struct rte_eth_dev *dev)
 {
-#define HNS3_L2TBL_NUM	4
-#define HNS3_L3TBL_NUM	16
-#define HNS3_L4TBL_NUM	16
-#define HNS3_OL3TBL_NUM	16
-#define HNS3_OL4TBL_NUM	16
-	uint32_t pkt_type = 0;
-	uint32_t l2id, l3id, l4id;
-	uint32_t ol3id, ol4id;
-
-	static const uint32_t l2table[HNS3_L2TBL_NUM] = {
-		RTE_PTYPE_L2_ETHER,
-		RTE_PTYPE_L2_ETHER_QINQ,
-		RTE_PTYPE_L2_ETHER_VLAN,
-		RTE_PTYPE_L2_ETHER_VLAN
-	};
-
-	static const uint32_t l3table[HNS3_L3TBL_NUM] = {
-		RTE_PTYPE_L3_IPV4,
-		RTE_PTYPE_L3_IPV6,
-		RTE_PTYPE_L2_ETHER_ARP,
-		RTE_PTYPE_L2_ETHER,
-		RTE_PTYPE_L3_IPV4_EXT,
-		RTE_PTYPE_L3_IPV6_EXT,
-		RTE_PTYPE_L2_ETHER_LLDP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
-
-	static const uint32_t l4table[HNS3_L4TBL_NUM] = {
-		RTE_PTYPE_L4_UDP,
-		RTE_PTYPE_L4_TCP,
-		RTE_PTYPE_TUNNEL_GRE,
-		RTE_PTYPE_L4_SCTP,
-		RTE_PTYPE_L4_IGMP,
-		RTE_PTYPE_L4_ICMP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
-
-	static const uint32_t inner_l2table[HNS3_L2TBL_NUM] = {
-		RTE_PTYPE_INNER_L2_ETHER,
-		RTE_PTYPE_INNER_L2_ETHER_VLAN,
-		RTE_PTYPE_INNER_L2_ETHER_QINQ,
-		0
-	};
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
 
-	static const uint32_t inner_l3table[HNS3_L3TBL_NUM] = {
-		RTE_PTYPE_INNER_L3_IPV4,
-		RTE_PTYPE_INNER_L3_IPV6,
-		0,
-		RTE_PTYPE_INNER_L2_ETHER,
-		RTE_PTYPE_INNER_L3_IPV4_EXT,
-		RTE_PTYPE_INNER_L3_IPV6_EXT,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+	hw->rx_buf_len = 0;
+	dev->data->scattered_rx = false;
+}
 
-	static const uint32_t inner_l4table[HNS3_L4TBL_NUM] = {
-		RTE_PTYPE_INNER_L4_UDP,
-		RTE_PTYPE_INNER_L4_TCP,
-		RTE_PTYPE_TUNNEL_GRE,
-		RTE_PTYPE_INNER_L4_SCTP,
-		RTE_PTYPE_L4_IGMP,
-		RTE_PTYPE_INNER_L4_ICMP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+void
+hns3_rx_scattered_calc(struct rte_eth_dev *dev)
+{
+	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
+	struct hns3_rx_queue *rxq;
+	uint32_t queue_id;
 
-	static const uint32_t ol3table[HNS3_OL3TBL_NUM] = {
-		RTE_PTYPE_L3_IPV4,
-		RTE_PTYPE_L3_IPV6,
-		0, 0,
-		RTE_PTYPE_L3_IPV4_EXT,
-		RTE_PTYPE_L3_IPV6_EXT,
-		0, 0, 0, 0, 0, 0, 0, 0, 0,
-		RTE_PTYPE_UNKNOWN
-	};
+	if (dev->data->rx_queues == NULL)
+		return;
 
-	static const uint32_t ol4table[HNS3_OL4TBL_NUM] = {
-		0,
-		RTE_PTYPE_TUNNEL_VXLAN,
-		RTE_PTYPE_TUNNEL_NVGRE,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		rxq = dev->data->rx_queues[queue_id];
+		if (hw->rx_buf_len == 0)
+			hw->rx_buf_len = rxq->rx_buf_len;
+		else
+			hw->rx_buf_len = RTE_MIN(hw->rx_buf_len,
+						 rxq->rx_buf_len);
+	}
 
-	l2id = hns3_get_field(pkt_info, HNS3_RXD_STRP_TAGP_M,
-			      HNS3_RXD_STRP_TAGP_S);
-	l3id = hns3_get_field(pkt_info, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S);
-	l4id = hns3_get_field(pkt_info, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S);
-	ol3id = hns3_get_field(ol_info, HNS3_RXD_OL3ID_M, HNS3_RXD_OL3ID_S);
-	ol4id = hns3_get_field(ol_info, HNS3_RXD_OL4ID_M, HNS3_RXD_OL4ID_S);
-
-	if (ol4table[ol4id])
-		pkt_type |= (inner_l2table[l2id] | inner_l3table[l3id] |
-			     inner_l4table[l4id] | ol3table[ol3id] |
-			     ol4table[ol4id]);
-	else
-		pkt_type |= (l2table[l2id] | l3table[l3id] | l4table[l4id]);
-	return pkt_type;
+	if (dev_conf->rxmode.offloads & DEV_RX_OFFLOAD_SCATTER ||
+	    dev_conf->rxmode.max_rx_pkt_len > hw->rx_buf_len)
+		dev->data->scattered_rx = true;
 }
 
 const uint32_t *
@@ -1468,81 +1427,69 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 		RTE_PTYPE_UNKNOWN
 	};
 
-	if (dev->rx_pkt_burst == hns3_recv_pkts)
+	if (dev->rx_pkt_burst == hns3_recv_pkts ||
+	    dev->rx_pkt_burst == hns3_recv_scattered_pkts)
 		return ptypes;
 
 	return NULL;
 }
 
-static int
-hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
-		   uint32_t bd_base_info, uint32_t l234_info,
-		   uint32_t *cksum_err)
+void
+hns3_init_rx_ptype_tble(struct rte_eth_dev *dev)
 {
-	uint32_t tmp = 0;
-
-	if (unlikely(l234_info & BIT(HNS3_RXD_L2E_B))) {
-		rxq->l2_errors++;
-		return -EINVAL;
-	}
-
-	if (unlikely(rxm->pkt_len == 0 ||
-		(l234_info & BIT(HNS3_RXD_TRUNCAT_B)))) {
-		rxq->pkt_len_errors++;
-		return -EINVAL;
-	}
-
-	if (bd_base_info & BIT(HNS3_RXD_L3L4P_B)) {
-		if (unlikely(l234_info & BIT(HNS3_RXD_L3E_B))) {
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
-			rxq->l3_csum_erros++;
-			tmp |= HNS3_L3_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_L4E_B))) {
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
-			rxq->l4_csum_erros++;
-			tmp |= HNS3_L4_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_OL3E_B))) {
-			rxq->ol3_csum_erros++;
-			tmp |= HNS3_OUTER_L3_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_OL4E_B))) {
-			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_BAD;
-			rxq->ol4_csum_erros++;
-			tmp |= HNS3_OUTER_L4_CKSUM_ERR;
-		}
-	}
-	*cksum_err = tmp;
-
-	return 0;
-}
-
-static void
-hns3_rx_set_cksum_flag(struct rte_mbuf *rxm, uint64_t packet_type,
-		       const uint32_t cksum_err)
-{
-	if (unlikely((packet_type & RTE_PTYPE_TUNNEL_MASK))) {
-		if (likely(packet_type & RTE_PTYPE_INNER_L3_MASK) &&
-		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_INNER_L4_MASK) &&
-		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
-		    (cksum_err & HNS3_OUTER_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_GOOD;
-	} else {
-		if (likely(packet_type & RTE_PTYPE_L3_MASK) &&
-		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
-		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-	}
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_ptype_table *tbl = &hns->ptype_tbl;
+
+	memset(tbl, 0, sizeof(*tbl));
+
+	tbl->l2table[0] = RTE_PTYPE_L2_ETHER;
+	tbl->l2table[1] = RTE_PTYPE_L2_ETHER_QINQ;
+	tbl->l2table[2] = RTE_PTYPE_L2_ETHER_VLAN;
+	tbl->l2table[3] = RTE_PTYPE_L2_ETHER_VLAN;
+
+	tbl->l3table[0] = RTE_PTYPE_L3_IPV4;
+	tbl->l3table[1] = RTE_PTYPE_L3_IPV6;
+	tbl->l3table[2] = RTE_PTYPE_L2_ETHER_ARP;
+	tbl->l3table[3] = RTE_PTYPE_L2_ETHER;
+	tbl->l3table[4] = RTE_PTYPE_L3_IPV4_EXT;
+	tbl->l3table[5] = RTE_PTYPE_L3_IPV6_EXT;
+	tbl->l3table[6] = RTE_PTYPE_L2_ETHER_LLDP;
+
+	tbl->l4table[0] = RTE_PTYPE_L4_UDP;
+	tbl->l4table[1] = RTE_PTYPE_L4_TCP;
+	tbl->l4table[2] = RTE_PTYPE_TUNNEL_GRE;
+	tbl->l4table[3] = RTE_PTYPE_L4_SCTP;
+	tbl->l4table[4] = RTE_PTYPE_L4_IGMP;
+	tbl->l4table[5] = RTE_PTYPE_L4_ICMP;
+
+	tbl->inner_l2table[0] = RTE_PTYPE_INNER_L2_ETHER;
+	tbl->inner_l2table[1] = RTE_PTYPE_INNER_L2_ETHER_VLAN;
+	tbl->inner_l2table[2] = RTE_PTYPE_INNER_L2_ETHER_QINQ;
+
+	tbl->inner_l3table[0] = RTE_PTYPE_INNER_L3_IPV4;
+	tbl->inner_l3table[1] = RTE_PTYPE_INNER_L3_IPV6;
+	tbl->inner_l3table[2] = 0;
+	tbl->inner_l3table[3] = RTE_PTYPE_INNER_L2_ETHER;
+	tbl->inner_l3table[4] = RTE_PTYPE_INNER_L3_IPV4_EXT;
+	tbl->inner_l3table[5] = RTE_PTYPE_INNER_L3_IPV6_EXT;
+
+	tbl->inner_l4table[0] = RTE_PTYPE_INNER_L4_UDP;
+	tbl->inner_l4table[1] = RTE_PTYPE_INNER_L4_TCP;
+	tbl->inner_l4table[2] = RTE_PTYPE_TUNNEL_GRE;
+	tbl->inner_l4table[3] = RTE_PTYPE_INNER_L4_SCTP;
+	tbl->inner_l4table[4] = RTE_PTYPE_L4_IGMP;
+	tbl->inner_l4table[5] = RTE_PTYPE_INNER_L4_ICMP;
+
+	tbl->ol3table[0] = RTE_PTYPE_L3_IPV4;
+	tbl->ol3table[1] = RTE_PTYPE_L3_IPV6;
+	tbl->ol3table[2] = 0;
+	tbl->ol3table[3] = 0;
+	tbl->ol3table[4] = RTE_PTYPE_L3_IPV4_EXT;
+	tbl->ol3table[5] = RTE_PTYPE_L3_IPV6_EXT;
+
+	tbl->ol4table[0] = 0;
+	tbl->ol4table[1] = RTE_PTYPE_TUNNEL_VXLAN;
+	tbl->ol4table[2] = RTE_PTYPE_TUNNEL_NVGRE;
 }
 
 static inline void
@@ -1612,6 +1559,23 @@ recalculate_data_len(struct rte_mbuf *first_seg, struct rte_mbuf *last_seg,
 		rxm->data_len = (uint16_t)(data_len - crc_len);
 }
 
+static inline struct rte_mbuf *
+hns3_rx_alloc_buffer(struct hns3_rx_queue *rxq)
+{
+	int ret;
+
+	if (likely(rxq->bulk_mbuf_num > 0))
+		return rxq->bulk_mbuf[--rxq->bulk_mbuf_num];
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void **)rxq->bulk_mbuf,
+				   HNS3_BULK_ALLOC_MBUF_NUM);
+	if (likely(ret == 0)) {
+		rxq->bulk_mbuf_num = HNS3_BULK_ALLOC_MBUF_NUM;
+		return rxq->bulk_mbuf[--rxq->bulk_mbuf_num];
+	} else
+		return rte_mbuf_raw_alloc(rxq->mb_pool);
+}
+
 uint16_t
 hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
@@ -1620,6 +1584,119 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct hns3_rx_queue *rxq;      /* RX queue */
 	struct hns3_entry *sw_ring;
 	struct hns3_entry *rxe;
+	struct hns3_desc rxd;
+	struct rte_mbuf *nmb;           /* pointer of the new mbuf */
+	struct rte_mbuf *rxm;
+	uint32_t bd_base_info;
+	uint32_t cksum_err;
+	uint32_t l234_info;
+	uint32_t ol_info;
+	uint64_t dma_addr;
+	uint16_t nb_rx_bd;
+	uint16_t nb_rx;
+	uint16_t rx_id;
+	int ret;
+
+	nb_rx = 0;
+	nb_rx_bd = 0;
+	rxq = rx_queue;
+	rx_ring = rxq->rx_ring;
+	sw_ring = rxq->sw_ring;
+	rx_id = rxq->next_to_use;
+
+	while (nb_rx < nb_pkts) {
+		rxdp = &rx_ring[rx_id];
+		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
+		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
+			break;
+
+		rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+			   (1u << HNS3_RXD_VLD_B)];
+
+		nmb = hns3_rx_alloc_buffer(rxq);
+		if (unlikely(nmb == NULL)) {
+			uint16_t port_id;
+
+			port_id = rxq->port_id;
+			rte_eth_devices[port_id].data->rx_mbuf_alloc_failed++;
+			break;
+		}
+
+		nb_rx_bd++;
+		rxe = &sw_ring[rx_id];
+		rx_id++;
+		if (unlikely(rx_id == rxq->nb_rx_desc))
+			rx_id = 0;
+
+		rte_prefetch0(sw_ring[rx_id].mbuf);
+		if ((rx_id & HNS3_RX_RING_PREFECTH_MASK) == 0) {
+			rte_prefetch0(&rx_ring[rx_id]);
+			rte_prefetch0(&sw_ring[rx_id]);
+		}
+
+		rxm = rxe->mbuf;
+		rxe->mbuf = nmb;
+
+		dma_addr = rte_mbuf_data_iova_default(nmb);
+		rxdp->addr = rte_cpu_to_le_64(dma_addr);
+		rxdp->rx.bd_base_info = 0;
+
+		rxm->data_off = RTE_PKTMBUF_HEADROOM;
+		rxm->pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len)) -
+				rxq->crc_len;
+		rxm->data_len = rxm->pkt_len;
+		rxm->port = rxq->port_id;
+		rxm->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
+		rxm->ol_flags = PKT_RX_RSS_HASH;
+		if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
+			rxm->hash.fdir.hi =
+				rte_le_to_cpu_16(rxd.rx.fd_id);
+			rxm->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+		}
+		rxm->nb_segs = 1;
+		rxm->next = NULL;
+
+		/* Load remained descriptor data and extract necessary fields */
+		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
+		ret = hns3_handle_bdinfo(rxq, rxm, bd_base_info,
+					 l234_info, &cksum_err);
+		if (unlikely(ret))
+			goto pkt_err;
+
+		rxm->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+
+		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+			hns3_rx_set_cksum_flag(rxm, rxm->packet_type,
+					       cksum_err);
+		hns3_rxd_to_vlan_tci(rxq, rxm, l234_info, &rxd);
+
+		rx_pkts[nb_rx++] = rxm;
+		continue;
+pkt_err:
+		rte_pktmbuf_free(rxm);
+	}
+
+	rxq->next_to_use = rx_id;
+	rxq->rx_free_hold += nb_rx_bd;
+	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+		hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
+		rxq->rx_free_hold = 0;
+	}
+
+	return nb_rx;
+}
+
+uint16_t
+hns3_recv_scattered_pkts(void *rx_queue,
+			 struct rte_mbuf **rx_pkts,
+			 uint16_t nb_pkts)
+{
+	volatile struct hns3_desc *rx_ring;  /* RX ring (desc) */
+	volatile struct hns3_desc *rxdp;     /* pointer of the current desc */
+	struct hns3_rx_queue *rxq;      /* RX queue */
+	struct hns3_entry *sw_ring;
+	struct hns3_entry *rxe;
 	struct rte_mbuf *first_seg;
 	struct rte_mbuf *last_seg;
 	struct hns3_desc rxd;
@@ -1632,9 +1709,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint32_t gro_size;
 	uint32_t ol_info;
 	uint64_t dma_addr;
-	uint16_t data_len;
 	uint16_t nb_rx_bd;
-	uint16_t pkt_len;
 	uint16_t nb_rx;
 	uint16_t rx_id;
 	int ret;
@@ -1652,8 +1727,9 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
-		if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B)))
+		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
 			break;
+
 		/*
 		 * The interactive process between software and hardware of
 		 * receiving a new packet in hns3 network engine:
@@ -1716,7 +1792,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
 			   (1u << HNS3_RXD_VLD_B)];
 
-		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
+		nmb = hns3_rx_alloc_buffer(rxq);
 		if (unlikely(nmb == NULL)) {
 			dev = &rte_eth_devices[rxq->port_id];
 			dev->data->rx_mbuf_alloc_failed++;
@@ -1730,7 +1806,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_id = 0;
 
 		rte_prefetch0(sw_ring[rx_id].mbuf);
-		if ((rx_id & 0x3) == 0) {
+		if ((rx_id & HNS3_RX_RING_PREFECTH_MASK) == 0) {
 			rte_prefetch0(&rx_ring[rx_id]);
 			rte_prefetch0(&sw_ring[rx_id]);
 		}
@@ -1742,15 +1818,6 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxdp->rx.bd_base_info = 0;
 		rxdp->addr = dma_addr;
 
-		/*
-		 * Load remained descriptor data and extract necessary fields.
-		 * Data size from buffer description may contains CRC len,
-		 * packet len should subtract it.
-		 */
-		data_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.size));
-		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
-		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
-
 		if (first_seg == NULL) {
 			first_seg = rxm;
 			first_seg->nb_segs = 1;
@@ -1760,10 +1827,11 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 
 		rxm->data_off = RTE_PKTMBUF_HEADROOM;
-		rxm->data_len = data_len;
+		rxm->data_len = rte_le_to_cpu_16(rxd.rx.size);
 
-		if (!hns3_get_bit(bd_base_info, HNS3_RXD_FE_B)) {
+		if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) {
 			last_seg = rxm;
+			rxm->next = NULL;
 			continue;
 		}
 
@@ -1772,8 +1840,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		 * buffer description may contains CRC len, packet len should
 		 * subtract it, same as data len.
 		 */
-		pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len));
-		first_seg->pkt_len = pkt_len;
+		first_seg->pkt_len = rte_le_to_cpu_16(rxd.rx.pkt_len);
 
 		/*
 		 * This is the last buffer of the received packet. If the CRC
@@ -1789,15 +1856,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if (unlikely(rxq->crc_len > 0)) {
 			first_seg->pkt_len -= rxq->crc_len;
 			recalculate_data_len(first_seg, last_seg, rxm, rxq,
-				data_len);
+				rxm->data_len);
 		}
 
 		first_seg->port = rxq->port_id;
 		first_seg->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
 		first_seg->ol_flags = PKT_RX_RSS_HASH;
-		if (unlikely(hns3_get_bit(bd_base_info, HNS3_RXD_LUM_B))) {
+		if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
 			first_seg->hash.fdir.hi =
-				rte_le_to_cpu_32(rxd.rx.fd_id);
+				rte_le_to_cpu_16(rxd.rx.fd_id);
 			first_seg->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
 		}
 
@@ -1808,13 +1875,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			first_seg->tso_segsz = gro_size;
 		}
 
+		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
 		ret = hns3_handle_bdinfo(rxq, first_seg, bd_base_info,
 					 l234_info, &cksum_err);
 		if (unlikely(ret))
 			goto pkt_err;
 
-		first_seg->packet_type = rxd_pkt_info_to_pkt_type(l234_info,
-								  ol_info);
+		first_seg->packet_type = hns3_rx_calc_ptype(rxq,
+						l234_info, ol_info);
 
 		if (bd_base_info & BIT(HNS3_RXD_L3L4P_B))
 			hns3_rx_set_cksum_flag(first_seg,
@@ -1844,6 +1913,46 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 }
 
 int
+hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
+		       struct rte_eth_burst_mode *mode)
+{
+	static const struct {
+		eth_rx_burst_t pkt_burst;
+		const char *info;
+	} burst_infos[] = {
+		{ hns3_recv_pkts,		"Scalar" },
+		{ hns3_recv_scattered_pkts,	"Scalar Scattered" },
+	};
+
+	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+	int ret = -EINVAL;
+	unsigned int i;
+
+	for (i = 0; i < RTE_DIM(burst_infos); i++) {
+		if (pkt_burst == burst_infos[i].pkt_burst) {
+			snprintf(mode->info, sizeof(mode->info), "%s",
+				 burst_infos[i].info);
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static eth_rx_burst_t
+hns3_get_rx_function(struct rte_eth_dev *dev)
+{
+	struct hns3_adapter *hns = dev->data->dev_private;
+	uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
+
+	if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
+	    (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
+		return hns3_recv_pkts;
+
+	return hns3_recv_scattered_pkts;
+}
+int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
 {
@@ -1932,7 +2041,8 @@ hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 	struct hns3_desc *desc = &txq->tx_ring[tx_next_clean];
 	struct rte_mbuf *mbuf;
 
-	while ((!hns3_get_bit(desc->tx.tp_fe_sc_vld_ra_ri, HNS3_TXD_VLD_B)) &&
+	while ((!(desc->tx.tp_fe_sc_vld_ra_ri &
+		rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))) &&
 		tx_next_use != tx_next_clean) {
 		mbuf = tx_bak_pkt->mbuf;
 		if (mbuf) {
@@ -2818,7 +2928,7 @@ void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev)
 
 	if (hns->hw.adapter_state == HNS3_NIC_STARTED &&
 	    rte_atomic16_read(&hns->hw.reset.resetting) == 0) {
-		eth_dev->rx_pkt_burst = hns3_recv_pkts;
+		eth_dev->rx_pkt_burst = hns3_get_rx_function(eth_dev);
 		eth_dev->tx_pkt_burst = hns3_xmit_pkts;
 		eth_dev->tx_pkt_prepare = hns3_prep_pkts;
 	} else {
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index c1a34e2..3d3f0a0 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -10,6 +10,8 @@
 #define HNS3_DEFAULT_RING_DESC  1024
 #define	HNS3_ALIGN_RING_DESC	32
 #define HNS3_RING_BASE_ALIGN	128
+#define HNS3_BULK_ALLOC_MBUF_NUM	32
+
 #define HNS3_DEFAULT_RX_FREE_THRESH	32
 
 #define HNS3_512_BD_BUF_SIZE	512
@@ -233,6 +235,7 @@ struct hns3_rx_queue {
 	void *io_base;
 	volatile void *io_head_reg;
 	struct hns3_adapter *hns;
+	struct hns3_ptype_table *ptype_tbl;
 	struct rte_mempool *mb_pool;
 	struct hns3_desc *rx_ring;
 	uint64_t rx_ring_phys_addr; /* RX ring DMA address */
@@ -245,13 +248,13 @@ struct hns3_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_rx_desc;
-	uint16_t next_to_use;
 	uint16_t rx_buf_len;
 	/*
 	 * threshold for the number of BDs waited to passed to hardware. If the
 	 * number exceeds the threshold, driver will pass these BDs to hardware.
 	 */
 	uint16_t rx_free_thresh;
+	uint16_t next_to_use;    /* index of next BD to be polled */
 	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
 
 	/*
@@ -272,6 +275,9 @@ struct hns3_rx_queue {
 	uint64_t l4_csum_erros;
 	uint64_t ol3_csum_erros;
 	uint64_t ol4_csum_erros;
+
+	struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
+	uint16_t bulk_mbuf_num;
 };
 
 struct hns3_tx_queue {
@@ -380,6 +386,120 @@ enum hns3_cksum_status {
 	HNS3_OUTER_L4_CKSUM_ERR = 8
 };
 
+static inline int
+hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
+		   uint32_t bd_base_info, uint32_t l234_info,
+		   uint32_t *cksum_err)
+{
+#define L2E_TRUNC_ERR_FLAG	(BIT(HNS3_RXD_L2E_B) | \
+				 BIT(HNS3_RXD_TRUNCAT_B))
+#define CHECKSUM_ERR_FLAG	(BIT(HNS3_RXD_L3E_B) | \
+				 BIT(HNS3_RXD_L4E_B) | \
+				 BIT(HNS3_RXD_OL3E_B) | \
+				 BIT(HNS3_RXD_OL4E_B))
+
+	uint32_t tmp = 0;
+
+	/*
+	 * If packet len bigger than mtu when recv with no-scattered algorithm,
+	 * the first n bd will without FE bit, we need process this sisution.
+	 * Note: we don't need add statistic counter because lastest bd which
+	 *       with FE bit will mark HNS3_RXD_L2E_B bit.
+	 */
+	if (unlikely((bd_base_info & BIT(HNS3_RXD_FE_B)) == 0))
+		return -EINVAL;
+
+	if (unlikely((l234_info & L2E_TRUNC_ERR_FLAG) || rxm->pkt_len == 0)) {
+		if (l234_info & BIT(HNS3_RXD_L2E_B))
+			rxq->l2_errors++;
+		else
+			rxq->pkt_len_errors++;
+		return -EINVAL;
+	}
+
+	if (bd_base_info & BIT(HNS3_RXD_L3L4P_B)) {
+		if (likely((l234_info & CHECKSUM_ERR_FLAG) == 0)) {
+			*cksum_err = 0;
+			return 0;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_L3E_B))) {
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+			rxq->l3_csum_erros++;
+			tmp |= HNS3_L3_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_L4E_B))) {
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
+			rxq->l4_csum_erros++;
+			tmp |= HNS3_L4_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_OL3E_B))) {
+			rxq->ol3_csum_erros++;
+			tmp |= HNS3_OUTER_L3_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_OL4E_B))) {
+			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_BAD;
+			rxq->ol4_csum_erros++;
+			tmp |= HNS3_OUTER_L4_CKSUM_ERR;
+		}
+	}
+	*cksum_err = tmp;
+
+	return 0;
+}
+
+static inline void
+hns3_rx_set_cksum_flag(struct rte_mbuf *rxm, const uint64_t packet_type,
+		       const uint32_t cksum_err)
+{
+	if (unlikely((packet_type & RTE_PTYPE_TUNNEL_MASK))) {
+		if (likely(packet_type & RTE_PTYPE_INNER_L3_MASK) &&
+		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_INNER_L4_MASK) &&
+		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
+		    (cksum_err & HNS3_OUTER_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_GOOD;
+	} else {
+		if (likely(packet_type & RTE_PTYPE_L3_MASK) &&
+		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
+		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+}
+
+static inline uint32_t
+hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
+		   const uint32_t ol_info)
+{
+	const struct hns3_ptype_table *const ptype_tbl = rxq->ptype_tbl;
+	uint32_t l2id, l3id, l4id;
+	uint32_t ol3id, ol4id;
+
+	ol4id = hns3_get_field(ol_info, HNS3_RXD_OL4ID_M, HNS3_RXD_OL4ID_S);
+	ol3id = hns3_get_field(ol_info, HNS3_RXD_OL3ID_M, HNS3_RXD_OL3ID_S);
+	l2id = hns3_get_field(l234_info, HNS3_RXD_STRP_TAGP_M,
+			      HNS3_RXD_STRP_TAGP_S);
+	l3id = hns3_get_field(l234_info, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S);
+	l4id = hns3_get_field(l234_info, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S);
+
+	if (unlikely(ptype_tbl->ol4table[ol4id]))
+		return ptype_tbl->inner_l2table[l2id] |
+			ptype_tbl->inner_l3table[l3id] |
+			ptype_tbl->inner_l4table[l4id] |
+			ptype_tbl->ol3table[ol3id] | ptype_tbl->ol4table[ol4id];
+	else
+		return ptype_tbl->l2table[l2id] | ptype_tbl->l3table[l3id] |
+			ptype_tbl->l4table[l4id];
+}
+
 void hns3_dev_rx_queue_release(void *queue);
 void hns3_dev_tx_queue_release(void *queue);
 void hns3_free_all_queues(struct rte_eth_dev *dev);
@@ -398,11 +518,17 @@ int hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 			unsigned int socket, const struct rte_eth_txconf *conf);
 uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts);
+int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
+			   __rte_unused uint16_t queue_id,
+			   struct rte_eth_burst_mode *mode);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 const uint32_t *hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+void hns3_init_rx_ptype_tble(struct rte_eth_dev *dev);
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev);
 void hns3_set_queue_intr_gl(struct hns3_hw *hw, uint16_t queue_id,
 			    uint8_t gl_idx, uint16_t gl_value);
@@ -415,6 +541,8 @@ int hns3_set_fake_rx_or_tx_queues(struct rte_eth_dev *dev, uint16_t nb_rx_q,
 int hns3_config_gro(struct hns3_hw *hw, bool en);
 int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
+void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
+void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 4/8] net/hns3: add simple Tx process function
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (2 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds simple Tx process function. When multiple segment packets
are not needed, Which means that DEV_TX_OFFLOAD_MBUF_FAST_FREE offload is
not set, we can simple Tx process.

Signed-off-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |   6 +
 drivers/net/hns3/hns3_ethdev.h    |   1 +
 drivers/net/hns3/hns3_ethdev_vf.c |   6 +
 drivers/net/hns3/hns3_rxtx.c      | 260 +++++++++++++++++++++++++++++++++++---
 drivers/net/hns3/hns3_rxtx.h      |  34 +++++
 5 files changed, 292 insertions(+), 15 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 5d612f1..8701994 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->tx_simple_allowed = true;
 	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
@@ -2512,6 +2513,10 @@ hns3_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 		.rx_drop_en = 1,
 		.offloads = 0,
 	};
+	info->default_txconf = (struct rte_eth_txconf) {
+		.tx_rs_thresh = HNS3_DEFAULT_TX_RS_THRESH,
+		.offloads = 0,
+	};
 
 	info->vmdq_queue_num = 0;
 
@@ -5545,6 +5550,7 @@ static const struct eth_dev_ops hns3_eth_dev_ops = {
 	.rxq_info_get           = hns3_rxq_info_get,
 	.txq_info_get           = hns3_txq_info_get,
 	.rx_burst_mode_get      = hns3_rx_burst_mode_get,
+	.tx_burst_mode_get      = hns3_tx_burst_mode_get,
 	.flow_ctrl_get          = hns3_flow_ctrl_get,
 	.flow_ctrl_set          = hns3_flow_ctrl_set,
 	.priority_flow_ctrl_set = hns3_priority_flow_ctrl_set,
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index d93c5b2..ef85034 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -643,6 +643,7 @@ struct hns3_adapter {
 	};
 
 	bool rx_simple_allowed;
+	bool tx_simple_allowed;
 	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 0f155d8..915b896 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->tx_simple_allowed = true;
 	hns3_init_rx_ptype_tble(dev);
 
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
@@ -957,6 +958,10 @@ hns3vf_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 		.rx_drop_en = 1,
 		.offloads = 0,
 	};
+	info->default_txconf = (struct rte_eth_txconf) {
+		.tx_rs_thresh = HNS3_DEFAULT_TX_RS_THRESH,
+		.offloads = 0,
+	};
 
 	info->vmdq_queue_num = 0;
 
@@ -2541,6 +2546,7 @@ static const struct eth_dev_ops hns3vf_eth_dev_ops = {
 	.rxq_info_get       = hns3_rxq_info_get,
 	.txq_info_get       = hns3_txq_info_get,
 	.rx_burst_mode_get  = hns3_rx_burst_mode_get,
+	.tx_burst_mode_get  = hns3_tx_burst_mode_get,
 	.mac_addr_add       = hns3vf_add_mac_addr,
 	.mac_addr_remove    = hns3vf_remove_mac_addr,
 	.mac_addr_set       = hns3vf_set_default_mac_addr,
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 38ad454..08a3dcd 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1952,27 +1952,72 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
 
 	return hns3_recv_scattered_pkts;
 }
+
+static int
+hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
+			 uint16_t nb_desc, uint16_t *tx_rs_thresh,
+			 uint16_t *tx_free_thresh, uint16_t idx)
+{
+#define HNS3_TX_RS_FREE_THRESH_GAP	8
+	uint16_t rs_thresh, free_thresh, fast_free_thresh;
+
+	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
+	    nb_desc % HNS3_ALIGN_RING_DESC) {
+		hns3_err(hw, "number (%u) of tx descriptors is invalid",
+			 nb_desc);
+		return -EINVAL;
+	}
+
+	rs_thresh = (conf->tx_rs_thresh > 0) ?
+			conf->tx_rs_thresh : HNS3_DEFAULT_TX_RS_THRESH;
+	free_thresh = (conf->tx_free_thresh > 0) ?
+			conf->tx_free_thresh : HNS3_DEFAULT_TX_FREE_THRESH;
+	if (rs_thresh + free_thresh > nb_desc || nb_desc % rs_thresh ||
+	    rs_thresh >= nb_desc - HNS3_TX_RS_FREE_THRESH_GAP ||
+	    free_thresh >= nb_desc - HNS3_TX_RS_FREE_THRESH_GAP) {
+		hns3_err(hw, "tx_rs_thresh (%d) tx_free_thresh (%d) nb_desc "
+			 "(%d) of tx descriptors for port=%d queue=%d check "
+			 "fail!",
+			 rs_thresh, free_thresh, nb_desc, hw->data->port_id,
+			 idx);
+		return -EINVAL;
+	}
+
+	if (conf->tx_free_thresh == 0) {
+		/* Fast free Tx memory buffer to improve cache hit rate */
+		fast_free_thresh = nb_desc - rs_thresh;
+		if (fast_free_thresh >=
+		    HNS3_TX_FAST_FREE_AHEAD + HNS3_DEFAULT_TX_FREE_THRESH)
+			free_thresh = fast_free_thresh -
+					HNS3_TX_FAST_FREE_AHEAD;
+	}
+
+	*tx_rs_thresh = rs_thresh;
+	*tx_free_thresh = free_thresh;
+	return 0;
+}
+
 int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
 {
 	struct hns3_adapter *hns = dev->data->dev_private;
+	uint16_t tx_rs_thresh, tx_free_thresh;
 	struct hns3_hw *hw = &hns->hw;
 	struct hns3_queue_info q_info;
 	struct hns3_tx_queue *txq;
 	int tx_entry_len;
+	int ret;
 
 	if (dev->data->dev_started) {
 		hns3_err(hw, "tx_queue_setup after dev_start no supported");
 		return -EINVAL;
 	}
 
-	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
-	    nb_desc % HNS3_ALIGN_RING_DESC) {
-		hns3_err(hw, "Number (%u) of tx descriptors is invalid",
-			    nb_desc);
-		return -EINVAL;
-	}
+	ret = hns3_tx_queue_conf_check(hw, conf, nb_desc,
+				       &tx_rs_thresh, &tx_free_thresh, idx);
+	if (ret)
+		return ret;
 
 	if (dev->data->tx_queues[idx] != NULL) {
 		hns3_tx_queue_release(dev->data->tx_queues[idx]);
@@ -2005,11 +2050,15 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->next_to_use = 0;
 	txq->next_to_clean = 0;
 	txq->tx_bd_ready = txq->nb_tx_desc - 1;
+	txq->tx_free_thresh = tx_free_thresh;
+	txq->tx_rs_thresh = tx_rs_thresh;
 	txq->port_id = dev->data->port_id;
 	txq->pvid_state = hw->port_base_vlan_cfg.state;
 	txq->configured = true;
 	txq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
 				idx * HNS3_TQP_REG_SIZE);
+	txq->io_tail_reg = (volatile void *)((char *)txq->io_base +
+					     HNS3_RING_TX_TAIL_REG);
 	txq->min_tx_pkt_len = hw->min_tx_pkt_len;
 	txq->over_length_pkt_cnt = 0;
 	txq->exceed_limit_bd_pkt_cnt = 0;
@@ -2024,12 +2073,6 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static inline void
-hns3_queue_xmit(struct hns3_tx_queue *txq, uint32_t buf_num)
-{
-	hns3_write_dev(txq, HNS3_RING_TX_TAIL_REG, buf_num);
-}
-
 static void
 hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 {
@@ -2798,6 +2841,154 @@ hns3_check_non_tso_pkt(uint16_t nb_buf, struct rte_mbuf **m_seg,
 	return 0;
 }
 
+static inline void
+hns3_tx_free_buffer_simple(struct hns3_tx_queue *txq)
+{
+	struct hns3_entry *tx_entry;
+	struct hns3_desc *desc;
+	uint16_t tx_next_clean;
+	int i;
+
+	while (1) {
+		if (HNS3_GET_TX_QUEUE_PEND_BD_NUM(txq) < txq->tx_rs_thresh)
+			break;
+
+		/*
+		 * All mbufs can be released only when the VLD bits of all
+		 * descriptors in a batch are cleared.
+		 */
+		tx_next_clean = (txq->next_to_clean + txq->tx_rs_thresh - 1) %
+				txq->nb_tx_desc;
+		desc = &txq->tx_ring[tx_next_clean];
+		for (i = 0; i < txq->tx_rs_thresh; i++) {
+			if (rte_le_to_cpu_16(desc->tx.tp_fe_sc_vld_ra_ri) &
+					BIT(HNS3_TXD_VLD_B))
+				return;
+			desc--;
+		}
+
+		tx_entry = &txq->sw_ring[txq->next_to_clean];
+
+		for (i = 0; i < txq->tx_rs_thresh; i++)
+			rte_prefetch0((tx_entry + i)->mbuf);
+		for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
+			rte_mempool_put(tx_entry->mbuf->pool, tx_entry->mbuf);
+			tx_entry->mbuf = NULL;
+		}
+
+		txq->next_to_clean = (tx_next_clean + 1) % txq->nb_tx_desc;
+		txq->tx_bd_ready += txq->tx_rs_thresh;
+	}
+}
+
+static inline void
+hns3_tx_backup_1mbuf(struct hns3_entry *tx_entry, struct rte_mbuf **pkts)
+{
+	tx_entry->mbuf = pkts[0];
+}
+
+static inline void
+hns3_tx_backup_4mbuf(struct hns3_entry *tx_entry, struct rte_mbuf **pkts)
+{
+	hns3_tx_backup_1mbuf(&tx_entry[0], &pkts[0]);
+	hns3_tx_backup_1mbuf(&tx_entry[1], &pkts[1]);
+	hns3_tx_backup_1mbuf(&tx_entry[2], &pkts[2]);
+	hns3_tx_backup_1mbuf(&tx_entry[3], &pkts[3]);
+}
+
+static inline void
+hns3_tx_setup_4bd(struct hns3_desc *txdp, struct rte_mbuf **pkts)
+{
+#define PER_LOOP_NUM	4
+	const uint16_t bd_flag = BIT(HNS3_TXD_VLD_B) | BIT(HNS3_TXD_FE_B);
+	uint64_t dma_addr;
+	uint32_t i;
+
+	for (i = 0; i < PER_LOOP_NUM; i++, txdp++, pkts++) {
+		dma_addr = rte_mbuf_data_iova(*pkts);
+		txdp->addr = rte_cpu_to_le_64(dma_addr);
+		txdp->tx.send_size = rte_cpu_to_le_16((*pkts)->data_len);
+		txdp->tx.paylen = 0;
+		txdp->tx.type_cs_vlan_tso_len = 0;
+		txdp->tx.ol_type_vlan_len_msec = 0;
+		txdp->tx.tp_fe_sc_vld_ra_ri = rte_cpu_to_le_16(bd_flag);
+	}
+}
+
+static inline void
+hns3_tx_setup_1bd(struct hns3_desc *txdp, struct rte_mbuf **pkts)
+{
+	const uint16_t bd_flag = BIT(HNS3_TXD_VLD_B) | BIT(HNS3_TXD_FE_B);
+	uint64_t dma_addr;
+
+	dma_addr = rte_mbuf_data_iova(*pkts);
+	txdp->addr = rte_cpu_to_le_64(dma_addr);
+	txdp->tx.send_size = rte_cpu_to_le_16((*pkts)->data_len);
+	txdp->tx.paylen = 0;
+	txdp->tx.type_cs_vlan_tso_len = 0;
+	txdp->tx.ol_type_vlan_len_msec = 0;
+	txdp->tx.tp_fe_sc_vld_ra_ri = rte_cpu_to_le_16(bd_flag);
+}
+
+static inline void
+hns3_tx_fill_hw_ring(struct hns3_tx_queue *txq,
+		     struct rte_mbuf **pkts,
+		     uint16_t nb_pkts)
+{
+#define PER_LOOP_NUM	4
+#define PER_LOOP_MASK	(PER_LOOP_NUM - 1)
+	struct hns3_desc *txdp = &txq->tx_ring[txq->next_to_use];
+	struct hns3_entry *tx_entry = &txq->sw_ring[txq->next_to_use];
+	const uint32_t mainpart = (nb_pkts & ((uint32_t)~PER_LOOP_MASK));
+	const uint32_t leftover = (nb_pkts & ((uint32_t)PER_LOOP_MASK));
+	uint32_t i;
+
+	for (i = 0; i < mainpart; i += PER_LOOP_NUM) {
+		hns3_tx_backup_4mbuf(tx_entry + i, pkts + i);
+		hns3_tx_setup_4bd(txdp + i, pkts + i);
+	}
+	if (unlikely(leftover > 0)) {
+		for (i = 0; i < leftover; i++) {
+			hns3_tx_backup_1mbuf(tx_entry + mainpart + i,
+					     pkts + mainpart + i);
+			hns3_tx_setup_1bd(txdp + mainpart + i,
+					  pkts + mainpart + i);
+		}
+	}
+}
+
+uint16_t
+hns3_xmit_pkts_simple(void *tx_queue,
+		      struct rte_mbuf **tx_pkts,
+		      uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = tx_queue;
+	uint16_t nb_tx = 0;
+
+	hns3_tx_free_buffer_simple(txq);
+
+	nb_pkts = RTE_MIN(txq->tx_bd_ready, nb_pkts);
+	if (unlikely(nb_pkts == 0)) {
+		if (txq->tx_bd_ready == 0)
+			txq->queue_full_cnt++;
+		return 0;
+	}
+
+	txq->tx_bd_ready -= nb_pkts;
+	if (txq->next_to_use + nb_pkts > txq->nb_tx_desc) {
+		nb_tx = txq->nb_tx_desc - txq->next_to_use;
+		hns3_tx_fill_hw_ring(txq, tx_pkts, nb_tx);
+		txq->next_to_use = 0;
+	}
+
+	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
+	txq->next_to_use += nb_pkts - nb_tx;
+
+	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+
+	return nb_pkts;
+}
+
 uint16_t
 hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 {
@@ -2909,11 +3100,47 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 end_of_tx:
 
 	if (likely(nb_tx))
-		hns3_queue_xmit(txq, nb_hold);
+		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
 
 	return nb_tx;
 }
 
+int
+hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
+		       struct rte_eth_burst_mode *mode)
+{
+	eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
+	const char *info = NULL;
+
+	if (pkt_burst == hns3_xmit_pkts_simple)
+		info = "Scalar Simple";
+	else if (pkt_burst == hns3_xmit_pkts)
+		info = "Scalar";
+
+	if (info == NULL)
+		return -EINVAL;
+
+	snprintf(mode->info, sizeof(mode->info), "%s", info);
+
+	return 0;
+}
+
+static eth_tx_burst_t
+hns3_get_tx_function(struct rte_eth_dev *dev, eth_tx_prep_t *prep)
+{
+	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
+	struct hns3_adapter *hns = dev->data->dev_private;
+
+	if (hns->tx_simple_allowed &&
+	    offloads == (offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE)) {
+		*prep = NULL;
+		return hns3_xmit_pkts_simple;
+	}
+
+	*prep = hns3_prep_pkts;
+	return hns3_xmit_pkts;
+}
+
 static uint16_t
 hns3_dummy_rxtx_burst(void *dpdk_txq __rte_unused,
 		      struct rte_mbuf **pkts __rte_unused,
@@ -2925,12 +3152,13 @@ hns3_dummy_rxtx_burst(void *dpdk_txq __rte_unused,
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev)
 {
 	struct hns3_adapter *hns = eth_dev->data->dev_private;
+	eth_tx_prep_t prep = NULL;
 
 	if (hns->hw.adapter_state == HNS3_NIC_STARTED &&
 	    rte_atomic16_read(&hns->hw.reset.resetting) == 0) {
 		eth_dev->rx_pkt_burst = hns3_get_rx_function(eth_dev);
-		eth_dev->tx_pkt_burst = hns3_xmit_pkts;
-		eth_dev->tx_pkt_prepare = hns3_prep_pkts;
+		eth_dev->tx_pkt_burst = hns3_get_tx_function(eth_dev, &prep);
+		eth_dev->tx_pkt_prepare = prep;
 	} else {
 		eth_dev->rx_pkt_burst = hns3_dummy_rxtx_burst;
 		eth_dev->tx_pkt_burst = hns3_dummy_rxtx_burst;
@@ -2966,5 +3194,7 @@ hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 
 	qinfo->nb_desc = txq->nb_tx_desc;
 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+	qinfo->conf.tx_rs_thresh = txq->tx_rs_thresh;
+	qinfo->conf.tx_free_thresh = txq->tx_free_thresh;
 	qinfo->conf.tx_deferred_start = txq->tx_deferred_start;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 3d3f0a0..9933494 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -13,6 +13,9 @@
 #define HNS3_BULK_ALLOC_MBUF_NUM	32
 
 #define HNS3_DEFAULT_RX_FREE_THRESH	32
+#define HNS3_DEFAULT_TX_FREE_THRESH	32
+#define HNS3_DEFAULT_TX_RS_THRESH	32
+#define HNS3_TX_FAST_FREE_AHEAD		64
 
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
@@ -282,6 +285,7 @@ struct hns3_rx_queue {
 
 struct hns3_tx_queue {
 	void *io_base;
+	volatile void *io_tail_reg;
 	struct hns3_adapter *hns;
 	struct hns3_desc *tx_ring;
 	uint64_t tx_ring_phys_addr; /* TX ring DMA address */
@@ -291,10 +295,32 @@ struct hns3_tx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_tx_desc;
+	/*
+	 * index of next BD whose corresponding rte_mbuf can be released by
+	 * driver.
+	 */
 	uint16_t next_to_clean;
+	/* index of next BD to be filled by driver to send packet */
 	uint16_t next_to_use;
+	/* num of remaining BDs ready to be filled by driver to send packet */
 	uint16_t tx_bd_ready;
 
+	/* threshold for free tx buffer if available BDs less than this value */
+	uint16_t tx_free_thresh;
+
+	/*
+	 * For better performance in tx datapath, releasing mbuf in batches is
+	 * required.
+	 * Only checking the VLD bit of the last descriptor in a batch of the
+	 * thresh descriptors does not mean that these descriptors are all sent
+	 * by hardware successfully. So we need to check that the VLD bits of
+	 * all descriptors are cleared. and then free all mbufs in the batch.
+	 * - tx_rs_thresh
+	 *   Number of mbufs released at a time.
+
+	 */
+	uint16_t tx_rs_thresh;
+
 	/*
 	 * port based vlan configuration state.
 	 * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -360,6 +386,9 @@ struct hns3_tx_queue {
 	uint64_t pkt_padding_fail_cnt;
 };
 
+#define HNS3_GET_TX_QUEUE_PEND_BD_NUM(txq) \
+		((txq)->nb_tx_desc - 1 - (txq)->tx_bd_ready)
+
 struct hns3_queue_info {
 	const char *type;   /* point to queue memory name */
 	const char *ring_name;  /* point to hardware ring name */
@@ -525,8 +554,13 @@ int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
 			   struct rte_eth_burst_mode *mode);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+			       uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+int hns3_tx_burst_mode_get(struct rte_eth_dev *dev,
+			   __rte_unused uint16_t queue_id,
+			   struct rte_eth_burst_mode *mode);
 const uint32_t *hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev);
 void hns3_init_rx_ptype_tble(struct rte_eth_dev *dev);
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev);
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 5/8] net/hns3: add vector Tx burst with NEON instructions
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (3 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds NEON vector instructions to optimize Tx burst process.

Signed-off-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 config/common_base                    |  1 +
 config/common_linux                   |  1 +
 drivers/net/hns3/Makefile             |  5 +++
 drivers/net/hns3/hns3_ethdev.c        |  2 +
 drivers/net/hns3/hns3_ethdev.h        |  2 +
 drivers/net/hns3/hns3_ethdev_vf.c     |  2 +
 drivers/net/hns3/hns3_rxtx.c          | 33 ++++++++++++++
 drivers/net/hns3/hns3_rxtx.h          | 20 ++++++++-
 drivers/net/hns3/hns3_rxtx_vec.c      | 47 ++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec.h      | 57 ++++++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h | 81 +++++++++++++++++++++++++++++++++++
 drivers/net/hns3/meson.build          |  4 ++
 12 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.c
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.h
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec_neon.h

diff --git a/config/common_base b/config/common_base
index fbf0ee7..af1dea6 100644
--- a/config/common_base
+++ b/config/common_base
@@ -292,6 +292,7 @@ CONFIG_RTE_LIBRTE_HINIC_PMD=n
 # Compile burst-oriented HNS3 PMD driver
 #
 CONFIG_RTE_LIBRTE_HNS3_PMD=n
+CONFIG_RTE_LIBRTE_HNS3_INC_VECTOR=n
 
 #
 # Compile Pensando IONIC PMD driver
diff --git a/config/common_linux b/config/common_linux
index 8168106..e88a404 100644
--- a/config/common_linux
+++ b/config/common_linux
@@ -66,3 +66,4 @@ CONFIG_RTE_LIBRTE_HINIC_PMD=y
 # Hisilicon HNS3 PMD driver
 #
 CONFIG_RTE_LIBRTE_HNS3_PMD=y
+CONFIG_RTE_LIBRTE_HNS3_INC_VECTOR=y
diff --git a/drivers/net/hns3/Makefile b/drivers/net/hns3/Makefile
index d7798a4..d08d8fa 100644
--- a/drivers/net/hns3/Makefile
+++ b/drivers/net/hns3/Makefile
@@ -30,6 +30,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_ethdev_vf.c
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_cmd.c
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_mbx.c
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_rxtx.c
+
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+SRCS-$(CONFIG_RTE_LIBRTE_HNS3_INC_VECTOR) += hns3_rxtx_vec.c
+endif
+
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_rss.c
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_HNS3_PMD) += hns3_fdir.c
diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 8701994..68239f5 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2353,6 +2353,8 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 
 	hns->rx_simple_allowed = true;
 	hns->tx_simple_allowed = true;
+	hns->tx_vec_allowed = true;
+
 	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index ef85034..098b6ce 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -644,6 +644,8 @@ struct hns3_adapter {
 
 	bool rx_simple_allowed;
 	bool tx_simple_allowed;
+	bool tx_vec_allowed;
+
 	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 915b896..f3e6aea 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -823,6 +823,8 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 
 	hns->rx_simple_allowed = true;
 	hns->tx_simple_allowed = true;
+	hns->tx_vec_allowed = true;
+
 	hns3_init_rx_ptype_tble(dev);
 
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 08a3dcd..a537fbe 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -95,6 +95,8 @@ hns3_tx_queue_release(void *queue)
 			rte_memzone_free(txq->mz);
 		if (txq->sw_ring)
 			rte_free(txq->sw_ring);
+		if (txq->free)
+			rte_free(txq->free);
 		rte_free(txq);
 	}
 }
@@ -1020,6 +1022,7 @@ hns3_fake_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 
 	/* Don't need alloc sw_ring, because upper applications don't use it */
 	txq->sw_ring = NULL;
+	txq->free = NULL;
 
 	txq->hns = hns;
 	txq->tx_deferred_start = false;
@@ -2052,6 +2055,15 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->tx_bd_ready = txq->nb_tx_desc - 1;
 	txq->tx_free_thresh = tx_free_thresh;
 	txq->tx_rs_thresh = tx_rs_thresh;
+	txq->free = rte_zmalloc_socket("hns3 TX mbuf free array",
+				sizeof(struct rte_mbuf *) * txq->tx_rs_thresh,
+				RTE_CACHE_LINE_SIZE, socket_id);
+	if (!txq->free) {
+		hns3_err(hw, "failed to allocate tx mbuf free array!");
+		hns3_tx_queue_release(txq);
+		return -ENOMEM;
+	}
+
 	txq->port_id = dev->data->port_id;
 	txq->pvid_state = hw->port_base_vlan_cfg.state;
 	txq->configured = true;
@@ -3105,6 +3117,20 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	return nb_tx;
 }
 
+int __rte_weak
+hns3_tx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+	return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_xmit_pkts_vec(__rte_unused void *tx_queue,
+		   __rte_unused struct rte_mbuf **tx_pkts,
+		   __rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
+
 int
 hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		       struct rte_eth_burst_mode *mode)
@@ -3116,6 +3142,8 @@ hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		info = "Scalar Simple";
 	else if (pkt_burst == hns3_xmit_pkts)
 		info = "Scalar";
+	else if (pkt_burst == hns3_xmit_pkts_vec)
+		info = "Vector Neon";
 
 	if (info == NULL)
 		return -EINVAL;
@@ -3131,6 +3159,11 @@ hns3_get_tx_function(struct rte_eth_dev *dev, eth_tx_prep_t *prep)
 	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
 	struct hns3_adapter *hns = dev->data->dev_private;
 
+	if (hns->tx_vec_allowed && hns3_tx_check_vec_support(dev) == 0) {
+		*prep = NULL;
+		return hns3_xmit_pkts_vec;
+	}
+
 	if (hns->tx_simple_allowed &&
 	    offloads == (offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE)) {
 		*prep = NULL;
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 9933494..c5a510b 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -17,6 +17,10 @@
 #define HNS3_DEFAULT_TX_RS_THRESH	32
 #define HNS3_TX_FAST_FREE_AHEAD		64
 
+#define HNS3_UINT8_BIT			8
+#define HNS3_UINT16_BIT			16
+#define HNS3_UINT32_BIT			32
+
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
 #define HNS3_2K_BD_BUF_SIZE	2048
@@ -132,6 +136,13 @@
 #define HNS3_L3_LEN_UNIT			2UL
 #define HNS3_L4_LEN_UNIT			2UL
 
+#define HNS3_TXD_DEFAULT_BDTYPE		0
+#define HNS3_TXD_VLD_CMD		(0x1 << HNS3_TXD_VLD_B)
+#define HNS3_TXD_FE_CMD			(0x1 << HNS3_TXD_FE_B)
+#define HNS3_TXD_DEFAULT_VLD_FE_BDTYPE		\
+		(HNS3_TXD_VLD_CMD | HNS3_TXD_FE_CMD | HNS3_TXD_DEFAULT_BDTYPE)
+#define HNS3_TXD_SEND_SIZE_SHIFT	16
+
 enum hns3_pkt_l2t_type {
 	HNS3_L2_TYPE_UNICAST,
 	HNS3_L2_TYPE_MULTICAST,
@@ -317,9 +328,13 @@ struct hns3_tx_queue {
 	 * all descriptors are cleared. and then free all mbufs in the batch.
 	 * - tx_rs_thresh
 	 *   Number of mbufs released at a time.
-
+	 *
+	 * - free
+	 *   Tx mbuf free array used for preserving temporarily address of mbuf
+	 *   released back to mempool, when releasing mbuf in batches.
 	 */
 	uint16_t tx_rs_thresh;
+	struct rte_mbuf **free;
 
 	/*
 	 * port based vlan configuration state.
@@ -558,6 +573,8 @@ uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 			       uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+							uint16_t nb_pkts);
 int hns3_tx_burst_mode_get(struct rte_eth_dev *dev,
 			   __rte_unused uint16_t queue_id,
 			   struct rte_eth_burst_mode *mode);
@@ -577,6 +594,7 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c
new file mode 100644
index 0000000..1154b6f
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec.c
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#include <rte_io.h>
+#include <rte_ethdev_driver.h>
+
+#include "hns3_ethdev.h"
+#include "hns3_rxtx.h"
+#include "hns3_rxtx_vec.h"
+
+#if defined RTE_ARCH_ARM64
+#include "hns3_rxtx_vec_neon.h"
+#endif
+
+int
+hns3_tx_check_vec_support(struct rte_eth_dev *dev)
+{
+	struct rte_eth_txmode *txmode = &dev->data->dev_conf.txmode;
+
+	/* Only support DEV_TX_OFFLOAD_MBUF_FAST_FREE */
+	if (txmode->offloads != DEV_TX_OFFLOAD_MBUF_FAST_FREE)
+		return -ENOTSUP;
+
+	return 0;
+}
+
+uint16_t
+hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, new_burst;
+
+		new_burst = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = hns3_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+						new_burst);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < new_burst)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
new file mode 100644
index 0000000..90679bf
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#ifndef _HNS3_RXTX_VEC_H_
+#define _HNS3_RXTX_VEC_H_
+
+#include "hns3_rxtx.h"
+#include "hns3_ethdev.h"
+
+static inline void
+hns3_tx_free_buffers(struct hns3_tx_queue *txq)
+{
+	struct rte_mbuf **free = txq->free;
+	struct hns3_entry *tx_entry;
+	struct hns3_desc *tx_desc;
+	struct rte_mbuf *m;
+	int nb_free = 0;
+	int i;
+
+	/*
+	 * All mbufs can be released only when the VLD bits of all
+	 * descriptors in a batch are cleared.
+	 */
+	tx_desc = &txq->tx_ring[txq->next_to_clean];
+	for (i = 0; i < txq->tx_rs_thresh; i++, tx_desc++) {
+		if (tx_desc->tx.tp_fe_sc_vld_ra_ri &
+				rte_le_to_cpu_16(BIT(HNS3_TXD_VLD_B)))
+			return;
+	}
+
+	tx_entry = &txq->sw_ring[txq->next_to_clean];
+	for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
+		m = rte_pktmbuf_prefree_seg(tx_entry->mbuf);
+		tx_entry->mbuf = NULL;
+
+		if (m == NULL)
+			continue;
+
+		if (nb_free && m->pool != free[0]->pool) {
+			rte_mempool_put_bulk(free[0]->pool, (void **)free,
+					     nb_free);
+			nb_free = 0;
+		}
+		free[nb_free++] = m;
+	}
+
+	if (nb_free)
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+
+	/* Update numbers of available descriptor due to buffer freed */
+	txq->tx_bd_ready += txq->tx_rs_thresh;
+	txq->next_to_clean += txq->tx_rs_thresh;
+	if (txq->next_to_clean >= txq->nb_tx_desc)
+		txq->next_to_clean = 0;
+}
+#endif /* _HNS3_RXTX_VEC_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
new file mode 100644
index 0000000..2bd2b35
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#ifndef _HNS3_RXTX_VEC_NEON_H_
+#define _HNS3_RXTX_VEC_NEON_H_
+
+#include <arm_neon.h>
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static inline void
+hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
+{
+	uint64x2_t val1 = { pkt->buf_iova + pkt->data_off,
+		((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT };
+	uint64x2_t val2 = { 0,
+		((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT };
+	vst1q_u64((uint64_t *)&desc->addr, val1);
+	vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
+}
+
+static uint16_t
+hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
+			  struct rte_mbuf **__restrict tx_pkts,
+			  uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
+	volatile struct hns3_desc *tx_desc;
+	struct hns3_entry *tx_entry;
+	uint16_t next_to_use;
+	uint16_t nb_commit;
+	uint16_t nb_tx;
+	uint16_t n, i;
+
+	if (txq->tx_bd_ready < txq->tx_free_thresh)
+		hns3_tx_free_buffers(txq);
+
+	nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
+	if (unlikely(nb_commit == 0)) {
+		txq->queue_full_cnt++;
+		return 0;
+	}
+	nb_tx = nb_commit;
+
+	next_to_use = txq->next_to_use;
+	tx_desc = &txq->tx_ring[next_to_use];
+	tx_entry = &txq->sw_ring[next_to_use];
+
+	/*
+	 * We need to deal with n descriptors first for better performance,
+	 * if nb_commit is greater than the difference between txq->nb_tx_desc
+	 * and next_to_use in sw_ring and tx_ring.
+	 */
+	n = txq->nb_tx_desc - next_to_use;
+	if (nb_commit >= n) {
+		for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
+			hns3_vec_tx(tx_desc, *tx_pkts);
+			tx_entry[i].mbuf = *tx_pkts;
+		}
+
+		nb_commit -= n;
+		next_to_use = 0;
+		tx_desc = &txq->tx_ring[next_to_use];
+		tx_entry = &txq->sw_ring[next_to_use];
+	}
+
+	for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
+		hns3_vec_tx(tx_desc, *tx_pkts);
+		tx_entry[i].mbuf = *tx_pkts;
+	}
+
+	next_to_use += nb_commit;
+	txq->next_to_use = next_to_use;
+	txq->tx_bd_ready -= nb_tx;
+
+	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
+
+	return nb_tx;
+}
+#endif /* _HNS3_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/hns3/meson.build b/drivers/net/hns3/meson.build
index e01e6ce..19aee71 100644
--- a/drivers/net/hns3/meson.build
+++ b/drivers/net/hns3/meson.build
@@ -27,4 +27,8 @@ sources = files('hns3_cmd.c',
 	'hns3_stats.c',
 	'hns3_mp.c')
 
+if (dpdk_conf.has('RTE_ARCH_ARM64'))
+	sources += files('hns3_rxtx_vec.c')
+endif
+
 deps += ['hash']
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 6/8] net/hns3: add vector Rx burst with NEON instructions
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (4 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds NEON vector instructions to optimize Rx burst process.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c        |   1 +
 drivers/net/hns3/hns3_ethdev.h        |   1 +
 drivers/net/hns3/hns3_ethdev_vf.c     |   1 +
 drivers/net/hns3/hns3_rxtx.c          |  94 +++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          |  35 +++++-
 drivers/net/hns3/hns3_rxtx_vec.c      | 167 ++++++++++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec.h      |  20 ++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h | 203 ++++++++++++++++++++++++++++++++++
 8 files changed, 514 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 68239f5..0727c6d 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 098b6ce..fd6a9f9 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -643,6 +643,7 @@ struct hns3_adapter {
 	};
 
 	bool rx_simple_allowed;
+	bool rx_vec_allowed;
 	bool tx_simple_allowed;
 	bool tx_vec_allowed;
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index f3e6aea..93f2c93 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index a537fbe..03d69b1 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -41,9 +41,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 	if (rxq->sw_ring == NULL)
 		return;
 
-	for (i = 0; i < rxq->nb_rx_desc; i++)
-		if (rxq->sw_ring[i].mbuf)
-			rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+	if (rxq->rx_rearm_nb == 0) {
+		for (i = 0; i < rxq->nb_rx_desc; i++) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	} else {
+		for (i = rxq->next_to_use;
+		     i != rxq->rx_rearm_start;
+		     i = (i + 1) % rxq->nb_rx_desc) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	}
 
 	for (i = 0; i < rxq->bulk_mbuf_num; i++)
 		rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
@@ -661,10 +671,13 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	}
 
 	rxq->next_to_use = 0;
+	rxq->rx_rearm_start = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	hns3_init_rx_queue_hw(rxq);
+	hns3_rxq_vec_setup(rxq);
 
 	return 0;
 }
@@ -678,6 +691,8 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -860,6 +875,40 @@ hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue)
 	return 0;
 }
 
+/*
+ * Iterate over all Rx Queue, and call the callback() function for each Rx
+ * queue.
+ *
+ * @param[in] dev
+ *   The target eth dev.
+ * @param[in] callback
+ *   The function to call for each queue.
+ *   if callback function return nonzero will stop iterate and return it's value
+ * @param[in] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   0 on success, otherwise with errno set.
+ */
+int
+hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg)
+{
+	uint32_t i;
+	int ret;
+
+	if (dev->data->rx_queues == NULL)
+		return -EINVAL;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		ret = callback(dev->data->rx_queues[i], arg);
+		if (ret != 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void*
 hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 			    struct hns3_queue_info *q_info)
@@ -880,7 +929,13 @@ hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 	/* Allocate rx ring hardware descriptors. */
 	rxq->queue_id = q_info->idx;
 	rxq->nb_rx_desc = q_info->nb_desc;
-	rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc);
+
+	/*
+	 * Allocate a litter more memory because rx vector functions
+	 * don't check boundaries each time.
+	 */
+	rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_desc);
 	rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx,
 					 rx_desc, HNS3_RING_BASE_ALIGN,
 					 q_info->socket_id);
@@ -1329,7 +1384,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
 	rxq->rx_deferred_start = conf->rx_deferred_start;
 
-	rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
+	rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_entry);
 	rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len,
 					  RTE_CACHE_LINE_SIZE, socket_id);
 	if (rxq->sw_ring == NULL) {
@@ -1340,6 +1396,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	rxq->port_id = dev->data->port_id;
@@ -1431,7 +1489,8 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	};
 
 	if (dev->rx_pkt_burst == hns3_recv_pkts ||
-	    dev->rx_pkt_burst == hns3_recv_scattered_pkts)
+	    dev->rx_pkt_burst == hns3_recv_scattered_pkts ||
+	    dev->rx_pkt_burst == hns3_recv_pkts_vec)
 		return ptypes;
 
 	return NULL;
@@ -1915,6 +1974,25 @@ hns3_recv_scattered_pkts(void *rx_queue,
 	return nb_rx;
 }
 
+void __rte_weak
+hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq)
+{
+}
+
+int __rte_weak
+hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+	return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_recv_pkts_vec(__rte_unused void *tx_queue,
+		   __rte_unused struct rte_mbuf **tx_pkts,
+		   __rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
+
 int
 hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		       struct rte_eth_burst_mode *mode)
@@ -1925,6 +2003,7 @@ hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 	} burst_infos[] = {
 		{ hns3_recv_pkts,		"Scalar" },
 		{ hns3_recv_scattered_pkts,	"Scalar Scattered" },
+		{ hns3_recv_pkts_vec,		"Vector Neon" },
 	};
 
 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
@@ -1949,6 +2028,9 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
 	struct hns3_adapter *hns = dev->data->dev_private;
 	uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
 
+	if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0)
+		return hns3_recv_pkts_vec;
+
 	if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
 	    (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
 		return hns3_recv_pkts;
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index c5a510b..a629be9 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -17,6 +17,18 @@
 #define HNS3_DEFAULT_TX_RS_THRESH	32
 #define HNS3_TX_FAST_FREE_AHEAD		64
 
+#define HNS3_DEFAULT_RX_BURST		32
+#if (HNS3_DEFAULT_RX_BURST > 64)
+#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n"
+#endif
+#define HNS3_DEFAULT_DESCS_PER_LOOP	4
+#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP	8
+#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP)
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_DEFAULT_DESCS_PER_LOOP
+#else
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_SVE_DEFAULT_DESCS_PER_LOOP
+#endif
+#define HNS3_DEFAULT_RXQ_REARM_THRESH	64
 #define HNS3_UINT8_BIT			8
 #define HNS3_UINT16_BIT			16
 #define HNS3_UINT32_BIT			32
@@ -236,7 +248,13 @@ struct hns3_desc {
 					uint16_t ot_vlan_tag;
 				};
 			};
-			uint32_t bd_base_info;
+			union {
+				uint32_t bd_base_info;
+				struct {
+					uint16_t bdtype_vld_udp0;
+					uint16_t fe_lum_crcp_l3l4p;
+				};
+			};
 		} rx;
 	};
 } __rte_packed;
@@ -270,7 +288,8 @@ struct hns3_rx_queue {
 	uint16_t rx_free_thresh;
 	uint16_t next_to_use;    /* index of next BD to be polled */
 	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
-
+	uint16_t rx_rearm_start; /* index of BD that driver re-arming from */
+	uint16_t rx_rearm_nb;    /* number of remaining BDs to be re-armed */
 	/*
 	 * port based vlan configuration state.
 	 * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -292,6 +311,11 @@ struct hns3_rx_queue {
 
 	struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
 	uint16_t bulk_mbuf_num;
+
+	/* offset_table: used for vector, to solve execute re-order problem */
+	uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
+	uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
+	struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
 };
 
 struct hns3_tx_queue {
@@ -554,6 +578,8 @@ int hns3_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
 void hns3_enable_all_queues(struct hns3_hw *hw, bool en);
 int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue);
 int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue);
+int hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg);
 void hns3_dev_release_mbufs(struct hns3_adapter *hns);
 int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 			unsigned int socket, const struct rte_eth_rxconf *conf,
@@ -564,9 +590,12 @@ uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 				  uint16_t nb_pkts);
+uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts);
 int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
 			   __rte_unused uint16_t queue_id,
 			   struct rte_eth_burst_mode *mode);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -594,7 +623,9 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
+void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c
index 1154b6f..a26c83d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.c
+++ b/drivers/net/hns3/hns3_rxtx_vec.c
@@ -45,3 +45,170 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 	return nb_tx;
 }
+
+static inline void
+hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq)
+{
+#define REARM_LOOP_STEP_NUM	4
+	struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start];
+	struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start;
+	uint64_t dma_addr;
+	int i;
+
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
+					  HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) {
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
+		return;
+	}
+
+	for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM,
+		rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) {
+		if (likely(i <
+			HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) {
+			rte_prefetch_non_temporal(rxep[4].mbuf);
+			rte_prefetch_non_temporal(rxep[5].mbuf);
+			rte_prefetch_non_temporal(rxep[6].mbuf);
+			rte_prefetch_non_temporal(rxep[7].mbuf);
+		}
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf);
+		rxdp[0].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[0].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf);
+		rxdp[1].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[1].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf);
+		rxdp[2].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[2].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf);
+		rxdp[3].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[3].rx.bd_base_info = 0;
+	}
+
+	rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH;
+	if (rxq->rx_rearm_start >= rxq->nb_rx_desc)
+		rxq->rx_rearm_start = 0;
+
+	rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH;
+
+	hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH);
+}
+
+uint16_t
+hns3_recv_pkts_vec(void *__restrict rx_queue,
+		   struct rte_mbuf **__restrict rx_pkts,
+		   uint16_t nb_pkts)
+{
+	struct hns3_rx_queue *rxq = rx_queue;
+	struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use];
+	uint64_t bd_err_mask;  /* bit mask indicate whick pkts is error */
+	uint16_t nb_rx;
+
+	nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP);
+
+	rte_prefetch_non_temporal(rxdp);
+
+	if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH)
+		hns3_rxq_rearm_mbuf(rxq);
+
+	if (unlikely(!(rxdp->rx.bd_base_info &
+			rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B))))
+		return 0;
+
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf);
+
+	bd_err_mask = 0;
+	nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask);
+	if (unlikely(bd_err_mask))
+		nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask);
+
+	return nb_rx;
+}
+
+static void
+hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+}
+
+void
+hns3_rxq_vec_setup(struct hns3_rx_queue *rxq)
+{
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc];
+	unsigned int i;
+
+	memset(&rxq->rx_ring[rxq->nb_rx_desc], 0,
+		sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST);
+
+	memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf));
+	for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++)
+		sw_ring[i].mbuf = &rxq->fake_mbuf;
+
+	hns3_rxq_vec_setup_rearm_data(rxq);
+
+	memset(rxq->offset_table, 0, sizeof(rxq->offset_table));
+}
+
+#ifndef RTE_LIBRTE_IEEE1588
+static int
+hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg)
+{
+	uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH +
+				HNS3_DEFAULT_RX_BURST;
+
+	if (rxq->nb_rx_desc < min_vec_bds)
+		return -ENOTSUP;
+
+	if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH)
+		return -ENOTSUP;
+
+	RTE_SET_USED(arg);
+	return 0;
+}
+#endif
+
+int
+hns3_rx_check_vec_support(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO |
+				 DEV_RX_OFFLOAD_VLAN;
+
+	if (dev->data->scattered_rx)
+		return -ENOTSUP;
+
+	if (fconf->mode != RTE_FDIR_MODE_NONE)
+		return -ENOTSUP;
+
+	if (rxmode->offloads & offloads_mask)
+		return -ENOTSUP;
+
+	if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0)
+		return -ENOTSUP;
+
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	return -ENOTSUP;
+#endif
+}
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
index 90679bf..c6df36d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.h
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -54,4 +54,24 @@ hns3_tx_free_buffers(struct hns3_tx_queue *txq)
 	if (txq->next_to_clean >= txq->nb_tx_desc)
 		txq->next_to_clean = 0;
 }
+
+static inline uint16_t
+hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts,
+			uint16_t nb_pkts,
+			uint64_t pkt_err_mask)
+{
+	uint16_t count, i;
+	uint64_t mask;
+
+	count = 0;
+	for (i = 0; i < nb_pkts; i++) {
+		mask = ((uint64_t)1u) << i;
+		if (pkt_err_mask & mask)
+			rte_pktmbuf_free_seg(rx_pkts[i]);
+		else
+			rx_pkts[count++] = rx_pkts[i];
+	}
+
+	return count;
+}
 #endif /* _HNS3_RXTX_VEC_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index 2bd2b35..700ee8d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -78,4 +78,207 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 
 	return nb_tx;
 }
+
+static inline uint32_t
+hns3_desc_parse_field(struct hns3_rx_queue *rxq,
+		      struct hns3_entry *sw_ring,
+		      struct hns3_desc *rxdp,
+		      uint32_t   bd_vld_num)
+{
+	uint32_t l234_info, ol_info, bd_base_info;
+	struct rte_mbuf *pkt;
+	uint32_t retcode = 0;
+	uint32_t cksum_err;
+	int ret, i;
+
+	for (i = 0; i < (int)bd_vld_num; i++) {
+		pkt = sw_ring[i].mbuf;
+
+		/* init rte_mbuf.rearm_data last 64-bit */
+		pkt->ol_flags = PKT_RX_RSS_HASH;
+
+		l234_info = rxdp[i].rx.l234_info;
+		ol_info = rxdp[i].rx.ol_info;
+		bd_base_info = rxdp[i].rx.bd_base_info;
+		ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
+					 l234_info, &cksum_err);
+		if (unlikely(ret)) {
+			retcode |= 1u << i;
+			continue;
+		}
+
+		pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+			hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
+					       cksum_err);
+	}
+
+	return retcode;
+}
+
+static inline uint16_t
+hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
+		    struct rte_mbuf **__restrict rx_pkts,
+		    uint16_t nb_pkts,
+		    uint64_t *bd_err_mask)
+{
+	uint16_t rx_id = rxq->next_to_use;
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
+	struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
+	uint32_t bd_valid_num, parse_retcode;
+	uint16_t nb_rx = 0;
+	int pos, offset;
+
+	/* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
+	uint8x16_t shuf_rx_desc_fields_msk = {
+		0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
+		22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
+		20, 21,	                 /* size to rte_mbuf.data_len */
+		0xff, 0xff,	         /* rte_mbuf.vlan_tci init zero */
+		8, 9, 10, 11,	         /* rx.rss_hash to rte_mbuf.hash.rss */
+	};
+
+	uint16x8_t crc_adjust = {
+		0, 0,         /* ignore pkt_type field */
+		rxq->crc_len, /* sub crc on pkt_len */
+		0,            /* ignore high-16bits of pkt_len */
+		rxq->crc_len, /* sub crc on data_len */
+		0, 0, 0,      /* ignore non-length fields */
+	};
+
+	for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
+				     rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
+		uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
+		uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint64x2_t mbp1, mbp2;
+		uint16x4_t bd_vld = {0};
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* calc how many bd valid */
+		bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
+		bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
+		bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
+		bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
+
+		/* load 2 mbuf pointer */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		bd_vld = vshl_n_u16(bd_vld,
+				    HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
+		bd_vld = vreinterpret_u16_s16(
+				vshr_n_s16(vreinterpret_s16_u16(bd_vld),
+					   HNS3_UINT16_BIT - 1));
+		stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
+
+		/* load 2 mbuf pointer again */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		if (likely(stat == 0))
+			bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
+		else
+			bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
+		if (bd_valid_num == 0)
+			break;
+
+		/* use offset to control below data load oper ordering */
+		offset = rxq->offset_table[bd_valid_num];
+
+		/* store 2 mbuf pointer into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* read first two descs */
+		descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
+		descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
+
+		/* store 2 mbuf pointer into rx_pkts again */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* read remains two descs */
+		descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
+		descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
+
+		pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
+		pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
+		pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
+		pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
+
+		/* pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_rx_desc_fields_msk);
+		pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_rx_desc_fields_msk);
+
+		/* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 1,2 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+
+		pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
+		pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
+		pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
+		pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
+
+		/* pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_rx_desc_fields_msk);
+		pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_rx_desc_fields_msk);
+
+		/* pkt 1,2 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
+			 pkt_mb1);
+		vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
+			 pkt_mb2);
+
+		/* pkt 3,4 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+
+		/* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 3,4 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
+			 pkt_mb3);
+		vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
+			 pkt_mb4);
+
+		rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
+
+		parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
+			&rxdp[offset], bd_valid_num);
+		if (unlikely(parse_retcode))
+			(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
+
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
+
+		nb_rx += bd_valid_num;
+		if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
+			break;
+	}
+
+	rxq->rx_rearm_nb += nb_rx;
+	rxq->next_to_use += nb_rx;
+	if (rxq->next_to_use >= rxq->nb_rx_desc)
+		rxq->next_to_use = 0;
+
+	return nb_rx;
+}
 #endif /* _HNS3_RXTX_VEC_NEON_H_ */
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 7/8] net/hns3: add restriction on setting VF MTU
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (5 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

when Rx of scattered packets is off, we have some possibility of using
vector Rx process function or simple Rx functions in hns3 PMD driver.
If the input MTU is increased and the maximum length of received packets
is greater than the length of a buffer for Rx packets, the hardware network
engine needs to use multiple BDs and buffers to store these packets. This
will cause problems when still using vector Rx process function or simple
Rx function to receiving packets. So, when Rx of scattered packets is off
and device is started, it is not permitted to increase MTU so that the
maximum length of Rx packets is greater than Rx buffer length.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev_vf.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 93f2c93..44e51b5 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -871,6 +871,25 @@ hns3vf_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
 		return -EIO;
 	}
 
+	/*
+	 * when Rx of scattered packets is off, we have some possibility of
+	 * using vector Rx process function or simple Rx functions in hns3 PMD
+	 * driver. If the input MTU is increased and the maximum length of
+	 * received packets is greater than the length of a buffer for Rx
+	 * packet, the hardware network engine needs to use multiple BDs and
+	 * buffers to store these packets. This will cause problems when still
+	 * using vector Rx process function or simple Rx function to receiving
+	 * packets. So, when Rx of scattered packets is off and device is
+	 * started, it is not permitted to increase MTU so that the maximum
+	 * length of Rx packets is greater than Rx buffer length.
+	 */
+	if (dev->data->dev_started && !dev->data->scattered_rx &&
+	    frame_size > hw->rx_buf_len) {
+		hns3_err(hw, "failed to set mtu because current is "
+			"not scattered rx mode");
+		return -EOPNOTSUPP;
+	}
+
 	rte_spinlock_lock(&hw->lock);
 	ret = hns3vf_config_mtu(hw, mtu);
 	if (ret) {
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH 8/8] net/hns3: fix segfault when Tx multiple buffer packets
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (6 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
@ 2020-09-07  9:08 ` Wei Hu (Xavier)
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-07  9:08 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: Chengchang Tang <tangchengchang@huawei.com>

Currently, there is a possibility that segment faults occur when sending
packets whose payloads are stored in multiple buffers based on hns3 network
engine. The related core dump information as follows:

Program terminated with signal 11, Segmentation fault.
0  hns3_reassemble_tx_pkts
2512                            temp = temp->next;
Missing separate debuginfos, use:
(gdb) bt
0  hns3_reassemble_tx_pkts
1  0x0000000000969c60 in hns3_check_non_tso_pkt
2  0x000000000096adbc in hns3_xmit_pkts
3  0x000000000050d4d0 in rte_eth_tx_burst
4  0x000000000050fca4 in pkt_burst_transmit
5  0x00000000004ca6b8 in run_pkt_fwd_on_lcore
6  0x00000000004ca7fc in start_pkt_forward_on_core
7  0x00000000006975a4 in eal_thread_loop
8  0x0000ffffa6f7fc48 in start_thread
9  0x0000ffffa6ed1600 in thread_start

The root cause is that hns3 PMD driver invokes the rte_pktmbuf_free_seg API
function to release the same rte_mbuf multiple times. The rte_mbuf pointer
is not set to NULL in the internal function hns3_rx_queue_release_mbufs
which is invoked during queue setup, stop and close. As a result the
rte_mbuf in Rx queues will be repeatedly released when the user application
setup queues or stop/start the dev for multiple times. Probably for
performance reasons, DPDK mempool lib does not check for the repeated
rte_mbuf releases. The Address of released rte_mbuf are directly stored
into the per lcore cache of the mempool. This makes the rte_mbufs obtained
from mempool by calling rte_mempool_get_bulk API function repetitively.
ultimately, it causes to access to a NULL pointer in PMD driver.

This patch fixes this problem by setting released mbuf pointer to NULL in
the internal function named hns3_rx_queue_release_mbuf. And the other
internal function named hns3_reassemble_tx_pkts is optimized to avoid a
similar problem.

Fixes: bba636698316 ("net/hns3: support Rx/Tx and related operations")
Cc: stable@dpdk.org

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 61 +++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 03d69b1..1a1f828 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -43,15 +43,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 
 	if (rxq->rx_rearm_nb == 0) {
 		for (i = 0; i < rxq->nb_rx_desc; i++) {
-			if (rxq->sw_ring[i].mbuf != NULL)
+			if (rxq->sw_ring[i].mbuf != NULL) {
 				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+				rxq->sw_ring[i].mbuf = NULL;
+			}
 		}
 	} else {
 		for (i = rxq->next_to_use;
 		     i != rxq->rx_rearm_start;
 		     i = (i + 1) % rxq->nb_rx_desc) {
-			if (rxq->sw_ring[i].mbuf != NULL)
+			if (rxq->sw_ring[i].mbuf != NULL) {
 				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+				rxq->sw_ring[i].mbuf = NULL;
+			}
 		}
 	}
 
@@ -2368,37 +2372,24 @@ hns3_fill_first_desc(struct hns3_tx_queue *txq, struct hns3_desc *desc,
 	}
 }
 
-static int
-hns3_tx_alloc_mbufs(struct hns3_tx_queue *txq, struct rte_mempool *mb_pool,
-		    uint16_t nb_new_buf, struct rte_mbuf **alloc_mbuf)
+static inline int
+hns3_tx_alloc_mbufs(struct rte_mempool *mb_pool, uint16_t nb_new_buf,
+			struct rte_mbuf **alloc_mbuf)
 {
-	struct rte_mbuf *new_mbuf = NULL;
-	struct rte_eth_dev *dev;
-	struct rte_mbuf *temp;
-	struct hns3_hw *hw;
+#define MAX_NON_TSO_BD_PER_PKT 18
+	struct rte_mbuf *pkt_segs[MAX_NON_TSO_BD_PER_PKT];
 	uint16_t i;
 
 	/* Allocate enough mbufs */
-	for (i = 0; i < nb_new_buf; i++) {
-		temp = rte_pktmbuf_alloc(mb_pool);
-		if (unlikely(temp == NULL)) {
-			dev = &rte_eth_devices[txq->port_id];
-			hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-			hns3_err(hw, "Failed to alloc TX mbuf port_id=%d,"
-				     "queue_id=%d in reassemble tx pkts.",
-				     txq->port_id, txq->queue_id);
-			rte_pktmbuf_free(new_mbuf);
-			return -ENOMEM;
-		}
-		temp->next = new_mbuf;
-		new_mbuf = temp;
-	}
-
-	if (new_mbuf == NULL)
+	if (rte_mempool_get_bulk(mb_pool, (void **)pkt_segs, nb_new_buf))
 		return -ENOMEM;
 
-	new_mbuf->nb_segs = nb_new_buf;
-	*alloc_mbuf = new_mbuf;
+	for (i = 0; i < nb_new_buf - 1; i++)
+		pkt_segs[i]->next = pkt_segs[i + 1];
+
+	pkt_segs[nb_new_buf - 1]->next = NULL;
+	pkt_segs[0]->nb_segs = nb_new_buf;
+	*alloc_mbuf = pkt_segs[0];
 
 	return 0;
 }
@@ -2418,10 +2409,8 @@ hns3_pktmbuf_copy_hdr(struct rte_mbuf *new_pkt, struct rte_mbuf *old_pkt)
 }
 
 static int
-hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
-			struct rte_mbuf **new_pkt)
+hns3_reassemble_tx_pkts(struct rte_mbuf *tx_pkt, struct rte_mbuf **new_pkt)
 {
-	struct hns3_tx_queue *txq = tx_queue;
 	struct rte_mempool *mb_pool;
 	struct rte_mbuf *new_mbuf;
 	struct rte_mbuf *temp_new;
@@ -2433,7 +2422,6 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 	uint16_t len_s;
 	uint16_t len_d;
 	uint16_t len;
-	uint16_t i;
 	int ret;
 	char *s;
 	char *d;
@@ -2449,7 +2437,7 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 		last_buf_len = buf_size;
 
 	/* Allocate enough mbufs */
-	ret = hns3_tx_alloc_mbufs(txq, mb_pool, nb_new_buf, &new_mbuf);
+	ret = hns3_tx_alloc_mbufs(mb_pool, nb_new_buf, &new_mbuf);
 	if (ret)
 		return ret;
 
@@ -2458,12 +2446,9 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 	s = rte_pktmbuf_mtod(temp, char *);
 	len_s = rte_pktmbuf_data_len(temp);
 	temp_new = new_mbuf;
-	for (i = 0; i < nb_new_buf; i++) {
+	while (temp != NULL && temp_new != NULL) {
 		d = rte_pktmbuf_mtod(temp_new, char *);
-		if (i < nb_new_buf - 1)
-			buf_len = buf_size;
-		else
-			buf_len = last_buf_len;
+		buf_len = temp_new->next == NULL ? last_buf_len : buf_size;
 		len_d = buf_len;
 
 		while (len_d) {
@@ -2924,7 +2909,7 @@ hns3_check_non_tso_pkt(uint16_t nb_buf, struct rte_mbuf **m_seg,
 
 	if (unlikely(nb_buf > HNS3_MAX_NON_TSO_BD_PER_PKT)) {
 		txq->exceed_limit_bd_pkt_cnt++;
-		ret = hns3_reassemble_tx_pkts(txq, tx_pkt, &new_pkt);
+		ret = hns3_reassemble_tx_pkts(tx_pkt, &new_pkt);
 		if (ret) {
 			txq->exceed_limit_bd_reassem_fail++;
 			return ret;
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx
  2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                   ` (7 preceding siblings ...)
  2020-09-07  9:08 ` [dpdk-dev] [PATCH 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
@ 2020-09-09  9:23 ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
                     ` (8 more replies)
  8 siblings, 9 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

This series are updates for Rx/Tx process.

Chengchang Tang (1):
  net/hns3: fix segfault when Tx multiple buffer packets

Wei Hu (Xavier) (7):
  net/hns3: report Rx free threshold
  net/hns3: reduce address calculation in Rx
  net/hns3: add simple Rx process function
  net/hns3: add simple Tx process function
  net/hns3: add vector Tx burst with NEON instructions
  net/hns3: add vector Rx burst with NEON instructions
  net/hns3: add restriction on setting VF MTU

 drivers/net/hns3/hns3_ethdev.c        |   18 +-
 drivers/net/hns3/hns3_ethdev.h        |   54 +-
 drivers/net/hns3/hns3_ethdev_vf.c     |   41 +-
 drivers/net/hns3/hns3_rxtx.c          | 1014 +++++++++++++++++++++++----------
 drivers/net/hns3/hns3_rxtx.h          |  237 +++++++-
 drivers/net/hns3/hns3_rxtx_vec.c      |  214 +++++++
 drivers/net/hns3/hns3_rxtx_vec.h      |   77 +++
 drivers/net/hns3/hns3_rxtx_vec_neon.h |  288 ++++++++++
 drivers/net/hns3/hns3_stats.c         |   16 +-
 drivers/net/hns3/meson.build          |    4 +
 10 files changed, 1645 insertions(+), 318 deletions(-)
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.c
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.h
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec_neon.h

-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 1/8] net/hns3: report Rx free threshold
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch reports .rx_free_thresh value in the .dev_infos_get ops
implementation function named hns3_dev_infos_get and hns3vf_dev_infos_get.
In addition, the name of the member variable of struct hns3_rx_queue is
modified and comments are added to improve code readability.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |  2 ++
 drivers/net/hns3/hns3_ethdev_vf.c |  2 ++
 drivers/net/hns3/hns3_rxtx.c      | 30 ++++++++++++------------------
 drivers/net/hns3/hns3_rxtx.h      |  9 ++++++---
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 9dd0d9c..eb5879b 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2501,12 +2501,14 @@ hns3_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 	};
 
 	info->default_rxconf = (struct rte_eth_rxconf) {
+		.rx_free_thresh = HNS3_DEFAULT_RX_FREE_THRESH,
 		/*
 		 * If there are no available Rx buffer descriptors, incoming
 		 * packets are always dropped by hardware based on hns3 network
 		 * engine.
 		 */
 		.rx_drop_en = 1,
+		.offloads = 0,
 	};
 
 	info->vmdq_queue_num = 0;
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 7fd0e6a..7226cc5 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -944,12 +944,14 @@ hns3vf_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 	};
 
 	info->default_rxconf = (struct rte_eth_rxconf) {
+		.rx_free_thresh = HNS3_DEFAULT_RX_FREE_THRESH,
 		/*
 		 * If there are no available Rx buffer descriptors, incoming
 		 * packets are always dropped by hardware based on hns3 network
 		 * engine.
 		 */
 		.rx_drop_en = 1,
+		.offloads = 0,
 	};
 
 	info->vmdq_queue_num = 0;
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 308d0a6..fe2a7a4 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -652,8 +652,7 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	}
 
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 
 	return 0;
@@ -667,8 +666,7 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -1303,10 +1301,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
 	rxq->hns = hns;
 	rxq->mb_pool = mp;
-	if (conf->rx_free_thresh <= 0)
-		rxq->rx_free_thresh = DEFAULT_RX_FREE_THRESH;
-	else
-		rxq->rx_free_thresh = conf->rx_free_thresh;
+	rxq->rx_free_thresh = (conf->rx_free_thresh > 0) ?
+		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
 	rxq->rx_deferred_start = conf->rx_deferred_start;
 
 	rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
@@ -1319,8 +1315,7 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	}
 
 	rxq->next_to_use = 0;
-	rxq->next_to_clean = 0;
-	rxq->nb_rx_hold = 0;
+	rxq->rx_free_hold = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	rxq->port_id = dev->data->port_id;
@@ -1656,11 +1651,11 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	nb_rx_bd = 0;
 	rxq = rx_queue;
 
-	rx_id = rxq->next_to_clean;
+	rx_id = rxq->next_to_use;
 	rx_ring = rxq->rx_ring;
+	sw_ring = rxq->sw_ring;
 	first_seg = rxq->pkt_first_seg;
 	last_seg = rxq->pkt_last_seg;
-	sw_ring = rxq->sw_ring;
 
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
@@ -1843,16 +1838,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		first_seg = NULL;
 	}
 
-	rxq->next_to_clean = rx_id;
+	rxq->next_to_use = rx_id;
 	rxq->pkt_first_seg = first_seg;
 	rxq->pkt_last_seg = last_seg;
 
-	nb_rx_bd = nb_rx_bd + rxq->nb_rx_hold;
-	if (nb_rx_bd > rxq->rx_free_thresh) {
-		hns3_clean_rx_buffers(rxq, nb_rx_bd);
-		nb_rx_bd = 0;
+	rxq->rx_free_hold += nb_rx_bd;
+	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+		hns3_clean_rx_buffers(rxq, rxq->rx_free_hold);
+		rxq->rx_free_hold = 0;
 	}
-	rxq->nb_rx_hold = nb_rx_bd;
 
 	return nb_rx;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 4b3269b..a2d6514 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -10,6 +10,7 @@
 #define HNS3_DEFAULT_RING_DESC  1024
 #define	HNS3_ALIGN_RING_DESC	32
 #define HNS3_RING_BASE_ALIGN	128
+#define HNS3_DEFAULT_RX_FREE_THRESH	32
 
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
@@ -243,12 +244,14 @@ struct hns3_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_rx_desc;
-	uint16_t nb_rx_hold;
-	uint16_t rx_tail;
-	uint16_t next_to_clean;
 	uint16_t next_to_use;
 	uint16_t rx_buf_len;
+	/*
+	 * threshold for the number of BDs waited to passed to hardware. If the
+	 * number exceeds the threshold, driver will pass these BDs to hardware.
+	 */
 	uint16_t rx_free_thresh;
+	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
 
 	/*
 	 * port based vlan configuration state.
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 2/8] net/hns3: reduce address calculation in Rx
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds the internal function named hns3_write_reg_opt to avoid
performance loss from address calculation during register access in the
'.rx_pkt_burst' ops implementation function named hns3_recv_pkts.

In addition, because hardware always access register in little-endian mode
based on hns3 network engine, so driver should also call rte_cpu_to_le_32
to convert data in little-endian mode before writing register and call
rte_le_to_cpu_32 to convert data after reading from register. Here the
driver encapsulates the data conversion operation in the register
read/write operation function as below:
  hns3_write_reg
  hns3_write_reg_opt
  hns3_read_reg
Therefore, when calling these functions, conversion is not required again.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.h | 29 +++++++++++++++++++++++++++--
 drivers/net/hns3/hns3_rxtx.c   | 14 +++-----------
 drivers/net/hns3/hns3_rxtx.h   |  1 +
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 9e49e28..3cb0535 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -708,14 +708,39 @@ struct hns3_adapter {
 
 #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
+/*
+ * Because hardware always access register in little-endian mode based on hns3
+ * network engine, so driver should also call rte_cpu_to_le_32 to convert data
+ * in little-endian mode before writing register and call rte_le_to_cpu_32 to
+ * convert data after reading from register.
+ *
+ * Here the driver encapsulates the data conversion operation in the register
+ * read/write operation function as below:
+ *   hns3_write_reg
+ *   hns3_write_reg_opt
+ *   hns3_read_reg
+ * Therefore, when calling these functions, conversion is not required again.
+ */
 static inline void hns3_write_reg(void *base, uint32_t reg, uint32_t value)
 {
-	rte_write32(value, (volatile void *)((char *)base + reg));
+	rte_write32(rte_cpu_to_le_32(value),
+		    (volatile void *)((char *)base + reg));
+}
+
+/*
+ * The optimized function for writing registers used in the '.rx_pkt_burst' and
+ * '.tx_pkt_burst' ops implementation function.
+ */
+static inline void hns3_write_reg_opt(volatile void *addr, uint32_t value)
+{
+	rte_io_wmb();
+	rte_write32_relaxed(rte_cpu_to_le_32(value), addr);
 }
 
 static inline uint32_t hns3_read_reg(void *base, uint32_t reg)
 {
-	return rte_read32((volatile void *)((char *)base + reg));
+	uint32_t read_val = rte_read32((volatile void *)((char *)base + reg));
+	return rte_le_to_cpu_32(read_val);
 }
 
 #define hns3_write_dev(a, reg, value) \
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index fe2a7a4..703b12a 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1323,6 +1323,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	rxq->configured = true;
 	rxq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
 				idx * HNS3_TQP_REG_SIZE);
+	rxq->io_head_reg = (volatile void *)((char *)rxq->io_base +
+			   HNS3_RING_RX_HEAD_REG);
 	rxq->rx_buf_len = rx_buf_size;
 	rxq->l2_errors = 0;
 	rxq->pkt_len_errors = 0;
@@ -1472,16 +1474,6 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	return NULL;
 }
 
-static void
-hns3_clean_rx_buffers(struct hns3_rx_queue *rxq, int count)
-{
-	rxq->next_to_use += count;
-	if (rxq->next_to_use >= rxq->nb_rx_desc)
-		rxq->next_to_use -= rxq->nb_rx_desc;
-
-	hns3_write_dev(rxq, HNS3_RING_RX_HEAD_REG, count);
-}
-
 static int
 hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
 		   uint32_t bd_base_info, uint32_t l234_info,
@@ -1844,7 +1836,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 	rxq->rx_free_hold += nb_rx_bd;
 	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
-		hns3_clean_rx_buffers(rxq, rxq->rx_free_hold);
+		hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
 		rxq->rx_free_hold = 0;
 	}
 
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index a2d6514..c1a34e2 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -231,6 +231,7 @@ struct hns3_entry {
 
 struct hns3_rx_queue {
 	void *io_base;
+	volatile void *io_head_reg;
 	struct hns3_adapter *hns;
 	struct rte_mempool *mb_pool;
 	struct hns3_desc *rx_ring;
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 3/8] net/hns3: add simple Rx process function
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds simple Rx process function and support chose Rx function
by real Rx offloads capability.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |   7 +-
 drivers/net/hns3/hns3_ethdev.h    |  21 ++
 drivers/net/hns3/hns3_ethdev_vf.c |  11 +-
 drivers/net/hns3/hns3_rxtx.c      | 546 +++++++++++++++++++++++---------------
 drivers/net/hns3/hns3_rxtx.h      | 140 +++++++++-
 drivers/net/hns3/hns3_stats.c     |  16 +-
 6 files changed, 506 insertions(+), 235 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index eb5879b..024dc19 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2351,6 +2351,8 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 	if (ret)
 		goto cfg_err;
 
+	hns->rx_simple_allowed = true;
+	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
 	return 0;
@@ -4746,6 +4748,7 @@ hns3_dev_start(struct rte_eth_dev *dev)
 	hw->adapter_state = HNS3_NIC_STARTED;
 	rte_spinlock_unlock(&hw->lock);
 
+	hns3_rx_scattered_calc(dev);
 	hns3_set_rxtx_function(dev);
 	hns3_mp_req_start_rxtx(dev);
 	rte_eal_alarm_set(HNS3_SERVICE_INTERVAL, hns3_service_handler, dev);
@@ -4844,6 +4847,7 @@ hns3_dev_stop(struct rte_eth_dev *dev)
 		hns3_dev_release_mbufs(hns);
 		hw->adapter_state = HNS3_NIC_CONFIGURED;
 	}
+	hns3_rx_scattered_reset(dev);
 	rte_eal_alarm_cancel(hns3_service_handler, dev);
 	rte_spinlock_unlock(&hw->lock);
 }
@@ -5514,6 +5518,7 @@ hns3_reset_service(void *param)
 }
 
 static const struct eth_dev_ops hns3_eth_dev_ops = {
+	.dev_configure      = hns3_dev_configure,
 	.dev_start          = hns3_dev_start,
 	.dev_stop           = hns3_dev_stop,
 	.dev_close          = hns3_dev_close,
@@ -5539,7 +5544,7 @@ static const struct eth_dev_ops hns3_eth_dev_ops = {
 	.rx_queue_intr_disable  = hns3_dev_rx_queue_intr_disable,
 	.rxq_info_get           = hns3_rxq_info_get,
 	.txq_info_get           = hns3_txq_info_get,
-	.dev_configure          = hns3_dev_configure,
+	.rx_burst_mode_get      = hns3_rx_burst_mode_get,
 	.flow_ctrl_get          = hns3_flow_ctrl_get,
 	.flow_ctrl_set          = hns3_flow_ctrl_set,
 	.priority_flow_ctrl_set = hns3_priority_flow_ctrl_set,
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 3cb0535..d93c5b2 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -433,6 +433,7 @@ struct hns3_hw {
 	uint16_t tqps_num;          /* num task queue pairs of this function */
 	uint16_t intr_tqps_num;     /* num queue pairs mapping interrupt */
 	uint16_t rss_size_max;      /* HW defined max RSS task queue */
+	uint16_t rx_buf_len;        /* hold min hardware rx buf len */
 	uint16_t num_tx_desc;       /* desc num of per tx queue */
 	uint16_t num_rx_desc;       /* desc num of per rx queue */
 	uint32_t mng_entry_num;     /* number of manager table entry */
@@ -575,6 +576,23 @@ struct hns3_mp_param {
 /* Key string for IPC. */
 #define HNS3_MP_NAME "net_hns3_mp"
 
+#define HNS3_L2TBL_NUM	4
+#define HNS3_L3TBL_NUM	16
+#define HNS3_L4TBL_NUM	16
+#define HNS3_OL3TBL_NUM	16
+#define HNS3_OL4TBL_NUM	16
+
+struct hns3_ptype_table {
+	uint32_t l2table[HNS3_L2TBL_NUM];
+	uint32_t l3table[HNS3_L3TBL_NUM];
+	uint32_t l4table[HNS3_L4TBL_NUM];
+	uint32_t inner_l2table[HNS3_L2TBL_NUM];
+	uint32_t inner_l3table[HNS3_L3TBL_NUM];
+	uint32_t inner_l4table[HNS3_L4TBL_NUM];
+	uint32_t ol3table[HNS3_OL3TBL_NUM];
+	uint32_t ol4table[HNS3_OL4TBL_NUM];
+};
+
 struct hns3_pf {
 	struct hns3_adapter *adapter;
 	bool is_main_pf;
@@ -623,6 +641,9 @@ struct hns3_adapter {
 		struct hns3_pf pf;
 		struct hns3_vf vf;
 	};
+
+	bool rx_simple_allowed;
+	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
 #define HNS3_DEV_SUPPORT_DCB_B			0x0
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 7226cc5..0f155d8 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -745,7 +745,8 @@ hns3vf_init_ring_with_vector(struct hns3_hw *hw)
 static int
 hns3vf_dev_configure(struct rte_eth_dev *dev)
 {
-	struct hns3_hw *hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
 	struct hns3_rss_conf *rss_cfg = &hw->rss_info;
 	struct rte_eth_conf *conf = &dev->data->dev_conf;
 	enum rte_eth_rx_mq_mode mq_mode = conf->rxmode.mq_mode;
@@ -820,6 +821,9 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 	if (ret)
 		goto cfg_err;
 
+	hns->rx_simple_allowed = true;
+	hns3_init_rx_ptype_tble(dev);
+
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 	return 0;
 
@@ -1875,6 +1879,7 @@ hns3vf_dev_stop(struct rte_eth_dev *dev)
 		hns3_dev_release_mbufs(hns);
 		hw->adapter_state = HNS3_NIC_CONFIGURED;
 	}
+	hns3_rx_scattered_reset(dev);
 	rte_eal_alarm_cancel(hns3vf_service_handler, dev);
 	rte_spinlock_unlock(&hw->lock);
 }
@@ -2111,6 +2116,7 @@ hns3vf_dev_start(struct rte_eth_dev *dev)
 	hw->adapter_state = HNS3_NIC_STARTED;
 	rte_spinlock_unlock(&hw->lock);
 
+	hns3_rx_scattered_calc(dev);
 	hns3_set_rxtx_function(dev);
 	hns3_mp_req_start_rxtx(dev);
 	rte_eal_alarm_set(HNS3VF_SERVICE_INTERVAL, hns3vf_service_handler, dev);
@@ -2508,6 +2514,7 @@ hns3vf_reinit_dev(struct hns3_adapter *hns)
 }
 
 static const struct eth_dev_ops hns3vf_eth_dev_ops = {
+	.dev_configure      = hns3vf_dev_configure,
 	.dev_start          = hns3vf_dev_start,
 	.dev_stop           = hns3vf_dev_stop,
 	.dev_close          = hns3vf_dev_close,
@@ -2533,7 +2540,7 @@ static const struct eth_dev_ops hns3vf_eth_dev_ops = {
 	.rx_queue_intr_disable  = hns3_dev_rx_queue_intr_disable,
 	.rxq_info_get       = hns3_rxq_info_get,
 	.txq_info_get       = hns3_txq_info_get,
-	.dev_configure      = hns3vf_dev_configure,
+	.rx_burst_mode_get  = hns3_rx_burst_mode_get,
 	.mac_addr_add       = hns3vf_add_mac_addr,
 	.mac_addr_remove    = hns3vf_remove_mac_addr,
 	.mac_addr_set       = hns3vf_set_default_mac_addr,
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 703b12a..10f0112 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -30,7 +30,7 @@
 #include "hns3_logs.h"
 
 #define HNS3_CFG_DESC_NUM(num)	((num) / 8 - 1)
-#define DEFAULT_RX_FREE_THRESH	32
+#define HNS3_RX_RING_PREFETCTH_MASK	3
 
 static void
 hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
@@ -38,13 +38,20 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 	uint16_t i;
 
 	/* Note: Fake rx queue will not enter here */
-	if (rxq->sw_ring) {
-		for (i = 0; i < rxq->nb_rx_desc; i++) {
-			if (rxq->sw_ring[i].mbuf) {
-				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
-				rxq->sw_ring[i].mbuf = NULL;
-			}
-		}
+	if (rxq->sw_ring == NULL)
+		return;
+
+	for (i = 0; i < rxq->nb_rx_desc; i++)
+		if (rxq->sw_ring[i].mbuf)
+			rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+
+	for (i = 0; i < rxq->bulk_mbuf_num; i++)
+		rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
+	rxq->bulk_mbuf_num = 0;
+
+	if (rxq->pkt_first_seg) {
+		rte_pktmbuf_free(rxq->pkt_first_seg);
+		rxq->pkt_first_seg = NULL;
 	}
 }
 
@@ -653,6 +660,8 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->pkt_first_seg = NULL;
+	rxq->pkt_last_seg = NULL;
 	hns3_init_rx_queue_hw(rxq);
 
 	return 0;
@@ -1243,6 +1252,33 @@ hns3_rx_buf_len_calc(struct rte_mempool *mp, uint16_t *rx_buf_len)
 	return 0;
 }
 
+static int
+hns3_rx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_rxconf *conf,
+			 struct rte_mempool *mp, uint16_t nb_desc,
+			 uint16_t *buf_size)
+{
+	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
+	    nb_desc % HNS3_ALIGN_RING_DESC) {
+		hns3_err(hw, "Number (%u) of rx descriptors is invalid",
+			 nb_desc);
+		return -EINVAL;
+	}
+
+	if (conf->rx_drop_en == 0)
+		hns3_warn(hw, "if no descriptors available, packets are always "
+			  "dropped and rx_drop_en (1) is fixed on");
+
+	if (hns3_rx_buf_len_calc(mp, buf_size)) {
+		hns3_err(hw, "rxq mbufs' data room size (%u) is not enough! "
+				"minimal data room size (%u).",
+				rte_pktmbuf_data_room_size(mp),
+				HNS3_MIN_BD_BUF_SIZE + RTE_PKTMBUF_HEADROOM);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int
 hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_rxconf *conf,
@@ -1254,24 +1290,16 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	struct hns3_rx_queue *rxq;
 	uint16_t rx_buf_size;
 	int rx_entry_len;
+	int ret;
 
 	if (dev->data->dev_started) {
 		hns3_err(hw, "rx_queue_setup after dev_start no supported");
 		return -EINVAL;
 	}
 
-	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
-	    nb_desc % HNS3_ALIGN_RING_DESC) {
-		hns3_err(hw, "Number (%u) of rx descriptors is invalid",
-			 nb_desc);
-		return -EINVAL;
-	}
-
-	if (conf->rx_drop_en == 0)
-		hns3_warn(hw, "if there are no available Rx descriptors,"
-			  "incoming packets are always dropped. input parameter"
-			  " conf->rx_drop_en(%u) is uneffective.",
-			  conf->rx_drop_en);
+	ret = hns3_rx_queue_conf_check(hw, conf, mp, nb_desc, &rx_buf_size);
+	if (ret)
+		return ret;
 
 	if (dev->data->rx_queues[idx]) {
 		hns3_rx_queue_release(dev->data->rx_queues[idx]);
@@ -1284,14 +1312,6 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	q_info.type = "hns3 RX queue";
 	q_info.ring_name = "rx_ring";
 
-	if (hns3_rx_buf_len_calc(mp, &rx_buf_size)) {
-		hns3_err(hw, "rxq mbufs' data room size:%u is not enough! "
-				"minimal data room size:%u.",
-				rte_pktmbuf_data_room_size(mp),
-				HNS3_MIN_BD_BUF_SIZE + RTE_PKTMBUF_HEADROOM);
-		return -EINVAL;
-	}
-
 	rxq = hns3_alloc_rxq_and_dma_zone(dev, &q_info);
 	if (rxq == NULL) {
 		hns3_err(hw,
@@ -1300,6 +1320,7 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	}
 
 	rxq->hns = hns;
+	rxq->ptype_tbl = &hns->ptype_tbl;
 	rxq->mb_pool = mp;
 	rxq->rx_free_thresh = (conf->rx_free_thresh > 0) ?
 		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
@@ -1328,10 +1349,10 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	rxq->rx_buf_len = rx_buf_size;
 	rxq->l2_errors = 0;
 	rxq->pkt_len_errors = 0;
-	rxq->l3_csum_erros = 0;
-	rxq->l4_csum_erros = 0;
-	rxq->ol3_csum_erros = 0;
-	rxq->ol4_csum_erros = 0;
+	rxq->l3_csum_errors = 0;
+	rxq->l4_csum_errors = 0;
+	rxq->ol3_csum_errors = 0;
+	rxq->ol4_csum_errors = 0;
 
 	/* CRC len set here is used for amending packet length */
 	if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
@@ -1339,6 +1360,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	else
 		rxq->crc_len = 0;
 
+	rxq->bulk_mbuf_num = 0;
+
 	rte_spinlock_lock(&hw->lock);
 	dev->data->rx_queues[idx] = rxq;
 	rte_spinlock_unlock(&hw->lock);
@@ -1346,104 +1369,40 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static inline uint32_t
-rxd_pkt_info_to_pkt_type(uint32_t pkt_info, uint32_t ol_info)
+void
+hns3_rx_scattered_reset(struct rte_eth_dev *dev)
 {
-#define HNS3_L2TBL_NUM	4
-#define HNS3_L3TBL_NUM	16
-#define HNS3_L4TBL_NUM	16
-#define HNS3_OL3TBL_NUM	16
-#define HNS3_OL4TBL_NUM	16
-	uint32_t pkt_type = 0;
-	uint32_t l2id, l3id, l4id;
-	uint32_t ol3id, ol4id;
-
-	static const uint32_t l2table[HNS3_L2TBL_NUM] = {
-		RTE_PTYPE_L2_ETHER,
-		RTE_PTYPE_L2_ETHER_QINQ,
-		RTE_PTYPE_L2_ETHER_VLAN,
-		RTE_PTYPE_L2_ETHER_VLAN
-	};
-
-	static const uint32_t l3table[HNS3_L3TBL_NUM] = {
-		RTE_PTYPE_L3_IPV4,
-		RTE_PTYPE_L3_IPV6,
-		RTE_PTYPE_L2_ETHER_ARP,
-		RTE_PTYPE_L2_ETHER,
-		RTE_PTYPE_L3_IPV4_EXT,
-		RTE_PTYPE_L3_IPV6_EXT,
-		RTE_PTYPE_L2_ETHER_LLDP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
-
-	static const uint32_t l4table[HNS3_L4TBL_NUM] = {
-		RTE_PTYPE_L4_UDP,
-		RTE_PTYPE_L4_TCP,
-		RTE_PTYPE_TUNNEL_GRE,
-		RTE_PTYPE_L4_SCTP,
-		RTE_PTYPE_L4_IGMP,
-		RTE_PTYPE_L4_ICMP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
-
-	static const uint32_t inner_l2table[HNS3_L2TBL_NUM] = {
-		RTE_PTYPE_INNER_L2_ETHER,
-		RTE_PTYPE_INNER_L2_ETHER_VLAN,
-		RTE_PTYPE_INNER_L2_ETHER_QINQ,
-		0
-	};
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
 
-	static const uint32_t inner_l3table[HNS3_L3TBL_NUM] = {
-		RTE_PTYPE_INNER_L3_IPV4,
-		RTE_PTYPE_INNER_L3_IPV6,
-		0,
-		RTE_PTYPE_INNER_L2_ETHER,
-		RTE_PTYPE_INNER_L3_IPV4_EXT,
-		RTE_PTYPE_INNER_L3_IPV6_EXT,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+	hw->rx_buf_len = 0;
+	dev->data->scattered_rx = false;
+}
 
-	static const uint32_t inner_l4table[HNS3_L4TBL_NUM] = {
-		RTE_PTYPE_INNER_L4_UDP,
-		RTE_PTYPE_INNER_L4_TCP,
-		RTE_PTYPE_TUNNEL_GRE,
-		RTE_PTYPE_INNER_L4_SCTP,
-		RTE_PTYPE_L4_IGMP,
-		RTE_PTYPE_INNER_L4_ICMP,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+void
+hns3_rx_scattered_calc(struct rte_eth_dev *dev)
+{
+	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_hw *hw = &hns->hw;
+	struct hns3_rx_queue *rxq;
+	uint32_t queue_id;
 
-	static const uint32_t ol3table[HNS3_OL3TBL_NUM] = {
-		RTE_PTYPE_L3_IPV4,
-		RTE_PTYPE_L3_IPV6,
-		0, 0,
-		RTE_PTYPE_L3_IPV4_EXT,
-		RTE_PTYPE_L3_IPV6_EXT,
-		0, 0, 0, 0, 0, 0, 0, 0, 0,
-		RTE_PTYPE_UNKNOWN
-	};
+	if (dev->data->rx_queues == NULL)
+		return;
 
-	static const uint32_t ol4table[HNS3_OL4TBL_NUM] = {
-		0,
-		RTE_PTYPE_TUNNEL_VXLAN,
-		RTE_PTYPE_TUNNEL_NVGRE,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	};
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		rxq = dev->data->rx_queues[queue_id];
+		if (hw->rx_buf_len == 0)
+			hw->rx_buf_len = rxq->rx_buf_len;
+		else
+			hw->rx_buf_len = RTE_MIN(hw->rx_buf_len,
+						 rxq->rx_buf_len);
+	}
 
-	l2id = hns3_get_field(pkt_info, HNS3_RXD_STRP_TAGP_M,
-			      HNS3_RXD_STRP_TAGP_S);
-	l3id = hns3_get_field(pkt_info, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S);
-	l4id = hns3_get_field(pkt_info, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S);
-	ol3id = hns3_get_field(ol_info, HNS3_RXD_OL3ID_M, HNS3_RXD_OL3ID_S);
-	ol4id = hns3_get_field(ol_info, HNS3_RXD_OL4ID_M, HNS3_RXD_OL4ID_S);
-
-	if (ol4table[ol4id])
-		pkt_type |= (inner_l2table[l2id] | inner_l3table[l3id] |
-			     inner_l4table[l4id] | ol3table[ol3id] |
-			     ol4table[ol4id]);
-	else
-		pkt_type |= (l2table[l2id] | l3table[l3id] | l4table[l4id]);
-	return pkt_type;
+	if (dev_conf->rxmode.offloads & DEV_RX_OFFLOAD_SCATTER ||
+	    dev_conf->rxmode.max_rx_pkt_len > hw->rx_buf_len)
+		dev->data->scattered_rx = true;
 }
 
 const uint32_t *
@@ -1468,81 +1427,69 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 		RTE_PTYPE_UNKNOWN
 	};
 
-	if (dev->rx_pkt_burst == hns3_recv_pkts)
+	if (dev->rx_pkt_burst == hns3_recv_pkts ||
+	    dev->rx_pkt_burst == hns3_recv_scattered_pkts)
 		return ptypes;
 
 	return NULL;
 }
 
-static int
-hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
-		   uint32_t bd_base_info, uint32_t l234_info,
-		   uint32_t *cksum_err)
+void
+hns3_init_rx_ptype_tble(struct rte_eth_dev *dev)
 {
-	uint32_t tmp = 0;
-
-	if (unlikely(l234_info & BIT(HNS3_RXD_L2E_B))) {
-		rxq->l2_errors++;
-		return -EINVAL;
-	}
-
-	if (unlikely(rxm->pkt_len == 0 ||
-		(l234_info & BIT(HNS3_RXD_TRUNCAT_B)))) {
-		rxq->pkt_len_errors++;
-		return -EINVAL;
-	}
-
-	if (bd_base_info & BIT(HNS3_RXD_L3L4P_B)) {
-		if (unlikely(l234_info & BIT(HNS3_RXD_L3E_B))) {
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
-			rxq->l3_csum_erros++;
-			tmp |= HNS3_L3_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_L4E_B))) {
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
-			rxq->l4_csum_erros++;
-			tmp |= HNS3_L4_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_OL3E_B))) {
-			rxq->ol3_csum_erros++;
-			tmp |= HNS3_OUTER_L3_CKSUM_ERR;
-		}
-
-		if (unlikely(l234_info & BIT(HNS3_RXD_OL4E_B))) {
-			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_BAD;
-			rxq->ol4_csum_erros++;
-			tmp |= HNS3_OUTER_L4_CKSUM_ERR;
-		}
-	}
-	*cksum_err = tmp;
-
-	return 0;
-}
-
-static void
-hns3_rx_set_cksum_flag(struct rte_mbuf *rxm, uint64_t packet_type,
-		       const uint32_t cksum_err)
-{
-	if (unlikely((packet_type & RTE_PTYPE_TUNNEL_MASK))) {
-		if (likely(packet_type & RTE_PTYPE_INNER_L3_MASK) &&
-		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_INNER_L4_MASK) &&
-		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
-		    (cksum_err & HNS3_OUTER_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_GOOD;
-	} else {
-		if (likely(packet_type & RTE_PTYPE_L3_MASK) &&
-		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
-		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
-		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
-			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-	}
+	struct hns3_adapter *hns = dev->data->dev_private;
+	struct hns3_ptype_table *tbl = &hns->ptype_tbl;
+
+	memset(tbl, 0, sizeof(*tbl));
+
+	tbl->l2table[0] = RTE_PTYPE_L2_ETHER;
+	tbl->l2table[1] = RTE_PTYPE_L2_ETHER_QINQ;
+	tbl->l2table[2] = RTE_PTYPE_L2_ETHER_VLAN;
+	tbl->l2table[3] = RTE_PTYPE_L2_ETHER_VLAN;
+
+	tbl->l3table[0] = RTE_PTYPE_L3_IPV4;
+	tbl->l3table[1] = RTE_PTYPE_L3_IPV6;
+	tbl->l3table[2] = RTE_PTYPE_L2_ETHER_ARP;
+	tbl->l3table[3] = RTE_PTYPE_L2_ETHER;
+	tbl->l3table[4] = RTE_PTYPE_L3_IPV4_EXT;
+	tbl->l3table[5] = RTE_PTYPE_L3_IPV6_EXT;
+	tbl->l3table[6] = RTE_PTYPE_L2_ETHER_LLDP;
+
+	tbl->l4table[0] = RTE_PTYPE_L4_UDP;
+	tbl->l4table[1] = RTE_PTYPE_L4_TCP;
+	tbl->l4table[2] = RTE_PTYPE_TUNNEL_GRE;
+	tbl->l4table[3] = RTE_PTYPE_L4_SCTP;
+	tbl->l4table[4] = RTE_PTYPE_L4_IGMP;
+	tbl->l4table[5] = RTE_PTYPE_L4_ICMP;
+
+	tbl->inner_l2table[0] = RTE_PTYPE_INNER_L2_ETHER;
+	tbl->inner_l2table[1] = RTE_PTYPE_INNER_L2_ETHER_VLAN;
+	tbl->inner_l2table[2] = RTE_PTYPE_INNER_L2_ETHER_QINQ;
+
+	tbl->inner_l3table[0] = RTE_PTYPE_INNER_L3_IPV4;
+	tbl->inner_l3table[1] = RTE_PTYPE_INNER_L3_IPV6;
+	tbl->inner_l3table[2] = 0;
+	tbl->inner_l3table[3] = RTE_PTYPE_INNER_L2_ETHER;
+	tbl->inner_l3table[4] = RTE_PTYPE_INNER_L3_IPV4_EXT;
+	tbl->inner_l3table[5] = RTE_PTYPE_INNER_L3_IPV6_EXT;
+
+	tbl->inner_l4table[0] = RTE_PTYPE_INNER_L4_UDP;
+	tbl->inner_l4table[1] = RTE_PTYPE_INNER_L4_TCP;
+	tbl->inner_l4table[2] = RTE_PTYPE_TUNNEL_GRE;
+	tbl->inner_l4table[3] = RTE_PTYPE_INNER_L4_SCTP;
+	tbl->inner_l4table[4] = RTE_PTYPE_L4_IGMP;
+	tbl->inner_l4table[5] = RTE_PTYPE_INNER_L4_ICMP;
+
+	tbl->ol3table[0] = RTE_PTYPE_L3_IPV4;
+	tbl->ol3table[1] = RTE_PTYPE_L3_IPV6;
+	tbl->ol3table[2] = 0;
+	tbl->ol3table[3] = 0;
+	tbl->ol3table[4] = RTE_PTYPE_L3_IPV4_EXT;
+	tbl->ol3table[5] = RTE_PTYPE_L3_IPV6_EXT;
+
+	tbl->ol4table[0] = 0;
+	tbl->ol4table[1] = RTE_PTYPE_TUNNEL_VXLAN;
+	tbl->ol4table[2] = RTE_PTYPE_TUNNEL_NVGRE;
 }
 
 static inline void
@@ -1612,6 +1559,23 @@ recalculate_data_len(struct rte_mbuf *first_seg, struct rte_mbuf *last_seg,
 		rxm->data_len = (uint16_t)(data_len - crc_len);
 }
 
+static inline struct rte_mbuf *
+hns3_rx_alloc_buffer(struct hns3_rx_queue *rxq)
+{
+	int ret;
+
+	if (likely(rxq->bulk_mbuf_num > 0))
+		return rxq->bulk_mbuf[--rxq->bulk_mbuf_num];
+
+	ret = rte_mempool_get_bulk(rxq->mb_pool, (void **)rxq->bulk_mbuf,
+				   HNS3_BULK_ALLOC_MBUF_NUM);
+	if (likely(ret == 0)) {
+		rxq->bulk_mbuf_num = HNS3_BULK_ALLOC_MBUF_NUM;
+		return rxq->bulk_mbuf[--rxq->bulk_mbuf_num];
+	} else
+		return rte_mbuf_raw_alloc(rxq->mb_pool);
+}
+
 uint16_t
 hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
@@ -1620,6 +1584,119 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct hns3_rx_queue *rxq;      /* RX queue */
 	struct hns3_entry *sw_ring;
 	struct hns3_entry *rxe;
+	struct hns3_desc rxd;
+	struct rte_mbuf *nmb;           /* pointer of the new mbuf */
+	struct rte_mbuf *rxm;
+	uint32_t bd_base_info;
+	uint32_t cksum_err;
+	uint32_t l234_info;
+	uint32_t ol_info;
+	uint64_t dma_addr;
+	uint16_t nb_rx_bd;
+	uint16_t nb_rx;
+	uint16_t rx_id;
+	int ret;
+
+	nb_rx = 0;
+	nb_rx_bd = 0;
+	rxq = rx_queue;
+	rx_ring = rxq->rx_ring;
+	sw_ring = rxq->sw_ring;
+	rx_id = rxq->next_to_use;
+
+	while (nb_rx < nb_pkts) {
+		rxdp = &rx_ring[rx_id];
+		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
+		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
+			break;
+
+		rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+			   (1u << HNS3_RXD_VLD_B)];
+
+		nmb = hns3_rx_alloc_buffer(rxq);
+		if (unlikely(nmb == NULL)) {
+			uint16_t port_id;
+
+			port_id = rxq->port_id;
+			rte_eth_devices[port_id].data->rx_mbuf_alloc_failed++;
+			break;
+		}
+
+		nb_rx_bd++;
+		rxe = &sw_ring[rx_id];
+		rx_id++;
+		if (unlikely(rx_id == rxq->nb_rx_desc))
+			rx_id = 0;
+
+		rte_prefetch0(sw_ring[rx_id].mbuf);
+		if ((rx_id & HNS3_RX_RING_PREFETCTH_MASK) == 0) {
+			rte_prefetch0(&rx_ring[rx_id]);
+			rte_prefetch0(&sw_ring[rx_id]);
+		}
+
+		rxm = rxe->mbuf;
+		rxe->mbuf = nmb;
+
+		dma_addr = rte_mbuf_data_iova_default(nmb);
+		rxdp->addr = rte_cpu_to_le_64(dma_addr);
+		rxdp->rx.bd_base_info = 0;
+
+		rxm->data_off = RTE_PKTMBUF_HEADROOM;
+		rxm->pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len)) -
+				rxq->crc_len;
+		rxm->data_len = rxm->pkt_len;
+		rxm->port = rxq->port_id;
+		rxm->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
+		rxm->ol_flags = PKT_RX_RSS_HASH;
+		if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
+			rxm->hash.fdir.hi =
+				rte_le_to_cpu_16(rxd.rx.fd_id);
+			rxm->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+		}
+		rxm->nb_segs = 1;
+		rxm->next = NULL;
+
+		/* Load remained descriptor data and extract necessary fields */
+		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
+		ret = hns3_handle_bdinfo(rxq, rxm, bd_base_info,
+					 l234_info, &cksum_err);
+		if (unlikely(ret))
+			goto pkt_err;
+
+		rxm->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+
+		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+			hns3_rx_set_cksum_flag(rxm, rxm->packet_type,
+					       cksum_err);
+		hns3_rxd_to_vlan_tci(rxq, rxm, l234_info, &rxd);
+
+		rx_pkts[nb_rx++] = rxm;
+		continue;
+pkt_err:
+		rte_pktmbuf_free(rxm);
+	}
+
+	rxq->next_to_use = rx_id;
+	rxq->rx_free_hold += nb_rx_bd;
+	if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+		hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
+		rxq->rx_free_hold = 0;
+	}
+
+	return nb_rx;
+}
+
+uint16_t
+hns3_recv_scattered_pkts(void *rx_queue,
+			 struct rte_mbuf **rx_pkts,
+			 uint16_t nb_pkts)
+{
+	volatile struct hns3_desc *rx_ring;  /* RX ring (desc) */
+	volatile struct hns3_desc *rxdp;     /* pointer of the current desc */
+	struct hns3_rx_queue *rxq;      /* RX queue */
+	struct hns3_entry *sw_ring;
+	struct hns3_entry *rxe;
 	struct rte_mbuf *first_seg;
 	struct rte_mbuf *last_seg;
 	struct hns3_desc rxd;
@@ -1632,9 +1709,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint32_t gro_size;
 	uint32_t ol_info;
 	uint64_t dma_addr;
-	uint16_t data_len;
 	uint16_t nb_rx_bd;
-	uint16_t pkt_len;
 	uint16_t nb_rx;
 	uint16_t rx_id;
 	int ret;
@@ -1652,8 +1727,9 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
-		if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B)))
+		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
 			break;
+
 		/*
 		 * The interactive process between software and hardware of
 		 * receiving a new packet in hns3 network engine:
@@ -1716,7 +1792,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
 			   (1u << HNS3_RXD_VLD_B)];
 
-		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
+		nmb = hns3_rx_alloc_buffer(rxq);
 		if (unlikely(nmb == NULL)) {
 			dev = &rte_eth_devices[rxq->port_id];
 			dev->data->rx_mbuf_alloc_failed++;
@@ -1730,7 +1806,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			rx_id = 0;
 
 		rte_prefetch0(sw_ring[rx_id].mbuf);
-		if ((rx_id & 0x3) == 0) {
+		if ((rx_id & HNS3_RX_RING_PREFETCTH_MASK) == 0) {
 			rte_prefetch0(&rx_ring[rx_id]);
 			rte_prefetch0(&sw_ring[rx_id]);
 		}
@@ -1742,15 +1818,6 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxdp->rx.bd_base_info = 0;
 		rxdp->addr = dma_addr;
 
-		/*
-		 * Load remained descriptor data and extract necessary fields.
-		 * Data size from buffer description may contains CRC len,
-		 * packet len should subtract it.
-		 */
-		data_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.size));
-		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
-		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
-
 		if (first_seg == NULL) {
 			first_seg = rxm;
 			first_seg->nb_segs = 1;
@@ -1760,10 +1827,11 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 
 		rxm->data_off = RTE_PKTMBUF_HEADROOM;
-		rxm->data_len = data_len;
+		rxm->data_len = rte_le_to_cpu_16(rxd.rx.size);
 
-		if (!hns3_get_bit(bd_base_info, HNS3_RXD_FE_B)) {
+		if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) {
 			last_seg = rxm;
+			rxm->next = NULL;
 			continue;
 		}
 
@@ -1772,8 +1840,7 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		 * buffer description may contains CRC len, packet len should
 		 * subtract it, same as data len.
 		 */
-		pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len));
-		first_seg->pkt_len = pkt_len;
+		first_seg->pkt_len = rte_le_to_cpu_16(rxd.rx.pkt_len);
 
 		/*
 		 * This is the last buffer of the received packet. If the CRC
@@ -1789,15 +1856,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if (unlikely(rxq->crc_len > 0)) {
 			first_seg->pkt_len -= rxq->crc_len;
 			recalculate_data_len(first_seg, last_seg, rxm, rxq,
-				data_len);
+				rxm->data_len);
 		}
 
 		first_seg->port = rxq->port_id;
 		first_seg->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
 		first_seg->ol_flags = PKT_RX_RSS_HASH;
-		if (unlikely(hns3_get_bit(bd_base_info, HNS3_RXD_LUM_B))) {
+		if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
 			first_seg->hash.fdir.hi =
-				rte_le_to_cpu_32(rxd.rx.fd_id);
+				rte_le_to_cpu_16(rxd.rx.fd_id);
 			first_seg->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
 		}
 
@@ -1808,13 +1875,15 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 			first_seg->tso_segsz = gro_size;
 		}
 
+		l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+		ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
 		ret = hns3_handle_bdinfo(rxq, first_seg, bd_base_info,
 					 l234_info, &cksum_err);
 		if (unlikely(ret))
 			goto pkt_err;
 
-		first_seg->packet_type = rxd_pkt_info_to_pkt_type(l234_info,
-								  ol_info);
+		first_seg->packet_type = hns3_rx_calc_ptype(rxq,
+						l234_info, ol_info);
 
 		if (bd_base_info & BIT(HNS3_RXD_L3L4P_B))
 			hns3_rx_set_cksum_flag(first_seg,
@@ -1844,6 +1913,46 @@ hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 }
 
 int
+hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
+		       struct rte_eth_burst_mode *mode)
+{
+	static const struct {
+		eth_rx_burst_t pkt_burst;
+		const char *info;
+	} burst_infos[] = {
+		{ hns3_recv_pkts,		"Scalar" },
+		{ hns3_recv_scattered_pkts,	"Scalar Scattered" },
+	};
+
+	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+	int ret = -EINVAL;
+	unsigned int i;
+
+	for (i = 0; i < RTE_DIM(burst_infos); i++) {
+		if (pkt_burst == burst_infos[i].pkt_burst) {
+			snprintf(mode->info, sizeof(mode->info), "%s",
+				 burst_infos[i].info);
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static eth_rx_burst_t
+hns3_get_rx_function(struct rte_eth_dev *dev)
+{
+	struct hns3_adapter *hns = dev->data->dev_private;
+	uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
+
+	if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
+	    (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
+		return hns3_recv_pkts;
+
+	return hns3_recv_scattered_pkts;
+}
+int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
 {
@@ -1932,7 +2041,8 @@ hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 	struct hns3_desc *desc = &txq->tx_ring[tx_next_clean];
 	struct rte_mbuf *mbuf;
 
-	while ((!hns3_get_bit(desc->tx.tp_fe_sc_vld_ra_ri, HNS3_TXD_VLD_B)) &&
+	while ((!(desc->tx.tp_fe_sc_vld_ra_ri &
+		rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))) &&
 		tx_next_use != tx_next_clean) {
 		mbuf = tx_bak_pkt->mbuf;
 		if (mbuf) {
@@ -2818,7 +2928,7 @@ void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev)
 
 	if (hns->hw.adapter_state == HNS3_NIC_STARTED &&
 	    rte_atomic16_read(&hns->hw.reset.resetting) == 0) {
-		eth_dev->rx_pkt_burst = hns3_recv_pkts;
+		eth_dev->rx_pkt_burst = hns3_get_rx_function(eth_dev);
 		eth_dev->tx_pkt_burst = hns3_xmit_pkts;
 		eth_dev->tx_pkt_prepare = hns3_prep_pkts;
 	} else {
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index c1a34e2..a6a607e 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -10,6 +10,8 @@
 #define HNS3_DEFAULT_RING_DESC  1024
 #define	HNS3_ALIGN_RING_DESC	32
 #define HNS3_RING_BASE_ALIGN	128
+#define HNS3_BULK_ALLOC_MBUF_NUM	32
+
 #define HNS3_DEFAULT_RX_FREE_THRESH	32
 
 #define HNS3_512_BD_BUF_SIZE	512
@@ -46,7 +48,7 @@
 #define HNS3_RXD_L2E_B				16
 #define HNS3_RXD_L3E_B				17
 #define HNS3_RXD_L4E_B				18
-#define HNS3_RXD_TRUNCAT_B			19
+#define HNS3_RXD_TRUNCATE_B			19
 #define HNS3_RXD_HOI_B				20
 #define HNS3_RXD_DOI_B				21
 #define HNS3_RXD_OL3E_B				22
@@ -233,6 +235,7 @@ struct hns3_rx_queue {
 	void *io_base;
 	volatile void *io_head_reg;
 	struct hns3_adapter *hns;
+	struct hns3_ptype_table *ptype_tbl;
 	struct rte_mempool *mb_pool;
 	struct hns3_desc *rx_ring;
 	uint64_t rx_ring_phys_addr; /* RX ring DMA address */
@@ -245,13 +248,13 @@ struct hns3_rx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_rx_desc;
-	uint16_t next_to_use;
 	uint16_t rx_buf_len;
 	/*
 	 * threshold for the number of BDs waited to passed to hardware. If the
 	 * number exceeds the threshold, driver will pass these BDs to hardware.
 	 */
 	uint16_t rx_free_thresh;
+	uint16_t next_to_use;    /* index of next BD to be polled */
 	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
 
 	/*
@@ -268,10 +271,13 @@ struct hns3_rx_queue {
 
 	uint64_t l2_errors;
 	uint64_t pkt_len_errors;
-	uint64_t l3_csum_erros;
-	uint64_t l4_csum_erros;
-	uint64_t ol3_csum_erros;
-	uint64_t ol4_csum_erros;
+	uint64_t l3_csum_errors;
+	uint64_t l4_csum_errors;
+	uint64_t ol3_csum_errors;
+	uint64_t ol4_csum_errors;
+
+	struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
+	uint16_t bulk_mbuf_num;
 };
 
 struct hns3_tx_queue {
@@ -380,6 +386,120 @@ enum hns3_cksum_status {
 	HNS3_OUTER_L4_CKSUM_ERR = 8
 };
 
+static inline int
+hns3_handle_bdinfo(struct hns3_rx_queue *rxq, struct rte_mbuf *rxm,
+		   uint32_t bd_base_info, uint32_t l234_info,
+		   uint32_t *cksum_err)
+{
+#define L2E_TRUNC_ERR_FLAG	(BIT(HNS3_RXD_L2E_B) | \
+				 BIT(HNS3_RXD_TRUNCATE_B))
+#define CHECKSUM_ERR_FLAG	(BIT(HNS3_RXD_L3E_B) | \
+				 BIT(HNS3_RXD_L4E_B) | \
+				 BIT(HNS3_RXD_OL3E_B) | \
+				 BIT(HNS3_RXD_OL4E_B))
+
+	uint32_t tmp = 0;
+
+	/*
+	 * If packet len bigger than mtu when recv with no-scattered algorithm,
+	 * the first n bd will without FE bit, we need process this sisution.
+	 * Note: we don't need add statistic counter because latest BD which
+	 *       with FE bit will mark HNS3_RXD_L2E_B bit.
+	 */
+	if (unlikely((bd_base_info & BIT(HNS3_RXD_FE_B)) == 0))
+		return -EINVAL;
+
+	if (unlikely((l234_info & L2E_TRUNC_ERR_FLAG) || rxm->pkt_len == 0)) {
+		if (l234_info & BIT(HNS3_RXD_L2E_B))
+			rxq->l2_errors++;
+		else
+			rxq->pkt_len_errors++;
+		return -EINVAL;
+	}
+
+	if (bd_base_info & BIT(HNS3_RXD_L3L4P_B)) {
+		if (likely((l234_info & CHECKSUM_ERR_FLAG) == 0)) {
+			*cksum_err = 0;
+			return 0;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_L3E_B))) {
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+			rxq->l3_csum_errors++;
+			tmp |= HNS3_L3_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_L4E_B))) {
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
+			rxq->l4_csum_errors++;
+			tmp |= HNS3_L4_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_OL3E_B))) {
+			rxq->ol3_csum_errors++;
+			tmp |= HNS3_OUTER_L3_CKSUM_ERR;
+		}
+
+		if (unlikely(l234_info & BIT(HNS3_RXD_OL4E_B))) {
+			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_BAD;
+			rxq->ol4_csum_errors++;
+			tmp |= HNS3_OUTER_L4_CKSUM_ERR;
+		}
+	}
+	*cksum_err = tmp;
+
+	return 0;
+}
+
+static inline void
+hns3_rx_set_cksum_flag(struct rte_mbuf *rxm, const uint64_t packet_type,
+		       const uint32_t cksum_err)
+{
+	if (unlikely((packet_type & RTE_PTYPE_TUNNEL_MASK))) {
+		if (likely(packet_type & RTE_PTYPE_INNER_L3_MASK) &&
+		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_INNER_L4_MASK) &&
+		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
+		    (cksum_err & HNS3_OUTER_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_OUTER_L4_CKSUM_GOOD;
+	} else {
+		if (likely(packet_type & RTE_PTYPE_L3_MASK) &&
+		    (cksum_err & HNS3_L3_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
+		if (likely(packet_type & RTE_PTYPE_L4_MASK) &&
+		    (cksum_err & HNS3_L4_CKSUM_ERR) == 0)
+			rxm->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+}
+
+static inline uint32_t
+hns3_rx_calc_ptype(struct hns3_rx_queue *rxq, const uint32_t l234_info,
+		   const uint32_t ol_info)
+{
+	const struct hns3_ptype_table *const ptype_tbl = rxq->ptype_tbl;
+	uint32_t l2id, l3id, l4id;
+	uint32_t ol3id, ol4id;
+
+	ol4id = hns3_get_field(ol_info, HNS3_RXD_OL4ID_M, HNS3_RXD_OL4ID_S);
+	ol3id = hns3_get_field(ol_info, HNS3_RXD_OL3ID_M, HNS3_RXD_OL3ID_S);
+	l2id = hns3_get_field(l234_info, HNS3_RXD_STRP_TAGP_M,
+			      HNS3_RXD_STRP_TAGP_S);
+	l3id = hns3_get_field(l234_info, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S);
+	l4id = hns3_get_field(l234_info, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S);
+
+	if (unlikely(ptype_tbl->ol4table[ol4id]))
+		return ptype_tbl->inner_l2table[l2id] |
+			ptype_tbl->inner_l3table[l3id] |
+			ptype_tbl->inner_l4table[l4id] |
+			ptype_tbl->ol3table[ol3id] | ptype_tbl->ol4table[ol4id];
+	else
+		return ptype_tbl->l2table[l2id] | ptype_tbl->l3table[l3id] |
+			ptype_tbl->l4table[l4id];
+}
+
 void hns3_dev_rx_queue_release(void *queue);
 void hns3_dev_tx_queue_release(void *queue);
 void hns3_free_all_queues(struct rte_eth_dev *dev);
@@ -398,11 +518,17 @@ int hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 			unsigned int socket, const struct rte_eth_txconf *conf);
 uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts);
+int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
+			   __rte_unused uint16_t queue_id,
+			   struct rte_eth_burst_mode *mode);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 const uint32_t *hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+void hns3_init_rx_ptype_tble(struct rte_eth_dev *dev);
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev);
 void hns3_set_queue_intr_gl(struct hns3_hw *hw, uint16_t queue_id,
 			    uint8_t gl_idx, uint16_t gl_value);
@@ -415,6 +541,8 @@ int hns3_set_fake_rx_or_tx_queues(struct rte_eth_dev *dev, uint16_t nb_rx_q,
 int hns3_config_gro(struct hns3_hw *hw, bool en);
 int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
+void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
+void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_stats.c b/drivers/net/hns3/hns3_stats.c
index f2918fc..067673c 100644
--- a/drivers/net/hns3/hns3_stats.c
+++ b/drivers/net/hns3/hns3_stats.c
@@ -271,13 +271,13 @@ static const struct hns3_xstats_name_offset hns3_rx_bd_error_strings[] = {
 	{"L2_RX_ERRORS",
 		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(l2_errors)},
 	{"RX_L3_CHECKSUM_ERRORS",
-		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(l3_csum_erros)},
+		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(l3_csum_errors)},
 	{"RX_L4_CHECKSUM_ERRORS",
-		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(l4_csum_erros)},
+		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(l4_csum_errors)},
 	{"RX_OL3_CHECKSUM_ERRORS",
-		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(ol3_csum_erros)},
+		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(ol3_csum_errors)},
 	{"RX_OL4_CHECKSUM_ERRORS",
-		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(ol4_csum_erros)}
+		HNS3_RX_BD_ERROR_STATS_FIELD_OFFSET(ol4_csum_errors)}
 };
 
 /* The statistic of the Tx errors */
@@ -594,10 +594,10 @@ hns3_stats_reset(struct rte_eth_dev *eth_dev)
 		if (rxq) {
 			rxq->pkt_len_errors = 0;
 			rxq->l2_errors = 0;
-			rxq->l3_csum_erros = 0;
-			rxq->l4_csum_erros = 0;
-			rxq->ol3_csum_erros = 0;
-			rxq->ol4_csum_erros = 0;
+			rxq->l3_csum_errors = 0;
+			rxq->l4_csum_errors = 0;
+			rxq->ol3_csum_errors = 0;
+			rxq->ol4_csum_errors = 0;
 		}
 	}
 
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 4/8] net/hns3: add simple Tx process function
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (2 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds simple Tx process function. When multiple segment packets
are not needed, Which means that DEV_TX_OFFLOAD_MBUF_FAST_FREE offload is
not set, we can simple Tx process.

Signed-off-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c    |   6 +
 drivers/net/hns3/hns3_ethdev.h    |   1 +
 drivers/net/hns3/hns3_ethdev_vf.c |   6 +
 drivers/net/hns3/hns3_rxtx.c      | 260 +++++++++++++++++++++++++++++++++++---
 drivers/net/hns3/hns3_rxtx.h      |  34 +++++
 5 files changed, 292 insertions(+), 15 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 024dc19..b4a5ba3 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->tx_simple_allowed = true;
 	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
@@ -2512,6 +2513,10 @@ hns3_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 		.rx_drop_en = 1,
 		.offloads = 0,
 	};
+	info->default_txconf = (struct rte_eth_txconf) {
+		.tx_rs_thresh = HNS3_DEFAULT_TX_RS_THRESH,
+		.offloads = 0,
+	};
 
 	info->vmdq_queue_num = 0;
 
@@ -5545,6 +5550,7 @@ static const struct eth_dev_ops hns3_eth_dev_ops = {
 	.rxq_info_get           = hns3_rxq_info_get,
 	.txq_info_get           = hns3_txq_info_get,
 	.rx_burst_mode_get      = hns3_rx_burst_mode_get,
+	.tx_burst_mode_get      = hns3_tx_burst_mode_get,
 	.flow_ctrl_get          = hns3_flow_ctrl_get,
 	.flow_ctrl_set          = hns3_flow_ctrl_set,
 	.priority_flow_ctrl_set = hns3_priority_flow_ctrl_set,
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index d93c5b2..ef85034 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -643,6 +643,7 @@ struct hns3_adapter {
 	};
 
 	bool rx_simple_allowed;
+	bool tx_simple_allowed;
 	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 0f155d8..915b896 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->tx_simple_allowed = true;
 	hns3_init_rx_ptype_tble(dev);
 
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
@@ -957,6 +958,10 @@ hns3vf_dev_infos_get(struct rte_eth_dev *eth_dev, struct rte_eth_dev_info *info)
 		.rx_drop_en = 1,
 		.offloads = 0,
 	};
+	info->default_txconf = (struct rte_eth_txconf) {
+		.tx_rs_thresh = HNS3_DEFAULT_TX_RS_THRESH,
+		.offloads = 0,
+	};
 
 	info->vmdq_queue_num = 0;
 
@@ -2541,6 +2546,7 @@ static const struct eth_dev_ops hns3vf_eth_dev_ops = {
 	.rxq_info_get       = hns3_rxq_info_get,
 	.txq_info_get       = hns3_txq_info_get,
 	.rx_burst_mode_get  = hns3_rx_burst_mode_get,
+	.tx_burst_mode_get  = hns3_tx_burst_mode_get,
 	.mac_addr_add       = hns3vf_add_mac_addr,
 	.mac_addr_remove    = hns3vf_remove_mac_addr,
 	.mac_addr_set       = hns3vf_set_default_mac_addr,
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 10f0112..dc09ea0 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -1952,27 +1952,72 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
 
 	return hns3_recv_scattered_pkts;
 }
+
+static int
+hns3_tx_queue_conf_check(struct hns3_hw *hw, const struct rte_eth_txconf *conf,
+			 uint16_t nb_desc, uint16_t *tx_rs_thresh,
+			 uint16_t *tx_free_thresh, uint16_t idx)
+{
+#define HNS3_TX_RS_FREE_THRESH_GAP	8
+	uint16_t rs_thresh, free_thresh, fast_free_thresh;
+
+	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
+	    nb_desc % HNS3_ALIGN_RING_DESC) {
+		hns3_err(hw, "number (%u) of tx descriptors is invalid",
+			 nb_desc);
+		return -EINVAL;
+	}
+
+	rs_thresh = (conf->tx_rs_thresh > 0) ?
+			conf->tx_rs_thresh : HNS3_DEFAULT_TX_RS_THRESH;
+	free_thresh = (conf->tx_free_thresh > 0) ?
+			conf->tx_free_thresh : HNS3_DEFAULT_TX_FREE_THRESH;
+	if (rs_thresh + free_thresh > nb_desc || nb_desc % rs_thresh ||
+	    rs_thresh >= nb_desc - HNS3_TX_RS_FREE_THRESH_GAP ||
+	    free_thresh >= nb_desc - HNS3_TX_RS_FREE_THRESH_GAP) {
+		hns3_err(hw, "tx_rs_thresh (%d) tx_free_thresh (%d) nb_desc "
+			 "(%d) of tx descriptors for port=%d queue=%d check "
+			 "fail!",
+			 rs_thresh, free_thresh, nb_desc, hw->data->port_id,
+			 idx);
+		return -EINVAL;
+	}
+
+	if (conf->tx_free_thresh == 0) {
+		/* Fast free Tx memory buffer to improve cache hit rate */
+		fast_free_thresh = nb_desc - rs_thresh;
+		if (fast_free_thresh >=
+		    HNS3_TX_FAST_FREE_AHEAD + HNS3_DEFAULT_TX_FREE_THRESH)
+			free_thresh = fast_free_thresh -
+					HNS3_TX_FAST_FREE_AHEAD;
+	}
+
+	*tx_rs_thresh = rs_thresh;
+	*tx_free_thresh = free_thresh;
+	return 0;
+}
+
 int
 hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		    unsigned int socket_id, const struct rte_eth_txconf *conf)
 {
 	struct hns3_adapter *hns = dev->data->dev_private;
+	uint16_t tx_rs_thresh, tx_free_thresh;
 	struct hns3_hw *hw = &hns->hw;
 	struct hns3_queue_info q_info;
 	struct hns3_tx_queue *txq;
 	int tx_entry_len;
+	int ret;
 
 	if (dev->data->dev_started) {
 		hns3_err(hw, "tx_queue_setup after dev_start no supported");
 		return -EINVAL;
 	}
 
-	if (nb_desc > HNS3_MAX_RING_DESC || nb_desc < HNS3_MIN_RING_DESC ||
-	    nb_desc % HNS3_ALIGN_RING_DESC) {
-		hns3_err(hw, "Number (%u) of tx descriptors is invalid",
-			    nb_desc);
-		return -EINVAL;
-	}
+	ret = hns3_tx_queue_conf_check(hw, conf, nb_desc,
+				       &tx_rs_thresh, &tx_free_thresh, idx);
+	if (ret)
+		return ret;
 
 	if (dev->data->tx_queues[idx] != NULL) {
 		hns3_tx_queue_release(dev->data->tx_queues[idx]);
@@ -2005,11 +2050,15 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->next_to_use = 0;
 	txq->next_to_clean = 0;
 	txq->tx_bd_ready = txq->nb_tx_desc - 1;
+	txq->tx_free_thresh = tx_free_thresh;
+	txq->tx_rs_thresh = tx_rs_thresh;
 	txq->port_id = dev->data->port_id;
 	txq->pvid_state = hw->port_base_vlan_cfg.state;
 	txq->configured = true;
 	txq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
 				idx * HNS3_TQP_REG_SIZE);
+	txq->io_tail_reg = (volatile void *)((char *)txq->io_base +
+					     HNS3_RING_TX_TAIL_REG);
 	txq->min_tx_pkt_len = hw->min_tx_pkt_len;
 	txq->over_length_pkt_cnt = 0;
 	txq->exceed_limit_bd_pkt_cnt = 0;
@@ -2024,12 +2073,6 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static inline void
-hns3_queue_xmit(struct hns3_tx_queue *txq, uint32_t buf_num)
-{
-	hns3_write_dev(txq, HNS3_RING_TX_TAIL_REG, buf_num);
-}
-
 static void
 hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 {
@@ -2798,6 +2841,154 @@ hns3_check_non_tso_pkt(uint16_t nb_buf, struct rte_mbuf **m_seg,
 	return 0;
 }
 
+static inline void
+hns3_tx_free_buffer_simple(struct hns3_tx_queue *txq)
+{
+	struct hns3_entry *tx_entry;
+	struct hns3_desc *desc;
+	uint16_t tx_next_clean;
+	int i;
+
+	while (1) {
+		if (HNS3_GET_TX_QUEUE_PEND_BD_NUM(txq) < txq->tx_rs_thresh)
+			break;
+
+		/*
+		 * All mbufs can be released only when the VLD bits of all
+		 * descriptors in a batch are cleared.
+		 */
+		tx_next_clean = (txq->next_to_clean + txq->tx_rs_thresh - 1) %
+				txq->nb_tx_desc;
+		desc = &txq->tx_ring[tx_next_clean];
+		for (i = 0; i < txq->tx_rs_thresh; i++) {
+			if (rte_le_to_cpu_16(desc->tx.tp_fe_sc_vld_ra_ri) &
+					BIT(HNS3_TXD_VLD_B))
+				return;
+			desc--;
+		}
+
+		tx_entry = &txq->sw_ring[txq->next_to_clean];
+
+		for (i = 0; i < txq->tx_rs_thresh; i++)
+			rte_prefetch0((tx_entry + i)->mbuf);
+		for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
+			rte_mempool_put(tx_entry->mbuf->pool, tx_entry->mbuf);
+			tx_entry->mbuf = NULL;
+		}
+
+		txq->next_to_clean = (tx_next_clean + 1) % txq->nb_tx_desc;
+		txq->tx_bd_ready += txq->tx_rs_thresh;
+	}
+}
+
+static inline void
+hns3_tx_backup_1mbuf(struct hns3_entry *tx_entry, struct rte_mbuf **pkts)
+{
+	tx_entry->mbuf = pkts[0];
+}
+
+static inline void
+hns3_tx_backup_4mbuf(struct hns3_entry *tx_entry, struct rte_mbuf **pkts)
+{
+	hns3_tx_backup_1mbuf(&tx_entry[0], &pkts[0]);
+	hns3_tx_backup_1mbuf(&tx_entry[1], &pkts[1]);
+	hns3_tx_backup_1mbuf(&tx_entry[2], &pkts[2]);
+	hns3_tx_backup_1mbuf(&tx_entry[3], &pkts[3]);
+}
+
+static inline void
+hns3_tx_setup_4bd(struct hns3_desc *txdp, struct rte_mbuf **pkts)
+{
+#define PER_LOOP_NUM	4
+	const uint16_t bd_flag = BIT(HNS3_TXD_VLD_B) | BIT(HNS3_TXD_FE_B);
+	uint64_t dma_addr;
+	uint32_t i;
+
+	for (i = 0; i < PER_LOOP_NUM; i++, txdp++, pkts++) {
+		dma_addr = rte_mbuf_data_iova(*pkts);
+		txdp->addr = rte_cpu_to_le_64(dma_addr);
+		txdp->tx.send_size = rte_cpu_to_le_16((*pkts)->data_len);
+		txdp->tx.paylen = 0;
+		txdp->tx.type_cs_vlan_tso_len = 0;
+		txdp->tx.ol_type_vlan_len_msec = 0;
+		txdp->tx.tp_fe_sc_vld_ra_ri = rte_cpu_to_le_16(bd_flag);
+	}
+}
+
+static inline void
+hns3_tx_setup_1bd(struct hns3_desc *txdp, struct rte_mbuf **pkts)
+{
+	const uint16_t bd_flag = BIT(HNS3_TXD_VLD_B) | BIT(HNS3_TXD_FE_B);
+	uint64_t dma_addr;
+
+	dma_addr = rte_mbuf_data_iova(*pkts);
+	txdp->addr = rte_cpu_to_le_64(dma_addr);
+	txdp->tx.send_size = rte_cpu_to_le_16((*pkts)->data_len);
+	txdp->tx.paylen = 0;
+	txdp->tx.type_cs_vlan_tso_len = 0;
+	txdp->tx.ol_type_vlan_len_msec = 0;
+	txdp->tx.tp_fe_sc_vld_ra_ri = rte_cpu_to_le_16(bd_flag);
+}
+
+static inline void
+hns3_tx_fill_hw_ring(struct hns3_tx_queue *txq,
+		     struct rte_mbuf **pkts,
+		     uint16_t nb_pkts)
+{
+#define PER_LOOP_NUM	4
+#define PER_LOOP_MASK	(PER_LOOP_NUM - 1)
+	struct hns3_desc *txdp = &txq->tx_ring[txq->next_to_use];
+	struct hns3_entry *tx_entry = &txq->sw_ring[txq->next_to_use];
+	const uint32_t mainpart = (nb_pkts & ((uint32_t)~PER_LOOP_MASK));
+	const uint32_t leftover = (nb_pkts & ((uint32_t)PER_LOOP_MASK));
+	uint32_t i;
+
+	for (i = 0; i < mainpart; i += PER_LOOP_NUM) {
+		hns3_tx_backup_4mbuf(tx_entry + i, pkts + i);
+		hns3_tx_setup_4bd(txdp + i, pkts + i);
+	}
+	if (unlikely(leftover > 0)) {
+		for (i = 0; i < leftover; i++) {
+			hns3_tx_backup_1mbuf(tx_entry + mainpart + i,
+					     pkts + mainpart + i);
+			hns3_tx_setup_1bd(txdp + mainpart + i,
+					  pkts + mainpart + i);
+		}
+	}
+}
+
+uint16_t
+hns3_xmit_pkts_simple(void *tx_queue,
+		      struct rte_mbuf **tx_pkts,
+		      uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = tx_queue;
+	uint16_t nb_tx = 0;
+
+	hns3_tx_free_buffer_simple(txq);
+
+	nb_pkts = RTE_MIN(txq->tx_bd_ready, nb_pkts);
+	if (unlikely(nb_pkts == 0)) {
+		if (txq->tx_bd_ready == 0)
+			txq->queue_full_cnt++;
+		return 0;
+	}
+
+	txq->tx_bd_ready -= nb_pkts;
+	if (txq->next_to_use + nb_pkts > txq->nb_tx_desc) {
+		nb_tx = txq->nb_tx_desc - txq->next_to_use;
+		hns3_tx_fill_hw_ring(txq, tx_pkts, nb_tx);
+		txq->next_to_use = 0;
+	}
+
+	hns3_tx_fill_hw_ring(txq, tx_pkts + nb_tx, nb_pkts - nb_tx);
+	txq->next_to_use += nb_pkts - nb_tx;
+
+	hns3_write_reg_opt(txq->io_tail_reg, nb_pkts);
+
+	return nb_pkts;
+}
+
 uint16_t
 hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 {
@@ -2909,11 +3100,47 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 end_of_tx:
 
 	if (likely(nb_tx))
-		hns3_queue_xmit(txq, nb_hold);
+		hns3_write_reg_opt(txq->io_tail_reg, nb_hold);
 
 	return nb_tx;
 }
 
+int
+hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
+		       struct rte_eth_burst_mode *mode)
+{
+	eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
+	const char *info = NULL;
+
+	if (pkt_burst == hns3_xmit_pkts_simple)
+		info = "Scalar Simple";
+	else if (pkt_burst == hns3_xmit_pkts)
+		info = "Scalar";
+
+	if (info == NULL)
+		return -EINVAL;
+
+	snprintf(mode->info, sizeof(mode->info), "%s", info);
+
+	return 0;
+}
+
+static eth_tx_burst_t
+hns3_get_tx_function(struct rte_eth_dev *dev, eth_tx_prep_t *prep)
+{
+	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
+	struct hns3_adapter *hns = dev->data->dev_private;
+
+	if (hns->tx_simple_allowed &&
+	    offloads == (offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE)) {
+		*prep = NULL;
+		return hns3_xmit_pkts_simple;
+	}
+
+	*prep = hns3_prep_pkts;
+	return hns3_xmit_pkts;
+}
+
 static uint16_t
 hns3_dummy_rxtx_burst(void *dpdk_txq __rte_unused,
 		      struct rte_mbuf **pkts __rte_unused,
@@ -2925,12 +3152,13 @@ hns3_dummy_rxtx_burst(void *dpdk_txq __rte_unused,
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev)
 {
 	struct hns3_adapter *hns = eth_dev->data->dev_private;
+	eth_tx_prep_t prep = NULL;
 
 	if (hns->hw.adapter_state == HNS3_NIC_STARTED &&
 	    rte_atomic16_read(&hns->hw.reset.resetting) == 0) {
 		eth_dev->rx_pkt_burst = hns3_get_rx_function(eth_dev);
-		eth_dev->tx_pkt_burst = hns3_xmit_pkts;
-		eth_dev->tx_pkt_prepare = hns3_prep_pkts;
+		eth_dev->tx_pkt_burst = hns3_get_tx_function(eth_dev, &prep);
+		eth_dev->tx_pkt_prepare = prep;
 	} else {
 		eth_dev->rx_pkt_burst = hns3_dummy_rxtx_burst;
 		eth_dev->tx_pkt_burst = hns3_dummy_rxtx_burst;
@@ -2966,5 +3194,7 @@ hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 
 	qinfo->nb_desc = txq->nb_tx_desc;
 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+	qinfo->conf.tx_rs_thresh = txq->tx_rs_thresh;
+	qinfo->conf.tx_free_thresh = txq->tx_free_thresh;
 	qinfo->conf.tx_deferred_start = txq->tx_deferred_start;
 }
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index a6a607e..32f5d34 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -13,6 +13,9 @@
 #define HNS3_BULK_ALLOC_MBUF_NUM	32
 
 #define HNS3_DEFAULT_RX_FREE_THRESH	32
+#define HNS3_DEFAULT_TX_FREE_THRESH	32
+#define HNS3_DEFAULT_TX_RS_THRESH	32
+#define HNS3_TX_FAST_FREE_AHEAD		64
 
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
@@ -282,6 +285,7 @@ struct hns3_rx_queue {
 
 struct hns3_tx_queue {
 	void *io_base;
+	volatile void *io_tail_reg;
 	struct hns3_adapter *hns;
 	struct hns3_desc *tx_ring;
 	uint64_t tx_ring_phys_addr; /* TX ring DMA address */
@@ -291,10 +295,32 @@ struct hns3_tx_queue {
 	uint16_t queue_id;
 	uint16_t port_id;
 	uint16_t nb_tx_desc;
+	/*
+	 * index of next BD whose corresponding rte_mbuf can be released by
+	 * driver.
+	 */
 	uint16_t next_to_clean;
+	/* index of next BD to be filled by driver to send packet */
 	uint16_t next_to_use;
+	/* num of remaining BDs ready to be filled by driver to send packet */
 	uint16_t tx_bd_ready;
 
+	/* threshold for free tx buffer if available BDs less than this value */
+	uint16_t tx_free_thresh;
+
+	/*
+	 * For better performance in tx datapath, releasing mbuf in batches is
+	 * required.
+	 * Only checking the VLD bit of the last descriptor in a batch of the
+	 * thresh descriptors does not mean that these descriptors are all sent
+	 * by hardware successfully. So we need to check that the VLD bits of
+	 * all descriptors are cleared. and then free all mbufs in the batch.
+	 * - tx_rs_thresh
+	 *   Number of mbufs released at a time.
+
+	 */
+	uint16_t tx_rs_thresh;
+
 	/*
 	 * port based vlan configuration state.
 	 * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -360,6 +386,9 @@ struct hns3_tx_queue {
 	uint64_t pkt_padding_fail_cnt;
 };
 
+#define HNS3_GET_TX_QUEUE_PEND_BD_NUM(txq) \
+		((txq)->nb_tx_desc - 1 - (txq)->tx_bd_ready)
+
 struct hns3_queue_info {
 	const char *type;   /* point to queue memory name */
 	const char *ring_name;  /* point to hardware ring name */
@@ -525,8 +554,13 @@ int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
 			   struct rte_eth_burst_mode *mode);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+			       uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+int hns3_tx_burst_mode_get(struct rte_eth_dev *dev,
+			   __rte_unused uint16_t queue_id,
+			   struct rte_eth_burst_mode *mode);
 const uint32_t *hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev);
 void hns3_init_rx_ptype_tble(struct rte_eth_dev *dev);
 void hns3_set_rxtx_function(struct rte_eth_dev *eth_dev);
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 5/8] net/hns3: add vector Tx burst with NEON instructions
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (3 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds NEON vector instructions to optimize Tx burst process.

Signed-off-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
v1 -> v2: modification about meson.build for hns3_rxtx_vec.c file.
---
 drivers/net/hns3/hns3_ethdev.c        |  2 +
 drivers/net/hns3/hns3_ethdev.h        |  2 +
 drivers/net/hns3/hns3_ethdev_vf.c     |  2 +
 drivers/net/hns3/hns3_rxtx.c          | 33 ++++++++++++++
 drivers/net/hns3/hns3_rxtx.h          | 20 ++++++++-
 drivers/net/hns3/hns3_rxtx_vec.c      | 47 +++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec.h      | 57 +++++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h | 85 +++++++++++++++++++++++++++++++++++
 drivers/net/hns3/meson.build          |  4 ++
 9 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.c
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec.h
 create mode 100644 drivers/net/hns3/hns3_rxtx_vec_neon.h

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index b4a5ba3..9df5fc8 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2353,6 +2353,8 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 
 	hns->rx_simple_allowed = true;
 	hns->tx_simple_allowed = true;
+	hns->tx_vec_allowed = true;
+
 	hns3_init_rx_ptype_tble(dev);
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index ef85034..098b6ce 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -644,6 +644,8 @@ struct hns3_adapter {
 
 	bool rx_simple_allowed;
 	bool tx_simple_allowed;
+	bool tx_vec_allowed;
+
 	struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
 };
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 915b896..f3e6aea 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -823,6 +823,8 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 
 	hns->rx_simple_allowed = true;
 	hns->tx_simple_allowed = true;
+	hns->tx_vec_allowed = true;
+
 	hns3_init_rx_ptype_tble(dev);
 
 	hw->adapter_state = HNS3_NIC_CONFIGURED;
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index dc09ea0..3e708b5 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -95,6 +95,8 @@ hns3_tx_queue_release(void *queue)
 			rte_memzone_free(txq->mz);
 		if (txq->sw_ring)
 			rte_free(txq->sw_ring);
+		if (txq->free)
+			rte_free(txq->free);
 		rte_free(txq);
 	}
 }
@@ -1020,6 +1022,7 @@ hns3_fake_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 
 	/* Don't need alloc sw_ring, because upper applications don't use it */
 	txq->sw_ring = NULL;
+	txq->free = NULL;
 
 	txq->hns = hns;
 	txq->tx_deferred_start = false;
@@ -2052,6 +2055,15 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->tx_bd_ready = txq->nb_tx_desc - 1;
 	txq->tx_free_thresh = tx_free_thresh;
 	txq->tx_rs_thresh = tx_rs_thresh;
+	txq->free = rte_zmalloc_socket("hns3 TX mbuf free array",
+				sizeof(struct rte_mbuf *) * txq->tx_rs_thresh,
+				RTE_CACHE_LINE_SIZE, socket_id);
+	if (!txq->free) {
+		hns3_err(hw, "failed to allocate tx mbuf free array!");
+		hns3_tx_queue_release(txq);
+		return -ENOMEM;
+	}
+
 	txq->port_id = dev->data->port_id;
 	txq->pvid_state = hw->port_base_vlan_cfg.state;
 	txq->configured = true;
@@ -3105,6 +3117,20 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	return nb_tx;
 }
 
+int __rte_weak
+hns3_tx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+	return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_xmit_pkts_vec(__rte_unused void *tx_queue,
+		   __rte_unused struct rte_mbuf **tx_pkts,
+		   __rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
+
 int
 hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		       struct rte_eth_burst_mode *mode)
@@ -3116,6 +3142,8 @@ hns3_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		info = "Scalar Simple";
 	else if (pkt_burst == hns3_xmit_pkts)
 		info = "Scalar";
+	else if (pkt_burst == hns3_xmit_pkts_vec)
+		info = "Vector Neon";
 
 	if (info == NULL)
 		return -EINVAL;
@@ -3131,6 +3159,11 @@ hns3_get_tx_function(struct rte_eth_dev *dev, eth_tx_prep_t *prep)
 	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
 	struct hns3_adapter *hns = dev->data->dev_private;
 
+	if (hns->tx_vec_allowed && hns3_tx_check_vec_support(dev) == 0) {
+		*prep = NULL;
+		return hns3_xmit_pkts_vec;
+	}
+
 	if (hns->tx_simple_allowed &&
 	    offloads == (offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE)) {
 		*prep = NULL;
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 32f5d34..b471bf5 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -17,6 +17,10 @@
 #define HNS3_DEFAULT_TX_RS_THRESH	32
 #define HNS3_TX_FAST_FREE_AHEAD		64
 
+#define HNS3_UINT8_BIT			8
+#define HNS3_UINT16_BIT			16
+#define HNS3_UINT32_BIT			32
+
 #define HNS3_512_BD_BUF_SIZE	512
 #define HNS3_1K_BD_BUF_SIZE	1024
 #define HNS3_2K_BD_BUF_SIZE	2048
@@ -132,6 +136,13 @@
 #define HNS3_L3_LEN_UNIT			2UL
 #define HNS3_L4_LEN_UNIT			2UL
 
+#define HNS3_TXD_DEFAULT_BDTYPE		0
+#define HNS3_TXD_VLD_CMD		(0x1 << HNS3_TXD_VLD_B)
+#define HNS3_TXD_FE_CMD			(0x1 << HNS3_TXD_FE_B)
+#define HNS3_TXD_DEFAULT_VLD_FE_BDTYPE		\
+		(HNS3_TXD_VLD_CMD | HNS3_TXD_FE_CMD | HNS3_TXD_DEFAULT_BDTYPE)
+#define HNS3_TXD_SEND_SIZE_SHIFT	16
+
 enum hns3_pkt_l2t_type {
 	HNS3_L2_TYPE_UNICAST,
 	HNS3_L2_TYPE_MULTICAST,
@@ -317,9 +328,13 @@ struct hns3_tx_queue {
 	 * all descriptors are cleared. and then free all mbufs in the batch.
 	 * - tx_rs_thresh
 	 *   Number of mbufs released at a time.
-
+	 *
+	 * - free
+	 *   Tx mbuf free array used for preserving temporarily address of mbuf
+	 *   released back to mempool, when releasing mbuf in batches.
 	 */
 	uint16_t tx_rs_thresh;
+	struct rte_mbuf **free;
 
 	/*
 	 * port based vlan configuration state.
@@ -558,6 +573,8 @@ uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 			       uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
+uint16_t hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+							uint16_t nb_pkts);
 int hns3_tx_burst_mode_get(struct rte_eth_dev *dev,
 			   __rte_unused uint16_t queue_id,
 			   struct rte_eth_burst_mode *mode);
@@ -577,6 +594,7 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c
new file mode 100644
index 0000000..1154b6f
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec.c
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#include <rte_io.h>
+#include <rte_ethdev_driver.h>
+
+#include "hns3_ethdev.h"
+#include "hns3_rxtx.h"
+#include "hns3_rxtx_vec.h"
+
+#if defined RTE_ARCH_ARM64
+#include "hns3_rxtx_vec_neon.h"
+#endif
+
+int
+hns3_tx_check_vec_support(struct rte_eth_dev *dev)
+{
+	struct rte_eth_txmode *txmode = &dev->data->dev_conf.txmode;
+
+	/* Only support DEV_TX_OFFLOAD_MBUF_FAST_FREE */
+	if (txmode->offloads != DEV_TX_OFFLOAD_MBUF_FAST_FREE)
+		return -ENOTSUP;
+
+	return 0;
+}
+
+uint16_t
+hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
+	uint16_t nb_tx = 0;
+
+	while (nb_pkts) {
+		uint16_t ret, new_burst;
+
+		new_burst = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = hns3_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+						new_burst);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < new_burst)
+			break;
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
new file mode 100644
index 0000000..90679bf
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#ifndef _HNS3_RXTX_VEC_H_
+#define _HNS3_RXTX_VEC_H_
+
+#include "hns3_rxtx.h"
+#include "hns3_ethdev.h"
+
+static inline void
+hns3_tx_free_buffers(struct hns3_tx_queue *txq)
+{
+	struct rte_mbuf **free = txq->free;
+	struct hns3_entry *tx_entry;
+	struct hns3_desc *tx_desc;
+	struct rte_mbuf *m;
+	int nb_free = 0;
+	int i;
+
+	/*
+	 * All mbufs can be released only when the VLD bits of all
+	 * descriptors in a batch are cleared.
+	 */
+	tx_desc = &txq->tx_ring[txq->next_to_clean];
+	for (i = 0; i < txq->tx_rs_thresh; i++, tx_desc++) {
+		if (tx_desc->tx.tp_fe_sc_vld_ra_ri &
+				rte_le_to_cpu_16(BIT(HNS3_TXD_VLD_B)))
+			return;
+	}
+
+	tx_entry = &txq->sw_ring[txq->next_to_clean];
+	for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
+		m = rte_pktmbuf_prefree_seg(tx_entry->mbuf);
+		tx_entry->mbuf = NULL;
+
+		if (m == NULL)
+			continue;
+
+		if (nb_free && m->pool != free[0]->pool) {
+			rte_mempool_put_bulk(free[0]->pool, (void **)free,
+					     nb_free);
+			nb_free = 0;
+		}
+		free[nb_free++] = m;
+	}
+
+	if (nb_free)
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+
+	/* Update numbers of available descriptor due to buffer freed */
+	txq->tx_bd_ready += txq->tx_rs_thresh;
+	txq->next_to_clean += txq->tx_rs_thresh;
+	if (txq->next_to_clean >= txq->nb_tx_desc)
+		txq->next_to_clean = 0;
+}
+#endif /* _HNS3_RXTX_VEC_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
new file mode 100644
index 0000000..e878ee1
--- /dev/null
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Hisilicon Limited.
+ */
+
+#ifndef _HNS3_RXTX_VEC_NEON_H_
+#define _HNS3_RXTX_VEC_NEON_H_
+
+#include <arm_neon.h>
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static inline void
+hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
+{
+	uint64x2_t val1 = {
+		pkt->buf_iova + pkt->data_off,
+		((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT
+	};
+	uint64x2_t val2 = {
+		0,
+		((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT
+	};
+	vst1q_u64((uint64_t *)&desc->addr, val1);
+	vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
+}
+
+static uint16_t
+hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
+			  struct rte_mbuf **__restrict tx_pkts,
+			  uint16_t nb_pkts)
+{
+	struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
+	volatile struct hns3_desc *tx_desc;
+	struct hns3_entry *tx_entry;
+	uint16_t next_to_use;
+	uint16_t nb_commit;
+	uint16_t nb_tx;
+	uint16_t n, i;
+
+	if (txq->tx_bd_ready < txq->tx_free_thresh)
+		hns3_tx_free_buffers(txq);
+
+	nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
+	if (unlikely(nb_commit == 0)) {
+		txq->queue_full_cnt++;
+		return 0;
+	}
+	nb_tx = nb_commit;
+
+	next_to_use = txq->next_to_use;
+	tx_desc = &txq->tx_ring[next_to_use];
+	tx_entry = &txq->sw_ring[next_to_use];
+
+	/*
+	 * We need to deal with n descriptors first for better performance,
+	 * if nb_commit is greater than the difference between txq->nb_tx_desc
+	 * and next_to_use in sw_ring and tx_ring.
+	 */
+	n = txq->nb_tx_desc - next_to_use;
+	if (nb_commit >= n) {
+		for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
+			hns3_vec_tx(tx_desc, *tx_pkts);
+			tx_entry[i].mbuf = *tx_pkts;
+		}
+
+		nb_commit -= n;
+		next_to_use = 0;
+		tx_desc = &txq->tx_ring[next_to_use];
+		tx_entry = &txq->sw_ring[next_to_use];
+	}
+
+	for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
+		hns3_vec_tx(tx_desc, *tx_pkts);
+		tx_entry[i].mbuf = *tx_pkts;
+	}
+
+	next_to_use += nb_commit;
+	txq->next_to_use = next_to_use;
+	txq->tx_bd_ready -= nb_tx;
+
+	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
+
+	return nb_tx;
+}
+#endif /* _HNS3_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/hns3/meson.build b/drivers/net/hns3/meson.build
index e01e6ce..bf69ad4 100644
--- a/drivers/net/hns3/meson.build
+++ b/drivers/net/hns3/meson.build
@@ -28,3 +28,7 @@ sources = files('hns3_cmd.c',
 	'hns3_mp.c')
 
 deps += ['hash']
+
+if arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
+	sources += files('hns3_rxtx_vec.c')
+endif
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 6/8] net/hns3: add vector Rx burst with NEON instructions
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (4 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

This patch adds NEON vector instructions to optimize Rx burst process.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c        |   1 +
 drivers/net/hns3/hns3_ethdev.h        |   1 +
 drivers/net/hns3/hns3_ethdev_vf.c     |   1 +
 drivers/net/hns3/hns3_rxtx.c          |  94 +++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          |  35 +++++-
 drivers/net/hns3/hns3_rxtx_vec.c      | 167 ++++++++++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec.h      |  20 ++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h | 203 ++++++++++++++++++++++++++++++++++
 8 files changed, 514 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 9df5fc8..61be870 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 098b6ce..fd6a9f9 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -643,6 +643,7 @@ struct hns3_adapter {
 	};
 
 	bool rx_simple_allowed;
+	bool rx_vec_allowed;
 	bool tx_simple_allowed;
 	bool tx_vec_allowed;
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index f3e6aea..93f2c93 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 3e708b5..ada02de 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -41,9 +41,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 	if (rxq->sw_ring == NULL)
 		return;
 
-	for (i = 0; i < rxq->nb_rx_desc; i++)
-		if (rxq->sw_ring[i].mbuf)
-			rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+	if (rxq->rx_rearm_nb == 0) {
+		for (i = 0; i < rxq->nb_rx_desc; i++) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	} else {
+		for (i = rxq->next_to_use;
+		     i != rxq->rx_rearm_start;
+		     i = (i + 1) % rxq->nb_rx_desc) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	}
 
 	for (i = 0; i < rxq->bulk_mbuf_num; i++)
 		rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
@@ -661,10 +671,13 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	}
 
 	rxq->next_to_use = 0;
+	rxq->rx_rearm_start = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	hns3_init_rx_queue_hw(rxq);
+	hns3_rxq_vec_setup(rxq);
 
 	return 0;
 }
@@ -678,6 +691,8 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -860,6 +875,40 @@ hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue)
 	return 0;
 }
 
+/*
+ * Iterate over all Rx Queue, and call the callback() function for each Rx
+ * queue.
+ *
+ * @param[in] dev
+ *   The target eth dev.
+ * @param[in] callback
+ *   The function to call for each queue.
+ *   if callback function return nonzero will stop iterate and return it's value
+ * @param[in] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   0 on success, otherwise with errno set.
+ */
+int
+hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg)
+{
+	uint32_t i;
+	int ret;
+
+	if (dev->data->rx_queues == NULL)
+		return -EINVAL;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		ret = callback(dev->data->rx_queues[i], arg);
+		if (ret != 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void*
 hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 			    struct hns3_queue_info *q_info)
@@ -880,7 +929,13 @@ hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 	/* Allocate rx ring hardware descriptors. */
 	rxq->queue_id = q_info->idx;
 	rxq->nb_rx_desc = q_info->nb_desc;
-	rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc);
+
+	/*
+	 * Allocate a litter more memory because rx vector functions
+	 * don't check boundaries each time.
+	 */
+	rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_desc);
 	rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx,
 					 rx_desc, HNS3_RING_BASE_ALIGN,
 					 q_info->socket_id);
@@ -1329,7 +1384,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
 	rxq->rx_deferred_start = conf->rx_deferred_start;
 
-	rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
+	rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_entry);
 	rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len,
 					  RTE_CACHE_LINE_SIZE, socket_id);
 	if (rxq->sw_ring == NULL) {
@@ -1340,6 +1396,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	rxq->port_id = dev->data->port_id;
@@ -1431,7 +1489,8 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	};
 
 	if (dev->rx_pkt_burst == hns3_recv_pkts ||
-	    dev->rx_pkt_burst == hns3_recv_scattered_pkts)
+	    dev->rx_pkt_burst == hns3_recv_scattered_pkts ||
+	    dev->rx_pkt_burst == hns3_recv_pkts_vec)
 		return ptypes;
 
 	return NULL;
@@ -1915,6 +1974,25 @@ hns3_recv_scattered_pkts(void *rx_queue,
 	return nb_rx;
 }
 
+void __rte_weak
+hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq)
+{
+}
+
+int __rte_weak
+hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+	return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_recv_pkts_vec(__rte_unused void *tx_queue,
+		   __rte_unused struct rte_mbuf **tx_pkts,
+		   __rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
+
 int
 hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		       struct rte_eth_burst_mode *mode)
@@ -1925,6 +2003,7 @@ hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 	} burst_infos[] = {
 		{ hns3_recv_pkts,		"Scalar" },
 		{ hns3_recv_scattered_pkts,	"Scalar Scattered" },
+		{ hns3_recv_pkts_vec,		"Vector Neon" },
 	};
 
 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
@@ -1949,6 +2028,9 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
 	struct hns3_adapter *hns = dev->data->dev_private;
 	uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
 
+	if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0)
+		return hns3_recv_pkts_vec;
+
 	if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
 	    (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
 		return hns3_recv_pkts;
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index b471bf5..27041ab 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -17,6 +17,18 @@
 #define HNS3_DEFAULT_TX_RS_THRESH	32
 #define HNS3_TX_FAST_FREE_AHEAD		64
 
+#define HNS3_DEFAULT_RX_BURST		32
+#if (HNS3_DEFAULT_RX_BURST > 64)
+#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n"
+#endif
+#define HNS3_DEFAULT_DESCS_PER_LOOP	4
+#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP	8
+#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP)
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_DEFAULT_DESCS_PER_LOOP
+#else
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_SVE_DEFAULT_DESCS_PER_LOOP
+#endif
+#define HNS3_DEFAULT_RXQ_REARM_THRESH	64
 #define HNS3_UINT8_BIT			8
 #define HNS3_UINT16_BIT			16
 #define HNS3_UINT32_BIT			32
@@ -236,7 +248,13 @@ struct hns3_desc {
 					uint16_t ot_vlan_tag;
 				};
 			};
-			uint32_t bd_base_info;
+			union {
+				uint32_t bd_base_info;
+				struct {
+					uint16_t bdtype_vld_udp0;
+					uint16_t fe_lum_crcp_l3l4p;
+				};
+			};
 		} rx;
 	};
 } __rte_packed;
@@ -270,7 +288,8 @@ struct hns3_rx_queue {
 	uint16_t rx_free_thresh;
 	uint16_t next_to_use;    /* index of next BD to be polled */
 	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
-
+	uint16_t rx_rearm_start; /* index of BD that driver re-arming from */
+	uint16_t rx_rearm_nb;    /* number of remaining BDs to be re-armed */
 	/*
 	 * port based vlan configuration state.
 	 * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -292,6 +311,11 @@ struct hns3_rx_queue {
 
 	struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
 	uint16_t bulk_mbuf_num;
+
+	/* offset_table: used for vector, to solve execute re-order problem */
+	uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
+	uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
+	struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
 };
 
 struct hns3_tx_queue {
@@ -554,6 +578,8 @@ int hns3_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
 void hns3_enable_all_queues(struct hns3_hw *hw, bool en);
 int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue);
 int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue);
+int hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg);
 void hns3_dev_release_mbufs(struct hns3_adapter *hns);
 int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 			unsigned int socket, const struct rte_eth_rxconf *conf,
@@ -564,9 +590,12 @@ uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 				  uint16_t nb_pkts);
+uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts);
 int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
 			   __rte_unused uint16_t queue_id,
 			   struct rte_eth_burst_mode *mode);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -594,7 +623,9 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
+void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c
index 1154b6f..a26c83d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.c
+++ b/drivers/net/hns3/hns3_rxtx_vec.c
@@ -45,3 +45,170 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 	return nb_tx;
 }
+
+static inline void
+hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq)
+{
+#define REARM_LOOP_STEP_NUM	4
+	struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start];
+	struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start;
+	uint64_t dma_addr;
+	int i;
+
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
+					  HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) {
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
+		return;
+	}
+
+	for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM,
+		rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) {
+		if (likely(i <
+			HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) {
+			rte_prefetch_non_temporal(rxep[4].mbuf);
+			rte_prefetch_non_temporal(rxep[5].mbuf);
+			rte_prefetch_non_temporal(rxep[6].mbuf);
+			rte_prefetch_non_temporal(rxep[7].mbuf);
+		}
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf);
+		rxdp[0].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[0].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf);
+		rxdp[1].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[1].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf);
+		rxdp[2].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[2].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf);
+		rxdp[3].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[3].rx.bd_base_info = 0;
+	}
+
+	rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH;
+	if (rxq->rx_rearm_start >= rxq->nb_rx_desc)
+		rxq->rx_rearm_start = 0;
+
+	rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH;
+
+	hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH);
+}
+
+uint16_t
+hns3_recv_pkts_vec(void *__restrict rx_queue,
+		   struct rte_mbuf **__restrict rx_pkts,
+		   uint16_t nb_pkts)
+{
+	struct hns3_rx_queue *rxq = rx_queue;
+	struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use];
+	uint64_t bd_err_mask;  /* bit mask indicate whick pkts is error */
+	uint16_t nb_rx;
+
+	nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP);
+
+	rte_prefetch_non_temporal(rxdp);
+
+	if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH)
+		hns3_rxq_rearm_mbuf(rxq);
+
+	if (unlikely(!(rxdp->rx.bd_base_info &
+			rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B))))
+		return 0;
+
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf);
+
+	bd_err_mask = 0;
+	nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask);
+	if (unlikely(bd_err_mask))
+		nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask);
+
+	return nb_rx;
+}
+
+static void
+hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+}
+
+void
+hns3_rxq_vec_setup(struct hns3_rx_queue *rxq)
+{
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc];
+	unsigned int i;
+
+	memset(&rxq->rx_ring[rxq->nb_rx_desc], 0,
+		sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST);
+
+	memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf));
+	for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++)
+		sw_ring[i].mbuf = &rxq->fake_mbuf;
+
+	hns3_rxq_vec_setup_rearm_data(rxq);
+
+	memset(rxq->offset_table, 0, sizeof(rxq->offset_table));
+}
+
+#ifndef RTE_LIBRTE_IEEE1588
+static int
+hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg)
+{
+	uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH +
+				HNS3_DEFAULT_RX_BURST;
+
+	if (rxq->nb_rx_desc < min_vec_bds)
+		return -ENOTSUP;
+
+	if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH)
+		return -ENOTSUP;
+
+	RTE_SET_USED(arg);
+	return 0;
+}
+#endif
+
+int
+hns3_rx_check_vec_support(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO |
+				 DEV_RX_OFFLOAD_VLAN;
+
+	if (dev->data->scattered_rx)
+		return -ENOTSUP;
+
+	if (fconf->mode != RTE_FDIR_MODE_NONE)
+		return -ENOTSUP;
+
+	if (rxmode->offloads & offloads_mask)
+		return -ENOTSUP;
+
+	if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0)
+		return -ENOTSUP;
+
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	return -ENOTSUP;
+#endif
+}
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
index 90679bf..c6df36d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.h
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -54,4 +54,24 @@ hns3_tx_free_buffers(struct hns3_tx_queue *txq)
 	if (txq->next_to_clean >= txq->nb_tx_desc)
 		txq->next_to_clean = 0;
 }
+
+static inline uint16_t
+hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts,
+			uint16_t nb_pkts,
+			uint64_t pkt_err_mask)
+{
+	uint16_t count, i;
+	uint64_t mask;
+
+	count = 0;
+	for (i = 0; i < nb_pkts; i++) {
+		mask = ((uint64_t)1u) << i;
+		if (pkt_err_mask & mask)
+			rte_pktmbuf_free_seg(rx_pkts[i]);
+		else
+			rx_pkts[count++] = rx_pkts[i];
+	}
+
+	return count;
+}
 #endif /* _HNS3_RXTX_VEC_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index e878ee1..8d7721b 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -82,4 +82,207 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 
 	return nb_tx;
 }
+
+static inline uint32_t
+hns3_desc_parse_field(struct hns3_rx_queue *rxq,
+		      struct hns3_entry *sw_ring,
+		      struct hns3_desc *rxdp,
+		      uint32_t   bd_vld_num)
+{
+	uint32_t l234_info, ol_info, bd_base_info;
+	struct rte_mbuf *pkt;
+	uint32_t retcode = 0;
+	uint32_t cksum_err;
+	int ret, i;
+
+	for (i = 0; i < (int)bd_vld_num; i++) {
+		pkt = sw_ring[i].mbuf;
+
+		/* init rte_mbuf.rearm_data last 64-bit */
+		pkt->ol_flags = PKT_RX_RSS_HASH;
+
+		l234_info = rxdp[i].rx.l234_info;
+		ol_info = rxdp[i].rx.ol_info;
+		bd_base_info = rxdp[i].rx.bd_base_info;
+		ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
+					 l234_info, &cksum_err);
+		if (unlikely(ret)) {
+			retcode |= 1u << i;
+			continue;
+		}
+
+		pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+			hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
+					       cksum_err);
+	}
+
+	return retcode;
+}
+
+static inline uint16_t
+hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
+		    struct rte_mbuf **__restrict rx_pkts,
+		    uint16_t nb_pkts,
+		    uint64_t *bd_err_mask)
+{
+	uint16_t rx_id = rxq->next_to_use;
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
+	struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
+	uint32_t bd_valid_num, parse_retcode;
+	uint16_t nb_rx = 0;
+	int pos, offset;
+
+	/* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
+	uint8x16_t shuf_desc_fields_msk = {
+		0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
+		22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
+		20, 21,	                 /* size to rte_mbuf.data_len */
+		0xff, 0xff,	         /* rte_mbuf.vlan_tci init zero */
+		8, 9, 10, 11,	         /* rx.rss_hash to rte_mbuf.hash.rss */
+	};
+
+	uint16x8_t crc_adjust = {
+		0, 0,         /* ignore pkt_type field */
+		rxq->crc_len, /* sub crc on pkt_len */
+		0,            /* ignore high-16bits of pkt_len */
+		rxq->crc_len, /* sub crc on data_len */
+		0, 0, 0,      /* ignore non-length fields */
+	};
+
+	for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
+				     rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
+		uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
+		uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint64x2_t mbp1, mbp2;
+		uint16x4_t bd_vld = {0};
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* calc how many bd valid */
+		bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
+		bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
+		bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
+		bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
+
+		/* load 2 mbuf pointer */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		bd_vld = vshl_n_u16(bd_vld,
+				    HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
+		bd_vld = vreinterpret_u16_s16(
+				vshr_n_s16(vreinterpret_s16_u16(bd_vld),
+					   HNS3_UINT16_BIT - 1));
+		stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
+
+		/* load 2 mbuf pointer again */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		if (likely(stat == 0))
+			bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
+		else
+			bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
+		if (bd_valid_num == 0)
+			break;
+
+		/* use offset to control below data load oper ordering */
+		offset = rxq->offset_table[bd_valid_num];
+
+		/* store 2 mbuf pointer into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* read first two descs */
+		descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
+		descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
+
+		/* store 2 mbuf pointer into rx_pkts again */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* read remains two descs */
+		descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
+		descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
+
+		pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
+		pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
+		pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
+		pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
+
+		/* pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
+		pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
+
+		/* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 1,2 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+
+		pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
+		pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
+		pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
+		pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
+
+		/* pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
+		pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
+
+		/* pkt 1,2 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
+			 pkt_mb1);
+		vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
+			 pkt_mb2);
+
+		/* pkt 3,4 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+
+		/* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 3,4 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
+			 pkt_mb3);
+		vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
+			 pkt_mb4);
+
+		rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
+
+		parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
+			&rxdp[offset], bd_valid_num);
+		if (unlikely(parse_retcode))
+			(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
+
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
+
+		nb_rx += bd_valid_num;
+		if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
+			break;
+	}
+
+	rxq->rx_rearm_nb += nb_rx;
+	rxq->next_to_use += nb_rx;
+	if (rxq->next_to_use >= rxq->nb_rx_desc)
+		rxq->next_to_use = 0;
+
+	return nb_rx;
+}
 #endif /* _HNS3_RXTX_VEC_NEON_H_ */
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 7/8] net/hns3: add restriction on setting VF MTU
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (5 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
  2020-09-21 12:58   ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Ferruh Yigit
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>

when Rx of scattered packets is off, we have some possibility of using
vector Rx process function or simple Rx functions in hns3 PMD driver.
If the input MTU is increased and the maximum length of received packets
is greater than the length of a buffer for Rx packets, the hardware network
engine needs to use multiple BDs and buffers to store these packets. This
will cause problems when still using vector Rx process function or simple
Rx function to receiving packets. So, when Rx of scattered packets is off
and device is started, it is not permitted to increase MTU so that the
maximum length of Rx packets is greater than Rx buffer length.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/net/hns3/hns3_ethdev_vf.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index 93f2c93..44e51b5 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -871,6 +871,25 @@ hns3vf_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
 		return -EIO;
 	}
 
+	/*
+	 * when Rx of scattered packets is off, we have some possibility of
+	 * using vector Rx process function or simple Rx functions in hns3 PMD
+	 * driver. If the input MTU is increased and the maximum length of
+	 * received packets is greater than the length of a buffer for Rx
+	 * packet, the hardware network engine needs to use multiple BDs and
+	 * buffers to store these packets. This will cause problems when still
+	 * using vector Rx process function or simple Rx function to receiving
+	 * packets. So, when Rx of scattered packets is off and device is
+	 * started, it is not permitted to increase MTU so that the maximum
+	 * length of Rx packets is greater than Rx buffer length.
+	 */
+	if (dev->data->dev_started && !dev->data->scattered_rx &&
+	    frame_size > hw->rx_buf_len) {
+		hns3_err(hw, "failed to set mtu because current is "
+			"not scattered rx mode");
+		return -EOPNOTSUPP;
+	}
+
 	rte_spinlock_lock(&hw->lock);
 	ret = hns3vf_config_mtu(hw, mtu);
 	if (ret) {
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [dpdk-dev] [PATCH v2 8/8] net/hns3: fix segfault when Tx multiple buffer packets
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (6 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
@ 2020-09-09  9:23   ` Wei Hu (Xavier)
  2020-09-21 12:58   ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Ferruh Yigit
  8 siblings, 0 replies; 19+ messages in thread
From: Wei Hu (Xavier) @ 2020-09-09  9:23 UTC (permalink / raw)
  To: dev; +Cc: xavier.huwei

From: Chengchang Tang <tangchengchang@huawei.com>

Currently, there is a possibility that segment faults occur when sending
packets whose payloads are stored in multiple buffers based on hns3 network
engine. The related core dump information as follows:

Program terminated with signal 11, Segmentation fault.
0  hns3_reassemble_tx_pkts
2512                            temp = temp->next;
Missing separate debuginfos, use:
(gdb) bt
0  hns3_reassemble_tx_pkts
1  0x0000000000969c60 in hns3_check_non_tso_pkt
2  0x000000000096adbc in hns3_xmit_pkts
3  0x000000000050d4d0 in rte_eth_tx_burst
4  0x000000000050fca4 in pkt_burst_transmit
5  0x00000000004ca6b8 in run_pkt_fwd_on_lcore
6  0x00000000004ca7fc in start_pkt_forward_on_core
7  0x00000000006975a4 in eal_thread_loop
8  0x0000ffffa6f7fc48 in start_thread
9  0x0000ffffa6ed1600 in thread_start

The root cause is that hns3 PMD driver invokes the rte_pktmbuf_free_seg API
function to release the same rte_mbuf multiple times. The rte_mbuf pointer
is not set to NULL in the internal function hns3_rx_queue_release_mbufs
which is invoked during queue setup, stop and close. As a result the
rte_mbuf in Rx queues will be repeatedly released when the user application
setup queues or stop/start the dev for multiple times. Probably for
performance reasons, DPDK mempool lib does not check for the repeated
rte_mbuf releases. The Address of released rte_mbuf are directly stored
into the per lcore cache of the mempool. This makes the rte_mbufs obtained
from mempool by calling rte_mempool_get_bulk API function repetitively.
ultimately, it causes to access to a NULL pointer in PMD driver.

This patch fixes this problem by setting released mbuf pointer to NULL in
the internal function named hns3_rx_queue_release_mbuf. And the other
internal function named hns3_reassemble_tx_pkts is optimized to avoid a
similar problem.

Fixes: bba636698316 ("net/hns3: support Rx/Tx and related operations")
Cc: stable@dpdk.org

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 61 +++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index ada02de..68d7a6a 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -43,15 +43,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 
 	if (rxq->rx_rearm_nb == 0) {
 		for (i = 0; i < rxq->nb_rx_desc; i++) {
-			if (rxq->sw_ring[i].mbuf != NULL)
+			if (rxq->sw_ring[i].mbuf != NULL) {
 				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+				rxq->sw_ring[i].mbuf = NULL;
+			}
 		}
 	} else {
 		for (i = rxq->next_to_use;
 		     i != rxq->rx_rearm_start;
 		     i = (i + 1) % rxq->nb_rx_desc) {
-			if (rxq->sw_ring[i].mbuf != NULL)
+			if (rxq->sw_ring[i].mbuf != NULL) {
 				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+				rxq->sw_ring[i].mbuf = NULL;
+			}
 		}
 	}
 
@@ -2368,37 +2372,24 @@ hns3_fill_first_desc(struct hns3_tx_queue *txq, struct hns3_desc *desc,
 	}
 }
 
-static int
-hns3_tx_alloc_mbufs(struct hns3_tx_queue *txq, struct rte_mempool *mb_pool,
-		    uint16_t nb_new_buf, struct rte_mbuf **alloc_mbuf)
+static inline int
+hns3_tx_alloc_mbufs(struct rte_mempool *mb_pool, uint16_t nb_new_buf,
+			struct rte_mbuf **alloc_mbuf)
 {
-	struct rte_mbuf *new_mbuf = NULL;
-	struct rte_eth_dev *dev;
-	struct rte_mbuf *temp;
-	struct hns3_hw *hw;
+#define MAX_NON_TSO_BD_PER_PKT 18
+	struct rte_mbuf *pkt_segs[MAX_NON_TSO_BD_PER_PKT];
 	uint16_t i;
 
 	/* Allocate enough mbufs */
-	for (i = 0; i < nb_new_buf; i++) {
-		temp = rte_pktmbuf_alloc(mb_pool);
-		if (unlikely(temp == NULL)) {
-			dev = &rte_eth_devices[txq->port_id];
-			hw = HNS3_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-			hns3_err(hw, "Failed to alloc TX mbuf port_id=%d,"
-				     "queue_id=%d in reassemble tx pkts.",
-				     txq->port_id, txq->queue_id);
-			rte_pktmbuf_free(new_mbuf);
-			return -ENOMEM;
-		}
-		temp->next = new_mbuf;
-		new_mbuf = temp;
-	}
-
-	if (new_mbuf == NULL)
+	if (rte_mempool_get_bulk(mb_pool, (void **)pkt_segs, nb_new_buf))
 		return -ENOMEM;
 
-	new_mbuf->nb_segs = nb_new_buf;
-	*alloc_mbuf = new_mbuf;
+	for (i = 0; i < nb_new_buf - 1; i++)
+		pkt_segs[i]->next = pkt_segs[i + 1];
+
+	pkt_segs[nb_new_buf - 1]->next = NULL;
+	pkt_segs[0]->nb_segs = nb_new_buf;
+	*alloc_mbuf = pkt_segs[0];
 
 	return 0;
 }
@@ -2418,10 +2409,8 @@ hns3_pktmbuf_copy_hdr(struct rte_mbuf *new_pkt, struct rte_mbuf *old_pkt)
 }
 
 static int
-hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
-			struct rte_mbuf **new_pkt)
+hns3_reassemble_tx_pkts(struct rte_mbuf *tx_pkt, struct rte_mbuf **new_pkt)
 {
-	struct hns3_tx_queue *txq = tx_queue;
 	struct rte_mempool *mb_pool;
 	struct rte_mbuf *new_mbuf;
 	struct rte_mbuf *temp_new;
@@ -2433,7 +2422,6 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 	uint16_t len_s;
 	uint16_t len_d;
 	uint16_t len;
-	uint16_t i;
 	int ret;
 	char *s;
 	char *d;
@@ -2449,7 +2437,7 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 		last_buf_len = buf_size;
 
 	/* Allocate enough mbufs */
-	ret = hns3_tx_alloc_mbufs(txq, mb_pool, nb_new_buf, &new_mbuf);
+	ret = hns3_tx_alloc_mbufs(mb_pool, nb_new_buf, &new_mbuf);
 	if (ret)
 		return ret;
 
@@ -2458,12 +2446,9 @@ hns3_reassemble_tx_pkts(void *tx_queue, struct rte_mbuf *tx_pkt,
 	s = rte_pktmbuf_mtod(temp, char *);
 	len_s = rte_pktmbuf_data_len(temp);
 	temp_new = new_mbuf;
-	for (i = 0; i < nb_new_buf; i++) {
+	while (temp != NULL && temp_new != NULL) {
 		d = rte_pktmbuf_mtod(temp_new, char *);
-		if (i < nb_new_buf - 1)
-			buf_len = buf_size;
-		else
-			buf_len = last_buf_len;
+		buf_len = temp_new->next == NULL ? last_buf_len : buf_size;
 		len_d = buf_len;
 
 		while (len_d) {
@@ -2924,7 +2909,7 @@ hns3_check_non_tso_pkt(uint16_t nb_buf, struct rte_mbuf **m_seg,
 
 	if (unlikely(nb_buf > HNS3_MAX_NON_TSO_BD_PER_PKT)) {
 		txq->exceed_limit_bd_pkt_cnt++;
-		ret = hns3_reassemble_tx_pkts(txq, tx_pkt, &new_pkt);
+		ret = hns3_reassemble_tx_pkts(tx_pkt, &new_pkt);
 		if (ret) {
 			txq->exceed_limit_bd_reassem_fail++;
 			return ret;
-- 
2.9.5


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx
  2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
                     ` (7 preceding siblings ...)
  2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
@ 2020-09-21 12:58   ` Ferruh Yigit
  8 siblings, 0 replies; 19+ messages in thread
From: Ferruh Yigit @ 2020-09-21 12:58 UTC (permalink / raw)
  To: Wei Hu (Xavier), dev; +Cc: xavier.huwei

On 9/9/2020 10:23 AM, Wei Hu (Xavier) wrote:
> This series are updates for Rx/Tx process.
> 
> Chengchang Tang (1):
>    net/hns3: fix segfault when Tx multiple buffer packets
> 
> Wei Hu (Xavier) (7):
>    net/hns3: report Rx free threshold
>    net/hns3: reduce address calculation in Rx
>    net/hns3: add simple Rx process function
>    net/hns3: add simple Tx process function
>    net/hns3: add vector Tx burst with NEON instructions
>    net/hns3: add vector Rx burst with NEON instructions
>    net/hns3: add restriction on setting VF MTU
> 

Series applied to dpdk-next-net/main, thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2020-09-21 12:58 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-07  9:08 [dpdk-dev] [PATCH 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
2020-09-07  9:08 ` [dpdk-dev] [PATCH 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
2020-09-09  9:23 ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 1/8] net/hns3: report Rx free threshold Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 2/8] net/hns3: reduce address calculation in Rx Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 3/8] net/hns3: add simple Rx process function Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 4/8] net/hns3: add simple Tx " Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 5/8] net/hns3: add vector Tx burst with NEON instructions Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 6/8] net/hns3: add vector Rx " Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 7/8] net/hns3: add restriction on setting VF MTU Wei Hu (Xavier)
2020-09-09  9:23   ` [dpdk-dev] [PATCH v2 8/8] net/hns3: fix segfault when Tx multiple buffer packets Wei Hu (Xavier)
2020-09-21 12:58   ` [dpdk-dev] [PATCH v2 0/8] net/hns3: updates for Rx Tx Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).