DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/ice: CVL support double vlan
@ 2023-04-20  6:16 Mingjin Ye
  2023-05-06 10:04 ` [PATCH v2] net/ice: " Mingjin Ye
  2023-07-17  9:36 ` [POC] net/iavf: support no data path polling mode Mingjin Ye
  0 siblings, 2 replies; 16+ messages in thread
From: Mingjin Ye @ 2023-04-20  6:16 UTC (permalink / raw)
  To: dev; +Cc: yidingx.zhou, Mingjin Ye, Qiming Yang, Qi Zhang

Aligned with kernel driver, optimized for inner and outer VLAN handling
in DPDK, and implemented double vlan insertion and stripping support.

1.adjust vlan stripping
Remove the judgment on dvm, vlan stripping only operates inner vlan.

2.support QinQ stripping
This patch support ice outer vlan strip on and off in QinQ mode with mask
bit of DEV_RX_OFFLOAD_QINQ_STRIP, users canuse "vlan set qinq_strip on 0"
to enable or "vlan setqinq_strip off 0" to disable ice outer vlan strip
when try with testpmd app.
Note: Due to hardware limitations, QinQ stripping containing two tagged RX
packets with the same EtherType (for example, two VLANs with EtherType =`
ETH_P_8021Q`) is not supported.

3.Support outer tag type switching
Add implementation of ethdev `vlan_tpid_set` api to enable Outer tags supp
-ort processing `ETH_P_8021Q` `ETH_P_8021AD` `ETH_P_QINQ1` outer tag types.

4.Support outer port insertion
If dvm is enabled, will support outer port vlan. User can use "tx_vlan set
pvid 0 45 on" to enable or "tx_vlan set pvid 0 45 off" to disable ice outer
vlan insertion try with testpmd app.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
 drivers/net/ice/ice_ethdev.c | 427 +++++++++++++++++++++++++++++++++--
 drivers/net/ice/ice_ethdev.h |   1 +
 2 files changed, 414 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index 9a88cf9796..e4e22044ab 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -56,6 +56,24 @@ static const char * const ice_valid_args[] = {
 
 #define PPS_OUT_DELAY_NS  1
 
+/* Maximum number of VSI */
+#define ICE_MAX_NUM_VSIS          (768UL)
+
+/* The 119 bit offset of the LAN Rx queue context is the L2TSEL control bit. */
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX	3
+#define ICE_L2TSEL_BIT_OFFSET		   23
+enum ice_l2tsel {
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
+
+/* 802.1Q VLAN Extended Header */
+#define ETH_P_8021Q		0x8100
+/* 802.1ad Service VLAN */
+#define ETH_P_8021AD	0x88A8
+/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_QINQ1		0x9100
+
 struct proto_xtr_ol_flag {
 	const struct rte_mbuf_dynflag param;
 	bool required;
@@ -130,6 +148,9 @@ static int ice_fw_version_get(struct rte_eth_dev *dev, char *fw_version,
 			      size_t fw_size);
 static int ice_vlan_pvid_set(struct rte_eth_dev *dev,
 			     uint16_t pvid, int on);
+static int ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid);
 static int ice_get_eeprom_length(struct rte_eth_dev *dev);
 static int ice_get_eeprom(struct rte_eth_dev *dev,
 			  struct rte_dev_eeprom_info *eeprom);
@@ -252,6 +273,7 @@ static const struct eth_dev_ops ice_eth_dev_ops = {
 	.rx_queue_intr_disable        = ice_rx_queue_intr_disable,
 	.fw_version_get               = ice_fw_version_get,
 	.vlan_pvid_set                = ice_vlan_pvid_set,
+	.vlan_tpid_set                = ice_vlan_tpid_set,
 	.rxq_info_get                 = ice_rxq_info_get,
 	.txq_info_get                 = ice_txq_info_get,
 	.rx_burst_mode_get            = ice_rx_burst_mode_get,
@@ -1588,6 +1610,9 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 			hw->func_caps.common_cap.rss_table_size;
 	pf->flags |= ICE_FLAG_RSS_AQ_CAPABLE;
 
+	/* Defines the type of outer tag expected */
+	pf->outer_ethertype = ETH_P_8021Q;
+
 	memset(&vsi_ctx, 0, sizeof(vsi_ctx));
 	switch (type) {
 	case ICE_VSI_PF:
@@ -1615,6 +1640,9 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 				(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
 				 ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
 				ICE_AQ_VSI_OUTER_TAG_TYPE_M;
+			vsi_ctx.info.outer_vlan_flags |=
+				(ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+				ICE_AQ_VSI_OUTER_VLAN_EMODE_S);
 		}
 
 		/* FDIR */
@@ -4431,11 +4459,87 @@ ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
 	return ice_vsi_manage_vlan_stripping(vsi, false);
 }
 
-static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+	switch (tpid) {
+	case ETH_P_8021Q:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+		break;
+	case ETH_P_8021AD:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+		break;
+	case ETH_P_QINQ1:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+		break;
+	default:
+		*tag_type = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), it supports three
+ * types: ETH_P_8021Q, ETH_P_QINQ1 and ETH_P_8021AD. If the device is
+ * configured in Single VLAN Mode (SVM), then only ETH_P_8021Q is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+	bool is_supported = false;
+
+	switch (vlan_proto) {
+	case ETH_P_8021Q:
+		is_supported = true;
+		break;
+	case ETH_P_8021AD:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	case ETH_P_QINQ1:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	}
+
+	return is_supported;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
 {
 	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	struct ice_vsi_ctx ctxt;
 	enum ice_status status;
+	u8 tag_type;
 	int err = 0;
 
 	/* do not allow modifying VLAN stripping when a port VLAN is configured
@@ -4444,6 +4548,9 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	if (vsi->info.port_based_outer_vlan)
 		return 0;
 
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
 	memset(&ctxt, 0, sizeof(ctxt));
 
 	ctxt.info.valid_sections =
@@ -4454,8 +4561,8 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	ctxt.info.outer_vlan_flags |=
 		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
 		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
-		(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
-		 ICE_AQ_VSI_OUTER_TAG_TYPE_S);
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
 
 	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
 	if (status) {
@@ -4503,22 +4610,104 @@ ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
 static int
 ice_vsi_config_vlan_stripping(struct ice_vsi *vsi, bool ena)
 {
-	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	int ret;
 
-	if (ice_is_dvm_ena(hw)) {
-		if (ena)
-			ret = ice_vsi_ena_outer_stripping(vsi);
-		else
-			ret = ice_vsi_dis_outer_stripping(vsi);
+	if (ena)
+		ret = ice_vsi_ena_inner_stripping(vsi);
+	else
+		ret = ice_vsi_dis_inner_stripping(vsi);
+
+	return ret;
+}
+
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_pf *pf = ICE_VSI_TO_PF(vsi);
+	struct rte_eth_dev_data *dev_data = pf->dev_data;
+	u32 l2tsel_bit;
+	uint16_t i;
+
+	if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+		l2tsel_bit = 0;
+	else
+		l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
+
+	for (i = 0; i < dev_data->nb_rx_queues; i++) {
+		u32 qrx_context_offset;
+		u32 regval;
+
+		qrx_context_offset =
+			QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, i);
+
+		regval = rd32(hw, qrx_context_offset);
+		regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+		regval |= l2tsel_bit;
+		wr32(hw, qrx_context_offset, regval);
+	}
+}
+
+/* Configure outer vlan stripping on or off in QinQ mode */
+static int
+ice_vsi_config_outer_vlan_stripping(struct ice_vsi *vsi, bool on)
+{
+	uint16_t outer_ethertype = vsi->adapter->pf.outer_ethertype;
+	int err = 0;
+
+	if (vsi->vsi_id >= ICE_MAX_NUM_VSIS) {
+		PMD_DRV_LOG(ERR, "VSI ID exceeds the maximum");
+		return -EINVAL;
+	}
+
+	if (!ice_is_dvm_ena(&vsi->adapter->hw)) {
+		PMD_DRV_LOG(ERR, "Single VLAN mode (SVM) does not support qinq");
+		return -EOPNOTSUPP;
+	}
+
+	if (on) {
+		err = ice_vsi_ena_outer_stripping(vsi, outer_ethertype);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support outer stripping so the first tag always ends
+			 * up in L2TAG2_2ND and the second/inner tag, if
+			 * enabled, is extracted in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	} else {
-		if (ena)
-			ret = ice_vsi_ena_inner_stripping(vsi);
-		else
-			ret = ice_vsi_dis_inner_stripping(vsi);
+		err = ice_vsi_dis_outer_stripping(vsi);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support inner stripping while outer stripping is
+			 * disabled so that the first and only tag is extracted
+			 * in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	}
 
-	return ret;
+	return err;
 }
 
 static int
@@ -4543,6 +4732,14 @@ ice_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 			ice_vsi_config_vlan_stripping(vsi, false);
 	}
 
+	if (mask & RTE_ETH_QINQ_STRIP_MASK) {
+		/* Enable or disable outer VLAN stripping */
+		if (rxmode->offloads & RTE_ETH_RX_OFFLOAD_QINQ_STRIP)
+			ice_vsi_config_outer_vlan_stripping(vsi, true);
+		else
+			ice_vsi_config_outer_vlan_stripping(vsi, false);
+	}
+
 	return 0;
 }
 
@@ -5019,6 +5216,130 @@ ice_vsi_vlan_pvid_set(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
 	return ret;
 }
 
+/**
+ * __ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info = vsi->info;
+
+	ctxt.info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt.info.port_based_outer_vlan = rte_cpu_to_le_16(vlan_info);
+	ctxt.info.outer_vlan_flags =
+		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+		ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status != ICE_SUCCESS) {
+		PMD_DRV_LOG(ERR,
+		"update VSI for setting outer port based VLAN failed, err %d",
+		status);
+		err = -EINVAL;
+	} else {
+		vsi->info.port_based_outer_vlan = ctxt.info.port_based_outer_vlan;
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.sw_flags2 = ctxt.info.sw_flags2;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @info: vlan pvid info
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+static int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	uint8_t vlan_flags = 0;
+	int err = 0;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	ctxt.info.port_based_inner_vlan = 0;
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+	if (info->config.reject.tagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED;
+	if (info->config.reject.untagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED;
+	ctxt.info.outer_vlan_flags |=
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		((vlan_flags <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (!status) {
+		PMD_DRV_LOG(ERR,
+			    "update VSI for disabling outer VLAN insertion failed, err %d",
+			    status);
+		err = -EINVAL;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.port_based_inner_vlan = ctxt.info.port_based_inner_vlan;
+	}
+
+	return err;
+}
+
 static int
 ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 {
@@ -5039,6 +5360,13 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 			data->dev_conf.txmode.hw_vlan_reject_untagged;
 	}
 
+	if (ice_is_dvm_ena(&vsi->adapter->hw)) {
+		if (on)
+			return ice_vsi_set_outer_port_vlan(vsi, pvid, pf->outer_ethertype);
+		else
+			return ice_vsi_dis_outer_insertion(vsi, &info);
+	}
+
 	ret = ice_vsi_vlan_pvid_set(vsi, &info);
 	if (ret < 0) {
 		PMD_DRV_LOG(ERR, "Failed to set pvid.");
@@ -5048,6 +5376,77 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 	return 0;
 }
 
+static int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, uint16_t tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	int err = 0;
+	u8 tag_type;
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt.info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status) {
+		PMD_DRV_LOG(ERR, "Update VSI failed to enable outer VLAN stripping");
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+	}
+
+	return err;
+}
+
+static int
+ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid)
+{
+	struct ice_pf *pf = ICE_DEV_PRIVATE_TO_PF(dev->data->dev_private);
+	struct ice_hw *hw = ICE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ice_vsi *vsi = pf->main_vsi;
+	uint64_t qinq = dev->data->dev_conf.rxmode.offloads &
+		   RTE_ETH_RX_OFFLOAD_VLAN_EXTEND;
+	int err = 0;
+
+	if ((vlan_type != RTE_ETH_VLAN_TYPE_INNER &&
+	     vlan_type != RTE_ETH_VLAN_TYPE_OUTER) ||
+	     (!qinq && vlan_type == RTE_ETH_VLAN_TYPE_INNER) ||
+		 !ice_is_supported_port_vlan_proto(hw, tpid)) {
+		PMD_DRV_LOG(ERR,
+			    "Unsupported vlan type.");
+		return -EINVAL;
+	}
+
+	err = ice_vsi_ena_outer_insertion(vsi, tpid);
+	if (!err)
+		pf->outer_ethertype = tpid;
+
+	return err;
+}
+
 static int
 ice_get_eeprom_length(struct rte_eth_dev *dev)
 {
diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h
index 9140f3af79..f925231f34 100644
--- a/drivers/net/ice/ice_ethdev.h
+++ b/drivers/net/ice/ice_ethdev.h
@@ -550,6 +550,7 @@ struct ice_pf {
 	uint64_t supported_rxdid; /* bitmap for supported RXDID */
 	uint64_t rss_hf;
 	struct ice_tm_conf tm_conf;
+	uint16_t outer_ethertype;
 };
 
 #define ICE_MAX_QUEUE_NUM  2048
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] net/ice: support double vlan
  2023-04-20  6:16 [PATCH] net/ice: CVL support double vlan Mingjin Ye
@ 2023-05-06 10:04 ` Mingjin Ye
  2023-05-26 10:16   ` Xu, Ke1
  2023-07-17  9:36 ` [POC] net/iavf: support no data path polling mode Mingjin Ye
  1 sibling, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-05-06 10:04 UTC (permalink / raw)
  To: dev; +Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Qi Zhang

Aligned with kernel driver, optimized for inner and outer VLAN handling
in DPDK, and implemented double vlan insertion and stripping support.
NIC must work in double vlan mode(DVM), depending on FW/SW.

1. Adjust vlan stripping
The vlan stripping enable/disable is applied to the inner vlan.

2. Support QinQ stripping
The ice outer vlan strip is enabled/disabled by the mask bit
of `RTE_ETH_RX_OFFLOAD_QINQ_STRIP`, and the user can
use "vlan set qinq_strip on 0" to enable or "vlan setqinq_strip
off 0" to disable the ice outer vlan strip in testpmd.

3. Support outer tag type switching
Implement the ethdev `vlan_tpid_set` api to enable outer tag support to
handle `RTE_ETHER_TYPE_VLAN`` RTE_ETHER_TYPE_QINQ`` RTE_ETHER_TYPE_QINQ1`
outer tag types.

4. Support outer port-based vlan insertion
Implement port-based outer vlan insertion. User can use "tx_vlan set
pvid 0 45 on" to enable or "tx_vlan set pvid 0 45 off" to disable the
outer vlan insertion in testpmd.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
v2: Apply QinQ when initializing vlan offload.
---
 drivers/net/ice/ice_ethdev.c | 422 +++++++++++++++++++++++++++++++++--
 drivers/net/ice/ice_ethdev.h |   1 +
 2 files changed, 408 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index 9a88cf9796..f79dcff3d5 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -56,6 +56,17 @@ static const char * const ice_valid_args[] = {
 
 #define PPS_OUT_DELAY_NS  1
 
+/* Maximum number of VSI */
+#define ICE_MAX_NUM_VSIS          (768UL)
+
+/* The 119 bit offset of the LAN Rx queue context is the L2TSEL control bit. */
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX	3
+#define ICE_L2TSEL_BIT_OFFSET		   23
+enum ice_l2tsel {
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
+
 struct proto_xtr_ol_flag {
 	const struct rte_mbuf_dynflag param;
 	bool required;
@@ -130,6 +141,9 @@ static int ice_fw_version_get(struct rte_eth_dev *dev, char *fw_version,
 			      size_t fw_size);
 static int ice_vlan_pvid_set(struct rte_eth_dev *dev,
 			     uint16_t pvid, int on);
+static int ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid);
 static int ice_get_eeprom_length(struct rte_eth_dev *dev);
 static int ice_get_eeprom(struct rte_eth_dev *dev,
 			  struct rte_dev_eeprom_info *eeprom);
@@ -252,6 +266,7 @@ static const struct eth_dev_ops ice_eth_dev_ops = {
 	.rx_queue_intr_disable        = ice_rx_queue_intr_disable,
 	.fw_version_get               = ice_fw_version_get,
 	.vlan_pvid_set                = ice_vlan_pvid_set,
+	.vlan_tpid_set                = ice_vlan_tpid_set,
 	.rxq_info_get                 = ice_rxq_info_get,
 	.txq_info_get                 = ice_txq_info_get,
 	.rx_burst_mode_get            = ice_rx_burst_mode_get,
@@ -1588,6 +1603,9 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 			hw->func_caps.common_cap.rss_table_size;
 	pf->flags |= ICE_FLAG_RSS_AQ_CAPABLE;
 
+	/* Defines the type of outer tag expected */
+	pf->outer_ethertype = RTE_ETHER_TYPE_VLAN;
+
 	memset(&vsi_ctx, 0, sizeof(vsi_ctx));
 	switch (type) {
 	case ICE_VSI_PF:
@@ -1615,6 +1633,9 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 				(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
 				 ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
 				ICE_AQ_VSI_OUTER_TAG_TYPE_M;
+			vsi_ctx.info.outer_vlan_flags |=
+				(ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+				ICE_AQ_VSI_OUTER_VLAN_EMODE_S);
 		}
 
 		/* FDIR */
@@ -3698,7 +3719,7 @@ ice_dev_start(struct rte_eth_dev *dev)
 	ice_set_tx_function(dev);
 
 	mask = RTE_ETH_VLAN_STRIP_MASK | RTE_ETH_VLAN_FILTER_MASK |
-			RTE_ETH_VLAN_EXTEND_MASK;
+			RTE_ETH_VLAN_EXTEND_MASK | RTE_ETH_QINQ_STRIP_MASK;
 	ret = ice_vlan_offload_set(dev, mask);
 	if (ret) {
 		PMD_INIT_LOG(ERR, "Unable to set VLAN offload");
@@ -4431,11 +4452,86 @@ ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
 	return ice_vsi_manage_vlan_stripping(vsi, false);
 }
 
-static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+	switch (tpid) {
+	case RTE_ETHER_TYPE_VLAN:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+		break;
+	case RTE_ETHER_TYPE_QINQ:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+		break;
+	case RTE_ETHER_TYPE_QINQ1:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+		break;
+	default:
+		*tag_type = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), it supports three
+ * types: RTE_ETHER_TYPE_VLAN, RTE_ETHER_TYPE_QINQ1 and RTE_ETHER_TYPE_QINQ. If the device is
+ * configured in Single VLAN Mode (SVM), then only RTE_ETHER_TYPE_VLAN is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+	bool is_supported = false;
+
+	switch (vlan_proto) {
+	case RTE_ETHER_TYPE_VLAN:
+		is_supported = true;
+		break;
+	case RTE_ETHER_TYPE_QINQ:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	case RTE_ETHER_TYPE_QINQ1:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	}
+
+	return is_supported;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
 {
 	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	struct ice_vsi_ctx ctxt;
 	enum ice_status status;
+	u8 tag_type;
 	int err = 0;
 
 	/* do not allow modifying VLAN stripping when a port VLAN is configured
@@ -4444,6 +4540,9 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	if (vsi->info.port_based_outer_vlan)
 		return 0;
 
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
 	memset(&ctxt, 0, sizeof(ctxt));
 
 	ctxt.info.valid_sections =
@@ -4454,8 +4553,8 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	ctxt.info.outer_vlan_flags |=
 		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
 		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
-		(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
-		 ICE_AQ_VSI_OUTER_TAG_TYPE_S);
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
 
 	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
 	if (status) {
@@ -4503,22 +4602,105 @@ ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
 static int
 ice_vsi_config_vlan_stripping(struct ice_vsi *vsi, bool ena)
 {
-	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	int ret;
 
-	if (ice_is_dvm_ena(hw)) {
-		if (ena)
-			ret = ice_vsi_ena_outer_stripping(vsi);
-		else
-			ret = ice_vsi_dis_outer_stripping(vsi);
+	if (ena)
+		ret = ice_vsi_ena_inner_stripping(vsi);
+	else
+		ret = ice_vsi_dis_inner_stripping(vsi);
+
+	return ret;
+}
+
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_pf *pf = ICE_VSI_TO_PF(vsi);
+	struct rte_eth_dev_data *dev_data = pf->dev_data;
+	u32 l2tsel_bit;
+	uint16_t i;
+
+	if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+		l2tsel_bit = 0;
+	else
+		l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
+
+	for (i = 0; i < dev_data->nb_rx_queues; i++) {
+		u32 qrx_context_offset;
+		u32 regval;
+
+		qrx_context_offset =
+			QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, i);
+
+		regval = rd32(hw, qrx_context_offset);
+		regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+		regval |= l2tsel_bit;
+		wr32(hw, qrx_context_offset, regval);
+	}
+}
+
+/* Configure outer vlan stripping on or off in QinQ mode */
+static int
+ice_vsi_config_outer_vlan_stripping(struct ice_vsi *vsi, bool on)
+{
+	uint16_t outer_ethertype = vsi->adapter->pf.outer_ethertype;
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	int err = 0;
+
+	if (vsi->vsi_id >= ICE_MAX_NUM_VSIS) {
+		PMD_DRV_LOG(ERR, "VSI ID exceeds the maximum");
+		return -EINVAL;
+	}
+
+	if (!ice_is_dvm_ena(hw)) {
+		PMD_DRV_LOG(ERR, "Single VLAN mode (SVM) does not support qinq");
+		return -EOPNOTSUPP;
+	}
+
+	if (on) {
+		err = ice_vsi_ena_outer_stripping(vsi, outer_ethertype);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support outer stripping so the first tag always ends
+			 * up in L2TAG2_2ND and the second/inner tag, if
+			 * enabled, is extracted in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	} else {
-		if (ena)
-			ret = ice_vsi_ena_inner_stripping(vsi);
-		else
-			ret = ice_vsi_dis_inner_stripping(vsi);
+		err = ice_vsi_dis_outer_stripping(vsi);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support inner stripping while outer stripping is
+			 * disabled so that the first and only tag is extracted
+			 * in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	}
 
-	return ret;
+	return err;
 }
 
 static int
@@ -4543,6 +4725,14 @@ ice_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 			ice_vsi_config_vlan_stripping(vsi, false);
 	}
 
+	if (mask & RTE_ETH_QINQ_STRIP_MASK) {
+		/* Enable or disable outer VLAN stripping */
+		if (rxmode->offloads & RTE_ETH_RX_OFFLOAD_QINQ_STRIP)
+			ice_vsi_config_outer_vlan_stripping(vsi, true);
+		else
+			ice_vsi_config_outer_vlan_stripping(vsi, false);
+	}
+
 	return 0;
 }
 
@@ -5019,6 +5209,130 @@ ice_vsi_vlan_pvid_set(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
 	return ret;
 }
 
+/**
+ * ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info = vsi->info;
+
+	ctxt.info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt.info.port_based_outer_vlan = rte_cpu_to_le_16(vlan_info);
+	ctxt.info.outer_vlan_flags =
+		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+		ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status != ICE_SUCCESS) {
+		PMD_DRV_LOG(ERR,
+		"update VSI for setting outer port based VLAN failed, err %d",
+		status);
+		err = -EINVAL;
+	} else {
+		vsi->info.port_based_outer_vlan = ctxt.info.port_based_outer_vlan;
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.sw_flags2 = ctxt.info.sw_flags2;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @info: vlan pvid info
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+static int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	uint8_t vlan_flags = 0;
+	int err = 0;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	ctxt.info.port_based_inner_vlan = 0;
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+	if (info->config.reject.tagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED;
+	if (info->config.reject.untagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED;
+	ctxt.info.outer_vlan_flags |=
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		((vlan_flags <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (!status) {
+		PMD_DRV_LOG(ERR,
+			    "update VSI for disabling outer VLAN insertion failed, err %d",
+			    status);
+		err = -EINVAL;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.port_based_inner_vlan = ctxt.info.port_based_inner_vlan;
+	}
+
+	return err;
+}
+
 static int
 ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 {
@@ -5039,6 +5353,13 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 			data->dev_conf.txmode.hw_vlan_reject_untagged;
 	}
 
+	if (ice_is_dvm_ena(&vsi->adapter->hw)) {
+		if (on)
+			return ice_vsi_set_outer_port_vlan(vsi, pvid, pf->outer_ethertype);
+		else
+			return ice_vsi_dis_outer_insertion(vsi, &info);
+	}
+
 	ret = ice_vsi_vlan_pvid_set(vsi, &info);
 	if (ret < 0) {
 		PMD_DRV_LOG(ERR, "Failed to set pvid.");
@@ -5048,6 +5369,77 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 	return 0;
 }
 
+static int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, uint16_t tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	int err = 0;
+	u8 tag_type;
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt.info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status) {
+		PMD_DRV_LOG(ERR, "Update VSI failed to enable outer VLAN stripping");
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+	}
+
+	return err;
+}
+
+static int
+ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid)
+{
+	struct ice_pf *pf = ICE_DEV_PRIVATE_TO_PF(dev->data->dev_private);
+	struct ice_hw *hw = ICE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ice_vsi *vsi = pf->main_vsi;
+	uint64_t qinq = dev->data->dev_conf.rxmode.offloads &
+		   RTE_ETH_RX_OFFLOAD_QINQ_STRIP;
+	int err = 0;
+
+	if ((vlan_type != RTE_ETH_VLAN_TYPE_INNER &&
+	     vlan_type != RTE_ETH_VLAN_TYPE_OUTER) ||
+	     (!qinq && vlan_type == RTE_ETH_VLAN_TYPE_INNER) ||
+		 !ice_is_supported_port_vlan_proto(hw, tpid)) {
+		PMD_DRV_LOG(ERR,
+			    "Unsupported vlan type.");
+		return -EINVAL;
+	}
+
+	err = ice_vsi_ena_outer_insertion(vsi, tpid);
+	if (!err)
+		pf->outer_ethertype = tpid;
+
+	return err;
+}
+
 static int
 ice_get_eeprom_length(struct rte_eth_dev *dev)
 {
diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h
index 9140f3af79..f925231f34 100644
--- a/drivers/net/ice/ice_ethdev.h
+++ b/drivers/net/ice/ice_ethdev.h
@@ -550,6 +550,7 @@ struct ice_pf {
 	uint64_t supported_rxdid; /* bitmap for supported RXDID */
 	uint64_t rss_hf;
 	struct ice_tm_conf tm_conf;
+	uint16_t outer_ethertype;
 };
 
 #define ICE_MAX_QUEUE_NUM  2048
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v2] net/ice: support double vlan
  2023-05-06 10:04 ` [PATCH v2] net/ice: " Mingjin Ye
@ 2023-05-26 10:16   ` Xu, Ke1
  2023-05-26 11:10     ` Zhang, Qi Z
  0 siblings, 1 reply; 16+ messages in thread
From: Xu, Ke1 @ 2023-05-26 10:16 UTC (permalink / raw)
  To: Ye, MingjinX, dev; +Cc: Yang, Qiming, Zhou, YidingX, Ye, MingjinX, Zhang, Qi Z

> From: Mingjin Ye <mingjinx.ye@intel.com>
> Sent: Saturday, May 6, 2023 6:05 PM
> To: dev@dpdk.org
> Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Ye, MingjinX <mingjinx.ye@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v2] net/ice: support double vlan
> 
> Aligned with kernel driver, optimized for inner and outer VLAN handling in DPDK,
> and implemented double vlan insertion and stripping support.
> NIC must work in double vlan mode(DVM), depending on FW/SW.
> 
> 1. Adjust vlan stripping
> The vlan stripping enable/disable is applied to the inner vlan.
> 
> 2. Support QinQ stripping
> The ice outer vlan strip is enabled/disabled by the mask bit of
> `RTE_ETH_RX_OFFLOAD_QINQ_STRIP`, and the user can use "vlan set qinq_strip
> on 0" to enable or "vlan setqinq_strip off 0" to disable the ice outer vlan strip in
> testpmd.
> 
> 3. Support outer tag type switching
> Implement the ethdev `vlan_tpid_set` api to enable outer tag support to handle
> `RTE_ETHER_TYPE_VLAN`` RTE_ETHER_TYPE_QINQ`` RTE_ETHER_TYPE_QINQ1`
> outer tag types.
> 
> 4. Support outer port-based vlan insertion Implement port-based outer vlan
> insertion. User can use "tx_vlan set pvid 0 45 on" to enable or "tx_vlan set pvid 0
> 45 off" to disable the outer vlan insertion in testpmd.
> 
> Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>

Tested and passed.

There are several points need to be mentioned.
1. Only Scalar path is implemented, needs force-max-simd-bitwidth.
2. Not supported: StripQ.
3. Hardware limitation: Requires different outer and inner VLAN type. It means
  setting outer TPID to 0x88a8 before double VLAN insert.
4. Undefined behaviour: VLAN insert and TPID configured together is not defined
  in this new feature. TPID is designed to work with PVID now. Under this implementation
  changing TPID would not change the TX outer VLAN type of double VLAN insert.

Tested-by: Ke Xu <ke1.xu@intel.com>

> ---
> v2: Apply QinQ when initializing vlan offload.
> ---
>  drivers/net/ice/ice_ethdev.c | 422 +++++++++++++++++++++++++++++++++--
>  drivers/net/ice/ice_ethdev.h |   1 +
>  2 files changed, 408 insertions(+), 15 deletions(-)
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v2] net/ice: support double vlan
  2023-05-26 10:16   ` Xu, Ke1
@ 2023-05-26 11:10     ` Zhang, Qi Z
  0 siblings, 0 replies; 16+ messages in thread
From: Zhang, Qi Z @ 2023-05-26 11:10 UTC (permalink / raw)
  To: Xu, Ke1, Ye, MingjinX, dev; +Cc: Yang, Qiming, Zhou, YidingX, Ye, MingjinX



> -----Original Message-----
> From: Xu, Ke1 <ke1.xu@intel.com>
> Sent: Friday, May 26, 2023 6:16 PM
> To: Ye, MingjinX <mingjinx.ye@intel.com>; dev@dpdk.org
> Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Ye, MingjinX <mingjinx.ye@intel.com>; Zhang, Qi
> Z <qi.z.zhang@intel.com>
> Subject: RE: [PATCH v2] net/ice: support double vlan
> 
> > From: Mingjin Ye <mingjinx.ye@intel.com>
> > Sent: Saturday, May 6, 2023 6:05 PM
> > To: dev@dpdk.org
> > Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> > <yidingx.zhou@intel.com>; Ye, MingjinX <mingjinx.ye@intel.com>; Zhang,
> > Qi Z <qi.z.zhang@intel.com>
> > Subject: [PATCH v2] net/ice: support double vlan
> >
> > Aligned with kernel driver, optimized for inner and outer VLAN
> > handling in DPDK, and implemented double vlan insertion and stripping
> support.
> > NIC must work in double vlan mode(DVM), depending on FW/SW.
> >
> > 1. Adjust vlan stripping
> > The vlan stripping enable/disable is applied to the inner vlan.
> >
> > 2. Support QinQ stripping
> > The ice outer vlan strip is enabled/disabled by the mask bit of
> > `RTE_ETH_RX_OFFLOAD_QINQ_STRIP`, and the user can use "vlan set
> > qinq_strip on 0" to enable or "vlan setqinq_strip off 0" to disable
> > the ice outer vlan strip in testpmd.
> >
> > 3. Support outer tag type switching
> > Implement the ethdev `vlan_tpid_set` api to enable outer tag support
> > to handle `RTE_ETHER_TYPE_VLAN`` RTE_ETHER_TYPE_QINQ``
> > RTE_ETHER_TYPE_QINQ1` outer tag types.
> >
> > 4. Support outer port-based vlan insertion Implement port-based outer
> > vlan insertion. User can use "tx_vlan set pvid 0 45 on" to enable or
> > "tx_vlan set pvid 0
> > 45 off" to disable the outer vlan insertion in testpmd.
> >
> > Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
> 
> Tested and passed.
> 
> There are several points need to be mentioned.
> 1. Only Scalar path is implemented, needs force-max-simd-bitwidth.
> 2. Not supported: StripQ.
> 3. Hardware limitation: Requires different outer and inner VLAN type. It
> means
>   setting outer TPID to 0x88a8 before double VLAN insert.
> 4. Undefined behaviour: VLAN insert and TPID configured together is not
> defined
>   in this new feature. TPID is designed to work with PVID now. Under this
> implementation
>   changing TPID would not change the TX outer VLAN type of double VLAN
> insert.
> 
> Tested-by: Ke Xu <ke1.xu@intel.com>

Acked-by: Qi Zhang <qi.z.zhang@intel.com>

Applied to dpdk-next-net-intel.

Thanks
Qi


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [POC] net/iavf: support no data path polling mode
  2023-04-20  6:16 [PATCH] net/ice: CVL support double vlan Mingjin Ye
  2023-05-06 10:04 ` [PATCH v2] net/ice: " Mingjin Ye
@ 2023-07-17  9:36 ` Mingjin Ye
  2023-07-20 10:08   ` [POC v2] " Mingjin Ye
  1 sibling, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-07-17  9:36 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Jingjing Wu, Beilei Xing,
	Bruce Richardson, Konstantin Ananyev

Introduces a devargs "no-poll-on-link-down" in iavf PMD. When this
flag is set, the PMD switches to no-poll mode when the link state is
down (rx/tx burst returns to 0 immediately). When the link state
returns to normal, PMD switches to normal rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
 drivers/net/iavf/iavf.h                 |  2 ++
 drivers/net/iavf/iavf_ethdev.c          | 10 ++++++
 drivers/net/iavf/iavf_rxtx.c            | 29 +++++++++++++++--
 drivers/net/iavf/iavf_rxtx.h            |  1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 29 ++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 42 ++++++++++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c    | 21 ++++++++++++-
 drivers/net/iavf/iavf_vchnl.c           | 19 +++++++++++
 8 files changed, 141 insertions(+), 12 deletions(-)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 98861e4242..30b05d25b6 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -305,6 +305,7 @@ struct iavf_devargs {
 	uint8_t proto_xtr[IAVF_MAX_QUEUE_NUM];
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -323,6 +324,7 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index ac7154d720..41a8947f61 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -37,6 +37,7 @@
 #define IAVF_PROTO_XTR_ARG         "proto_xtr"
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG       "no-poll-on-link-down"
 
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
@@ -2237,6 +2238,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2270,6 +2272,14 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	no_poll_on_link_down = rte_kvargs_count(kvlist,
+		IAVF_NO_POLL_ON_LINK_DOWN_ARG);
+
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f7df4665d1..447e306fee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -770,6 +770,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -843,6 +844,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -1406,9 +1408,12 @@ iavf_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1515,9 +1520,12 @@ iavf_recv_pkts_flex_rxd(void *rx_queue,
 	const uint32_t *ptype_tbl;
 	uint64_t ts_ns;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1641,6 +1649,9 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_flex_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
 
@@ -1818,6 +1829,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		qword1 = rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len);
@@ -1973,6 +1987,9 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	uint64_t ts_ns;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = (volatile union iavf_rx_flex_desc *)&rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2104,6 +2121,9 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint1
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2281,6 +2301,9 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct iavf_rx_queue *rxq = (struct iavf_rx_queue *)rx_queue;
 	uint16_t nb_rx = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (!nb_pkts)
 		return 0;
 
@@ -2768,6 +2791,8 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t idx;
 	uint16_t slen;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
 
 	/* Check if the descriptor ring needs to be cleaned. */
 	if (txq->nb_free < txq->free_thresh)
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index c10f24036e..98316bed24 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -26,8 +26,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -36,6 +35,11 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	type_table = rxq->vsi->adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -530,12 +534,12 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -543,6 +547,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+	type_table = adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1774,6 +1787,9 @@ iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs(txq);
 
@@ -1834,6 +1850,9 @@ iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3e66df5341..8de739434c 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -45,7 +45,7 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       bool offload)
 {
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -53,6 +53,13 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -588,12 +595,12 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					uint8_t *split_packet,
 					bool offload)
 {
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -602,6 +609,17 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1700,6 +1718,10 @@ iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkt
 					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
@@ -2303,6 +2325,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2370,6 +2395,9 @@ iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2432,6 +2460,9 @@ iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
@@ -2498,6 +2529,9 @@ iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 892bfa4cf3..4007ce6f6f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -479,7 +479,12 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	int pos;
 	uint64_t var;
 	__m128i shuf_msk;
-	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *ptype_tbl;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
 	__m128i crc_adjust = _mm_set_epi16(
 				0, 0, 0,    /* ignore non-length fields */
@@ -1198,6 +1203,11 @@ uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
 {
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi)
+		return 0;
+
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
@@ -1215,6 +1225,9 @@ iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1284,6 +1297,9 @@ iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1437,6 +1453,9 @@ iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 524732f67d..ca2bdd5408 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -263,6 +263,15 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off !!!\n");
+			}
+			if (adapter->dev_data->dev_started && !vf->link_up &&
+				adapter->devargs.no_poll_on_link_down) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on !!!\n");
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -465,6 +474,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (vf->link_up && adapter->no_poll) {
+			adapter->no_poll = false;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off!!!\n");
+		}
+		if (dev->data->dev_started && !vf->link_up &&
+			adapter->devargs.no_poll_on_link_down) {
+			adapter->no_poll = true;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on !!!\n");
+		}
+
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [POC v2] net/iavf: support no data path polling mode
  2023-07-17  9:36 ` [POC] net/iavf: support no data path polling mode Mingjin Ye
@ 2023-07-20 10:08   ` Mingjin Ye
  2023-07-20 15:45     ` Stephen Hemminger
  2023-07-21  9:57     ` [POC v3] " Mingjin Ye
  0 siblings, 2 replies; 16+ messages in thread
From: Mingjin Ye @ 2023-07-20 10:08 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Jingjing Wu, Beilei Xing,
	Bruce Richardson, Konstantin Ananyev

Introduces a devargs "no-poll-on-link-down" in iavf PMD. When this
flag is set, the PMD switches to no-poll mode when the link state is
down (rx/tx burst returns to 0 immediately). When the link state
returns to normal, PMD switches to normal rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V2: Add IAVF_NO_POLL_ON_LINK_DOWN_ARG macro to iavf_valid_args.
---
 drivers/net/iavf/iavf.h                 |  2 ++
 drivers/net/iavf/iavf_ethdev.c          | 11 +++++++
 drivers/net/iavf/iavf_rxtx.c            | 29 +++++++++++++++--
 drivers/net/iavf/iavf_rxtx.h            |  1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 29 ++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 42 ++++++++++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c    | 21 ++++++++++++-
 drivers/net/iavf/iavf_vchnl.c           | 19 +++++++++++
 8 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 98861e4242..30b05d25b6 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -305,6 +305,7 @@ struct iavf_devargs {
 	uint8_t proto_xtr[IAVF_MAX_QUEUE_NUM];
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -323,6 +324,7 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index ac7154d720..c922c64838 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -37,6 +37,7 @@
 #define IAVF_PROTO_XTR_ARG         "proto_xtr"
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG       "no-poll-on-link-down"
 
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
@@ -45,6 +46,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_PROTO_XTR_ARG,
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2237,6 +2239,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2270,6 +2273,14 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	no_poll_on_link_down = rte_kvargs_count(kvlist,
+		IAVF_NO_POLL_ON_LINK_DOWN_ARG);
+
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f7df4665d1..447e306fee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -770,6 +770,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -843,6 +844,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -1406,9 +1408,12 @@ iavf_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1515,9 +1520,12 @@ iavf_recv_pkts_flex_rxd(void *rx_queue,
 	const uint32_t *ptype_tbl;
 	uint64_t ts_ns;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1641,6 +1649,9 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_flex_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
 
@@ -1818,6 +1829,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		qword1 = rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len);
@@ -1973,6 +1987,9 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	uint64_t ts_ns;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = (volatile union iavf_rx_flex_desc *)&rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2104,6 +2121,9 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint1
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2281,6 +2301,9 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct iavf_rx_queue *rxq = (struct iavf_rx_queue *)rx_queue;
 	uint16_t nb_rx = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (!nb_pkts)
 		return 0;
 
@@ -2768,6 +2791,8 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t idx;
 	uint16_t slen;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
 
 	/* Check if the descriptor ring needs to be cleaned. */
 	if (txq->nb_free < txq->free_thresh)
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index c10f24036e..98316bed24 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -26,8 +26,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -36,6 +35,11 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	type_table = rxq->vsi->adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -530,12 +534,12 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -543,6 +547,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+	type_table = adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1774,6 +1787,9 @@ iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs(txq);
 
@@ -1834,6 +1850,9 @@ iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3e66df5341..8de739434c 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -45,7 +45,7 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       bool offload)
 {
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -53,6 +53,13 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -588,12 +595,12 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					uint8_t *split_packet,
 					bool offload)
 {
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -602,6 +609,17 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1700,6 +1718,10 @@ iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkt
 					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
@@ -2303,6 +2325,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2370,6 +2395,9 @@ iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2432,6 +2460,9 @@ iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
@@ -2498,6 +2529,9 @@ iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 892bfa4cf3..4007ce6f6f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -479,7 +479,12 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	int pos;
 	uint64_t var;
 	__m128i shuf_msk;
-	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *ptype_tbl;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
 	__m128i crc_adjust = _mm_set_epi16(
 				0, 0, 0,    /* ignore non-length fields */
@@ -1198,6 +1203,11 @@ uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
 {
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi)
+		return 0;
+
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
@@ -1215,6 +1225,9 @@ iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1284,6 +1297,9 @@ iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1437,6 +1453,9 @@ iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 524732f67d..ca2bdd5408 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -263,6 +263,15 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off !!!\n");
+			}
+			if (adapter->dev_data->dev_started && !vf->link_up &&
+				adapter->devargs.no_poll_on_link_down) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on !!!\n");
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -465,6 +474,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (vf->link_up && adapter->no_poll) {
+			adapter->no_poll = false;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off!!!\n");
+		}
+		if (dev->data->dev_started && !vf->link_up &&
+			adapter->devargs.no_poll_on_link_down) {
+			adapter->no_poll = true;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on !!!\n");
+		}
+
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [POC v2] net/iavf: support no data path polling mode
  2023-07-20 10:08   ` [POC v2] " Mingjin Ye
@ 2023-07-20 15:45     ` Stephen Hemminger
  2023-07-21  9:57     ` [POC v3] " Mingjin Ye
  1 sibling, 0 replies; 16+ messages in thread
From: Stephen Hemminger @ 2023-07-20 15:45 UTC (permalink / raw)
  To: Mingjin Ye
  Cc: dev, qiming.yang, yidingx.zhou, Jingjing Wu, Beilei Xing,
	Bruce Richardson, Konstantin Ananyev

On Thu, 20 Jul 2023 10:08:14 +0000
Mingjin Ye <mingjinx.ye@intel.com> wrote:

> Introduces a devargs "no-poll-on-link-down" in iavf PMD. When this
> flag is set, the PMD switches to no-poll mode when the link state is
> down (rx/tx burst returns to 0 immediately). When the link state
> returns to normal, PMD switches to normal rx/tx burst state.
> 
> Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>

There is not enough explanation of this.
Why is this necessary?
Why is it unique to the iavf device and not true of other devices?
Where is the documentation?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [POC v3] net/iavf: support no data path polling mode
  2023-07-20 10:08   ` [POC v2] " Mingjin Ye
  2023-07-20 15:45     ` Stephen Hemminger
@ 2023-07-21  9:57     ` Mingjin Ye
  2023-08-11  6:27       ` [PATCH] " Mingjin Ye
  1 sibling, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-07-21  9:57 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, stephen, Mingjin Ye, Jingjing Wu,
	Beilei Xing, Bruce Richardson, Konstantin Ananyev

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

NOTE: The DPDK application needs to handle the
RTE_ETH_EVENT_INTR_RESET event posted by the iavf PMD and reset
the vf upon receipt of this event.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V2: Add IAVF_NO_POLL_ON_LINK_DOWN_ARG macro to iavf_valid_args.
---
V3: Improve commit log.
---
 drivers/net/iavf/iavf.h                 |  2 ++
 drivers/net/iavf/iavf_ethdev.c          | 11 +++++++
 drivers/net/iavf/iavf_rxtx.c            | 29 +++++++++++++++--
 drivers/net/iavf/iavf_rxtx.h            |  1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 29 ++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 42 ++++++++++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c    | 21 ++++++++++++-
 drivers/net/iavf/iavf_vchnl.c           | 17 ++++++++++
 8 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 98861e4242..30b05d25b6 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -305,6 +305,7 @@ struct iavf_devargs {
 	uint8_t proto_xtr[IAVF_MAX_QUEUE_NUM];
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -323,6 +324,7 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index ac7154d720..c922c64838 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -37,6 +37,7 @@
 #define IAVF_PROTO_XTR_ARG         "proto_xtr"
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG       "no-poll-on-link-down"
 
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
@@ -45,6 +46,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_PROTO_XTR_ARG,
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2237,6 +2239,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2270,6 +2273,14 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	no_poll_on_link_down = rte_kvargs_count(kvlist,
+		IAVF_NO_POLL_ON_LINK_DOWN_ARG);
+
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f7df4665d1..447e306fee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -770,6 +770,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -843,6 +844,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -1406,9 +1408,12 @@ iavf_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1515,9 +1520,12 @@ iavf_recv_pkts_flex_rxd(void *rx_queue,
 	const uint32_t *ptype_tbl;
 	uint64_t ts_ns;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1641,6 +1649,9 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_flex_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
 
@@ -1818,6 +1829,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		qword1 = rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len);
@@ -1973,6 +1987,9 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	uint64_t ts_ns;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = (volatile union iavf_rx_flex_desc *)&rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2104,6 +2121,9 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint1
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2281,6 +2301,9 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct iavf_rx_queue *rxq = (struct iavf_rx_queue *)rx_queue;
 	uint16_t nb_rx = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (!nb_pkts)
 		return 0;
 
@@ -2768,6 +2791,8 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t idx;
 	uint16_t slen;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
 
 	/* Check if the descriptor ring needs to be cleaned. */
 	if (txq->nb_free < txq->free_thresh)
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index c10f24036e..98316bed24 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -26,8 +26,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -36,6 +35,11 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	type_table = rxq->vsi->adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -530,12 +534,12 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -543,6 +547,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+	type_table = adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1774,6 +1787,9 @@ iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs(txq);
 
@@ -1834,6 +1850,9 @@ iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3e66df5341..8de739434c 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -45,7 +45,7 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       bool offload)
 {
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -53,6 +53,13 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -588,12 +595,12 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					uint8_t *split_packet,
 					bool offload)
 {
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -602,6 +609,17 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1700,6 +1718,10 @@ iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkt
 					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
@@ -2303,6 +2325,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2370,6 +2395,9 @@ iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2432,6 +2460,9 @@ iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
@@ -2498,6 +2529,9 @@ iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 892bfa4cf3..4007ce6f6f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -479,7 +479,12 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	int pos;
 	uint64_t var;
 	__m128i shuf_msk;
-	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *ptype_tbl;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
 	__m128i crc_adjust = _mm_set_epi16(
 				0, 0, 0,    /* ignore non-length fields */
@@ -1198,6 +1203,11 @@ uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
 {
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi)
+		return 0;
+
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
@@ -1215,6 +1225,9 @@ iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1284,6 +1297,9 @@ iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1437,6 +1453,9 @@ iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 524732f67d..2d0b7bddb3 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -263,6 +263,14 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off");
+			}
+			if (!vf->link_up && adapter->devargs.no_poll_on_link_down) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on");
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -465,6 +473,15 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (vf->link_up && adapter->no_poll) {
+			adapter->no_poll = false;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned off");
+		}
+		if (!vf->link_up && adapter->devargs.no_poll_on_link_down) {
+			adapter->no_poll = true;
+			PMD_DRV_LOG(DEBUG, "IAVF no-poll turned on");
+		}
+
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] net/iavf: support no data path polling mode
  2023-07-21  9:57     ` [POC v3] " Mingjin Ye
@ 2023-08-11  6:27       ` Mingjin Ye
  2023-09-26  7:56         ` [PATCH v2] " Mingjin Ye
  0 siblings, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-08-11  6:27 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Simei Su, Wenjun Wu,
	Yuying Zhang, Beilei Xing, Jingjing Wu, Bruce Richardson,
	Konstantin Ananyev

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
 doc/guides/nics/intel_vf.rst            |  3 ++
 drivers/net/iavf/iavf.h                 |  2 ++
 drivers/net/iavf/iavf_ethdev.c          | 12 +++++++
 drivers/net/iavf/iavf_rxtx.c            | 29 +++++++++++++++--
 drivers/net/iavf/iavf_rxtx.h            |  1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 29 ++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 42 ++++++++++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c    | 21 ++++++++++++-
 drivers/net/iavf/iavf_vchnl.c           | 20 ++++++++++++
 9 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index d365dbc185..54cfb688b3 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -101,6 +101,9 @@ For more detail on SR-IOV, please refer to the following documents:
     Set ``devargs`` parameter ``watchdog_period`` to adjust the watchdog period in microseconds, or set it to 0 to disable the watchdog,
     for example, ``-a 18:01.0,watchdog_period=5000`` or ``-a 18:01.0,watchdog_period=0``.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no_poll_on_link_down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 98861e4242..30b05d25b6 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -305,6 +305,7 @@ struct iavf_devargs {
 	uint8_t proto_xtr[IAVF_MAX_QUEUE_NUM];
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -323,6 +324,7 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index f2fc5a5621..2fdc845204 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -37,6 +37,7 @@
 #define IAVF_PROTO_XTR_ARG         "proto_xtr"
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG       "no_poll_on_link_down"
 
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
@@ -45,6 +46,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_PROTO_XTR_ARG,
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2237,6 +2239,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2270,6 +2273,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f7df4665d1..447e306fee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -770,6 +770,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -843,6 +844,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -1406,9 +1408,12 @@ iavf_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1515,9 +1520,12 @@ iavf_recv_pkts_flex_rxd(void *rx_queue,
 	const uint32_t *ptype_tbl;
 	uint64_t ts_ns;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1641,6 +1649,9 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_flex_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
 
@@ -1818,6 +1829,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		qword1 = rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len);
@@ -1973,6 +1987,9 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	uint64_t ts_ns;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = (volatile union iavf_rx_flex_desc *)&rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2104,6 +2121,9 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint1
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2281,6 +2301,9 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct iavf_rx_queue *rxq = (struct iavf_rx_queue *)rx_queue;
 	uint16_t nb_rx = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (!nb_pkts)
 		return 0;
 
@@ -2768,6 +2791,8 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t idx;
 	uint16_t slen;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
 
 	/* Check if the descriptor ring needs to be cleaned. */
 	if (txq->nb_free < txq->free_thresh)
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index c10f24036e..98316bed24 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -26,8 +26,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -36,6 +35,11 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	type_table = rxq->vsi->adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -530,12 +534,12 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -543,6 +547,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+	type_table = adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1774,6 +1787,9 @@ iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs(txq);
 
@@ -1834,6 +1850,9 @@ iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3e66df5341..8de739434c 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -45,7 +45,7 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       bool offload)
 {
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -53,6 +53,13 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -588,12 +595,12 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					uint8_t *split_packet,
 					bool offload)
 {
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -602,6 +609,17 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1700,6 +1718,10 @@ iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkt
 					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
@@ -2303,6 +2325,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2370,6 +2395,9 @@ iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2432,6 +2460,9 @@ iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
@@ -2498,6 +2529,9 @@ iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 892bfa4cf3..4007ce6f6f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -479,7 +479,12 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	int pos;
 	uint64_t var;
 	__m128i shuf_msk;
-	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *ptype_tbl;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
 	__m128i crc_adjust = _mm_set_epi16(
 				0, 0, 0,    /* ignore non-length fields */
@@ -1198,6 +1203,11 @@ uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
 {
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi)
+		return 0;
+
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
@@ -1215,6 +1225,9 @@ iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1284,6 +1297,9 @@ iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1437,6 +1453,9 @@ iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 524732f67d..80dfda5e11 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -263,6 +263,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -465,6 +475,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] net/iavf: support no data path polling mode
  2023-08-11  6:27       ` [PATCH] " Mingjin Ye
@ 2023-09-26  7:56         ` Mingjin Ye
  2023-10-13  1:27           ` [PATCH v3] " Mingjin Ye
  0 siblings, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-09-26  7:56 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Wenjun Wu, Simei Su,
	Yuying Zhang, Beilei Xing, Jingjing Wu, Bruce Richardson,
	Konstantin Ananyev

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
 doc/guides/nics/intel_vf.rst            |  3 ++
 drivers/net/iavf/iavf.h                 |  2 ++
 drivers/net/iavf/iavf_ethdev.c          | 16 +++++++++-
 drivers/net/iavf/iavf_rxtx.c            | 29 +++++++++++++++--
 drivers/net/iavf/iavf_rxtx.h            |  1 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 29 ++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 42 ++++++++++++++++++++++---
 drivers/net/iavf/iavf_rxtx_vec_sse.c    | 21 ++++++++++++-
 drivers/net/iavf/iavf_vchnl.c           | 20 ++++++++++++
 9 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index 7613e1c5e5..19c461c3de 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -104,6 +104,9 @@ For more detail on SR-IOV, please refer to the following documents:
     Enable vf auto-reset by setting the ``devargs`` parameter like ``-a 18:01.0,auto_reset=1`` when IAVF is backed
     by an Intel® E810 device or an Intel® 700 Series Ethernet device.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no-poll-on-link-down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 04774ce124..71cb08f0b1 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -308,6 +308,7 @@ struct iavf_devargs {
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
 	uint8_t  auto_reset;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -326,6 +327,7 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 5b2634a4e3..98cc5c8ea8 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -38,7 +38,7 @@
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
 #define IAVF_ENABLE_AUTO_RESET_ARG "auto_reset"
-
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG "no-poll-on-link-down"
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
 
@@ -47,6 +47,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
 	IAVF_ENABLE_AUTO_RESET_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2291,6 +2292,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2324,6 +2326,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
@@ -2337,6 +2348,9 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	if (ret)
 		goto bail;
 
+	if (ad->devargs.auto_reset != 0)
+		ad->devargs.no_poll_on_link_down = 1;
+
 bail:
 	rte_kvargs_free(kvlist);
 	return ret;
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0484988d13..a5f63ce30d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -777,6 +777,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -850,6 +851,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -1427,9 +1429,12 @@ iavf_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1536,9 +1541,12 @@ iavf_recv_pkts_flex_rxd(void *rx_queue,
 	const uint32_t *ptype_tbl;
 	uint64_t ts_ns;
 
+	rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	nb_rx = 0;
 	nb_hold = 0;
-	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
@@ -1662,6 +1670,9 @@ iavf_recv_scattered_pkts_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_flex_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
 
@@ -1839,6 +1850,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	volatile union iavf_rx_desc *rxdp;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_rx < nb_pkts) {
 		rxdp = &rx_ring[rx_id];
 		qword1 = rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len);
@@ -1994,6 +2008,9 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	uint64_t ts_ns;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = (volatile union iavf_rx_flex_desc *)&rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2125,6 +2142,9 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint1
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
 	rxep = &rxq->sw_ring[rxq->rx_tail];
 
@@ -2302,6 +2322,9 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	struct iavf_rx_queue *rxq = (struct iavf_rx_queue *)rx_queue;
 	uint16_t nb_rx = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	if (!nb_pkts)
 		return 0;
 
@@ -2793,6 +2816,8 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t idx;
 	uint16_t slen;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
 
 	/* Check if the descriptor ring needs to be cleaned. */
 	if (txq->nb_free < txq->free_thresh)
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index 510b4d8f1c..9d8905a95a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -26,8 +26,7 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	/* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -36,6 +35,11 @@ _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 	const int avx_aligned = ((rxq->rx_tail & 1) == 0);
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	type_table = rxq->vsi->adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -530,12 +534,12 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 {
 #define IAVF_DESCS_PER_LOOP_AVX 8
 
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -543,6 +547,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+	type_table = adapter->ptype_tbl;
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1774,6 +1787,9 @@ iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs(txq);
 
@@ -1834,6 +1850,9 @@ iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 7a7df6d258..ff79388ba6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -45,7 +45,7 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       bool offload)
 {
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -53,6 +53,13 @@ _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -588,12 +595,12 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 					uint8_t *split_packet,
 					bool offload)
 {
-	struct iavf_adapter *adapter = rxq->vsi->adapter;
+	struct iavf_adapter *adapter;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+	uint64_t offloads;
 #endif
 #ifdef IAVF_RX_PTYPE_OFFLOAD
-	const uint32_t *type_table = adapter->ptype_tbl;
+	const uint32_t *type_table;
 #endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
@@ -602,6 +609,17 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
 	volatile union iavf_rx_flex_desc *rxdp =
 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	adapter = rxq->vsi->adapter;
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+	offloads = adapter->dev_data->dev_conf.rxmode.offloads;
+#endif
+#ifdef IAVF_RX_PTYPE_OFFLOAD
+	type_table = adapter->ptype_tbl;
+#endif
+
 	rte_prefetch0(rxdp);
 
 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
@@ -1700,6 +1718,10 @@ iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkt
 					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
@@ -2303,6 +2325,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2370,6 +2395,9 @@ iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	if (txq->nb_free < txq->free_thresh)
 		iavf_tx_free_bufs_avx512(txq);
 
@@ -2432,6 +2460,9 @@ iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
@@ -2497,6 +2528,9 @@ iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c
index 96f187f511..86e748d6d1 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_sse.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c
@@ -479,7 +479,12 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	int pos;
 	uint64_t var;
 	__m128i shuf_msk;
-	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+	const uint32_t *ptype_tbl;
+
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
 	__m128i crc_adjust = _mm_set_epi16(
 				0, 0, 0,    /* ignore non-length fields */
@@ -1198,6 +1203,11 @@ uint16_t
 iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts)
 {
+	struct iavf_rx_queue *rxq = rx_queue;
+
+	if (!rxq->vsi)
+		return 0;
+
 	return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
@@ -1215,6 +1225,9 @@ iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1284,6 +1297,9 @@ iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue,
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 	unsigned int i = 0;
 
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts,
 					      split_flags);
@@ -1437,6 +1453,9 @@ iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
 	while (nb_pkts) {
 		uint16_t ret, num;
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 7f49eb2c1e..0a3e1d082c 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -272,6 +272,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -474,6 +484,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3] net/iavf: support no data path polling mode
  2023-09-26  7:56         ` [PATCH v2] " Mingjin Ye
@ 2023-10-13  1:27           ` Mingjin Ye
  2023-10-17  1:44             ` [PATCH v4] " Mingjin Ye
  2023-10-17  2:19             ` Mingjin Ye
  0 siblings, 2 replies; 16+ messages in thread
From: Mingjin Ye @ 2023-10-13  1:27 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Simei Su, Wenjun Wu,
	Yuying Zhang, Beilei Xing, Jingjing Wu

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

NOTE: The DPDK application needs to handle the
RTE_ETH_EVENT_INTR_RESET event posted by the iavf PMD and reset
the vf upon receipt of this event.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V3: Remove redundant code.
---
 doc/guides/nics/intel_vf.rst   |  3 ++
 drivers/net/iavf/iavf.h        |  4 +++
 drivers/net/iavf/iavf_ethdev.c | 16 +++++++++-
 drivers/net/iavf/iavf_rxtx.c   | 53 ++++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx.h   |  1 +
 drivers/net/iavf/iavf_vchnl.c  | 20 +++++++++++++
 6 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index 7613e1c5e5..19c461c3de 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -104,6 +104,9 @@ For more detail on SR-IOV, please refer to the following documents:
     Enable vf auto-reset by setting the ``devargs`` parameter like ``-a 18:01.0,auto_reset=1`` when IAVF is backed
     by an Intel® E810 device or an Intel® 700 Series Ethernet device.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no-poll-on-link-down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 04774ce124..c115f3444e 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -308,6 +308,7 @@ struct iavf_devargs {
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
 	uint8_t  auto_reset;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -326,6 +327,9 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
+	eth_rx_burst_t rx_pkt_burst;
+	eth_tx_burst_t tx_pkt_burst;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 5b2634a4e3..98cc5c8ea8 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -38,7 +38,7 @@
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
 #define IAVF_ENABLE_AUTO_RESET_ARG "auto_reset"
-
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG "no-poll-on-link-down"
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
 
@@ -47,6 +47,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
 	IAVF_ENABLE_AUTO_RESET_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2291,6 +2292,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2324,6 +2326,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
@@ -2337,6 +2348,9 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	if (ret)
 		goto bail;
 
+	if (ad->devargs.auto_reset != 0)
+		ad->devargs.no_poll_on_link_down = 1;
+
 bail:
 	rte_kvargs_free(kvlist);
 	return ret;
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0484988d13..7feadee7d0 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -777,6 +777,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -850,6 +851,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -3707,6 +3709,30 @@ iavf_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 	return i;
 }
 
+static uint16_t
+iavf_recv_pkts_no_poll(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	return rxq->vsi->adapter->rx_pkt_burst(rx_queue,
+								rx_pkts, nb_pkts);
+}
+
+static uint16_t
+iavf_xmit_pkts_no_poll(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = tx_queue;
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
+	return txq->vsi->adapter->tx_pkt_burst(tx_queue,
+								tx_pkts, nb_pkts);
+}
+
 /* choose rx function*/
 void
 iavf_set_rx_function(struct rte_eth_dev *dev)
@@ -3714,6 +3740,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	struct iavf_adapter *adapter =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 	int i;
 	struct iavf_rx_queue *rxq;
 	bool use_flex = true;
@@ -3891,6 +3918,10 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			}
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #elif defined RTE_ARCH_ARM
@@ -3906,6 +3937,11 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			(void)iavf_rxq_vec_setup(rxq);
 		}
 		dev->rx_pkt_burst = iavf_recv_pkts_vec;
+
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #endif
@@ -3928,12 +3964,20 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 		else
 			dev->rx_pkt_burst = iavf_recv_pkts;
 	}
+
+	if (no_poll_on_link_down) {
+		adapter->rx_pkt_burst = dev->rx_pkt_burst;
+		dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+	}
 }
 
 /* choose tx function*/
 void
 iavf_set_tx_function(struct rte_eth_dev *dev)
 {
+	struct iavf_adapter *adapter =
+		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
@@ -4022,6 +4066,10 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 #endif
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->tx_pkt_burst = dev->tx_pkt_burst;
+			dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+		}
 		return;
 	}
 
@@ -4031,6 +4079,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
 	dev->tx_pkt_prepare = iavf_prep_pkts;
+
+	if (no_poll_on_link_down) {
+		adapter->tx_pkt_burst = dev->tx_pkt_burst;
+		dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+	}
 }
 
 static int
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 7f49eb2c1e..0a3e1d082c 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -272,6 +272,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -474,6 +484,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4] net/iavf: support no data path polling mode
  2023-10-13  1:27           ` [PATCH v3] " Mingjin Ye
@ 2023-10-17  1:44             ` Mingjin Ye
  2023-10-17  2:19             ` Mingjin Ye
  1 sibling, 0 replies; 16+ messages in thread
From: Mingjin Ye @ 2023-10-17  1:44 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Yuying Zhang, Beilei Xing,
	Wenjun Wu, Simei Su, Jingjing Wu

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 9607 bytes --]

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V3: Remove redundant code.
---
v4: Delete the git log note.
---
 doc/guides/nics/intel_vf.rst   |  3 ++
 drivers/net/iavf/iavf.h        |  4 +++
 drivers/net/iavf/iavf_ethdev.c | 16 +++++++++-
 drivers/net/iavf/iavf_rxtx.c   | 53 ++++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx.h   |  1 +
 drivers/net/iavf/iavf_vchnl.c  | 20 +++++++++++++
 6 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index 7613e1c5e5..19c461c3de 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -104,6 +104,9 @@ For more detail on SR-IOV, please refer to the following documents:
     Enable vf auto-reset by setting the ``devargs`` parameter like ``-a 18:01.0,auto_reset=1`` when IAVF is backed
     by an Intel® E810 device or an Intel® 700 Series Ethernet device.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no-poll-on-link-down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 04774ce124..c115f3444e 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -308,6 +308,7 @@ struct iavf_devargs {
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
 	uint8_t  auto_reset;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -326,6 +327,9 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
+	eth_rx_burst_t rx_pkt_burst;
+	eth_tx_burst_t tx_pkt_burst;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 5b2634a4e3..98cc5c8ea8 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -38,7 +38,7 @@
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
 #define IAVF_ENABLE_AUTO_RESET_ARG "auto_reset"
-
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG "no-poll-on-link-down"
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
 
@@ -47,6 +47,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
 	IAVF_ENABLE_AUTO_RESET_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2291,6 +2292,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2324,6 +2326,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
@@ -2337,6 +2348,9 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	if (ret)
 		goto bail;
 
+	if (ad->devargs.auto_reset != 0)
+		ad->devargs.no_poll_on_link_down = 1;
+
 bail:
 	rte_kvargs_free(kvlist);
 	return ret;
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index c6ef6af1d8..72263870a4 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -777,6 +777,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -850,6 +851,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -3703,6 +3705,30 @@ iavf_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 	return i;
 }
 
+static uint16_t
+iavf_recv_pkts_no_poll(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	return rxq->vsi->adapter->rx_pkt_burst(rx_queue,
+								rx_pkts, nb_pkts);
+}
+
+static uint16_t
+iavf_xmit_pkts_no_poll(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = tx_queue;
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
+	return txq->vsi->adapter->tx_pkt_burst(tx_queue,
+								tx_pkts, nb_pkts);
+}
+
 /* choose rx function*/
 void
 iavf_set_rx_function(struct rte_eth_dev *dev)
@@ -3710,6 +3736,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	struct iavf_adapter *adapter =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 	int i;
 	struct iavf_rx_queue *rxq;
 	bool use_flex = true;
@@ -3887,6 +3914,10 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			}
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #elif defined RTE_ARCH_ARM
@@ -3902,6 +3933,11 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			(void)iavf_rxq_vec_setup(rxq);
 		}
 		dev->rx_pkt_burst = iavf_recv_pkts_vec;
+
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #endif
@@ -3924,12 +3960,20 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 		else
 			dev->rx_pkt_burst = iavf_recv_pkts;
 	}
+
+	if (no_poll_on_link_down) {
+		adapter->rx_pkt_burst = dev->rx_pkt_burst;
+		dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+	}
 }
 
 /* choose tx function*/
 void
 iavf_set_tx_function(struct rte_eth_dev *dev)
 {
+	struct iavf_adapter *adapter =
+		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
@@ -4018,6 +4062,10 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 #endif
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->tx_pkt_burst = dev->tx_pkt_burst;
+			dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+		}
 		return;
 	}
 
@@ -4027,6 +4075,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
 	dev->tx_pkt_prepare = iavf_prep_pkts;
+
+	if (no_poll_on_link_down) {
+		adapter->tx_pkt_burst = dev->tx_pkt_burst;
+		dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+	}
 }
 
 static int
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 7f49eb2c1e..0a3e1d082c 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -272,6 +272,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -474,6 +484,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v4] net/iavf: support no data path polling mode
  2023-10-13  1:27           ` [PATCH v3] " Mingjin Ye
  2023-10-17  1:44             ` [PATCH v4] " Mingjin Ye
@ 2023-10-17  2:19             ` Mingjin Ye
  2023-10-19  9:04               ` [PATCH v5] net/iavf: data paths support no-polling mode Mingjin Ye
  1 sibling, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-10-17  2:19 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Wenjun Wu, Simei Su,
	Yuying Zhang, Beilei Xing, Jingjing Wu

Currently, during a PF to VF reset due to an action such as changing
trust settings on a VF, the DPDK application running with iavf PMD
loses connectivity, and the only solution is to reset the DPDK
application.

Instead of forcing a reset of the DPDK application to restore
connectivity, the iavf PMD driver handles the PF to VF reset event
normally by performing all necessary steps to bring the VF back
online.

To minimize downtime, a devargs "no-poll-on-link-down" is introduced
in iavf PMD. When this flag is set, the PMD switches to no-poll mode
when the link state is down (rx/tx bursts return to 0 immediately).
When the link state returns to normal, the PMD switches to normal
rx/tx burst state.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V3: Remove redundant code.
---
v4: Delete the git log note.
---
 doc/guides/nics/intel_vf.rst   |  3 ++
 drivers/net/iavf/iavf.h        |  4 +++
 drivers/net/iavf/iavf_ethdev.c | 16 +++++++++-
 drivers/net/iavf/iavf_rxtx.c   | 53 ++++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx.h   |  1 +
 drivers/net/iavf/iavf_vchnl.c  | 20 +++++++++++++
 6 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index e06d62a873..df298c6086 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -107,6 +107,9 @@ For more detail on SR-IOV, please refer to the following documents:
     when IAVF is backed by an Intel\ |reg| E810 device
     or an Intel\ |reg| 700 Series Ethernet device.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no-poll-on-link-down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 04774ce124..c115f3444e 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -308,6 +308,7 @@ struct iavf_devargs {
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
 	uint8_t  auto_reset;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -326,6 +327,9 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
+	eth_rx_burst_t rx_pkt_burst;
+	eth_tx_burst_t tx_pkt_burst;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 5b2634a4e3..98cc5c8ea8 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -38,7 +38,7 @@
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
 #define IAVF_ENABLE_AUTO_RESET_ARG "auto_reset"
-
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG "no-poll-on-link-down"
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
 
@@ -47,6 +47,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
 	IAVF_ENABLE_AUTO_RESET_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2291,6 +2292,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2324,6 +2326,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
@@ -2337,6 +2348,9 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	if (ret)
 		goto bail;
 
+	if (ad->devargs.auto_reset != 0)
+		ad->devargs.no_poll_on_link_down = 1;
+
 bail:
 	rte_kvargs_free(kvlist);
 	return ret;
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index c6ef6af1d8..72263870a4 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -777,6 +777,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -850,6 +851,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -3703,6 +3705,30 @@ iavf_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 	return i;
 }
 
+static uint16_t
+iavf_recv_pkts_no_poll(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	return rxq->vsi->adapter->rx_pkt_burst(rx_queue,
+								rx_pkts, nb_pkts);
+}
+
+static uint16_t
+iavf_xmit_pkts_no_poll(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = tx_queue;
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
+	return txq->vsi->adapter->tx_pkt_burst(tx_queue,
+								tx_pkts, nb_pkts);
+}
+
 /* choose rx function*/
 void
 iavf_set_rx_function(struct rte_eth_dev *dev)
@@ -3710,6 +3736,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	struct iavf_adapter *adapter =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 	int i;
 	struct iavf_rx_queue *rxq;
 	bool use_flex = true;
@@ -3887,6 +3914,10 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			}
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #elif defined RTE_ARCH_ARM
@@ -3902,6 +3933,11 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			(void)iavf_rxq_vec_setup(rxq);
 		}
 		dev->rx_pkt_burst = iavf_recv_pkts_vec;
+
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #endif
@@ -3924,12 +3960,20 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 		else
 			dev->rx_pkt_burst = iavf_recv_pkts;
 	}
+
+	if (no_poll_on_link_down) {
+		adapter->rx_pkt_burst = dev->rx_pkt_burst;
+		dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+	}
 }
 
 /* choose tx function*/
 void
 iavf_set_tx_function(struct rte_eth_dev *dev)
 {
+	struct iavf_adapter *adapter =
+		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
@@ -4018,6 +4062,10 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 #endif
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->tx_pkt_burst = dev->tx_pkt_burst;
+			dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+		}
 		return;
 	}
 
@@ -4027,6 +4075,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
 	dev->tx_pkt_prepare = iavf_prep_pkts;
+
+	if (no_poll_on_link_down) {
+		adapter->tx_pkt_burst = dev->tx_pkt_burst;
+		dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+	}
 }
 
 static int
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 7f49eb2c1e..0a3e1d082c 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -272,6 +272,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -474,6 +484,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5] net/iavf: data paths support no-polling mode
  2023-10-17  2:19             ` Mingjin Ye
@ 2023-10-19  9:04               ` Mingjin Ye
  2023-10-20  0:39                 ` Zhang, Qi Z
  0 siblings, 1 reply; 16+ messages in thread
From: Mingjin Ye @ 2023-10-19  9:04 UTC (permalink / raw)
  To: dev
  Cc: qiming.yang, yidingx.zhou, Mingjin Ye, Simei Su, Wenjun Wu,
	Yuying Zhang, Beilei Xing, Jingjing Wu

In a scenario involving a hot firmware upgrade, the network device on
the host side need to be reset, potentially causing the hardware queues
to become unreachable. In a VM, continuing to run VF PMD Rx/Tx during
this process can lead to application crash.

The solution is to implement a 'no-polling' Rx and Tx wrapper. This
wrapper will check the link status and return immediately if the link
is down. This is especially important because the link down events will
continue to be sent from the PF to the VF during firmware hot upgrades,
and the event will always occur before the RESET IMPENDING event.

The no-polling rx/tx mechanism will only be active when the
devarg "no-poll-on-link-down" is enabled. This devarg is typically
recommended for use in this specific hot upgrade scenario.
Ideally, "no-poll-on-link-down" should be used in conjunction with
the devarg "auto-reset" to provide a seamless and user-friendly
experience within the VM.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
V3: Remove redundant code.
---
v4: Delete the git log note.
---
v5: Optimize the commit log
---
 doc/guides/nics/intel_vf.rst   |  3 ++
 drivers/net/iavf/iavf.h        |  4 +++
 drivers/net/iavf/iavf_ethdev.c | 16 +++++++++-
 drivers/net/iavf/iavf_rxtx.c   | 53 ++++++++++++++++++++++++++++++++++
 drivers/net/iavf/iavf_rxtx.h   |  1 +
 drivers/net/iavf/iavf_vchnl.c  | 20 +++++++++++++
 6 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/intel_vf.rst b/doc/guides/nics/intel_vf.rst
index e06d62a873..df298c6086 100644
--- a/doc/guides/nics/intel_vf.rst
+++ b/doc/guides/nics/intel_vf.rst
@@ -107,6 +107,9 @@ For more detail on SR-IOV, please refer to the following documents:
     when IAVF is backed by an Intel\ |reg| E810 device
     or an Intel\ |reg| 700 Series Ethernet device.
 
+    Enable vf no-poll-on-link-down by setting the ``devargs`` parameter like ``-a 18:01.0,no-poll-on-link-down=1`` when IAVF is backed
+    by an Intel® E810 device or an Intel® 700 Series Ethernet device.
+
 The PCIE host-interface of Intel Ethernet Switch FM10000 Series VF infrastructure
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 04774ce124..c115f3444e 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -308,6 +308,7 @@ struct iavf_devargs {
 	uint16_t quanta_size;
 	uint32_t watchdog_period;
 	uint8_t  auto_reset;
+	uint16_t no_poll_on_link_down;
 };
 
 struct iavf_security_ctx;
@@ -326,6 +327,9 @@ struct iavf_adapter {
 	uint32_t ptype_tbl[IAVF_MAX_PKT_TYPE] __rte_cache_min_aligned;
 	bool stopped;
 	bool closed;
+	bool no_poll;
+	eth_rx_burst_t rx_pkt_burst;
+	eth_tx_burst_t tx_pkt_burst;
 	uint16_t fdir_ref_cnt;
 	struct iavf_devargs devargs;
 };
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 5b2634a4e3..98cc5c8ea8 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -38,7 +38,7 @@
 #define IAVF_QUANTA_SIZE_ARG       "quanta_size"
 #define IAVF_RESET_WATCHDOG_ARG    "watchdog_period"
 #define IAVF_ENABLE_AUTO_RESET_ARG "auto_reset"
-
+#define IAVF_NO_POLL_ON_LINK_DOWN_ARG "no-poll-on-link-down"
 uint64_t iavf_timestamp_dynflag;
 int iavf_timestamp_dynfield_offset = -1;
 
@@ -47,6 +47,7 @@ static const char * const iavf_valid_args[] = {
 	IAVF_QUANTA_SIZE_ARG,
 	IAVF_RESET_WATCHDOG_ARG,
 	IAVF_ENABLE_AUTO_RESET_ARG,
+	IAVF_NO_POLL_ON_LINK_DOWN_ARG,
 	NULL
 };
 
@@ -2291,6 +2292,7 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	struct rte_kvargs *kvlist;
 	int ret;
 	int watchdog_period = -1;
+	uint16_t no_poll_on_link_down;
 
 	if (!devargs)
 		return 0;
@@ -2324,6 +2326,15 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	else
 		ad->devargs.watchdog_period = watchdog_period;
 
+	ret = rte_kvargs_process(kvlist, IAVF_NO_POLL_ON_LINK_DOWN_ARG,
+				 &parse_u16, &no_poll_on_link_down);
+	if (ret)
+		goto bail;
+	if (no_poll_on_link_down == 0)
+		ad->devargs.no_poll_on_link_down = 0;
+	else
+		ad->devargs.no_poll_on_link_down = 1;
+
 	if (ad->devargs.quanta_size != 0 &&
 	    (ad->devargs.quanta_size < 256 || ad->devargs.quanta_size > 4096 ||
 	     ad->devargs.quanta_size & 0x40)) {
@@ -2337,6 +2348,9 @@ static int iavf_parse_devargs(struct rte_eth_dev *dev)
 	if (ret)
 		goto bail;
 
+	if (ad->devargs.auto_reset != 0)
+		ad->devargs.no_poll_on_link_down = 1;
+
 bail:
 	rte_kvargs_free(kvlist);
 	return ret;
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index c6ef6af1d8..72263870a4 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -777,6 +777,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf =
 		IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	struct iavf_vsi *vsi = &vf->vsi;
 	struct iavf_tx_queue *txq;
 	const struct rte_memzone *mz;
 	uint32_t ring_size;
@@ -850,6 +851,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq->port_id = dev->data->port_id;
 	txq->offloads = offloads;
 	txq->tx_deferred_start = tx_conf->tx_deferred_start;
+	txq->vsi = vsi;
 
 	if (iavf_ipsec_crypto_supported(adapter))
 		txq->ipsec_crypto_pkt_md_offset =
@@ -3703,6 +3705,30 @@ iavf_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 	return i;
 }
 
+static uint16_t
+iavf_recv_pkts_no_poll(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_rx_queue *rxq = rx_queue;
+	if (!rxq->vsi || rxq->vsi->adapter->no_poll)
+		return 0;
+
+	return rxq->vsi->adapter->rx_pkt_burst(rx_queue,
+								rx_pkts, nb_pkts);
+}
+
+static uint16_t
+iavf_xmit_pkts_no_poll(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts)
+{
+	struct iavf_tx_queue *txq = tx_queue;
+	if (!txq->vsi || txq->vsi->adapter->no_poll)
+		return 0;
+
+	return txq->vsi->adapter->tx_pkt_burst(tx_queue,
+								tx_pkts, nb_pkts);
+}
+
 /* choose rx function*/
 void
 iavf_set_rx_function(struct rte_eth_dev *dev)
@@ -3710,6 +3736,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 	struct iavf_adapter *adapter =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 	int i;
 	struct iavf_rx_queue *rxq;
 	bool use_flex = true;
@@ -3887,6 +3914,10 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			}
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #elif defined RTE_ARCH_ARM
@@ -3902,6 +3933,11 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 			(void)iavf_rxq_vec_setup(rxq);
 		}
 		dev->rx_pkt_burst = iavf_recv_pkts_vec;
+
+		if (no_poll_on_link_down) {
+			adapter->rx_pkt_burst = dev->rx_pkt_burst;
+			dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+		}
 		return;
 	}
 #endif
@@ -3924,12 +3960,20 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
 		else
 			dev->rx_pkt_burst = iavf_recv_pkts;
 	}
+
+	if (no_poll_on_link_down) {
+		adapter->rx_pkt_burst = dev->rx_pkt_burst;
+		dev->rx_pkt_burst = iavf_recv_pkts_no_poll;
+	}
 }
 
 /* choose tx function*/
 void
 iavf_set_tx_function(struct rte_eth_dev *dev)
 {
+	struct iavf_adapter *adapter =
+		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
@@ -4018,6 +4062,10 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 #endif
 		}
 
+		if (no_poll_on_link_down) {
+			adapter->tx_pkt_burst = dev->tx_pkt_burst;
+			dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+		}
 		return;
 	}
 
@@ -4027,6 +4075,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
 	dev->tx_pkt_prepare = iavf_prep_pkts;
+
+	if (no_poll_on_link_down) {
+		adapter->tx_pkt_burst = dev->tx_pkt_burst;
+		dev->tx_pkt_burst = iavf_xmit_pkts_no_poll;
+	}
 }
 
 static int
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 605ea3f824..d3324e0e6e 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -288,6 +288,7 @@ struct iavf_tx_queue {
 	uint16_t free_thresh;
 	uint16_t rs_thresh;
 	uint8_t rel_mbufs_type;
+	struct iavf_vsi *vsi; /**< the VSI this queue belongs to */
 
 	uint16_t port_id;
 	uint16_t queue_id;
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 7f49eb2c1e..0a3e1d082c 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -272,6 +272,16 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 				if (!vf->link_up)
 					iavf_dev_watchdog_enable(adapter);
 			}
+			if (adapter->devargs.no_poll_on_link_down) {
+				if (vf->link_up && adapter->no_poll) {
+					adapter->no_poll = false;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+				}
+				if (!vf->link_up) {
+					adapter->no_poll = true;
+					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+				}
+			}
 			PMD_DRV_LOG(INFO, "Link status update:%s",
 					vf->link_up ? "up" : "down");
 			break;
@@ -474,6 +484,16 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 			if (!vf->link_up)
 				iavf_dev_watchdog_enable(adapter);
 		}
+		if (adapter->devargs.no_poll_on_link_down) {
+			if (vf->link_up && adapter->no_poll) {
+				adapter->no_poll = false;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
+			}
+			if (!vf->link_up) {
+				adapter->no_poll = true;
+				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
+			}
+		}
 		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v5] net/iavf: data paths support no-polling mode
  2023-10-19  9:04               ` [PATCH v5] net/iavf: data paths support no-polling mode Mingjin Ye
@ 2023-10-20  0:39                 ` Zhang, Qi Z
  0 siblings, 0 replies; 16+ messages in thread
From: Zhang, Qi Z @ 2023-10-20  0:39 UTC (permalink / raw)
  To: Ye, MingjinX, dev
  Cc: Yang, Qiming, Zhou, YidingX, Ye, MingjinX, Su, Simei, Wu,
	Wenjun1, Zhang, Yuying, Xing, Beilei, Wu, Jingjing



> -----Original Message-----
> From: Mingjin Ye <mingjinx.ye@intel.com>
> Sent: Thursday, October 19, 2023 5:04 PM
> To: dev@dpdk.org
> Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Ye, MingjinX <mingjinx.ye@intel.com>; Su, Simei
> <simei.su@intel.com>; Wu, Wenjun1 <wenjun1.wu@intel.com>; Zhang,
> Yuying <yuying.zhang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>
> Subject: [PATCH v5] net/iavf: data paths support no-polling mode
> 
> In a scenario involving a hot firmware upgrade, the network device on the
> host side need to be reset, potentially causing the hardware queues to
> become unreachable. In a VM, continuing to run VF PMD Rx/Tx during this
> process can lead to application crash.
> 
> The solution is to implement a 'no-polling' Rx and Tx wrapper. This wrapper
> will check the link status and return immediately if the link is down. This is
> especially important because the link down events will continue to be sent
> from the PF to the VF during firmware hot upgrades, and the event will
> always occur before the RESET IMPENDING event.
> 
> The no-polling rx/tx mechanism will only be active when the devarg "no-poll-
> on-link-down" is enabled. This devarg is typically recommended for use in
> this specific hot upgrade scenario.
> Ideally, "no-poll-on-link-down" should be used in conjunction with the
> devarg "auto-reset" to provide a seamless and user-friendly experience
> within the VM.
> 
> Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>

Acked-by: Qi Zhang <qi.z.zhang@intel.com>

Applied to dpdk-next-net-intel after minor refine on commit log and description in ice.rst.

Thanks
Qi

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] net/ice: CVL support double vlan
@ 2023-04-19 10:46 Mingjin Ye
  0 siblings, 0 replies; 16+ messages in thread
From: Mingjin Ye @ 2023-04-19 10:46 UTC (permalink / raw)
  To: dev; +Cc: yidingx.zhou, Mingjin Ye, Qiming Yang, Qi Zhang

Aligned with kernel driver, optimized for inner and outer VLAN handling
in DPDK, and implemented double vlan insertion and stripping support.

1.adjust vlan stripping
Remove the judgment on dvm, vlan stripping only operates inner vlan.

2.support QinQ stripping
This patch support ice outer vlan strip on and off in QinQ mode with mask
bit of DEV_RX_OFFLOAD_QINQ_STRIP, users canuse "vlan set qinq_strip on 0"
to enable or "vlan setqinq_strip off 0" to disable ice outer vlan strip
when try with testpmd app.
Note: Due to hardware limitations, QinQ stripping containing two tagged RX
packets with the same EtherType (for example, two VLANs with EtherType =`
ETH_P_8021Q`) is not supported.

3.Support outer tag type switching
Add implementation of ethdev `vlan_tpid_set` api to enable Outer tags supp
-ort processing `ETH_P_8021Q` `ETH_P_8021AD` `ETH_P_QINQ1` outer tag types.

4.Support outer port insertion
If dvm is enabled, will support outer port vlan. User can use "tx_vlan set
pvid 0 45 on" to enable or "tx_vlan set pvid 0 45 off" to disable ice outer
vlan insertion try with testpmd app.

Signed-off-by: Mingjin Ye <mingjinx.ye@intel.com>
---
 drivers/net/ice/ice_ethdev.c | 426 +++++++++++++++++++++++++++++++++--
 drivers/net/ice/ice_ethdev.h |   1 +
 2 files changed, 413 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index 0bc739daf0..a945403328 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -54,6 +54,24 @@ static const char * const ice_valid_args[] = {
 
 #define PPS_OUT_DELAY_NS  1
 
+/* Maximum number of VSI */
+#define ICE_MAX_NUM_VSIS          (768UL)
+
+/* The 119 bit offset of the LAN Rx queue context is the L2TSEL control bit. */
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX	3
+#define ICE_L2TSEL_BIT_OFFSET		   23
+enum ice_l2tsel {
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
+
+/* 802.1Q VLAN Extended Header */
+#define ETH_P_8021Q		0x8100
+/* 802.1ad Service VLAN */
+#define ETH_P_8021AD	0x88A8
+/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_QINQ1		0x9100
+
 struct proto_xtr_ol_flag {
 	const struct rte_mbuf_dynflag param;
 	bool required;
@@ -128,6 +146,9 @@ static int ice_fw_version_get(struct rte_eth_dev *dev, char *fw_version,
 			      size_t fw_size);
 static int ice_vlan_pvid_set(struct rte_eth_dev *dev,
 			     uint16_t pvid, int on);
+static int ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid);
 static int ice_get_eeprom_length(struct rte_eth_dev *dev);
 static int ice_get_eeprom(struct rte_eth_dev *dev,
 			  struct rte_dev_eeprom_info *eeprom);
@@ -250,6 +271,7 @@ static const struct eth_dev_ops ice_eth_dev_ops = {
 	.rx_queue_intr_disable        = ice_rx_queue_intr_disable,
 	.fw_version_get               = ice_fw_version_get,
 	.vlan_pvid_set                = ice_vlan_pvid_set,
+	.vlan_tpid_set                = ice_vlan_tpid_set,
 	.rxq_info_get                 = ice_rxq_info_get,
 	.txq_info_get                 = ice_txq_info_get,
 	.rx_burst_mode_get            = ice_rx_burst_mode_get,
@@ -1579,6 +1601,9 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 			hw->func_caps.common_cap.rss_table_size;
 	pf->flags |= ICE_FLAG_RSS_AQ_CAPABLE;
 
+	/* Defines the type of outer tag expected */
+	pf->outer_ethertype = ETH_P_8021Q;
+
 	memset(&vsi_ctx, 0, sizeof(vsi_ctx));
 	switch (type) {
 	case ICE_VSI_PF:
@@ -1603,6 +1628,8 @@ ice_setup_vsi(struct ice_pf *pf, enum ice_vsi_type type)
 				 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
 				ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M;
 			vsi_ctx.info.outer_vlan_flags |=
+				(ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+				ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
 				(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
 				 ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
 				ICE_AQ_VSI_OUTER_TAG_TYPE_M;
@@ -4406,11 +4433,87 @@ ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
 	return ice_vsi_manage_vlan_stripping(vsi, false);
 }
 
-static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+	switch (tpid) {
+	case ETH_P_8021Q:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+		break;
+	case ETH_P_8021AD:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+		break;
+	case ETH_P_QINQ1:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+		break;
+	default:
+		*tag_type = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), it supports three
+ * types: ETH_P_8021Q, ETH_P_QINQ1 and ETH_P_8021AD. If the device is
+ * configured in Single VLAN Mode (SVM), then only ETH_P_8021Q is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+	bool is_supported = false;
+
+	switch (vlan_proto) {
+	case ETH_P_8021Q:
+		is_supported = true;
+		break;
+	case ETH_P_8021AD:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	case ETH_P_QINQ1:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	}
+
+	return is_supported;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
 {
 	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	struct ice_vsi_ctx ctxt;
 	enum ice_status status;
+	u8 tag_type;
 	int err = 0;
 
 	/* do not allow modifying VLAN stripping when a port VLAN is configured
@@ -4419,6 +4522,9 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	if (vsi->info.port_based_outer_vlan)
 		return 0;
 
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
 	memset(&ctxt, 0, sizeof(ctxt));
 
 	ctxt.info.valid_sections =
@@ -4429,8 +4535,8 @@ static int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi)
 	ctxt.info.outer_vlan_flags |=
 		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
 		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
-		(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
-		 ICE_AQ_VSI_OUTER_TAG_TYPE_S);
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
 
 	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
 	if (status) {
@@ -4478,22 +4584,104 @@ ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
 static int
 ice_vsi_config_vlan_stripping(struct ice_vsi *vsi, bool ena)
 {
-	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
 	int ret;
 
-	if (ice_is_dvm_ena(hw)) {
-		if (ena)
-			ret = ice_vsi_ena_outer_stripping(vsi);
-		else
-			ret = ice_vsi_dis_outer_stripping(vsi);
+	if (ena)
+		ret = ice_vsi_ena_inner_stripping(vsi);
+	else
+		ret = ice_vsi_dis_inner_stripping(vsi);
+
+	return ret;
+}
+
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_pf *pf = ICE_VSI_TO_PF(vsi);
+	struct rte_eth_dev *dev = ICE_PF_TO_ETH_DEV(pf);
+	u32 l2tsel_bit;
+	u16 i;
+
+	if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+		l2tsel_bit = 0;
+	else
+		l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
+
+	for (i = 0; i < dev->nb_rx_queues; i++) {
+		u32 qrx_context_offset;
+		u32 regval;
+
+		qrx_context_offset =
+			QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, i);
+
+		regval = rd32(hw, qrx_context_offset);
+		regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+		regval |= l2tsel_bit;
+		wr32(hw, qrx_context_offset, regval);
+	}
+}
+
+/* Configure outer vlan stripping on or off in QinQ mode */
+static int
+ice_vsi_config_outer_vlan_stripping(struct ice_vsi *vsi, bool on)
+{
+	uint16_t outer_ethertype = vsi->adapter->pf.outer_ethertype;
+	int err = 0;
+
+	if (vsi->vsi_id >= ICE_MAX_NUM_VSIS) {
+		PMD_DRV_LOG(ERR, "VSI ID exceeds the maximum");
+		return -EINVAL;
+	}
+
+	if (!ice_is_dvm_ena(&vsi->adapter->hw)) {
+		PMD_DRV_LOG(ERR, "Single VLAN mode (SVM) does not support qinq");
+		return -EOPNOTSUPP;
+	}
+
+	if (on) {
+		err = ice_vsi_ena_outer_stripping(vsi, outer_ethertype);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support outer stripping so the first tag always ends
+			 * up in L2TAG2_2ND and the second/inner tag, if
+			 * enabled, is extracted in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	} else {
-		if (ena)
-			ret = ice_vsi_ena_inner_stripping(vsi);
-		else
-			ret = ice_vsi_dis_inner_stripping(vsi);
+		err = ice_vsi_dis_outer_stripping(vsi);
+		if (!err) {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support inner stripping while outer stripping is
+			 * disabled so that the first and only tag is extracted
+			 * in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+		}
 	}
 
-	return ret;
+	return err;
 }
 
 static int
@@ -4518,6 +4706,14 @@ ice_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 			ice_vsi_config_vlan_stripping(vsi, false);
 	}
 
+	if (mask & RTE_ETH_QINQ_STRIP_MASK) {
+		/* Enable or disable outer VLAN stripping */
+		if (rxmode->offloads & RTE_ETH_RX_OFFLOAD_QINQ_STRIP)
+			ice_vsi_config_outer_vlan_stripping(vsi, true);
+		else
+			ice_vsi_config_outer_vlan_stripping(vsi, false);
+	}
+
 	return 0;
 }
 
@@ -4994,6 +5190,130 @@ ice_vsi_vlan_pvid_set(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
 	return ret;
 }
 
+/**
+ * __ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info = vsi->info;
+
+	ctxt.info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt.info.port_based_outer_vlan = rte_cpu_to_le_16(vlan_info);
+	ctxt.info.outer_vlan_flags =
+		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+		ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status != ICE_SUCCESS) {
+		PMD_DRV_LOG(ERR,
+		"update VSI for setting outer port based VLAN failed, err %d",
+		status);
+		err = -EINVAL;
+	} else {
+		vsi->info.port_based_outer_vlan = ctxt.info.port_based_outer_vlan;
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.sw_flags2 = ctxt.info.sw_flags2;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @info: vlan pvid info
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+static int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi, struct ice_vsi_vlan_pvid_info *info)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	uint8_t vlan_flags = 0;
+	int err = 0;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	ctxt.info.port_based_inner_vlan = 0;
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+	if (info->config.reject.tagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED;
+	if (info->config.reject.untagged == 0)
+		vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED;
+	ctxt.info.outer_vlan_flags |=
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		((vlan_flags <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (!status) {
+		PMD_DRV_LOG(ERR,
+			    "update VSI for disabling outer VLAN insertion failed, err %d",
+			    status);
+		err = -EINVAL;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+		vsi->info.port_based_inner_vlan = ctxt.info.port_based_inner_vlan;
+	}
+
+	return err;
+}
+
 static int
 ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 {
@@ -5014,6 +5334,13 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 			data->dev_conf.txmode.hw_vlan_reject_untagged;
 	}
 
+	if (ice_is_dvm_ena(&vsi->adapter->hw)) {
+		if (on)
+			return ice_vsi_set_outer_port_vlan(vsi, pvid, pf->outer_ethertype);
+		else
+			return ice_vsi_dis_outer_insertion(vsi, &info);
+	}
+
 	ret = ice_vsi_vlan_pvid_set(vsi, &info);
 	if (ret < 0) {
 		PMD_DRV_LOG(ERR, "Failed to set pvid.");
@@ -5023,6 +5350,77 @@ ice_vlan_pvid_set(struct rte_eth_dev *dev, uint16_t pvid, int on)
 	return 0;
 }
 
+static int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, uint16_t tpid)
+{
+	struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
+	struct ice_vsi_ctx ctxt;
+	enum ice_status status;
+	int err = 0;
+	u8 tag_type;
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.info.valid_sections =
+		rte_cpu_to_le_16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt.info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt.info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, &ctxt, NULL);
+	if (status) {
+		PMD_DRV_LOG(ERR, "Update VSI failed to enable outer VLAN stripping");
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt.info.outer_vlan_flags;
+	}
+
+	return err;
+}
+
+static int
+ice_vlan_tpid_set(struct rte_eth_dev *dev,
+		   enum rte_vlan_type vlan_type,
+		   uint16_t tpid)
+{
+	struct ice_pf *pf = ICE_DEV_PRIVATE_TO_PF(dev->data->dev_private);
+	struct ice_hw *hw = ICE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ice_vsi *vsi = pf->main_vsi;
+	uint64_t qinq = dev->data->dev_conf.rxmode.offloads &
+		   RTE_ETH_RX_OFFLOAD_VLAN_EXTEND;
+	int err = 0;
+
+	if ((vlan_type != RTE_ETH_VLAN_TYPE_INNER &&
+	     vlan_type != RTE_ETH_VLAN_TYPE_OUTER) ||
+	     (!qinq && vlan_type == RTE_ETH_VLAN_TYPE_INNER) ||
+		 !ice_is_supported_port_vlan_proto(hw, tpid)) {
+		PMD_DRV_LOG(ERR,
+			    "Unsupported vlan type.");
+		return -EINVAL;
+	}
+
+	err = ice_vsi_ena_outer_insertion(vsi, tpid);
+	if (!err)
+		pf->outer_ethertype = tpid;
+
+	return err;
+}
+
 static int
 ice_get_eeprom_length(struct rte_eth_dev *dev)
 {
diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h
index c8311be179..a07c01ef39 100644
--- a/drivers/net/ice/ice_ethdev.h
+++ b/drivers/net/ice/ice_ethdev.h
@@ -550,6 +550,7 @@ struct ice_pf {
 	uint64_t supported_rxdid; /* bitmap for supported RXDID */
 	uint64_t rss_hf;
 	struct ice_tm_conf tm_conf;
+	uint16_t outer_ethertype;
 };
 
 #define ICE_MAX_QUEUE_NUM  2048
-- 
2.25.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-10-20  0:39 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-20  6:16 [PATCH] net/ice: CVL support double vlan Mingjin Ye
2023-05-06 10:04 ` [PATCH v2] net/ice: " Mingjin Ye
2023-05-26 10:16   ` Xu, Ke1
2023-05-26 11:10     ` Zhang, Qi Z
2023-07-17  9:36 ` [POC] net/iavf: support no data path polling mode Mingjin Ye
2023-07-20 10:08   ` [POC v2] " Mingjin Ye
2023-07-20 15:45     ` Stephen Hemminger
2023-07-21  9:57     ` [POC v3] " Mingjin Ye
2023-08-11  6:27       ` [PATCH] " Mingjin Ye
2023-09-26  7:56         ` [PATCH v2] " Mingjin Ye
2023-10-13  1:27           ` [PATCH v3] " Mingjin Ye
2023-10-17  1:44             ` [PATCH v4] " Mingjin Ye
2023-10-17  2:19             ` Mingjin Ye
2023-10-19  9:04               ` [PATCH v5] net/iavf: data paths support no-polling mode Mingjin Ye
2023-10-20  0:39                 ` Zhang, Qi Z
  -- strict thread matches above, loose matches on Subject: below --
2023-04-19 10:46 [PATCH] net/ice: CVL support double vlan Mingjin Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).