patches for DPDK stable branches
 help / color / mirror / Atom feed
* [dpdk-stable] [PATCH 1/1] net/failsafe: link_update request crashing at boot
@ 2021-10-21 11:51 vipul.ashri
  2021-10-21 21:42 ` [dpdk-stable] [PATCH v2] " vipul.ashri
  0 siblings, 1 reply; 4+ messages in thread
From: vipul.ashri @ 2021-10-21 11:51 UTC (permalink / raw)
  To: grive; +Cc: dev, Vipul Ashri, stable

From: Vipul Ashri <vipul.ashri@oracle.com>

failsafe crashed while sending early link_update request during
boot time initialization.
Based on debugging we found failsafe device was good but sub-
devices were progressing towards initialization and SUBOPS macro
where expanding macro gives [partial_dev]->dev_ops->link_update()
execution of which triggered crash because dev_ops==0. similar
crash seen at failsafe_eth_dev_close()

Failsafe driver need a separate check for subdevices similar to
"RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);" which is
called to almost every eth_dev function.

Fixes: a46f8d5 ("net/failsafe: add fail-safe PMD")
Cc: stable@dpdk.org
Signed-off-by: Vipul Ashri <vipul.ashri@oracle.com>
---
 drivers/net/failsafe/failsafe_ops.c     | 45 +++++++++++++++++++++++--
 drivers/net/failsafe/failsafe_private.h |  6 ++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/net/failsafe/failsafe_ops.c b/drivers/net/failsafe/failsafe_ops.c
index d0030af061..f0c5e40fd5 100644
--- a/drivers/net/failsafe/failsafe_ops.c
+++ b/drivers/net/failsafe/failsafe_ops.c
@@ -371,7 +371,8 @@ fs_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 		close(rxq->event_fd);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
 		if (ETH(sdev)->data->rx_queues != NULL &&
-		    ETH(sdev)->data->rx_queues[rxq->qid] != NULL)
+			ETH(sdev)->data->rx_queues[rxq->qid] != NULL &&
+			SUBDEV_VALID_PORTID(sdev)) {
 			SUBOPS(sdev, rx_queue_release)(ETH(sdev), rxq->qid);
 	}
 	dev->data->rx_queues[rxq->qid] = NULL;
@@ -405,6 +406,12 @@ fs_rx_queue_setup(struct rte_eth_dev *dev,
 	fs_lock(dev, 0);
 	if (rx_conf->rx_deferred_start) {
 		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+			if (!SUBDEV_VALID_PORTID(sdev)) {
+				ERROR("fs_rx_queue_setup: Invalid sub-device "
+					"port_id=%u\n", PORT_ID(sdev));
+				fs_unlock(dev, 0);
+				return -ENODEV;
+			}
 			if (SUBOPS(sdev, rx_queue_start) == NULL) {
 				ERROR("Rx queue deferred start is not "
 					"supported for subdevice %d", i);
@@ -548,7 +555,8 @@ fs_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 	fs_lock(dev, 0);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
 		if (ETH(sdev)->data->tx_queues != NULL &&
-		    ETH(sdev)->data->tx_queues[txq->qid] != NULL)
+			ETH(sdev)->data->tx_queues[txq->qid] != NULL &&
+			SUBDEV_VALID_PORTID(sdev)) {
 			SUBOPS(sdev, tx_queue_release)(ETH(sdev), txq->qid);
 	}
 	dev->data->tx_queues[txq->qid] = NULL;
@@ -571,6 +579,12 @@ fs_tx_queue_setup(struct rte_eth_dev *dev,
 	fs_lock(dev, 0);
 	if (tx_conf->tx_deferred_start) {
 		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+			if (!SUBDEV_VALID_PORTID(sdev)) {
+				ERROR("fs_tx_queue_setup: Invalid sub-device "
+					"port_id=%u\n", PORT_ID(sdev));
+				fs_unlock(dev, 0);
+				return -ENODEV;
+			}
 			if (SUBOPS(sdev, tx_queue_start) == NULL) {
 				ERROR("Tx queue deferred start is not "
 					"supported for subdevice %d", i);
@@ -645,6 +659,12 @@ failsafe_eth_dev_close(struct rte_eth_dev *dev)
 	fs_lock(dev, 0);
 	failsafe_hotplug_alarm_cancel(dev);
 	if (PRIV(dev)->state == DEV_STARTED) {
+		if (!rte_eth_dev_is_valid_port(dev->data->port_id)) {
+			ERROR("failsafe_eth_dev_close: Invalid sub-device "
+				"port_id=%u\n", dev->data->port_id);
+			fs_unlock(dev, 0);
+			return -ENODEV;
+		}
 		ret = dev->dev_ops->dev_stop(dev);
 		if (ret != 0) {
 			fs_unlock(dev, 0);
@@ -827,6 +847,12 @@ fs_link_update(struct rte_eth_dev *dev,
 
 	fs_lock(dev, 0);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+		if (!SUBDEV_VALID_PORTID(sdev)) {
+			ERROR("fs_link_update: Invalid Sub-device "
+				"port_id=%u\n", PORT_ID(sdev));
+			fs_unlock(dev, 0);
+			return -ENODEV;
+		}
 		DEBUG("Calling link_update on sub_device %d", i);
 		ret = (SUBOPS(sdev, link_update))(ETH(sdev), wait_to_complete);
 		if (ret && ret != -1 && sdev->remove == 0 &&
@@ -1251,6 +1277,15 @@ fs_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 		goto unlock;
 	}
 	edev = ETH(sdev);
+
+	if (!SUBDEV_VALID_PORTID(sdev)) {
+		ERROR("fs_dev_supported_ptypes_get: "
+			"Invalid TX_SUBDEV port_id=%u\n", PORT_ID(sdev));
+		rte_errno = -ENODEV;
+		ret = NULL;
+		goto unlock;
+	}
+
 	/* ENOTSUP: counts as no supported ptypes */
 	if (SUBOPS(sdev, dev_supported_ptypes_get) == NULL) {
 		ret = NULL;
@@ -1326,6 +1361,12 @@ fs_flow_ctrl_get(struct rte_eth_dev *dev,
 		ret = 0;
 		goto unlock;
 	}
+	if (!SUBDEV_VALID_PORTID(sdev)) {
+		ERROR("fs_flow_ctrl_get_get: Invalid TX_SUBDEV "
+			"port_id=%u\n", PORT_ID(sdev));
+		ret = -ENODEV;
+		goto unlock;
+	}
 	if (SUBOPS(sdev, flow_ctrl_get) == NULL) {
 		ret = -ENOTSUP;
 		goto unlock;
diff --git a/drivers/net/failsafe/failsafe_private.h b/drivers/net/failsafe/failsafe_private.h
index cd39d103c6..95eb31f9e6 100644
--- a/drivers/net/failsafe/failsafe_private.h
+++ b/drivers/net/failsafe/failsafe_private.h
@@ -308,6 +308,12 @@ extern int failsafe_mac_from_arg;
 	 : (PRIV(dev)->subs[PRIV(dev)->subs_tx].state < DEV_PROBED ? NULL \
 	 : &PRIV(dev)->subs[PRIV(dev)->subs_tx]))
 
+/**
+ * check for fail-safe sub-device valid port
+ */
+#define SUBDEV_VALID_PORTID(s) \
+    rte_eth_dev_is_valid_port(PORT_ID(s))
+
 /**
  * s:   (struct sub_device *)
  * ops: (struct eth_dev_ops) member
-- 
2.28.0.windows.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [dpdk-stable] [PATCH v2] net/failsafe: link_update request crashing at boot
  2021-10-21 11:51 [dpdk-stable] [PATCH 1/1] net/failsafe: link_update request crashing at boot vipul.ashri
@ 2021-10-21 21:42 ` vipul.ashri
  2021-11-22  9:36   ` [dpdk-dev] " Ferruh Yigit
  2021-11-22 10:23   ` Gaëtan Rivet
  0 siblings, 2 replies; 4+ messages in thread
From: vipul.ashri @ 2021-10-21 21:42 UTC (permalink / raw)
  To: dev; +Cc: grive, Vipul Ashri, stable

From: Vipul Ashri <vipul.ashri@oracle.com>

failsafe crashed while sending early link_update request during
boot time initialization.
Based on debugging we found failsafe device was good but sub-
devices were progressing towards initialization and SUBOPS macro
where expanding macro gives [partial_dev]->dev_ops->link_update()
execution of which triggered crash because dev_ops==0. similar
crash seen at failsafe_eth_dev_close()

Failsafe driver need a separate check for subdevices similar to
"RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);" which is
called to almost every eth_dev function.

Fixes: a46f8d5 ("net/failsafe: add fail-safe PMD")
Cc: stable@dpdk.org
Signed-off-by: Vipul Ashri <vipul.ashri@oracle.com>
---
 drivers/net/failsafe/failsafe_ops.c     | 45 +++++++++++++++++++++++--
 drivers/net/failsafe/failsafe_private.h |  6 ++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/net/failsafe/failsafe_ops.c b/drivers/net/failsafe/failsafe_ops.c
index 29de39910c..8e128b9802 100644
--- a/drivers/net/failsafe/failsafe_ops.c
+++ b/drivers/net/failsafe/failsafe_ops.c
@@ -371,7 +371,8 @@ fs_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 		close(rxq->event_fd);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
 		if (ETH(sdev)->data->rx_queues != NULL &&
-		    ETH(sdev)->data->rx_queues[rxq->qid] != NULL)
+			ETH(sdev)->data->rx_queues[rxq->qid] != NULL &&
+			SUBDEV_VALID_PORTID(sdev))
 			SUBOPS(sdev, rx_queue_release)(ETH(sdev), rxq->qid);
 	}
 	dev->data->rx_queues[rxq->qid] = NULL;
@@ -405,6 +406,12 @@ fs_rx_queue_setup(struct rte_eth_dev *dev,
 	fs_lock(dev, 0);
 	if (rx_conf->rx_deferred_start) {
 		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+			if (!SUBDEV_VALID_PORTID(sdev)) {
+				ERROR("%s: Invalid sub-device port_id=%u\n",
+					__func__, PORT_ID(sdev));
+				fs_unlock(dev, 0);
+				return -ENODEV;
+			}
 			if (SUBOPS(sdev, rx_queue_start) == NULL) {
 				ERROR("Rx queue deferred start is not "
 					"supported for subdevice %d", i);
@@ -548,7 +555,8 @@ fs_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 	fs_lock(dev, 0);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
 		if (ETH(sdev)->data->tx_queues != NULL &&
-		    ETH(sdev)->data->tx_queues[txq->qid] != NULL)
+			ETH(sdev)->data->tx_queues[txq->qid] != NULL &&
+			SUBDEV_VALID_PORTID(sdev))
 			SUBOPS(sdev, tx_queue_release)(ETH(sdev), txq->qid);
 	}
 	dev->data->tx_queues[txq->qid] = NULL;
@@ -571,6 +579,12 @@ fs_tx_queue_setup(struct rte_eth_dev *dev,
 	fs_lock(dev, 0);
 	if (tx_conf->tx_deferred_start) {
 		FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
+			if (!SUBDEV_VALID_PORTID(sdev)) {
+				ERROR("%s: Invalid sub-device port_id=%u\n",
+					__func__, PORT_ID(sdev));
+				fs_unlock(dev, 0);
+				return -ENODEV;
+			}
 			if (SUBOPS(sdev, tx_queue_start) == NULL) {
 				ERROR("Tx queue deferred start is not "
 					"supported for subdevice %d", i);
@@ -645,6 +659,12 @@ failsafe_eth_dev_close(struct rte_eth_dev *dev)
 	fs_lock(dev, 0);
 	failsafe_hotplug_alarm_cancel(dev);
 	if (PRIV(dev)->state == DEV_STARTED) {
+		if (!rte_eth_dev_is_valid_port(dev->data->port_id)) {
+			ERROR("%s: Invalid sub-device port_id=%u\n",
+				__func__, dev->data->port_id);
+			fs_unlock(dev, 0);
+			return -ENODEV;
+		}
 		ret = dev->dev_ops->dev_stop(dev);
 		if (ret != 0) {
 			fs_unlock(dev, 0);
@@ -827,6 +847,12 @@ fs_link_update(struct rte_eth_dev *dev,
 
 	fs_lock(dev, 0);
 	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE) {
+		if (!SUBDEV_VALID_PORTID(sdev)) {
+			ERROR("%s: Invalid Sub-device port_id=%u\n",
+				__func__, PORT_ID(sdev));
+			fs_unlock(dev, 0);
+			return -ENODEV;
+		}
 		DEBUG("Calling link_update on sub_device %d", i);
 		ret = (SUBOPS(sdev, link_update))(ETH(sdev), wait_to_complete);
 		if (ret && ret != -1 && sdev->remove == 0 &&
@@ -1249,6 +1275,15 @@ fs_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 		goto unlock;
 	}
 	edev = ETH(sdev);
+
+	if (!SUBDEV_VALID_PORTID(sdev)) {
+		ERROR("%s: Invalid TX_SUBDEV port_id=%u\n",
+			__func__, PORT_ID(sdev));
+		rte_errno = -ENODEV;
+		ret = NULL;
+		goto unlock;
+	}
+
 	/* ENOTSUP: counts as no supported ptypes */
 	if (SUBOPS(sdev, dev_supported_ptypes_get) == NULL) {
 		ret = NULL;
@@ -1324,6 +1359,12 @@ fs_flow_ctrl_get(struct rte_eth_dev *dev,
 		ret = 0;
 		goto unlock;
 	}
+	if (!SUBDEV_VALID_PORTID(sdev)) {
+		ERROR("%s: Invalid TX_SUBDEV port_id=%u\n",
+			__func__, PORT_ID(sdev));
+		ret = -ENODEV;
+		goto unlock;
+	}
 	if (SUBOPS(sdev, flow_ctrl_get) == NULL) {
 		ret = -ENOTSUP;
 		goto unlock;
diff --git a/drivers/net/failsafe/failsafe_private.h b/drivers/net/failsafe/failsafe_private.h
index cd39d103c6..0227060bcb 100644
--- a/drivers/net/failsafe/failsafe_private.h
+++ b/drivers/net/failsafe/failsafe_private.h
@@ -308,6 +308,12 @@ extern int failsafe_mac_from_arg;
 	 : (PRIV(dev)->subs[PRIV(dev)->subs_tx].state < DEV_PROBED ? NULL \
 	 : &PRIV(dev)->subs[PRIV(dev)->subs_tx]))
 
+/**
+ * check for fail-safe sub-device valid port
+ */
+#define SUBDEV_VALID_PORTID(s) \
+	rte_eth_dev_is_valid_port(PORT_ID(s))
+
 /**
  * s:   (struct sub_device *)
  * ops: (struct eth_dev_ops) member
-- 
2.28.0.windows.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/failsafe: link_update request crashing at boot
  2021-10-21 21:42 ` [dpdk-stable] [PATCH v2] " vipul.ashri
@ 2021-11-22  9:36   ` Ferruh Yigit
  2021-11-22 10:23   ` Gaëtan Rivet
  1 sibling, 0 replies; 4+ messages in thread
From: Ferruh Yigit @ 2021-11-22  9:36 UTC (permalink / raw)
  To: vipul.ashri, grive; +Cc: stable, dev

On 10/21/2021 10:42 PM, vipul.ashri@oracle.com wrote:
> From: Vipul Ashri <vipul.ashri@oracle.com>
> 
> failsafe crashed while sending early link_update request during
> boot time initialization.
> Based on debugging we found failsafe device was good but sub-
> devices were progressing towards initialization and SUBOPS macro
> where expanding macro gives [partial_dev]->dev_ops->link_update()
> execution of which triggered crash because dev_ops==0. similar
> crash seen at failsafe_eth_dev_close()
> 
> Failsafe driver need a separate check for subdevices similar to
> "RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);" which is
> called to almost every eth_dev function.
> 
> Fixes: a46f8d5 ("net/failsafe: add fail-safe PMD")
> Cc: stable@dpdk.org
> Signed-off-by: Vipul Ashri <vipul.ashri@oracle.com>


Hi Gaetan,

Can you please review this patch?

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] net/failsafe: link_update request crashing at boot
  2021-10-21 21:42 ` [dpdk-stable] [PATCH v2] " vipul.ashri
  2021-11-22  9:36   ` [dpdk-dev] " Ferruh Yigit
@ 2021-11-22 10:23   ` Gaëtan Rivet
  1 sibling, 0 replies; 4+ messages in thread
From: Gaëtan Rivet @ 2021-11-22 10:23 UTC (permalink / raw)
  To: vipul.ashri, dev; +Cc: stable

On Thu, Oct 21, 2021, at 23:42, vipul.ashri@oracle.com wrote:
> From: Vipul Ashri <vipul.ashri@oracle.com>
>
> failsafe crashed while sending early link_update request during
> boot time initialization.
> Based on debugging we found failsafe device was good but sub-
> devices were progressing towards initialization and SUBOPS macro
> where expanding macro gives [partial_dev]->dev_ops->link_update()
> execution of which triggered crash because dev_ops==0. similar
> crash seen at failsafe_eth_dev_close()
>
> Failsafe driver need a separate check for subdevices similar to
> "RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);" which is
> called to almost every eth_dev function.
>
> Fixes: a46f8d5 ("net/failsafe: add fail-safe PMD")
> Cc: stable@dpdk.org
> Signed-off-by: Vipul Ashri <vipul.ashri@oracle.com>

Hello Vipul,

I'm sorry for the delay, I missed your fix on the mailing list.

IIUC, the issue is that failsafe finished init and received an ethdev
operation call, but one of its sub-device, although marked DEV_ACTIVE,
has its eth_dev->dev_ops field NULL.

It is really surprising to me, because there aren't many ways for a sub-device
to become DEV_ACTIVE.

The only two ways are

  * by executing 'fs_dev_configure()', which will first execute
    rte_eth_dev_configure() on the sub-device, and on error would
    stop *without* setting DEV_ACTIVE.
    rte_eth_dev_configure() will itself execute
    RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV), so it would
    return negative errno and fs_dev_configure() would abort.

  * by executing 'fs_dev_remove()' and the sub-device was 'DEV_STARTED'
    to begin with, then it is retrograded to DEV_ACTIVE once stopped.

So I don't understand yet how it is possible for a sub-device to become DEV_ACTIVE
while its eth_dev->dev_ops are NULL. It seems more like a bug, memory corruption or
just an unexpected execution pattern.

Could describe in more detail the execution?
In particular, setting the EAL log-level to debug with the option:
' --log-level pmd.net.failsafe:debug '
for example while using testpmd or your DPDK app.
It should show ethdev level accesses to the sub-devices, and error values.

Best regards,
-- 
Gaetan Rivet

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-11-22 10:23 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-21 11:51 [dpdk-stable] [PATCH 1/1] net/failsafe: link_update request crashing at boot vipul.ashri
2021-10-21 21:42 ` [dpdk-stable] [PATCH v2] " vipul.ashri
2021-11-22  9:36   ` [dpdk-dev] " Ferruh Yigit
2021-11-22 10:23   ` Gaëtan Rivet

patches for DPDK stable branches

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://inbox.dpdk.org/stable/0 stable/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 stable stable/ http://inbox.dpdk.org/stable \
		stable@dpdk.org
	public-inbox-index stable

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.stable


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git