DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields
@ 2020-10-26  7:16 Xueming Li
  2020-10-26  7:16 ` [dpdk-dev] [PATCH 2/2] vdpa/mlx5: hardware error handling Xueming Li
                   ` (8 more replies)
  0 siblings, 9 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26  7:16 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

Add the needed fields for virtq DevX object to read the error state.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
 drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
 drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..dc426e9b09 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1754,6 +1754,9 @@ mlx5_devx_cmd_query_virtq(struct mlx5_devx_obj *virtq_obj,
 	attr->hw_available_index = MLX5_GET16(virtio_net_q, virtq,
 					      hw_available_index);
 	attr->hw_used_index = MLX5_GET16(virtio_net_q, virtq, hw_used_index);
+	attr->state = MLX5_GET16(virtio_net_q, virtq, state);
+	attr->error_type = MLX5_GET16(virtio_net_q, virtq,
+				      virtio_q_context.error_type);
 	return ret;
 }
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..0ea2427b75 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -298,6 +298,7 @@ struct mlx5_devx_virtq_attr {
 		uint32_t size;
 		uint64_t offset;
 	} umems[3];
+	uint8_t error_type;
 };
 
 
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..7d671a3996 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -2280,7 +2280,8 @@ struct mlx5_ifc_virtio_q_bits {
 	u8 used_addr[0x40];
 	u8 available_addr[0x40];
 	u8 virtio_q_mkey[0x20];
-	u8 reserved_at_160[0x20];
+	u8 reserved_at_160[0x18];
+	u8 error_type[0x8];
 	u8 umem_1_id[0x20];
 	u8 umem_1_size[0x20];
 	u8 umem_1_offset[0x40];
@@ -2308,7 +2309,7 @@ struct mlx5_ifc_virtio_net_q_bits {
 	u8 vhost_log_page[0x5];
 	u8 reserved_at_90[0xc];
 	u8 state[0x4];
-	u8 error_type[0x8];
+	u8 reserved_at_a0[0x8];
 	u8 tisn_or_qpn[0x18];
 	u8 dirty_bitmap_mkey[0x20];
 	u8 dirty_bitmap_size[0x20];
@@ -2329,6 +2330,10 @@ struct mlx5_ifc_query_virtq_out_bits {
 	struct mlx5_ifc_virtio_net_q_bits virtq;
 };
 
+enum {
+	MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27,
+};
+
 enum {
 	MLX5_QP_ST_RC = 0x0,
 };
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH 2/2] vdpa/mlx5: hardware error handling
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-26  7:16 ` Xueming Li
  2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26  7:16 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

When hardware error happens, vdpa didn't get such information and leave
driver in silent: working state but no response.

This patch subscribes firmware virtq error event and try to recover max
3 times in 10 seconds, stop virtq if max retry number reached.

When error happens, PMD log in warning level. If failed to recover,
outputs error log. Query virtq statitics to get error counters report.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/vdpa/mlx5/mlx5_vdpa.c       |   2 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |  37 ++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 140 ++++++++++++++++++++++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_virtq.c |  61 +++++++++---
 4 files changed, 225 insertions(+), 15 deletions(-)

diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index a8f3e4b1de..ba779c10ee 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -283,6 +283,7 @@ mlx5_vdpa_dev_close(int vid)
 	}
 	if (priv->configured)
 		ret |= mlx5_vdpa_lm_log(priv);
+	mlx5_vdpa_err_event_unset(priv);
 	mlx5_vdpa_cqe_event_unset(priv);
 	mlx5_vdpa_steer_unset(priv);
 	mlx5_vdpa_virtqs_release(priv);
@@ -318,6 +319,7 @@ mlx5_vdpa_dev_config(int vid)
 		DRV_LOG(WARNING, "MTU cannot be set on device %s.",
 				vdev->device->name);
 	if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
+	    mlx5_vdpa_err_event_setup(priv) ||
 	    mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
 	    mlx5_vdpa_cqe_event_setup(priv)) {
 		mlx5_vdpa_dev_close(vid);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index fcbc12ab0c..0d6886c52c 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -87,6 +87,7 @@ struct mlx5_vdpa_virtq {
 	uint16_t vq_size;
 	uint8_t notifier_state;
 	bool stopped;
+	uint32_t version;
 	struct mlx5_vdpa_priv *priv;
 	struct mlx5_devx_obj *virtq;
 	struct mlx5_devx_obj *counters;
@@ -97,6 +98,8 @@ struct mlx5_vdpa_virtq {
 		uint32_t size;
 	} umems[3];
 	struct rte_intr_handle intr_handle;
+	uint64_t err_time[3]; /* RDTSC time of recent errors. */
+	uint32_t n_retry;
 	struct mlx5_devx_virtio_q_couners_attr reset;
 };
 
@@ -143,8 +146,10 @@ struct mlx5_vdpa_priv {
 	struct rte_vhost_memory *vmem;
 	uint32_t eqn;
 	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_event_channel *err_chnl;
 	struct mlx5dv_devx_uar *uar;
 	struct rte_intr_handle intr_handle;
+	struct rte_intr_handle err_intr_handle;
 	struct mlx5_devx_obj *td;
 	struct mlx5_devx_obj *tis;
 	uint16_t nr_virtqs;
@@ -259,6 +264,25 @@ int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
  */
 void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
 
+/**
+ * Setup error interrupt handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset error event handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv);
+
 /**
  * Release a virtq and all its related resources.
  *
@@ -392,6 +416,19 @@ int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
  */
 int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
 
+/**
+ * Query virtq information.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] index
+ *   The virtq index.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index);
+
 /**
  * Get virtq statistics.
  *
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 8a01e42794..89df699dad 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -15,11 +15,14 @@
 #include <rte_alarm.h>
 
 #include <mlx5_common.h>
+#include <mlx5_glue.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_ERROR_TIME_SEC 3u
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -378,6 +381,143 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
 	pthread_mutex_unlock(&priv->vq_config_lock);
 }
 
+static void
+mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
+{
+#ifdef HAVE_IBV_DEVX_EVENT
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	uint32_t vq_index, i, version;
+	struct mlx5_vdpa_virtq *virtq;
+	uint64_t sec;
+
+	pthread_mutex_lock(&priv->vq_config_lock);
+	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		vq_index = out.event_resp.cookie && UINT32_MAX;
+		version = out.event_resp.cookie >> 32;
+		if (vq_index >= priv->nr_virtqs) {
+			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
+				priv->vdev->device->name, vq_index);
+			continue;
+		}
+		virtq = &priv->virtqs[vq_index];
+		if (!virtq->enable || virtq->version != version)
+			continue;
+		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
+			continue;
+		virtq->stopped = true;
+		/* Query error info. */
+		if (mlx5_vdpa_virtq_query(priv, vq_index))
+			goto log;
+		/* Disable vq. */
+		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
+			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
+			goto log;
+		}
+		/* Retry if error happens less than N times in 3 seconds. */
+		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
+		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
+			/* Retry. */
+			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
+				DRV_LOG(ERR, "Failed to enable virtq %d.",
+					vq_index);
+			else
+				DRV_LOG(WARNING, "Recover virtq %d: %u.",
+					vq_index, ++virtq->n_retry);
+		} else {
+			/* Retry timeout, give up. */
+			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
+				priv->vdev->device->name, vq_index);
+		}
+log:
+		/* Shift in current time to error time log end. */
+		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
+			virtq->err_time[i - 1] = virtq->err_time[i];
+		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
+	}
+	pthread_mutex_unlock(&priv->vq_config_lock);
+#endif
+}
+
+int
+mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int ret;
+	int flags;
+
+	/* Setup device event channel. */
+	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
+	if (!priv->err_chnl) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create device event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	flags = fcntl(priv->err_chnl->fd, F_GETFL);
+	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change device event channel FD.");
+		goto error;
+	}
+	priv->err_intr_handle.fd = priv->err_chnl->fd;
+	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->err_intr_handle,
+				       mlx5_vdpa_err_interrupt_handler,
+				       priv)) {
+		priv->err_intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
+			priv->vid);
+		goto error;
+	} else {
+		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
+			priv->vid);
+	}
+	return 0;
+error:
+	mlx5_vdpa_err_event_unset(priv);
+	return -1;
+}
+
+void
+mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	int ret = -EAGAIN;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+
+	if (!priv->err_intr_handle.fd)
+		return;
+	while (retries-- && ret == -EAGAIN) {
+		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
+					    mlx5_vdpa_err_interrupt_handler,
+					    priv);
+		if (ret == -EAGAIN) {
+			DRV_LOG(DEBUG, "Try again to unregister fd %d "
+				"of error interrupt, retries = %d.",
+				priv->err_intr_handle.fd, retries);
+			rte_pause();
+		}
+	}
+	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
+	if (priv->err_chnl) {
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->err_chnl,
+		       &out.event_resp, sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
+		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
+		priv->err_chnl = NULL;
+	}
+}
+
 int
 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
 {
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
index 17e71cf4f4..d5ac040544 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -88,11 +88,6 @@ mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
 			rte_free(virtq->umems[i].buf);
 	}
 	memset(&virtq->umems, 0, sizeof(virtq->umems));
-	if (virtq->counters) {
-		claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
-		virtq->counters = NULL;
-	}
-	memset(&virtq->reset, 0, sizeof(virtq->reset));
 	if (virtq->eqp.fw_qp)
 		mlx5_vdpa_event_qp_destroy(&virtq->eqp);
 	virtq->notifier_state = MLX5_VDPA_NOTIFIER_STATE_DISABLED;
@@ -103,9 +98,19 @@ void
 mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
 {
 	int i;
+	struct mlx5_vdpa_virtq *virtq;
 
-	for (i = 0; i < priv->nr_virtqs; i++)
-		mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		virtq = &priv->virtqs[i];
+		mlx5_vdpa_virtq_unset(virtq);
+		if (virtq->counters) {
+			claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
+			virtq->counters = NULL;
+			memset(&virtq->reset, 0, sizeof(virtq->reset));
+		}
+		memset(virtq->err_time, 0, sizeof(virtq->err_time));
+		virtq->n_retry = 0;
+	}
 	if (priv->tis) {
 		claim_zero(mlx5_devx_cmd_destroy(priv->tis));
 		priv->tis = NULL;
@@ -138,7 +143,6 @@ mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
 int
 mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 {
-	struct mlx5_devx_virtq_attr attr = {0};
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
 	int ret;
 
@@ -148,6 +152,17 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 	if (ret)
 		return -1;
 	virtq->stopped = true;
+	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	return mlx5_vdpa_virtq_query(priv, index);
+}
+
+int
+mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index)
+{
+	struct mlx5_devx_virtq_attr attr = {0};
+	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+	int ret;
+
 	if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
 		DRV_LOG(ERR, "Failed to query virtq %d.", index);
 		return -1;
@@ -162,7 +177,9 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 		DRV_LOG(ERR, "Failed to set virtq %d base.", index);
 		return -1;
 	}
-	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	if (attr.state == MLX5_VIRTQ_STATE_ERROR)
+		DRV_LOG(WARNING, "vid %d vring %d hw error=%hhu",
+			priv->vid, index, attr.error_type);
 	return 0;
 }
 
@@ -195,6 +212,8 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 	unsigned int i;
 	uint16_t last_avail_idx;
 	uint16_t last_used_idx;
+	uint16_t event_num = MLX5_EVENT_TYPE_OBJECT_CHANGE;
+	uint64_t cookie;
 
 	ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
 	if (ret)
@@ -231,8 +250,9 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 			" need event QPs and event mechanism.", index);
 	}
 	if (priv->caps.queue_counters_valid) {
-		virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
-								    (priv->ctx);
+		if (!virtq->counters)
+			virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
+								(priv->ctx);
 		if (!virtq->counters) {
 			DRV_LOG(ERR, "Failed to create virtq couners for virtq"
 				" %d.", index);
@@ -332,6 +352,19 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 				virtq->intr_handle.fd, index);
 		}
 	}
+	/* Subscribe virtq error event. */
+	virtq->version++;
+	cookie = ((uint64_t)virtq->version << 32) + index;
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->err_chnl,
+						   virtq->virtq->obj,
+						   sizeof(event_num),
+						   &event_num, cookie);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe device %d virtq %d error event.",
+			priv->vid, index);
+		rte_errno = errno;
+		goto error;
+	}
 	virtq->stopped = false;
 	DRV_LOG(DEBUG, "vid %u virtq %u was created successfully.", priv->vid,
 		index);
@@ -526,12 +559,11 @@ mlx5_vdpa_virtq_stats_get(struct mlx5_vdpa_priv *priv, int qid,
 	struct mlx5_devx_virtio_q_couners_attr attr = {0};
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters, &attr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to read virtq %d stats from HW.", qid);
@@ -583,12 +615,11 @@ mlx5_vdpa_virtq_stats_reset(struct mlx5_vdpa_priv *priv, int qid)
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[qid];
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters,
 						    &virtq->reset);
 	if (ret)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v1 1/2] common/mlx5: add virtq attributes error fields
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
  2020-10-26  7:16 ` [dpdk-dev] [PATCH 2/2] vdpa/mlx5: hardware error handling Xueming Li
@ 2020-10-26  9:12 ` Xueming Li
  2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 2/2] vdpa/mlx5: hardware error handling Xueming Li
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26  9:12 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

Add the needed fields for virtq DevX object to read the error state.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
 drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
 drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..dc426e9b09 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1754,6 +1754,9 @@ mlx5_devx_cmd_query_virtq(struct mlx5_devx_obj *virtq_obj,
 	attr->hw_available_index = MLX5_GET16(virtio_net_q, virtq,
 					      hw_available_index);
 	attr->hw_used_index = MLX5_GET16(virtio_net_q, virtq, hw_used_index);
+	attr->state = MLX5_GET16(virtio_net_q, virtq, state);
+	attr->error_type = MLX5_GET16(virtio_net_q, virtq,
+				      virtio_q_context.error_type);
 	return ret;
 }
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..0ea2427b75 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -298,6 +298,7 @@ struct mlx5_devx_virtq_attr {
 		uint32_t size;
 		uint64_t offset;
 	} umems[3];
+	uint8_t error_type;
 };
 
 
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..7d671a3996 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -2280,7 +2280,8 @@ struct mlx5_ifc_virtio_q_bits {
 	u8 used_addr[0x40];
 	u8 available_addr[0x40];
 	u8 virtio_q_mkey[0x20];
-	u8 reserved_at_160[0x20];
+	u8 reserved_at_160[0x18];
+	u8 error_type[0x8];
 	u8 umem_1_id[0x20];
 	u8 umem_1_size[0x20];
 	u8 umem_1_offset[0x40];
@@ -2308,7 +2309,7 @@ struct mlx5_ifc_virtio_net_q_bits {
 	u8 vhost_log_page[0x5];
 	u8 reserved_at_90[0xc];
 	u8 state[0x4];
-	u8 error_type[0x8];
+	u8 reserved_at_a0[0x8];
 	u8 tisn_or_qpn[0x18];
 	u8 dirty_bitmap_mkey[0x20];
 	u8 dirty_bitmap_size[0x20];
@@ -2329,6 +2330,10 @@ struct mlx5_ifc_query_virtq_out_bits {
 	struct mlx5_ifc_virtio_net_q_bits virtq;
 };
 
+enum {
+	MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27,
+};
+
 enum {
 	MLX5_QP_ST_RC = 0x0,
 };
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v1 2/2] vdpa/mlx5: hardware error handling
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
  2020-10-26  7:16 ` [dpdk-dev] [PATCH 2/2] vdpa/mlx5: hardware error handling Xueming Li
  2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-26  9:12 ` Xueming Li
  2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26  9:12 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

When hardware error happens, vdpa didn't get such information and leave
driver in silent: working state but no response.

This patch subscribes firmware virtq error event and try to recover max
3 times in 10 seconds, stop virtq if max retry number reached.

When error happens, PMD log in warning level. If failed to recover,
outputs error log. Query virtq statistics to get error counters report.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/vdpa/mlx5/mlx5_vdpa.c       |   2 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |  37 ++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 140 ++++++++++++++++++++++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_virtq.c |  61 +++++++++---
 4 files changed, 225 insertions(+), 15 deletions(-)

diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index a8f3e4b1de..ba779c10ee 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -283,6 +283,7 @@ mlx5_vdpa_dev_close(int vid)
 	}
 	if (priv->configured)
 		ret |= mlx5_vdpa_lm_log(priv);
+	mlx5_vdpa_err_event_unset(priv);
 	mlx5_vdpa_cqe_event_unset(priv);
 	mlx5_vdpa_steer_unset(priv);
 	mlx5_vdpa_virtqs_release(priv);
@@ -318,6 +319,7 @@ mlx5_vdpa_dev_config(int vid)
 		DRV_LOG(WARNING, "MTU cannot be set on device %s.",
 				vdev->device->name);
 	if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
+	    mlx5_vdpa_err_event_setup(priv) ||
 	    mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
 	    mlx5_vdpa_cqe_event_setup(priv)) {
 		mlx5_vdpa_dev_close(vid);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index fcbc12ab0c..0d6886c52c 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -87,6 +87,7 @@ struct mlx5_vdpa_virtq {
 	uint16_t vq_size;
 	uint8_t notifier_state;
 	bool stopped;
+	uint32_t version;
 	struct mlx5_vdpa_priv *priv;
 	struct mlx5_devx_obj *virtq;
 	struct mlx5_devx_obj *counters;
@@ -97,6 +98,8 @@ struct mlx5_vdpa_virtq {
 		uint32_t size;
 	} umems[3];
 	struct rte_intr_handle intr_handle;
+	uint64_t err_time[3]; /* RDTSC time of recent errors. */
+	uint32_t n_retry;
 	struct mlx5_devx_virtio_q_couners_attr reset;
 };
 
@@ -143,8 +146,10 @@ struct mlx5_vdpa_priv {
 	struct rte_vhost_memory *vmem;
 	uint32_t eqn;
 	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_event_channel *err_chnl;
 	struct mlx5dv_devx_uar *uar;
 	struct rte_intr_handle intr_handle;
+	struct rte_intr_handle err_intr_handle;
 	struct mlx5_devx_obj *td;
 	struct mlx5_devx_obj *tis;
 	uint16_t nr_virtqs;
@@ -259,6 +264,25 @@ int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
  */
 void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
 
+/**
+ * Setup error interrupt handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset error event handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv);
+
 /**
  * Release a virtq and all its related resources.
  *
@@ -392,6 +416,19 @@ int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
  */
 int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
 
+/**
+ * Query virtq information.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] index
+ *   The virtq index.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index);
+
 /**
  * Get virtq statistics.
  *
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 8a01e42794..89df699dad 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -15,11 +15,14 @@
 #include <rte_alarm.h>
 
 #include <mlx5_common.h>
+#include <mlx5_glue.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_ERROR_TIME_SEC 3u
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -378,6 +381,143 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
 	pthread_mutex_unlock(&priv->vq_config_lock);
 }
 
+static void
+mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
+{
+#ifdef HAVE_IBV_DEVX_EVENT
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	uint32_t vq_index, i, version;
+	struct mlx5_vdpa_virtq *virtq;
+	uint64_t sec;
+
+	pthread_mutex_lock(&priv->vq_config_lock);
+	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		vq_index = out.event_resp.cookie && UINT32_MAX;
+		version = out.event_resp.cookie >> 32;
+		if (vq_index >= priv->nr_virtqs) {
+			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
+				priv->vdev->device->name, vq_index);
+			continue;
+		}
+		virtq = &priv->virtqs[vq_index];
+		if (!virtq->enable || virtq->version != version)
+			continue;
+		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
+			continue;
+		virtq->stopped = true;
+		/* Query error info. */
+		if (mlx5_vdpa_virtq_query(priv, vq_index))
+			goto log;
+		/* Disable vq. */
+		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
+			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
+			goto log;
+		}
+		/* Retry if error happens less than N times in 3 seconds. */
+		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
+		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
+			/* Retry. */
+			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
+				DRV_LOG(ERR, "Failed to enable virtq %d.",
+					vq_index);
+			else
+				DRV_LOG(WARNING, "Recover virtq %d: %u.",
+					vq_index, ++virtq->n_retry);
+		} else {
+			/* Retry timeout, give up. */
+			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
+				priv->vdev->device->name, vq_index);
+		}
+log:
+		/* Shift in current time to error time log end. */
+		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
+			virtq->err_time[i - 1] = virtq->err_time[i];
+		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
+	}
+	pthread_mutex_unlock(&priv->vq_config_lock);
+#endif
+}
+
+int
+mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int ret;
+	int flags;
+
+	/* Setup device event channel. */
+	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
+	if (!priv->err_chnl) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create device event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	flags = fcntl(priv->err_chnl->fd, F_GETFL);
+	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change device event channel FD.");
+		goto error;
+	}
+	priv->err_intr_handle.fd = priv->err_chnl->fd;
+	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->err_intr_handle,
+				       mlx5_vdpa_err_interrupt_handler,
+				       priv)) {
+		priv->err_intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
+			priv->vid);
+		goto error;
+	} else {
+		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
+			priv->vid);
+	}
+	return 0;
+error:
+	mlx5_vdpa_err_event_unset(priv);
+	return -1;
+}
+
+void
+mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	int ret = -EAGAIN;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+
+	if (!priv->err_intr_handle.fd)
+		return;
+	while (retries-- && ret == -EAGAIN) {
+		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
+					    mlx5_vdpa_err_interrupt_handler,
+					    priv);
+		if (ret == -EAGAIN) {
+			DRV_LOG(DEBUG, "Try again to unregister fd %d "
+				"of error interrupt, retries = %d.",
+				priv->err_intr_handle.fd, retries);
+			rte_pause();
+		}
+	}
+	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
+	if (priv->err_chnl) {
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->err_chnl,
+		       &out.event_resp, sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
+		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
+		priv->err_chnl = NULL;
+	}
+}
+
 int
 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
 {
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
index 17e71cf4f4..d5ac040544 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -88,11 +88,6 @@ mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
 			rte_free(virtq->umems[i].buf);
 	}
 	memset(&virtq->umems, 0, sizeof(virtq->umems));
-	if (virtq->counters) {
-		claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
-		virtq->counters = NULL;
-	}
-	memset(&virtq->reset, 0, sizeof(virtq->reset));
 	if (virtq->eqp.fw_qp)
 		mlx5_vdpa_event_qp_destroy(&virtq->eqp);
 	virtq->notifier_state = MLX5_VDPA_NOTIFIER_STATE_DISABLED;
@@ -103,9 +98,19 @@ void
 mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
 {
 	int i;
+	struct mlx5_vdpa_virtq *virtq;
 
-	for (i = 0; i < priv->nr_virtqs; i++)
-		mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		virtq = &priv->virtqs[i];
+		mlx5_vdpa_virtq_unset(virtq);
+		if (virtq->counters) {
+			claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
+			virtq->counters = NULL;
+			memset(&virtq->reset, 0, sizeof(virtq->reset));
+		}
+		memset(virtq->err_time, 0, sizeof(virtq->err_time));
+		virtq->n_retry = 0;
+	}
 	if (priv->tis) {
 		claim_zero(mlx5_devx_cmd_destroy(priv->tis));
 		priv->tis = NULL;
@@ -138,7 +143,6 @@ mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
 int
 mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 {
-	struct mlx5_devx_virtq_attr attr = {0};
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
 	int ret;
 
@@ -148,6 +152,17 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 	if (ret)
 		return -1;
 	virtq->stopped = true;
+	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	return mlx5_vdpa_virtq_query(priv, index);
+}
+
+int
+mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index)
+{
+	struct mlx5_devx_virtq_attr attr = {0};
+	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+	int ret;
+
 	if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
 		DRV_LOG(ERR, "Failed to query virtq %d.", index);
 		return -1;
@@ -162,7 +177,9 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 		DRV_LOG(ERR, "Failed to set virtq %d base.", index);
 		return -1;
 	}
-	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	if (attr.state == MLX5_VIRTQ_STATE_ERROR)
+		DRV_LOG(WARNING, "vid %d vring %d hw error=%hhu",
+			priv->vid, index, attr.error_type);
 	return 0;
 }
 
@@ -195,6 +212,8 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 	unsigned int i;
 	uint16_t last_avail_idx;
 	uint16_t last_used_idx;
+	uint16_t event_num = MLX5_EVENT_TYPE_OBJECT_CHANGE;
+	uint64_t cookie;
 
 	ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
 	if (ret)
@@ -231,8 +250,9 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 			" need event QPs and event mechanism.", index);
 	}
 	if (priv->caps.queue_counters_valid) {
-		virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
-								    (priv->ctx);
+		if (!virtq->counters)
+			virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
+								(priv->ctx);
 		if (!virtq->counters) {
 			DRV_LOG(ERR, "Failed to create virtq couners for virtq"
 				" %d.", index);
@@ -332,6 +352,19 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 				virtq->intr_handle.fd, index);
 		}
 	}
+	/* Subscribe virtq error event. */
+	virtq->version++;
+	cookie = ((uint64_t)virtq->version << 32) + index;
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->err_chnl,
+						   virtq->virtq->obj,
+						   sizeof(event_num),
+						   &event_num, cookie);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe device %d virtq %d error event.",
+			priv->vid, index);
+		rte_errno = errno;
+		goto error;
+	}
 	virtq->stopped = false;
 	DRV_LOG(DEBUG, "vid %u virtq %u was created successfully.", priv->vid,
 		index);
@@ -526,12 +559,11 @@ mlx5_vdpa_virtq_stats_get(struct mlx5_vdpa_priv *priv, int qid,
 	struct mlx5_devx_virtio_q_couners_attr attr = {0};
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters, &attr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to read virtq %d stats from HW.", qid);
@@ -583,12 +615,11 @@ mlx5_vdpa_virtq_stats_reset(struct mlx5_vdpa_priv *priv, int qid)
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[qid];
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters,
 						    &virtq->reset);
 	if (ret)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] common/mlx5: add virtq attributes error fields
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (2 preceding siblings ...)
  2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 2/2] vdpa/mlx5: hardware error handling Xueming Li
@ 2020-10-26 11:14 ` Xueming Li
  2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 2/2] vdpa/mlx5: hardware error handling Xueming Li
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26 11:14 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

Add the needed fields for virtq DevX object to read the error state.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
 drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
 drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..dc426e9b09 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1754,6 +1754,9 @@ mlx5_devx_cmd_query_virtq(struct mlx5_devx_obj *virtq_obj,
 	attr->hw_available_index = MLX5_GET16(virtio_net_q, virtq,
 					      hw_available_index);
 	attr->hw_used_index = MLX5_GET16(virtio_net_q, virtq, hw_used_index);
+	attr->state = MLX5_GET16(virtio_net_q, virtq, state);
+	attr->error_type = MLX5_GET16(virtio_net_q, virtq,
+				      virtio_q_context.error_type);
 	return ret;
 }
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..0ea2427b75 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -298,6 +298,7 @@ struct mlx5_devx_virtq_attr {
 		uint32_t size;
 		uint64_t offset;
 	} umems[3];
+	uint8_t error_type;
 };
 
 
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..7d671a3996 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -2280,7 +2280,8 @@ struct mlx5_ifc_virtio_q_bits {
 	u8 used_addr[0x40];
 	u8 available_addr[0x40];
 	u8 virtio_q_mkey[0x20];
-	u8 reserved_at_160[0x20];
+	u8 reserved_at_160[0x18];
+	u8 error_type[0x8];
 	u8 umem_1_id[0x20];
 	u8 umem_1_size[0x20];
 	u8 umem_1_offset[0x40];
@@ -2308,7 +2309,7 @@ struct mlx5_ifc_virtio_net_q_bits {
 	u8 vhost_log_page[0x5];
 	u8 reserved_at_90[0xc];
 	u8 state[0x4];
-	u8 error_type[0x8];
+	u8 reserved_at_a0[0x8];
 	u8 tisn_or_qpn[0x18];
 	u8 dirty_bitmap_mkey[0x20];
 	u8 dirty_bitmap_size[0x20];
@@ -2329,6 +2330,10 @@ struct mlx5_ifc_query_virtq_out_bits {
 	struct mlx5_ifc_virtio_net_q_bits virtq;
 };
 
+enum {
+	MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27,
+};
+
 enum {
 	MLX5_QP_ST_RC = 0x0,
 };
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] vdpa/mlx5: hardware error handling
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (3 preceding siblings ...)
  2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-26 11:14 ` Xueming Li
  2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26 11:14 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

When hardware error happens, vdpa didn't get such information and leave
driver in silent: working state but no response.

This patch subscribes firmware virtq error event and try to recover max
3 times in 10 seconds, stop virtq if max retry number reached.

When error happens, PMD log in warning level. If failed to recover,
outputs error log. Query virtq statistics to get error counters report.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/vdpa/mlx5/mlx5_vdpa.c       |   2 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |  37 ++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 142 ++++++++++++++++++++++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_virtq.c |  61 +++++++++---
 4 files changed, 227 insertions(+), 15 deletions(-)

diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index a8f3e4b1de..ba779c10ee 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -283,6 +283,7 @@ mlx5_vdpa_dev_close(int vid)
 	}
 	if (priv->configured)
 		ret |= mlx5_vdpa_lm_log(priv);
+	mlx5_vdpa_err_event_unset(priv);
 	mlx5_vdpa_cqe_event_unset(priv);
 	mlx5_vdpa_steer_unset(priv);
 	mlx5_vdpa_virtqs_release(priv);
@@ -318,6 +319,7 @@ mlx5_vdpa_dev_config(int vid)
 		DRV_LOG(WARNING, "MTU cannot be set on device %s.",
 				vdev->device->name);
 	if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
+	    mlx5_vdpa_err_event_setup(priv) ||
 	    mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
 	    mlx5_vdpa_cqe_event_setup(priv)) {
 		mlx5_vdpa_dev_close(vid);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index fcbc12ab0c..0d6886c52c 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -87,6 +87,7 @@ struct mlx5_vdpa_virtq {
 	uint16_t vq_size;
 	uint8_t notifier_state;
 	bool stopped;
+	uint32_t version;
 	struct mlx5_vdpa_priv *priv;
 	struct mlx5_devx_obj *virtq;
 	struct mlx5_devx_obj *counters;
@@ -97,6 +98,8 @@ struct mlx5_vdpa_virtq {
 		uint32_t size;
 	} umems[3];
 	struct rte_intr_handle intr_handle;
+	uint64_t err_time[3]; /* RDTSC time of recent errors. */
+	uint32_t n_retry;
 	struct mlx5_devx_virtio_q_couners_attr reset;
 };
 
@@ -143,8 +146,10 @@ struct mlx5_vdpa_priv {
 	struct rte_vhost_memory *vmem;
 	uint32_t eqn;
 	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_event_channel *err_chnl;
 	struct mlx5dv_devx_uar *uar;
 	struct rte_intr_handle intr_handle;
+	struct rte_intr_handle err_intr_handle;
 	struct mlx5_devx_obj *td;
 	struct mlx5_devx_obj *tis;
 	uint16_t nr_virtqs;
@@ -259,6 +264,25 @@ int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
  */
 void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
 
+/**
+ * Setup error interrupt handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset error event handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv);
+
 /**
  * Release a virtq and all its related resources.
  *
@@ -392,6 +416,19 @@ int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
  */
 int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
 
+/**
+ * Query virtq information.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] index
+ *   The virtq index.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index);
+
 /**
  * Get virtq statistics.
  *
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 8a01e42794..7d75b09757 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -15,11 +15,14 @@
 #include <rte_alarm.h>
 
 #include <mlx5_common.h>
+#include <mlx5_glue.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_ERROR_TIME_SEC 3u
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -378,6 +381,145 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
 	pthread_mutex_unlock(&priv->vq_config_lock);
 }
 
+static void
+mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
+{
+#ifdef HAVE_IBV_DEVX_EVENT
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	uint32_t vq_index, i, version;
+	struct mlx5_vdpa_virtq *virtq;
+	uint64_t sec;
+
+	pthread_mutex_lock(&priv->vq_config_lock);
+	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		vq_index = out.event_resp.cookie && UINT32_MAX;
+		version = out.event_resp.cookie >> 32;
+		if (vq_index >= priv->nr_virtqs) {
+			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
+				priv->vdev->device->name, vq_index);
+			continue;
+		}
+		virtq = &priv->virtqs[vq_index];
+		if (!virtq->enable || virtq->version != version)
+			continue;
+		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
+			continue;
+		virtq->stopped = true;
+		/* Query error info. */
+		if (mlx5_vdpa_virtq_query(priv, vq_index))
+			goto log;
+		/* Disable vq. */
+		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
+			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
+			goto log;
+		}
+		/* Retry if error happens less than N times in 3 seconds. */
+		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
+		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
+			/* Retry. */
+			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
+				DRV_LOG(ERR, "Failed to enable virtq %d.",
+					vq_index);
+			else
+				DRV_LOG(WARNING, "Recover virtq %d: %u.",
+					vq_index, ++virtq->n_retry);
+		} else {
+			/* Retry timeout, give up. */
+			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
+				priv->vdev->device->name, vq_index);
+		}
+log:
+		/* Shift in current time to error time log end. */
+		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
+			virtq->err_time[i - 1] = virtq->err_time[i];
+		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
+	}
+	pthread_mutex_unlock(&priv->vq_config_lock);
+#endif
+}
+
+int
+mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int ret;
+	int flags;
+
+	/* Setup device event channel. */
+	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
+	if (!priv->err_chnl) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create device event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	flags = fcntl(priv->err_chnl->fd, F_GETFL);
+	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change device event channel FD.");
+		goto error;
+	}
+	priv->err_intr_handle.fd = priv->err_chnl->fd;
+	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->err_intr_handle,
+				       mlx5_vdpa_err_interrupt_handler,
+				       priv)) {
+		priv->err_intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
+			priv->vid);
+		goto error;
+	} else {
+		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
+			priv->vid);
+	}
+	return 0;
+error:
+	mlx5_vdpa_err_event_unset(priv);
+	return -1;
+}
+
+void
+mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	if (!priv->err_intr_handle.fd)
+		return;
+#ifdef HAVE_IBV_DEVX_EVENT
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	int ret = -EAGAIN;
+
+	while (retries-- && ret == -EAGAIN) {
+		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
+					    mlx5_vdpa_err_interrupt_handler,
+					    priv);
+		if (ret == -EAGAIN) {
+			DRV_LOG(DEBUG, "Try again to unregister fd %d "
+				"of error interrupt, retries = %d.",
+				priv->err_intr_handle.fd, retries);
+			rte_pause();
+		}
+	}
+#endif
+	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
+	if (priv->err_chnl) {
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->err_chnl,
+		       &out.event_resp, sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
+		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
+		priv->err_chnl = NULL;
+	}
+}
+
 int
 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
 {
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
index 17e71cf4f4..d5ac040544 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -88,11 +88,6 @@ mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
 			rte_free(virtq->umems[i].buf);
 	}
 	memset(&virtq->umems, 0, sizeof(virtq->umems));
-	if (virtq->counters) {
-		claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
-		virtq->counters = NULL;
-	}
-	memset(&virtq->reset, 0, sizeof(virtq->reset));
 	if (virtq->eqp.fw_qp)
 		mlx5_vdpa_event_qp_destroy(&virtq->eqp);
 	virtq->notifier_state = MLX5_VDPA_NOTIFIER_STATE_DISABLED;
@@ -103,9 +98,19 @@ void
 mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
 {
 	int i;
+	struct mlx5_vdpa_virtq *virtq;
 
-	for (i = 0; i < priv->nr_virtqs; i++)
-		mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		virtq = &priv->virtqs[i];
+		mlx5_vdpa_virtq_unset(virtq);
+		if (virtq->counters) {
+			claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
+			virtq->counters = NULL;
+			memset(&virtq->reset, 0, sizeof(virtq->reset));
+		}
+		memset(virtq->err_time, 0, sizeof(virtq->err_time));
+		virtq->n_retry = 0;
+	}
 	if (priv->tis) {
 		claim_zero(mlx5_devx_cmd_destroy(priv->tis));
 		priv->tis = NULL;
@@ -138,7 +143,6 @@ mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
 int
 mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 {
-	struct mlx5_devx_virtq_attr attr = {0};
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
 	int ret;
 
@@ -148,6 +152,17 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 	if (ret)
 		return -1;
 	virtq->stopped = true;
+	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	return mlx5_vdpa_virtq_query(priv, index);
+}
+
+int
+mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index)
+{
+	struct mlx5_devx_virtq_attr attr = {0};
+	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+	int ret;
+
 	if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
 		DRV_LOG(ERR, "Failed to query virtq %d.", index);
 		return -1;
@@ -162,7 +177,9 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 		DRV_LOG(ERR, "Failed to set virtq %d base.", index);
 		return -1;
 	}
-	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	if (attr.state == MLX5_VIRTQ_STATE_ERROR)
+		DRV_LOG(WARNING, "vid %d vring %d hw error=%hhu",
+			priv->vid, index, attr.error_type);
 	return 0;
 }
 
@@ -195,6 +212,8 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 	unsigned int i;
 	uint16_t last_avail_idx;
 	uint16_t last_used_idx;
+	uint16_t event_num = MLX5_EVENT_TYPE_OBJECT_CHANGE;
+	uint64_t cookie;
 
 	ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
 	if (ret)
@@ -231,8 +250,9 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 			" need event QPs and event mechanism.", index);
 	}
 	if (priv->caps.queue_counters_valid) {
-		virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
-								    (priv->ctx);
+		if (!virtq->counters)
+			virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
+								(priv->ctx);
 		if (!virtq->counters) {
 			DRV_LOG(ERR, "Failed to create virtq couners for virtq"
 				" %d.", index);
@@ -332,6 +352,19 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 				virtq->intr_handle.fd, index);
 		}
 	}
+	/* Subscribe virtq error event. */
+	virtq->version++;
+	cookie = ((uint64_t)virtq->version << 32) + index;
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->err_chnl,
+						   virtq->virtq->obj,
+						   sizeof(event_num),
+						   &event_num, cookie);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe device %d virtq %d error event.",
+			priv->vid, index);
+		rte_errno = errno;
+		goto error;
+	}
 	virtq->stopped = false;
 	DRV_LOG(DEBUG, "vid %u virtq %u was created successfully.", priv->vid,
 		index);
@@ -526,12 +559,11 @@ mlx5_vdpa_virtq_stats_get(struct mlx5_vdpa_priv *priv, int qid,
 	struct mlx5_devx_virtio_q_couners_attr attr = {0};
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters, &attr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to read virtq %d stats from HW.", qid);
@@ -583,12 +615,11 @@ mlx5_vdpa_virtq_stats_reset(struct mlx5_vdpa_priv *priv, int qid)
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[qid];
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters,
 						    &virtq->reset);
 	if (ret)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] common/mlx5: add virtq attributes error fields
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (4 preceding siblings ...)
  2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 2/2] vdpa/mlx5: hardware error handling Xueming Li
@ 2020-10-26 15:04 ` Xueming Li
  2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 2/2] vdpa/mlx5: hardware error handling Xueming Li
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26 15:04 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

Add the needed fields for virtq DevX object to read the error state.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
 drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
 drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..dc426e9b09 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1754,6 +1754,9 @@ mlx5_devx_cmd_query_virtq(struct mlx5_devx_obj *virtq_obj,
 	attr->hw_available_index = MLX5_GET16(virtio_net_q, virtq,
 					      hw_available_index);
 	attr->hw_used_index = MLX5_GET16(virtio_net_q, virtq, hw_used_index);
+	attr->state = MLX5_GET16(virtio_net_q, virtq, state);
+	attr->error_type = MLX5_GET16(virtio_net_q, virtq,
+				      virtio_q_context.error_type);
 	return ret;
 }
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..0ea2427b75 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -298,6 +298,7 @@ struct mlx5_devx_virtq_attr {
 		uint32_t size;
 		uint64_t offset;
 	} umems[3];
+	uint8_t error_type;
 };
 
 
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..7d671a3996 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -2280,7 +2280,8 @@ struct mlx5_ifc_virtio_q_bits {
 	u8 used_addr[0x40];
 	u8 available_addr[0x40];
 	u8 virtio_q_mkey[0x20];
-	u8 reserved_at_160[0x20];
+	u8 reserved_at_160[0x18];
+	u8 error_type[0x8];
 	u8 umem_1_id[0x20];
 	u8 umem_1_size[0x20];
 	u8 umem_1_offset[0x40];
@@ -2308,7 +2309,7 @@ struct mlx5_ifc_virtio_net_q_bits {
 	u8 vhost_log_page[0x5];
 	u8 reserved_at_90[0xc];
 	u8 state[0x4];
-	u8 error_type[0x8];
+	u8 reserved_at_a0[0x8];
 	u8 tisn_or_qpn[0x18];
 	u8 dirty_bitmap_mkey[0x20];
 	u8 dirty_bitmap_size[0x20];
@@ -2329,6 +2330,10 @@ struct mlx5_ifc_query_virtq_out_bits {
 	struct mlx5_ifc_virtio_net_q_bits virtq;
 };
 
+enum {
+	MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27,
+};
+
 enum {
 	MLX5_QP_ST_RC = 0x0,
 };
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] vdpa/mlx5: hardware error handling
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (5 preceding siblings ...)
  2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-26 15:04 ` Xueming Li
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields Xueming Li
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling Xueming Li
  8 siblings, 0 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-26 15:04 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, xuemingl, Asaf Penso

When hardware error happens, vdpa didn't get such information and leave
driver in silent: working state but no response.

This patch subscribes firmware virtq error event and try to recover max
3 times in 10 seconds, stop virtq if max retry number reached.

When error happens, PMD log in warning level. If failed to recover,
outputs error log. Query virtq statistics to get error counters report.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/vdpa/mlx5/mlx5_vdpa.c       |   2 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |  37 +++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 143 ++++++++++++++++++++++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_virtq.c |  61 +++++++++---
 4 files changed, 228 insertions(+), 15 deletions(-)

diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index a8f3e4b1de..ba779c10ee 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -283,6 +283,7 @@ mlx5_vdpa_dev_close(int vid)
 	}
 	if (priv->configured)
 		ret |= mlx5_vdpa_lm_log(priv);
+	mlx5_vdpa_err_event_unset(priv);
 	mlx5_vdpa_cqe_event_unset(priv);
 	mlx5_vdpa_steer_unset(priv);
 	mlx5_vdpa_virtqs_release(priv);
@@ -318,6 +319,7 @@ mlx5_vdpa_dev_config(int vid)
 		DRV_LOG(WARNING, "MTU cannot be set on device %s.",
 				vdev->device->name);
 	if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
+	    mlx5_vdpa_err_event_setup(priv) ||
 	    mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
 	    mlx5_vdpa_cqe_event_setup(priv)) {
 		mlx5_vdpa_dev_close(vid);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index fcbc12ab0c..0d6886c52c 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -87,6 +87,7 @@ struct mlx5_vdpa_virtq {
 	uint16_t vq_size;
 	uint8_t notifier_state;
 	bool stopped;
+	uint32_t version;
 	struct mlx5_vdpa_priv *priv;
 	struct mlx5_devx_obj *virtq;
 	struct mlx5_devx_obj *counters;
@@ -97,6 +98,8 @@ struct mlx5_vdpa_virtq {
 		uint32_t size;
 	} umems[3];
 	struct rte_intr_handle intr_handle;
+	uint64_t err_time[3]; /* RDTSC time of recent errors. */
+	uint32_t n_retry;
 	struct mlx5_devx_virtio_q_couners_attr reset;
 };
 
@@ -143,8 +146,10 @@ struct mlx5_vdpa_priv {
 	struct rte_vhost_memory *vmem;
 	uint32_t eqn;
 	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_event_channel *err_chnl;
 	struct mlx5dv_devx_uar *uar;
 	struct rte_intr_handle intr_handle;
+	struct rte_intr_handle err_intr_handle;
 	struct mlx5_devx_obj *td;
 	struct mlx5_devx_obj *tis;
 	uint16_t nr_virtqs;
@@ -259,6 +264,25 @@ int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
  */
 void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
 
+/**
+ * Setup error interrupt handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset error event handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv);
+
 /**
  * Release a virtq and all its related resources.
  *
@@ -392,6 +416,19 @@ int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
  */
 int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
 
+/**
+ * Query virtq information.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] index
+ *   The virtq index.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index);
+
 /**
  * Get virtq statistics.
  *
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 8a01e42794..187a8d8d41 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -15,11 +15,14 @@
 #include <rte_alarm.h>
 
 #include <mlx5_common.h>
+#include <mlx5_glue.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_ERROR_TIME_SEC 3u
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -378,6 +381,146 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
 	pthread_mutex_unlock(&priv->vq_config_lock);
 }
 
+static void
+mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
+{
+#ifdef HAVE_IBV_DEVX_EVENT
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	uint32_t vq_index, i, version;
+	struct mlx5_vdpa_virtq *virtq;
+	uint64_t sec;
+
+	pthread_mutex_lock(&priv->vq_config_lock);
+	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		vq_index = out.event_resp.cookie & UINT32_MAX;
+		version = out.event_resp.cookie >> 32;
+		if (vq_index >= priv->nr_virtqs) {
+			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
+				priv->vdev->device->name, vq_index);
+			continue;
+		}
+		virtq = &priv->virtqs[vq_index];
+		if (!virtq->enable || virtq->version != version)
+			continue;
+		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
+			continue;
+		virtq->stopped = true;
+		/* Query error info. */
+		if (mlx5_vdpa_virtq_query(priv, vq_index))
+			goto log;
+		/* Disable vq. */
+		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
+			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
+			goto log;
+		}
+		/* Retry if error happens less than N times in 3 seconds. */
+		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
+		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
+			/* Retry. */
+			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
+				DRV_LOG(ERR, "Failed to enable virtq %d.",
+					vq_index);
+			else
+				DRV_LOG(WARNING, "Recover virtq %d: %u.",
+					vq_index, ++virtq->n_retry);
+		} else {
+			/* Retry timeout, give up. */
+			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
+				priv->vdev->device->name, vq_index);
+		}
+log:
+		/* Shift in current time to error time log end. */
+		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
+			virtq->err_time[i - 1] = virtq->err_time[i];
+		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
+	}
+	pthread_mutex_unlock(&priv->vq_config_lock);
+#endif
+}
+
+int
+mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int ret;
+	int flags;
+
+	/* Setup device event channel. */
+	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
+	if (!priv->err_chnl) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create device event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	flags = fcntl(priv->err_chnl->fd, F_GETFL);
+	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change device event channel FD.");
+		goto error;
+	}
+	priv->err_intr_handle.fd = priv->err_chnl->fd;
+	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->err_intr_handle,
+				       mlx5_vdpa_err_interrupt_handler,
+				       priv)) {
+		priv->err_intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
+			priv->vid);
+		goto error;
+	} else {
+		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
+			priv->vid);
+	}
+	return 0;
+error:
+	mlx5_vdpa_err_event_unset(priv);
+	return -1;
+}
+
+void
+mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	int ret = -EAGAIN;
+
+	if (!priv->err_intr_handle.fd)
+		return;
+	while (retries-- && ret == -EAGAIN) {
+		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
+					    mlx5_vdpa_err_interrupt_handler,
+					    priv);
+		if (ret == -EAGAIN) {
+			DRV_LOG(DEBUG, "Try again to unregister fd %d "
+				"of error interrupt, retries = %d.",
+				priv->err_intr_handle.fd, retries);
+			rte_pause();
+		}
+	}
+	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
+	if (priv->err_chnl) {
+#ifdef HAVE_IBV_DEVX_EVENT
+		union {
+			struct mlx5dv_devx_async_event_hdr event_resp;
+			uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+		} out;
+
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->err_chnl,
+		       &out.event_resp, sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
+#endif
+		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
+		priv->err_chnl = NULL;
+	}
+}
+
 int
 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
 {
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
index 17e71cf4f4..d5ac040544 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -88,11 +88,6 @@ mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
 			rte_free(virtq->umems[i].buf);
 	}
 	memset(&virtq->umems, 0, sizeof(virtq->umems));
-	if (virtq->counters) {
-		claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
-		virtq->counters = NULL;
-	}
-	memset(&virtq->reset, 0, sizeof(virtq->reset));
 	if (virtq->eqp.fw_qp)
 		mlx5_vdpa_event_qp_destroy(&virtq->eqp);
 	virtq->notifier_state = MLX5_VDPA_NOTIFIER_STATE_DISABLED;
@@ -103,9 +98,19 @@ void
 mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
 {
 	int i;
+	struct mlx5_vdpa_virtq *virtq;
 
-	for (i = 0; i < priv->nr_virtqs; i++)
-		mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		virtq = &priv->virtqs[i];
+		mlx5_vdpa_virtq_unset(virtq);
+		if (virtq->counters) {
+			claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
+			virtq->counters = NULL;
+			memset(&virtq->reset, 0, sizeof(virtq->reset));
+		}
+		memset(virtq->err_time, 0, sizeof(virtq->err_time));
+		virtq->n_retry = 0;
+	}
 	if (priv->tis) {
 		claim_zero(mlx5_devx_cmd_destroy(priv->tis));
 		priv->tis = NULL;
@@ -138,7 +143,6 @@ mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
 int
 mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 {
-	struct mlx5_devx_virtq_attr attr = {0};
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
 	int ret;
 
@@ -148,6 +152,17 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 	if (ret)
 		return -1;
 	virtq->stopped = true;
+	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	return mlx5_vdpa_virtq_query(priv, index);
+}
+
+int
+mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index)
+{
+	struct mlx5_devx_virtq_attr attr = {0};
+	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+	int ret;
+
 	if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
 		DRV_LOG(ERR, "Failed to query virtq %d.", index);
 		return -1;
@@ -162,7 +177,9 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 		DRV_LOG(ERR, "Failed to set virtq %d base.", index);
 		return -1;
 	}
-	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	if (attr.state == MLX5_VIRTQ_STATE_ERROR)
+		DRV_LOG(WARNING, "vid %d vring %d hw error=%hhu",
+			priv->vid, index, attr.error_type);
 	return 0;
 }
 
@@ -195,6 +212,8 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 	unsigned int i;
 	uint16_t last_avail_idx;
 	uint16_t last_used_idx;
+	uint16_t event_num = MLX5_EVENT_TYPE_OBJECT_CHANGE;
+	uint64_t cookie;
 
 	ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
 	if (ret)
@@ -231,8 +250,9 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 			" need event QPs and event mechanism.", index);
 	}
 	if (priv->caps.queue_counters_valid) {
-		virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
-								    (priv->ctx);
+		if (!virtq->counters)
+			virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
+								(priv->ctx);
 		if (!virtq->counters) {
 			DRV_LOG(ERR, "Failed to create virtq couners for virtq"
 				" %d.", index);
@@ -332,6 +352,19 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 				virtq->intr_handle.fd, index);
 		}
 	}
+	/* Subscribe virtq error event. */
+	virtq->version++;
+	cookie = ((uint64_t)virtq->version << 32) + index;
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->err_chnl,
+						   virtq->virtq->obj,
+						   sizeof(event_num),
+						   &event_num, cookie);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe device %d virtq %d error event.",
+			priv->vid, index);
+		rte_errno = errno;
+		goto error;
+	}
 	virtq->stopped = false;
 	DRV_LOG(DEBUG, "vid %u virtq %u was created successfully.", priv->vid,
 		index);
@@ -526,12 +559,11 @@ mlx5_vdpa_virtq_stats_get(struct mlx5_vdpa_priv *priv, int qid,
 	struct mlx5_devx_virtio_q_couners_attr attr = {0};
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters, &attr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to read virtq %d stats from HW.", qid);
@@ -583,12 +615,11 @@ mlx5_vdpa_virtq_stats_reset(struct mlx5_vdpa_priv *priv, int qid)
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[qid];
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters,
 						    &virtq->reset);
 	if (ret)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (6 preceding siblings ...)
  2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 2/2] vdpa/mlx5: hardware error handling Xueming Li
@ 2020-10-27  8:28 ` Xueming Li
  2020-10-28 13:59   ` Maxime Coquelin
  2020-10-29  8:29   ` Maxime Coquelin
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling Xueming Li
  8 siblings, 2 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-27  8:28 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko, Maxime Coquelin
  Cc: dev, xuemingl, Asaf Penso

Add the needed fields for virtq DevX object to read the error state.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
 drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
 drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..dc426e9b09 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1754,6 +1754,9 @@ mlx5_devx_cmd_query_virtq(struct mlx5_devx_obj *virtq_obj,
 	attr->hw_available_index = MLX5_GET16(virtio_net_q, virtq,
 					      hw_available_index);
 	attr->hw_used_index = MLX5_GET16(virtio_net_q, virtq, hw_used_index);
+	attr->state = MLX5_GET16(virtio_net_q, virtq, state);
+	attr->error_type = MLX5_GET16(virtio_net_q, virtq,
+				      virtio_q_context.error_type);
 	return ret;
 }
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..0ea2427b75 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -298,6 +298,7 @@ struct mlx5_devx_virtq_attr {
 		uint32_t size;
 		uint64_t offset;
 	} umems[3];
+	uint8_t error_type;
 };
 
 
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..7d671a3996 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -2280,7 +2280,8 @@ struct mlx5_ifc_virtio_q_bits {
 	u8 used_addr[0x40];
 	u8 available_addr[0x40];
 	u8 virtio_q_mkey[0x20];
-	u8 reserved_at_160[0x20];
+	u8 reserved_at_160[0x18];
+	u8 error_type[0x8];
 	u8 umem_1_id[0x20];
 	u8 umem_1_size[0x20];
 	u8 umem_1_offset[0x40];
@@ -2308,7 +2309,7 @@ struct mlx5_ifc_virtio_net_q_bits {
 	u8 vhost_log_page[0x5];
 	u8 reserved_at_90[0xc];
 	u8 state[0x4];
-	u8 error_type[0x8];
+	u8 reserved_at_a0[0x8];
 	u8 tisn_or_qpn[0x18];
 	u8 dirty_bitmap_mkey[0x20];
 	u8 dirty_bitmap_size[0x20];
@@ -2329,6 +2330,10 @@ struct mlx5_ifc_query_virtq_out_bits {
 	struct mlx5_ifc_virtio_net_q_bits virtq;
 };
 
+enum {
+	MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27,
+};
+
 enum {
 	MLX5_QP_ST_RC = 0x0,
 };
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling
  2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
                   ` (7 preceding siblings ...)
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-27  8:28 ` Xueming Li
  2020-10-28 14:19   ` Maxime Coquelin
  2020-10-29  8:29   ` Maxime Coquelin
  8 siblings, 2 replies; 14+ messages in thread
From: Xueming Li @ 2020-10-27  8:28 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko, Maxime Coquelin
  Cc: dev, xuemingl, Asaf Penso

When hardware error happens, vdpa didn't get such information and leave
driver in silent: working state but no response.

This patch subscribes firmware virtq error event and try to recover max
3 times in 3 seconds, stop virtq if max retry number reached.

When error happens, PMD log in warning level. If failed to recover,
outputs error log. Query virtq statistics to get error counters report.

Acked-by: Matan Azrad <matan@nvidia.com>
Signed-off-by: Xueming Li <xuemingl@nvidia.com>
---
 doc/guides/rel_notes/release_20_11.rst |   6 +-
 doc/guides/vdpadevs/mlx5.rst           |   8 ++
 drivers/vdpa/mlx5/mlx5_vdpa.c          |   2 +
 drivers/vdpa/mlx5/mlx5_vdpa.h          |  37 +++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c    | 144 +++++++++++++++++++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_virtq.c    |  61 ++++++++---
 6 files changed, 242 insertions(+), 16 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index 1cd7f1f2f9..5dc801fcc4 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -348,7 +348,7 @@ New Features
 
 * **Updated Mellanox mlx5 driver.**
 
-  Updated Mellanox mlx5 driver with new features and improvements, including:
+  Updated Mellanox mlx5 net driver with new features and improvements, including:
 
   * Added support for matching on fragmented/non-fragmented IPv4/IPv6 packets.
   * Updated the supported timeout for Age action to the maximal value supported
@@ -359,6 +359,10 @@ New Features
   * Added support for the new vlan fields ``has_vlan`` in the eth item and
     ``has_more_vlan`` in the vlan item.
 
+  Updated Mellanox mlx5 vDPA driver:
+
+  * Added support of vDPA VirtQ error handling.
+
 * **Updated vhost sample application.**
 
   Added vhost asynchronous APIs support, which demonstrated how the application
diff --git a/doc/guides/vdpadevs/mlx5.rst b/doc/guides/vdpadevs/mlx5.rst
index 9a11eefd2c..c5eaddae61 100644
--- a/doc/guides/vdpadevs/mlx5.rst
+++ b/doc/guides/vdpadevs/mlx5.rst
@@ -134,3 +134,11 @@ Driver options
   driver to no-traffic mode. In this mode the timer events are stopped and
   interrupts are configured to the device in order to notify traffic for the
   driver. Default value is 2s.
+
+Error handling
+^^^^^^^^^^^^^^
+
+Upon potential hardware errors, mlx5 PMD try to recover, give up if failed 3
+times in 3 seconds, virtq will be put in disable state. User should check log
+to get error information, or query vdpa statistics counter to know error type
+and count report.
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index 2d88633bfd..2d65370343 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -283,6 +283,7 @@ mlx5_vdpa_dev_close(int vid)
 	}
 	if (priv->configured)
 		ret |= mlx5_vdpa_lm_log(priv);
+	mlx5_vdpa_err_event_unset(priv);
 	mlx5_vdpa_cqe_event_unset(priv);
 	mlx5_vdpa_steer_unset(priv);
 	mlx5_vdpa_virtqs_release(priv);
@@ -318,6 +319,7 @@ mlx5_vdpa_dev_config(int vid)
 		DRV_LOG(WARNING, "MTU cannot be set on device %s.",
 				vdev->device->name);
 	if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
+	    mlx5_vdpa_err_event_setup(priv) ||
 	    mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
 	    mlx5_vdpa_cqe_event_setup(priv)) {
 		mlx5_vdpa_dev_close(vid);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index fcbc12ab0c..0d6886c52c 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -87,6 +87,7 @@ struct mlx5_vdpa_virtq {
 	uint16_t vq_size;
 	uint8_t notifier_state;
 	bool stopped;
+	uint32_t version;
 	struct mlx5_vdpa_priv *priv;
 	struct mlx5_devx_obj *virtq;
 	struct mlx5_devx_obj *counters;
@@ -97,6 +98,8 @@ struct mlx5_vdpa_virtq {
 		uint32_t size;
 	} umems[3];
 	struct rte_intr_handle intr_handle;
+	uint64_t err_time[3]; /* RDTSC time of recent errors. */
+	uint32_t n_retry;
 	struct mlx5_devx_virtio_q_couners_attr reset;
 };
 
@@ -143,8 +146,10 @@ struct mlx5_vdpa_priv {
 	struct rte_vhost_memory *vmem;
 	uint32_t eqn;
 	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_event_channel *err_chnl;
 	struct mlx5dv_devx_uar *uar;
 	struct rte_intr_handle intr_handle;
+	struct rte_intr_handle err_intr_handle;
 	struct mlx5_devx_obj *td;
 	struct mlx5_devx_obj *tis;
 	uint16_t nr_virtqs;
@@ -259,6 +264,25 @@ int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
  */
 void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
 
+/**
+ * Setup error interrupt handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset error event handler.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv);
+
 /**
  * Release a virtq and all its related resources.
  *
@@ -392,6 +416,19 @@ int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
  */
 int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
 
+/**
+ * Query virtq information.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] index
+ *   The virtq index.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index);
+
 /**
  * Get virtq statistics.
  *
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 8a01e42794..010543cb6a 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -15,11 +15,14 @@
 #include <rte_alarm.h>
 
 #include <mlx5_common.h>
+#include <mlx5_glue.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_ERROR_TIME_SEC 3u
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -378,6 +381,147 @@ mlx5_vdpa_interrupt_handler(void *cb_arg)
 	pthread_mutex_unlock(&priv->vq_config_lock);
 }
 
+static void
+mlx5_vdpa_err_interrupt_handler(void *cb_arg __rte_unused)
+{
+#ifdef HAVE_IBV_DEVX_EVENT
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+	uint32_t vq_index, i, version;
+	struct mlx5_vdpa_virtq *virtq;
+	uint64_t sec;
+
+	pthread_mutex_lock(&priv->vq_config_lock);
+	while (mlx5_glue->devx_get_event(priv->err_chnl, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		vq_index = out.event_resp.cookie & UINT32_MAX;
+		version = out.event_resp.cookie >> 32;
+		if (vq_index >= priv->nr_virtqs) {
+			DRV_LOG(ERR, "Invalid device %s error event virtq %d.",
+				priv->vdev->device->name, vq_index);
+			continue;
+		}
+		virtq = &priv->virtqs[vq_index];
+		if (!virtq->enable || virtq->version != version)
+			continue;
+		if (rte_rdtsc() / rte_get_tsc_hz() < MLX5_VDPA_ERROR_TIME_SEC)
+			continue;
+		virtq->stopped = true;
+		/* Query error info. */
+		if (mlx5_vdpa_virtq_query(priv, vq_index))
+			goto log;
+		/* Disable vq. */
+		if (mlx5_vdpa_virtq_enable(priv, vq_index, 0)) {
+			DRV_LOG(ERR, "Failed to disable virtq %d.", vq_index);
+			goto log;
+		}
+		/* Retry if error happens less than N times in 3 seconds. */
+		sec = (rte_rdtsc() - virtq->err_time[0]) / rte_get_tsc_hz();
+		if (sec > MLX5_VDPA_ERROR_TIME_SEC) {
+			/* Retry. */
+			if (mlx5_vdpa_virtq_enable(priv, vq_index, 1))
+				DRV_LOG(ERR, "Failed to enable virtq %d.",
+					vq_index);
+			else
+				DRV_LOG(WARNING, "Recover virtq %d: %u.",
+					vq_index, ++virtq->n_retry);
+		} else {
+			/* Retry timeout, give up. */
+			DRV_LOG(ERR, "Device %s virtq %d failed to recover.",
+				priv->vdev->device->name, vq_index);
+		}
+log:
+		/* Shift in current time to error time log end. */
+		for (i = 1; i < RTE_DIM(virtq->err_time); i++)
+			virtq->err_time[i - 1] = virtq->err_time[i];
+		virtq->err_time[RTE_DIM(virtq->err_time) - 1] = rte_rdtsc();
+	}
+	pthread_mutex_unlock(&priv->vq_config_lock);
+#endif
+}
+
+int
+mlx5_vdpa_err_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int ret;
+	int flags;
+
+	/* Setup device event channel. */
+	priv->err_chnl = mlx5_glue->devx_create_event_channel(priv->ctx, 0);
+	if (!priv->err_chnl) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create device event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	flags = fcntl(priv->err_chnl->fd, F_GETFL);
+	ret = fcntl(priv->err_chnl->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change device event channel FD.");
+		goto error;
+	}
+	priv->err_intr_handle.fd = priv->err_chnl->fd;
+	priv->err_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->err_intr_handle,
+				       mlx5_vdpa_err_interrupt_handler,
+				       priv)) {
+		priv->err_intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register error interrupt for device %d.",
+			priv->vid);
+		goto error;
+	} else {
+		DRV_LOG(DEBUG, "Registered error interrupt for device%d.",
+			priv->vid);
+	}
+	return 0;
+error:
+	mlx5_vdpa_err_event_unset(priv);
+	return -1;
+}
+
+void
+mlx5_vdpa_err_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	int ret = -EAGAIN;
+
+	if (!priv->err_intr_handle.fd)
+		return;
+	while (retries-- && ret == -EAGAIN) {
+		ret = rte_intr_callback_unregister(&priv->err_intr_handle,
+					    mlx5_vdpa_err_interrupt_handler,
+					    priv);
+		if (ret == -EAGAIN) {
+			DRV_LOG(DEBUG, "Try again to unregister fd %d "
+				"of error interrupt, retries = %d.",
+				priv->err_intr_handle.fd, retries);
+			rte_pause();
+		}
+	}
+	memset(&priv->err_intr_handle, 0, sizeof(priv->err_intr_handle));
+	if (priv->err_chnl) {
+#ifdef HAVE_IBV_DEVX_EVENT
+		union {
+			struct mlx5dv_devx_async_event_hdr event_resp;
+			uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) +
+				    128];
+		} out;
+
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->err_chnl,
+		       &out.event_resp, sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
+#endif
+		mlx5_glue->devx_destroy_event_channel(priv->err_chnl);
+		priv->err_chnl = NULL;
+	}
+}
+
 int
 mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
 {
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
index 17e71cf4f4..d5ac040544 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -88,11 +88,6 @@ mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
 			rte_free(virtq->umems[i].buf);
 	}
 	memset(&virtq->umems, 0, sizeof(virtq->umems));
-	if (virtq->counters) {
-		claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
-		virtq->counters = NULL;
-	}
-	memset(&virtq->reset, 0, sizeof(virtq->reset));
 	if (virtq->eqp.fw_qp)
 		mlx5_vdpa_event_qp_destroy(&virtq->eqp);
 	virtq->notifier_state = MLX5_VDPA_NOTIFIER_STATE_DISABLED;
@@ -103,9 +98,19 @@ void
 mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
 {
 	int i;
+	struct mlx5_vdpa_virtq *virtq;
 
-	for (i = 0; i < priv->nr_virtqs; i++)
-		mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		virtq = &priv->virtqs[i];
+		mlx5_vdpa_virtq_unset(virtq);
+		if (virtq->counters) {
+			claim_zero(mlx5_devx_cmd_destroy(virtq->counters));
+			virtq->counters = NULL;
+			memset(&virtq->reset, 0, sizeof(virtq->reset));
+		}
+		memset(virtq->err_time, 0, sizeof(virtq->err_time));
+		virtq->n_retry = 0;
+	}
 	if (priv->tis) {
 		claim_zero(mlx5_devx_cmd_destroy(priv->tis));
 		priv->tis = NULL;
@@ -138,7 +143,6 @@ mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
 int
 mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 {
-	struct mlx5_devx_virtq_attr attr = {0};
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
 	int ret;
 
@@ -148,6 +152,17 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 	if (ret)
 		return -1;
 	virtq->stopped = true;
+	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	return mlx5_vdpa_virtq_query(priv, index);
+}
+
+int
+mlx5_vdpa_virtq_query(struct mlx5_vdpa_priv *priv, int index)
+{
+	struct mlx5_devx_virtq_attr attr = {0};
+	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+	int ret;
+
 	if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
 		DRV_LOG(ERR, "Failed to query virtq %d.", index);
 		return -1;
@@ -162,7 +177,9 @@ mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
 		DRV_LOG(ERR, "Failed to set virtq %d base.", index);
 		return -1;
 	}
-	DRV_LOG(DEBUG, "vid %u virtq %u was stopped.", priv->vid, index);
+	if (attr.state == MLX5_VIRTQ_STATE_ERROR)
+		DRV_LOG(WARNING, "vid %d vring %d hw error=%hhu",
+			priv->vid, index, attr.error_type);
 	return 0;
 }
 
@@ -195,6 +212,8 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 	unsigned int i;
 	uint16_t last_avail_idx;
 	uint16_t last_used_idx;
+	uint16_t event_num = MLX5_EVENT_TYPE_OBJECT_CHANGE;
+	uint64_t cookie;
 
 	ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
 	if (ret)
@@ -231,8 +250,9 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 			" need event QPs and event mechanism.", index);
 	}
 	if (priv->caps.queue_counters_valid) {
-		virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
-								    (priv->ctx);
+		if (!virtq->counters)
+			virtq->counters = mlx5_devx_cmd_create_virtio_q_counters
+								(priv->ctx);
 		if (!virtq->counters) {
 			DRV_LOG(ERR, "Failed to create virtq couners for virtq"
 				" %d.", index);
@@ -332,6 +352,19 @@ mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
 				virtq->intr_handle.fd, index);
 		}
 	}
+	/* Subscribe virtq error event. */
+	virtq->version++;
+	cookie = ((uint64_t)virtq->version << 32) + index;
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->err_chnl,
+						   virtq->virtq->obj,
+						   sizeof(event_num),
+						   &event_num, cookie);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe device %d virtq %d error event.",
+			priv->vid, index);
+		rte_errno = errno;
+		goto error;
+	}
 	virtq->stopped = false;
 	DRV_LOG(DEBUG, "vid %u virtq %u was created successfully.", priv->vid,
 		index);
@@ -526,12 +559,11 @@ mlx5_vdpa_virtq_stats_get(struct mlx5_vdpa_priv *priv, int qid,
 	struct mlx5_devx_virtio_q_couners_attr attr = {0};
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters, &attr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to read virtq %d stats from HW.", qid);
@@ -583,12 +615,11 @@ mlx5_vdpa_virtq_stats_reset(struct mlx5_vdpa_priv *priv, int qid)
 	struct mlx5_vdpa_virtq *virtq = &priv->virtqs[qid];
 	int ret;
 
-	if (!virtq->virtq || !virtq->enable) {
+	if (!virtq->counters) {
 		DRV_LOG(ERR, "Failed to read virtq %d statistics - virtq "
 			"is invalid.", qid);
 		return -EINVAL;
 	}
-	MLX5_ASSERT(virtq->counters);
 	ret = mlx5_devx_cmd_query_virtio_q_counters(virtq->counters,
 						    &virtq->reset);
 	if (ret)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields Xueming Li
@ 2020-10-28 13:59   ` Maxime Coquelin
  2020-10-29  8:29   ` Maxime Coquelin
  1 sibling, 0 replies; 14+ messages in thread
From: Maxime Coquelin @ 2020-10-28 13:59 UTC (permalink / raw)
  To: Xueming Li, Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, Asaf Penso



On 10/27/20 9:28 AM, Xueming Li wrote:
> Add the needed fields for virtq DevX object to read the error state.
> 
> Acked-by: Matan Azrad <matan@nvidia.com>
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
> ---
>  drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
>  drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
>  drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
>  3 files changed, 11 insertions(+), 2 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling Xueming Li
@ 2020-10-28 14:19   ` Maxime Coquelin
  2020-10-29  8:29   ` Maxime Coquelin
  1 sibling, 0 replies; 14+ messages in thread
From: Maxime Coquelin @ 2020-10-28 14:19 UTC (permalink / raw)
  To: Xueming Li, Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, Asaf Penso



On 10/27/20 9:28 AM, Xueming Li wrote:
> When hardware error happens, vdpa didn't get such information and leave
> driver in silent: working state but no response.
> 
> This patch subscribes firmware virtq error event and try to recover max
> 3 times in 3 seconds, stop virtq if max retry number reached.
> 
> When error happens, PMD log in warning level. If failed to recover,
> outputs error log. Query virtq statistics to get error counters report.
> 
> Acked-by: Matan Azrad <matan@nvidia.com>
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
> ---
>  doc/guides/rel_notes/release_20_11.rst |   6 +-
>  doc/guides/vdpadevs/mlx5.rst           |   8 ++
>  drivers/vdpa/mlx5/mlx5_vdpa.c          |   2 +
>  drivers/vdpa/mlx5/mlx5_vdpa.h          |  37 +++++++
>  drivers/vdpa/mlx5/mlx5_vdpa_event.c    | 144 +++++++++++++++++++++++++
>  drivers/vdpa/mlx5/mlx5_vdpa_virtq.c    |  61 ++++++++---
>  6 files changed, 242 insertions(+), 16 deletions(-)
> 


Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields Xueming Li
  2020-10-28 13:59   ` Maxime Coquelin
@ 2020-10-29  8:29   ` Maxime Coquelin
  1 sibling, 0 replies; 14+ messages in thread
From: Maxime Coquelin @ 2020-10-29  8:29 UTC (permalink / raw)
  To: Xueming Li, Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, Asaf Penso



On 10/27/20 9:28 AM, Xueming Li wrote:
> Add the needed fields for virtq DevX object to read the error state.
> 
> Acked-by: Matan Azrad <matan@nvidia.com>
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
> ---
>  drivers/common/mlx5/mlx5_devx_cmds.c | 3 +++
>  drivers/common/mlx5/mlx5_devx_cmds.h | 1 +
>  drivers/common/mlx5/mlx5_prm.h       | 9 +++++++--
>  3 files changed, 11 insertions(+), 2 deletions(-)

Applied to dpdk-next-virtio/main.

Thanks!
Maxime


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling
  2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling Xueming Li
  2020-10-28 14:19   ` Maxime Coquelin
@ 2020-10-29  8:29   ` Maxime Coquelin
  1 sibling, 0 replies; 14+ messages in thread
From: Maxime Coquelin @ 2020-10-29  8:29 UTC (permalink / raw)
  To: Xueming Li, Matan Azrad, Viacheslav Ovsiienko; +Cc: dev, Asaf Penso



On 10/27/20 9:28 AM, Xueming Li wrote:
> When hardware error happens, vdpa didn't get such information and leave
> driver in silent: working state but no response.
> 
> This patch subscribes firmware virtq error event and try to recover max
> 3 times in 3 seconds, stop virtq if max retry number reached.
> 
> When error happens, PMD log in warning level. If failed to recover,
> outputs error log. Query virtq statistics to get error counters report.
> 
> Acked-by: Matan Azrad <matan@nvidia.com>
> Signed-off-by: Xueming Li <xuemingl@nvidia.com>
> ---
>  doc/guides/rel_notes/release_20_11.rst |   6 +-
>  doc/guides/vdpadevs/mlx5.rst           |   8 ++
>  drivers/vdpa/mlx5/mlx5_vdpa.c          |   2 +
>  drivers/vdpa/mlx5/mlx5_vdpa.h          |  37 +++++++
>  drivers/vdpa/mlx5/mlx5_vdpa_event.c    | 144 +++++++++++++++++++++++++
>  drivers/vdpa/mlx5/mlx5_vdpa_virtq.c    |  61 ++++++++---
>  6 files changed, 242 insertions(+), 16 deletions(-)
> 

Applied to dpdk-next-virtio/main.

Thanks!
Maxime


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-10-29  8:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-26  7:16 [dpdk-dev] [PATCH 1/2] common/mlx5: add virtq attributes error fields Xueming Li
2020-10-26  7:16 ` [dpdk-dev] [PATCH 2/2] vdpa/mlx5: hardware error handling Xueming Li
2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 1/2] common/mlx5: add virtq attributes error fields Xueming Li
2020-10-26  9:12 ` [dpdk-dev] [PATCH v1 2/2] vdpa/mlx5: hardware error handling Xueming Li
2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 1/2] common/mlx5: add virtq attributes error fields Xueming Li
2020-10-26 11:14 ` [dpdk-dev] [PATCH v2 2/2] vdpa/mlx5: hardware error handling Xueming Li
2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 1/2] common/mlx5: add virtq attributes error fields Xueming Li
2020-10-26 15:04 ` [dpdk-dev] [PATCH v3 2/2] vdpa/mlx5: hardware error handling Xueming Li
2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 1/2] common/mlx5: add virtq attributes error fields Xueming Li
2020-10-28 13:59   ` Maxime Coquelin
2020-10-29  8:29   ` Maxime Coquelin
2020-10-27  8:28 ` [dpdk-dev] [PATCH v4 2/2] vdpa/mlx5: hardware error handling Xueming Li
2020-10-28 14:19   ` Maxime Coquelin
2020-10-29  8:29   ` Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).