From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 28282A0093;
	Thu,  8 Dec 2022 21:21:49 +0100 (CET)
Received: from mails.dpdk.org (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id ABC1042D9B;
	Thu,  8 Dec 2022 21:19:36 +0100 (CET)
Received: from mx0b-0016f401.pphosted.com (mx0b-0016f401.pphosted.com
 [67.231.156.173])
 by mails.dpdk.org (Postfix) with ESMTP id 2559F42D49
 for <dev@dpdk.org>; Thu,  8 Dec 2022 21:19:19 +0100 (CET)
Received: from pps.filterd (m0045851.ppops.net [127.0.0.1])
 by mx0b-0016f401.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 2B8JkNPB006986 for <dev@dpdk.org>; Thu, 8 Dec 2022 12:19:18 -0800
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=marvell.com;
 h=from : to : cc :
 subject : date : message-id : in-reply-to : references : mime-version :
 content-type; s=pfpt0220; bh=kd74BnRmsLq/4WdVX7ICIABFiCXlPs+BOwoDXo4oY40=;
 b=gnWaVwTZUhyMQOoZHrVdoVup2ooNXzpm7paZdeC2ycrSfITBM+J0BLn1NIZXwBtyanNr
 Thr1YYpMB314B1otlhi2XIjomgzJbtaUGOPkpvMo1JL2HNEEM8rt5HkPOasvQlocJxhQ
 hyYdY2WuQxk/77MdIdWHZ3ZWiXZGfqemaaOLT4byz55VoS7Hy3woByL7lk2xBVJTPhWX
 /AHAbqu9drNlhXoRypbB8LdksRmt48CSnAQ1IuMaO0C+6oHEGGkb161JRgYb5rH8Ja0U
 RkoXnSAFkFzCZyGrokD5oEu4Z0sjkUQC+YSa/MkGTPHa4DE1I0GlyJJN7Xyl1tVIFFgX +g== 
Received: from dc5-exch02.marvell.com ([199.233.59.182])
 by mx0b-0016f401.pphosted.com (PPS) with ESMTPS id 3m86usnj1h-3
 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT)
 for <dev@dpdk.org>; Thu, 08 Dec 2022 12:19:18 -0800
Received: from DC5-EXCH02.marvell.com (10.69.176.39) by DC5-EXCH02.marvell.com
 (10.69.176.39) with Microsoft SMTP Server (TLS) id 15.0.1497.18;
 Thu, 8 Dec 2022 12:19:16 -0800
Received: from maili.marvell.com (10.69.176.80) by DC5-EXCH02.marvell.com
 (10.69.176.39) with Microsoft SMTP Server id 15.0.1497.18 via Frontend
 Transport; Thu, 8 Dec 2022 12:19:16 -0800
Received: from ml-host-33.caveonetworks.com (unknown [10.110.143.233])
 by maili.marvell.com (Postfix) with ESMTP id DBECC3F7106;
 Thu,  8 Dec 2022 12:18:19 -0800 (PST)
From: Srikanth Yalavarthi <syalavarthi@marvell.com>
To: Srikanth Yalavarthi <syalavarthi@marvell.com>
CC: <dev@dpdk.org>, <sshankarnara@marvell.com>, <jerinj@marvell.com>,
 <aprabhu@marvell.com>
Subject: [PATCH v2 28/37] ml/cnxk: enable support for firmware error codes
Date: Thu, 8 Dec 2022 12:17:56 -0800
Message-ID: <20221208201806.21893-29-syalavarthi@marvell.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20221208201806.21893-1-syalavarthi@marvell.com>
References: <20221208200220.20267-1-syalavarthi@marvell.com>
 <20221208201806.21893-1-syalavarthi@marvell.com>
MIME-Version: 1.0
Content-Type: text/plain
X-Proofpoint-GUID: xicTDb22wl5TmuexmsMyY1S3C52RQ2bE
X-Proofpoint-ORIG-GUID: xicTDb22wl5TmuexmsMyY1S3C52RQ2bE
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.205,Aquarius:18.0.923,Hydra:6.0.545,FMLib:17.11.122.1
 definitions=2022-12-08_11,2022-12-08_01,2022-06-22_01
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

Enabled support for error handling. Added error types and subtypes
supported by ML firmware. Enabled support to get device specific
error code and message for a completed ML request.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
v2:
* Fixed typos

 drivers/ml/cnxk/cn10k_ml_dev.c |   4 +-
 drivers/ml/cnxk/cn10k_ml_dev.h |  50 +++++++++++++-
 drivers/ml/cnxk/cn10k_ml_ops.c | 117 ++++++++++++++++++++++++++++++---
 drivers/ml/cnxk/cn10k_ml_ops.h |   2 +
 4 files changed, 160 insertions(+), 13 deletions(-)

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 805b037593..779734d6cd 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -261,7 +261,7 @@ cn10k_ml_fw_load_asim(struct cn10k_ml_fw *fw)
 	} while (plt_tsc_cycles() < timeout_cycle);

 	/* Check firmware load status, clean-up and exit on failure. */
-	if ((!timeout) && (fw->req->result.error_code == 0)) {
+	if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
 		cn10k_ml_fw_print_info(fw);
 	} else {
 		/* Set ML to disable new jobs */
@@ -452,7 +452,7 @@ cn10k_ml_fw_load_cn10ka(struct cn10k_ml_fw *fw, void *buffer, uint64_t size)
 	} while (plt_tsc_cycles() < timeout_cycle);

 	/* Check firmware load status, clean-up and exit on failure. */
-	if ((!timeout) && (fw->req->result.error_code == 0)) {
+	if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
 		cn10k_ml_fw_print_info(fw);
 	} else {
 		/* Set ML to disable new jobs */
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index 5096a26c40..f292078920 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -64,6 +64,54 @@ enum cn10k_ml_dev_state {
 	ML_CN10K_DEV_STATE_CLOSED
 };

+/* ML error types enumeration */
+enum cn10k_ml_error_etype {
+	/* 0x0 */ ML_ETYPE_NO_ERROR = 0, /* No error */
+	/* 0x1 */ ML_ETYPE_FW_NONFATAL,	 /* Firmware non-fatal error */
+	/* 0x2 */ ML_ETYPE_HW_NONFATAL,	 /* Hardware non-fatal error */
+	/* 0x3 */ ML_ETYPE_HW_FATAL,	 /* Hardware fatal error */
+	/* 0x4 */ ML_ETYPE_HW_WARNING,	 /* Hardware warning */
+	/* 0x5 */ ML_ETYPE_DRIVER,	 /* Driver specific error */
+	/* 0x6 */ ML_ETYPE_UNKNOWN,	 /* Unknown error */
+};
+
+/* ML firmware non-fatal error sub-type */
+enum cn10k_ml_error_stype_fw_nf {
+	/* 0x0 */ ML_FW_ERR_NOERR = 0,		 /* No error */
+	/* 0x1 */ ML_FW_ERR_UNLOAD_ID_NOT_FOUND, /* Model ID not found during load */
+	/* 0x2 */ ML_FW_ERR_LOAD_LUT_OVERFLOW,	 /* Lookup table overflow at load */
+	/* 0x3 */ ML_FW_ERR_ID_IN_USE,		 /* Model ID already in use */
+	/* 0x4 */ ML_FW_ERR_INVALID_TILEMASK,	 /* Invalid OCM tilemask */
+	/* 0x5 */ ML_FW_ERR_RUN_LUT_OVERFLOW,	 /* Lookup table overflow at run */
+	/* 0x6 */ ML_FW_ERR_RUN_ID_NOT_FOUND,	 /* Model ID not found during run */
+	/* 0x7 */ ML_FW_ERR_COMMAND_NOTSUP,	 /* Unsupported command */
+	/* 0x8 */ ML_FW_ERR_DDR_ADDR_RANGE,	 /* DDR address out of range */
+	/* 0x9 */ ML_FW_ERR_NUM_BATCHES_INVALID, /* Invalid number of batches */
+	/* 0xA */ ML_FW_ERR_INSSYNC_TIMEOUT,	 /* INS sync timeout */
+};
+
+/* ML driver error sub-type */
+enum cn10k_ml_error_stype_driver {
+	/* 0x0 */ ML_DRIVER_ERR_NOERR = 0, /* No error */
+	/* 0x1 */ ML_DRIVER_ERR_UNKNOWN,   /* Unable to determine error sub-type */
+	/* 0x2 */ ML_DRIVER_ERR_EXCEPTION, /* Firmware exception */
+	/* 0x3 */ ML_DRIVER_ERR_FW_ERROR,  /* Unknown firmware error */
+};
+
+/* ML error structure */
+union cn10k_ml_error_code {
+	struct {
+		/* Error type */
+		uint64_t etype : 4;
+
+		/* Error sub-type */
+		uint64_t stype : 60;
+	} s;
+
+	/* WORD 0 */
+	uint64_t u64;
+};
+
 /* ML Firmware stats */
 struct cn10k_ml_fw_stats {
 	/* Firmware start cycle */
@@ -82,7 +130,7 @@ struct cn10k_ml_fw_stats {
 /* ML result structure */
 struct cn10k_ml_result {
 	/* Job error code */
-	uint64_t error_code;
+	union cn10k_ml_error_code error_code;

 	/* Firmware stats */
 	struct cn10k_ml_fw_stats stats;
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index e7ee0774f2..d9eea21e12 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -23,6 +23,49 @@
 #define ML_FLAGS_POLL_COMPL BIT(0)
 #define ML_FLAGS_SSO_COMPL  BIT(1)

+/* Error message length */
+#define ERRMSG_LEN 32
+
+/* Error type database */
+static const struct cn10k_ml_etype_db {
+	enum cn10k_ml_error_etype etype;
+	char name[ERRMSG_LEN];
+} ml_etype_db[] = {
+	{ML_ETYPE_NO_ERROR, "NO_ERROR"},	{ML_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
+	{ML_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_ETYPE_HW_FATAL, "HW_FATAL"},
+	{ML_ETYPE_HW_WARNING, "HW_WARNING"},	{ML_ETYPE_DRIVER, "DRIVER_ERROR"},
+	{ML_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
+
+/* Hardware non-fatal error subtype database */
+static const struct cn10k_ml_stype_db_hw_nf {
+	enum cn10k_ml_error_stype_fw_nf stype;
+	char msg[ERRMSG_LEN];
+} ml_stype_db_hw_nf[] = {
+	{ML_FW_ERR_NOERR, "NO ERROR"},
+	{ML_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
+	{ML_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
+	{ML_FW_ERR_ID_IN_USE, "MODEL ID IN USE"},
+	{ML_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"},
+	{ML_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"},
+	{ML_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"},
+	{ML_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"},
+	{ML_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"},
+	{ML_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"},
+	{ML_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"},
+};
+
+/* Driver error subtype database */
+static const struct cn10k_ml_stype_db_driver {
+	enum cn10k_ml_error_stype_driver stype;
+	char msg[ERRMSG_LEN];
+} ml_stype_db_driver[] = {
+	{ML_DRIVER_ERR_NOERR, "NO ERROR"},
+	{ML_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
+	{ML_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
+	{ML_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"},
+};
+
 static void
 print_line(FILE *fp, int len)
 {
@@ -474,6 +517,7 @@ cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c

 	dev->enqueue_burst = cn10k_ml_enqueue_burst;
 	dev->dequeue_burst = cn10k_ml_dequeue_burst;
+	dev->op_error_get = cn10k_ml_op_error_get;

 	mldev->nb_models_loaded = 0;
 	mldev->state = ML_CN10K_DEV_STATE_CONFIGURED;
@@ -758,7 +802,7 @@ cn10k_ml_dev_selftest(struct rte_ml_dev *dev)
 	if (timeout) {
 		ret = -ETIME;
 	} else {
-		if (req->result.error_code != 0)
+		if (req->result.error_code.u64 != 0)
 			ret = -1;
 	}

@@ -940,7 +984,7 @@ cn10k_ml_model_start(struct rte_ml_dev *dev, int16_t model_id)
 	req = model->req;
 	cn10k_ml_prep_sp_job_descriptor(mldev, model, req, ML_CN10K_JOB_TYPE_MODEL_START);

-	req->result.error_code = 0x0;
+	req->result.error_code.u64 = 0x0;
 	req->result.user_ptr = NULL;

 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1021,7 +1065,7 @@ cn10k_ml_model_start(struct rte_ml_dev *dev, int16_t model_id)

 	if (job_dequeued) {
 		if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-			if (req->result.error_code == 0)
+			if (req->result.error_code.u64 == 0)
 				ret = 0;
 			else
 				ret = -1;
@@ -1083,7 +1127,7 @@ cn10k_ml_model_stop(struct rte_ml_dev *dev, int16_t model_id)
 	/* Prepare JD */
 	req = model->req;
 	cn10k_ml_prep_sp_job_descriptor(mldev, model, req, ML_CN10K_JOB_TYPE_MODEL_STOP);
-	req->result.error_code = 0x0;
+	req->result.error_code.u64 = 0x0;
 	req->result.user_ptr = NULL;

 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1138,7 +1182,7 @@ cn10k_ml_model_stop(struct rte_ml_dev *dev, int16_t model_id)

 	if (job_dequeued) {
 		if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-			if (req->result.error_code == 0x0)
+			if (req->result.error_code.u64 == 0x0)
 				ret = 0;
 			else
 				ret = -1;
@@ -1429,12 +1473,30 @@ cn10k_ml_result_update(struct rte_ml_dev *dev, int qp_id, struct cn10k_ml_result
 	PLT_SET_USED(dev);
 	PLT_SET_USED(qp_id);

-	op->impl_opaque = result->error_code;
+	struct cn10k_ml_dev *mldev;

-	if (likely(result->error_code == 0))
+	if (likely(result->error_code.u64 == 0)) {
+		op->impl_opaque = result->error_code.u64;
 		op->status = RTE_ML_OP_STATUS_SUCCESS;
-	else
+	} else {
+		/* Handle driver error */
+		if (result->error_code.s.etype == ML_ETYPE_DRIVER) {
+			mldev = dev->data->dev_private;
+
+			/* Check for exception */
+			if ((roc_ml_reg_read64(&mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0) != 0) ||
+			    (roc_ml_reg_read64(&mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1) != 0))
+				result->error_code.s.stype = ML_DRIVER_ERR_EXCEPTION;
+			else if ((roc_ml_reg_read64(&mldev->roc, ML_CORE_INT_LO) != 0) ||
+				 (roc_ml_reg_read64(&mldev->roc, ML_CORE_INT_HI) != 0))
+				result->error_code.s.stype = ML_DRIVER_ERR_FW_ERROR;
+			else
+				result->error_code.s.stype = ML_DRIVER_ERR_UNKNOWN;
+		}
+
+		op->impl_opaque = result->error_code.u64;
 		op->status = RTE_ML_OP_STATUS_ERROR;
+	}

 	op->user_ptr = result->user_ptr;
 }
@@ -1471,6 +1533,7 @@ cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 	cn10k_ml_prep_fp_job_descriptor(dev, req, op);

 	memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+	req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
 	req->result.user_ptr = op->user_ptr;

 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1518,8 +1581,12 @@ cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 dequeue_req:
 	req = &queue->reqs[tail];
 	status = plt_read64(&req->status);
-	if (unlikely(status != ML_CN10K_POLL_JOB_FINISH))
-		goto empty_or_active;
+	if (unlikely(status != ML_CN10K_POLL_JOB_FINISH)) {
+		if (plt_tsc_cycles() < req->timeout)
+			goto empty_or_active;
+		else /* Timeout, set indication of driver error */
+			req->result.error_code.s.etype = ML_ETYPE_DRIVER;
+	}

 	cn10k_ml_result_update(dev, qp_id, &req->result, req->op);
 	ops[count] = req->op;
@@ -1536,6 +1603,35 @@ cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 	return count;
 }

+__rte_hot int
+cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+{
+	union cn10k_ml_error_code *error_code;
+	char msg[RTE_ML_STR_MAX];
+
+	PLT_SET_USED(dev);
+
+	error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
+
+	/* Copy error message */
+	plt_strlcpy(msg, ml_etype_db[error_code->s.etype].name, sizeof(msg));
+
+	/* Copy sub error message */
+	if (error_code->s.etype == ML_ETYPE_HW_NONFATAL) {
+		strcat(msg, " : ");
+		strcat(msg, ml_stype_db_hw_nf[error_code->s.stype].msg);
+	}
+
+	if (error_code->s.etype == ML_ETYPE_DRIVER) {
+		strcat(msg, " : ");
+		strcat(msg, ml_stype_db_driver[error_code->s.stype].msg);
+	}
+
+	plt_strlcpy(error->message, msg, sizeof(error->message));
+
+	return 0;
+}
+
 __rte_hot int
 cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op)
 {
@@ -1552,6 +1648,7 @@ cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op)
 	cn10k_ml_prep_fp_job_descriptor(dev, req, op);

 	memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+	req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
 	req->result.user_ptr = op->user_ptr;

 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index c23e484b69..5f00cb2a60 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -75,6 +75,8 @@ __rte_hot uint16_t cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id
 					  struct rte_ml_op **ops, uint16_t nb_ops);
 __rte_hot uint16_t cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
 					  struct rte_ml_op **ops, uint16_t nb_ops);
+__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+				    struct rte_ml_op_error *error);
 __rte_hot int cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op);

 #endif /* _CN10K_ML_OPS_H_ */
--
2.17.1