From: Srikanth Yalavarthi <syalavarthi@marvell.com>
To: Srikanth Yalavarthi <syalavarthi@marvell.com>
Cc: <dev@dpdk.org>, <sshankarnara@marvell.com>, <aprabhu@marvell.com>,
<ptakkar@marvell.com>
Subject: [PATCH v1 1/1] ml/cnxk: updates to cn10k error handling
Date: Tue, 30 Jul 2024 23:38:02 -0700 [thread overview]
Message-ID: <20240731063803.9223-1-syalavarthi@marvell.com> (raw)
Renamed cnxk error codes as cn10k error codes. Added
support for model specific op_error_get routines.
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
drivers/ml/cnxk/cn10k_ml_dev.c | 8 ++++++++
drivers/ml/cnxk/cn10k_ml_dev.h | 16 ++++++++++++++++
drivers/ml/cnxk/cn10k_ml_ops.c | 20 +++++++++++---------
drivers/ml/cnxk/cn10k_ml_ops.h | 2 +-
drivers/ml/cnxk/cnxk_ml_dev.c | 8 --------
drivers/ml/cnxk/cnxk_ml_dev.h | 18 +-----------------
drivers/ml/cnxk/cnxk_ml_model.h | 3 +++
drivers/ml/cnxk/cnxk_ml_ops.c | 18 ++++++++++++++----
drivers/ml/cnxk/cnxk_ml_ops.h | 2 ++
drivers/ml/cnxk/mvtvm_ml_ops.c | 13 +++++++++++++
drivers/ml/cnxk/mvtvm_ml_ops.h | 2 ++
11 files changed, 71 insertions(+), 39 deletions(-)
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 41f3b7a95da..2e719919ce1 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -58,6 +58,14 @@ static const char *const valid_args[] = {CN10K_ML_FW_PATH,
/* Supported OCM page sizes: 1KB, 2KB, 4KB, 8KB and 16KB */
static const int valid_ocm_page_size[] = {1024, 2048, 4096, 8192, 16384};
+/* Error type database */
+struct cn10k_ml_error_db ml_etype_db[] = {
+ {ML_CN10K_ETYPE_NO_ERROR, "NO_ERROR"}, {ML_CN10K_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
+ {ML_CN10K_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CN10K_ETYPE_HW_FATAL, "HW_FATAL"},
+ {ML_CN10K_ETYPE_HW_WARNING, "HW_WARNING"}, {ML_CN10K_ETYPE_DRIVER, "DRIVER_ERROR"},
+ {ML_CN10K_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
+
static int
parse_string_arg(const char *key __rte_unused, const char *value, void *extra_args)
{
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index ddb8b67e06e..dadb3b571ba 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -46,6 +46,22 @@ struct cnxk_ml_dev;
struct cnxk_ml_req;
struct cnxk_ml_qp;
+/* Error types enumeration */
+enum cn10k_ml_error_etype {
+ /* 0x0 */ ML_CN10K_ETYPE_NO_ERROR = 0, /* No error */
+ /* 0x1 */ ML_CN10K_ETYPE_FW_NONFATAL, /* Firmware non-fatal error */
+ /* 0x2 */ ML_CN10K_ETYPE_HW_NONFATAL, /* Hardware non-fatal error */
+ /* 0x3 */ ML_CN10K_ETYPE_HW_FATAL, /* Hardware fatal error */
+ /* 0x4 */ ML_CN10K_ETYPE_HW_WARNING, /* Hardware warning */
+ /* 0x5 */ ML_CN10K_ETYPE_DRIVER, /* Driver specific error */
+ /* 0x6 */ ML_CN10K_ETYPE_UNKNOWN, /* Unknown error */
+};
+
+struct cn10k_ml_error_db {
+ uint64_t code;
+ char str[RTE_ML_STR_MAX];
+};
+
/* Firmware non-fatal error sub-type */
enum cn10k_ml_error_stype_fw_nf {
/* 0x0 */ ML_CN10K_FW_ERR_NOERR = 0, /* No error */
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 834e55e88e9..b30af7c7a44 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -23,7 +23,7 @@
#define ML_FLAGS_SSO_COMPL BIT(1)
/* Hardware non-fatal error subtype database */
-static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
+static struct cn10k_ml_error_db ml_stype_db_hw_nf[] = {
{ML_CN10K_FW_ERR_NOERR, "NO ERROR"},
{ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
{ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
@@ -38,7 +38,7 @@ static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
};
/* Driver error subtype database */
-static struct cnxk_ml_error_db ml_stype_db_driver[] = {
+static struct cn10k_ml_error_db ml_stype_db_driver[] = {
{ML_CN10K_DRIVER_ERR_NOERR, "NO ERROR"},
{ML_CN10K_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
{ML_CN10K_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
@@ -784,6 +784,7 @@ cn10k_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
model->result_update = cn10k_ml_result_update;
model->set_error_code = cn10k_ml_set_error_code;
model->set_poll_addr = cn10k_ml_set_poll_addr;
+ model->op_error_get = cn10k_ml_op_error_get;
return 0;
}
@@ -1257,7 +1258,7 @@ cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
/* Handle driver error */
error_code = (union cn10k_ml_error_code *)&result->error_code;
- if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
+ if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
cn10k_mldev = &cnxk_mldev->cn10k_mldev;
/* Check for exception */
@@ -1310,7 +1311,7 @@ cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
- error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
+ error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
req->cn10k_req.result.user_ptr = op->user_ptr;
cnxk_ml_set_poll_ptr(req);
@@ -1324,16 +1325,17 @@ cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
}
__rte_hot int
-cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+ struct rte_ml_op_error *error)
{
union cn10k_ml_error_code *error_code;
- PLT_SET_USED(dev);
+ PLT_SET_USED(cnxk_mldev);
error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
/* Copy sub error message */
- if (error_code->s.etype == ML_CNXK_ETYPE_HW_NONFATAL) {
+ if (error_code->s.etype == ML_CN10K_ETYPE_HW_NONFATAL) {
if (error_code->s.stype < PLT_DIM(ml_stype_db_hw_nf))
snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
ml_etype_db[error_code->s.etype].str,
@@ -1341,7 +1343,7 @@ cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_m
else
snprintf(error->message, RTE_ML_STR_MAX, "%s : UNKNOWN ERROR",
ml_etype_db[error_code->s.etype].str);
- } else if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
+ } else if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
ml_etype_db[error_code->s.etype].str,
ml_stype_db_driver[error_code->s.stype].str);
@@ -1387,7 +1389,7 @@ cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
- error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
+ error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
req->cn10k_req.result.user_ptr = NULL;
cnxk_ml_set_poll_ptr(req);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index eb3e1c139c7..0f352282014 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -312,7 +312,7 @@ int cn10k_ml_model_params_update(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_
/* Fast-path ops */
__rte_hot bool cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
-__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+__rte_hot int cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error);
__rte_hot int cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
uint16_t nb_batches);
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.c b/drivers/ml/cnxk/cnxk_ml_dev.c
index dc4512223ca..567f8ea7542 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.c
+++ b/drivers/ml/cnxk/cnxk_ml_dev.c
@@ -12,11 +12,3 @@ int cnxk_ml_dev_initialized;
/* Dummy operations for ML device */
struct rte_ml_dev_ops ml_dev_dummy_ops = {0};
-
-/* Error type database */
-struct cnxk_ml_error_db ml_etype_db[] = {
- {ML_CNXK_ETYPE_NO_ERROR, "NO_ERROR"}, {ML_CNXK_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
- {ML_CNXK_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CNXK_ETYPE_HW_FATAL, "HW_FATAL"},
- {ML_CNXK_ETYPE_HW_WARNING, "HW_WARNING"}, {ML_CNXK_ETYPE_DRIVER, "DRIVER_ERROR"},
- {ML_CNXK_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
-};
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.h b/drivers/ml/cnxk/cnxk_ml_dev.h
index 491c4c4aea5..9e373e65715 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.h
+++ b/drivers/ml/cnxk/cnxk_ml_dev.h
@@ -22,22 +22,6 @@
#define ML_CNXK_POLL_JOB_START 0
#define ML_CNXK_POLL_JOB_FINISH 1
-/* Error types enumeration */
-enum cnxk_ml_error_etype {
- /* 0x0 */ ML_CNXK_ETYPE_NO_ERROR = 0, /* No error */
- /* 0x1 */ ML_CNXK_ETYPE_FW_NONFATAL, /* Firmware non-fatal error */
- /* 0x2 */ ML_CNXK_ETYPE_HW_NONFATAL, /* Hardware non-fatal error */
- /* 0x3 */ ML_CNXK_ETYPE_HW_FATAL, /* Hardware fatal error */
- /* 0x4 */ ML_CNXK_ETYPE_HW_WARNING, /* Hardware warning */
- /* 0x5 */ ML_CNXK_ETYPE_DRIVER, /* Driver specific error */
- /* 0x6 */ ML_CNXK_ETYPE_UNKNOWN, /* Unknown error */
-};
-
-struct cnxk_ml_error_db {
- uint64_t code;
- char str[RTE_ML_STR_MAX];
-};
-
/* Device type */
enum cnxk_ml_dev_type {
/* PCI based Marvell's ML HW accelerator device */
@@ -115,6 +99,6 @@ struct cnxk_ml_dev {
struct cnxk_ml_index_map *index_map;
};
-extern struct cnxk_ml_error_db ml_etype_db[];
+extern struct cn10k_ml_error_db ml_etype_db[];
#endif /* _CNXK_ML_DEV_H_ */
diff --git a/drivers/ml/cnxk/cnxk_ml_model.h b/drivers/ml/cnxk/cnxk_ml_model.h
index a2fced46a22..1cd5ca1906a 100644
--- a/drivers/ml/cnxk/cnxk_ml_model.h
+++ b/drivers/ml/cnxk/cnxk_ml_model.h
@@ -128,6 +128,8 @@ typedef bool (*enqueue_single_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_o
typedef void (*result_update_t)(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
typedef void (*set_error_code_t)(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
typedef void (*set_poll_addr_t)(struct cnxk_ml_req *req);
+typedef int (*op_error_get_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+ struct rte_ml_op_error *error);
/* Model Object */
struct cnxk_ml_model {
@@ -184,6 +186,7 @@ struct cnxk_ml_model {
result_update_t result_update;
set_error_code_t set_error_code;
set_poll_addr_t set_poll_addr;
+ op_error_get_t op_error_get;
};
enum cnxk_ml_model_type cnxk_ml_model_get_type(struct rte_ml_model_params *params);
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index 971362b2420..6e0160f2656 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -647,9 +647,7 @@ cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co
cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
-
- if (cnxk_mldev->type == CNXK_ML_DEV_TYPE_PCI)
- cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
+ cnxk_mldev->mldev->op_error_get = cnxk_ml_op_error_get;
/* Allocate and initialize index_map */
if (cnxk_mldev->index_map == NULL) {
@@ -1636,7 +1634,7 @@ cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
if (plt_tsc_cycles() < req->timeout)
goto empty_or_active;
else /* Timeout, set indication of driver error */
- model->set_error_code(req, ML_CNXK_ETYPE_DRIVER, 0);
+ model->set_error_code(req, ML_CN10K_ETYPE_DRIVER, 0);
}
model->result_update(cnxk_mldev, qp->id, req);
@@ -1654,6 +1652,18 @@ cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
return count;
}
+__rte_hot int
+cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+{
+ struct cnxk_ml_dev *cnxk_mldev;
+ struct cnxk_ml_model *model;
+
+ cnxk_mldev = dev->data->dev_private;
+ model = cnxk_mldev->mldev->data->models[op->model_id];
+
+ return model->op_error_get(cnxk_mldev, op, error);
+}
+
struct rte_ml_dev_ops cnxk_ml_ops = {
/* Device control ops */
.dev_info_get = cnxk_ml_dev_info_get,
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index e348cc4e857..7a79fec412e 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -83,5 +83,7 @@ __rte_hot uint16_t cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
struct rte_ml_op **ops, uint16_t nb_ops);
__rte_hot void cnxk_ml_set_poll_ptr(struct cnxk_ml_req *req);
__rte_hot uint64_t cnxk_ml_get_poll_ptr(struct cnxk_ml_req *req);
+__rte_hot int cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+ struct rte_ml_op_error *error);
#endif /* _CNXK_ML_OPS_H_ */
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index e825c3fb23e..4c1cda3005b 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -329,11 +329,13 @@ mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
model->result_update = cn10k_ml_result_update;
model->set_error_code = cn10k_ml_set_error_code;
model->set_poll_addr = cn10k_ml_set_poll_addr;
+ model->op_error_get = cn10k_ml_op_error_get;
} else {
model->enqueue_single = mvtvm_ml_enqueue_single;
model->result_update = mvtvm_ml_result_update;
model->set_error_code = mvtvm_ml_set_error_code;
model->set_poll_addr = mvtvm_ml_set_poll_addr;
+ model->op_error_get = mvtvm_ml_op_error_get;
}
return 0;
@@ -584,6 +586,17 @@ mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
req->mvtvm_req.result.error_code = etype;
}
+__rte_hot int
+mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+ struct rte_ml_op_error *error)
+{
+ RTE_SET_USED(cnxk_mldev);
+ RTE_SET_USED(op);
+ RTE_SET_USED(error);
+
+ return 0;
+}
+
__rte_hot bool
mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
struct cnxk_ml_qp *qp, uint64_t head)
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.h b/drivers/ml/cnxk/mvtvm_ml_ops.h
index 0232c5ead5d..d8f2f361fb1 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.h
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.h
@@ -71,6 +71,8 @@ int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_na
__rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
+__rte_hot int mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+ struct rte_ml_op_error *error);
__rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
__rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
--
2.45.1
next reply other threads:[~2024-07-31 6:53 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-31 6:38 Srikanth Yalavarthi [this message]
2024-09-19 17:41 ` Jerin Jacob
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240731063803.9223-1-syalavarthi@marvell.com \
--to=syalavarthi@marvell.com \
--cc=aprabhu@marvell.com \
--cc=dev@dpdk.org \
--cc=ptakkar@marvell.com \
--cc=sshankarnara@marvell.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).