* [PATCH v3 1/5] net/mlx5: add new devarg for Tx queue consecutive memory
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
@ 2025-06-27 16:37 ` Bing Zhao
2025-06-27 16:37 ` [PATCH v3 2/5] net/mlx5: calculate the memory length for all Tx queues Bing Zhao
` (4 subsequent siblings)
5 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-27 16:37 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
With this commit, a new device argument is introduced to control
the memory allocation for Tx queues.
By default, without specifying any value. A default alignment with
system page size will be used. All SQ / CQ memory of Tx queues will
be allocated once and a single umem & MR will be used.
When setting to 0, the legacy way of per queue umem allocation will
be selected in the following commit.
If the value is smaller than the system page size, the starting
address alignment will be rounded up to the page size.
The value is a logarithm value based to 2.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
doc/guides/nics/mlx5.rst | 18 ++++++++++++++++++
drivers/net/mlx5/mlx5.c | 36 ++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5.h | 7 ++++---
3 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index c1dcb9ca68..82cb06909d 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -1682,6 +1682,24 @@ for an additional list of options shared with other mlx5 drivers.
By default, the PMD will set this value to 1.
+- ``txq_mem_algn`` parameter [int]
+
+ A The logarithm value to the base 2 for the memory starting
+ address alignment for Tx queues' WQ and associated CQ. Different CPU
+ architectures and generations may have different cache systems. The memory
+ accessing order may impact the cache misses rate on different CPUs. This devarg
+ gives the ability to control the alignment and gaps between TxQs without
+ rebuilding the application binary. User can tune the SW performance by specifying
+ this devarg after benchmark testing on their servers and systems.
+
+ By default, the PMD will set it to log(4096), or log(64*1024) on some specific OS
+ distributions - based on the system page size configuration.
+ All TxQs will use unique memory region and umem area, each TxQs will starting at an
+ address with 4K/64K (default system page size) alignment. If the user's input value is
+ less then the page size, it will be rounded up. If bigger than the maximal queue size,
+ a warning message will be shown, there will be some waste of the memory space. 0 indicates
+ that the legacy per queue memory allocation and separate MRs will be used as before.
+
Multiport E-Switch
------------------
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bad8a9e90..a167d75aeb 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -185,6 +185,14 @@
/* Device parameter to control representor matching in ingress/egress flows with HWS. */
#define MLX5_REPR_MATCHING_EN "repr_matching_en"
+/*
+ * Alignment of the Tx queue starting address,
+ * If not set, using separate umem and MR for each TxQ.
+ * If set, using consecutive memory address and single MR for all Tx queues, each TxQ will start at
+ * the alignment specified.
+ */
+#define MLX5_TXQ_MEM_ALGN "txq_mem_algn"
+
/* Shared memory between primary and secondary processes. */
struct mlx5_shared_data *mlx5_shared_data;
@@ -1447,6 +1455,8 @@ mlx5_dev_args_check_handler(const char *key, const char *val, void *opaque)
config->cnt_svc.cycle_time = tmp;
} else if (strcmp(MLX5_REPR_MATCHING_EN, key) == 0) {
config->repr_matching = !!tmp;
+ } else if (strcmp(MLX5_TXQ_MEM_ALGN, key) == 0) {
+ config->txq_mem_algn = (uint32_t)tmp;
}
return 0;
}
@@ -1486,9 +1496,17 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
MLX5_HWS_CNT_SERVICE_CORE,
MLX5_HWS_CNT_CYCLE_TIME,
MLX5_REPR_MATCHING_EN,
+ MLX5_TXQ_MEM_ALGN,
NULL,
};
int ret = 0;
+ size_t alignment = rte_mem_page_size();
+ uint32_t max_queue_umem_size = MLX5_WQE_SIZE * mlx5_dev_get_max_wq_size(sh);
+
+ if (alignment == (size_t)-1) {
+ DRV_LOG(WARNING, "Failed to get page_size, using default 4K size.");
+ alignment = 4 * 1024;
+ }
/* Default configuration. */
memset(config, 0, sizeof(*config));
@@ -1501,6 +1519,7 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->cnt_svc.cycle_time = MLX5_CNT_SVC_CYCLE_TIME_DEFAULT;
config->cnt_svc.service_core = rte_get_main_lcore();
config->repr_matching = 1;
+ config->txq_mem_algn = log2above(alignment);
if (mkvlist != NULL) {
/* Process parameters. */
ret = mlx5_kvargs_process(mkvlist, params,
@@ -1567,6 +1586,16 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->hw_fcs_strip = 0;
else
config->hw_fcs_strip = sh->dev_cap.hw_fcs_strip;
+ if (config->txq_mem_algn != 0 && config->txq_mem_algn < log2above(alignment)) {
+ DRV_LOG(WARNING,
+ "\"txq_mem_algn\" too small %u, round up to %u.",
+ config->txq_mem_algn, log2above(alignment));
+ config->txq_mem_algn = log2above(alignment);
+ } else if (config->txq_mem_algn > log2above(max_queue_umem_size)) {
+ DRV_LOG(WARNING,
+ "\"txq_mem_algn\" with value %u bigger than %u.",
+ config->txq_mem_algn, log2above(max_queue_umem_size));
+ }
DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
(config->hw_fcs_strip ? "" : "not "));
DRV_LOG(DEBUG, "\"tx_pp\" is %d.", config->tx_pp);
@@ -1584,6 +1613,7 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->allow_duplicate_pattern);
DRV_LOG(DEBUG, "\"fdb_def_rule_en\" is %u.", config->fdb_def_rule);
DRV_LOG(DEBUG, "\"repr_matching_en\" is %u.", config->repr_matching);
+ DRV_LOG(DEBUG, "\"txq_mem_algn\" is %u.", config->txq_mem_algn);
return 0;
}
@@ -3151,6 +3181,12 @@ mlx5_probe_again_args_validate(struct mlx5_common_device *cdev,
sh->ibdev_name);
goto error;
}
+ if (sh->config.txq_mem_algn != config->txq_mem_algn) {
+ DRV_LOG(ERR, "\"TxQ memory alignment\" "
+ "configuration mismatch for shared %s context. %u - %u",
+ sh->ibdev_name, sh->config.txq_mem_algn, config->txq_mem_algn);
+ goto error;
+ }
mlx5_free(config);
return 0;
error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index f085656196..6b8d29a2bf 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -386,13 +386,14 @@ struct mlx5_sh_config {
uint32_t hw_fcs_strip:1; /* FCS stripping is supported. */
uint32_t allow_duplicate_pattern:1;
uint32_t lro_allowed:1; /* Whether LRO is allowed. */
+ /* Allow/Prevent the duplicate rules pattern. */
+ uint32_t fdb_def_rule:1; /* Create FDB default jump rule */
+ uint32_t repr_matching:1; /* Enable implicit vport matching in HWS FDB. */
+ uint32_t txq_mem_algn; /* logarithm value of the TxQ address alignment. */
struct {
uint16_t service_core;
uint32_t cycle_time; /* query cycle time in milli-second. */
} cnt_svc; /* configure for HW steering's counter's service. */
- /* Allow/Prevent the duplicate rules pattern. */
- uint32_t fdb_def_rule:1; /* Create FDB default jump rule */
- uint32_t repr_matching:1; /* Enable implicit vport matching in HWS FDB. */
};
/* Structure for VF VLAN workaround. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v3 2/5] net/mlx5: calculate the memory length for all Tx queues
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
2025-06-27 16:37 ` [PATCH v3 1/5] net/mlx5: add new devarg for Tx queue consecutive memory Bing Zhao
@ 2025-06-27 16:37 ` Bing Zhao
2025-06-27 16:37 ` [PATCH v3 3/5] net/mlx5: allocate and release unique resources for " Bing Zhao
` (3 subsequent siblings)
5 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-27 16:37 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
When the alignment is non-zero, it means that the single umem and MR
allocation for all Tx queues will be used.
In this commit, the total length of SQs and associated CQs will be
calculated and saved.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/net/mlx5/mlx5.h | 4 +++
drivers/net/mlx5/mlx5_tx.h | 2 ++
drivers/net/mlx5/mlx5_txq.c | 69 +++++++++++++++++++++++++++++++++++--
3 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 6b8d29a2bf..285c9ba396 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -2138,6 +2138,10 @@ struct mlx5_priv {
struct mlx5_nta_sample_ctx *nta_sample_ctx;
#endif
struct rte_eth_dev *shared_host; /* Host device for HW steering. */
+ struct {
+ uint32_t sq_total_size;
+ uint32_t cq_total_size;
+ } consec_tx_mem;
RTE_ATOMIC(uint16_t) shared_refcnt; /* HW steering host reference counter. */
};
diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h
index 55568c41b1..94f2028513 100644
--- a/drivers/net/mlx5/mlx5_tx.h
+++ b/drivers/net/mlx5/mlx5_tx.h
@@ -149,6 +149,7 @@ struct __rte_cache_aligned mlx5_txq_data {
uint16_t inlen_mode; /* Minimal data length to inline. */
uint8_t tx_aggr_affinity; /* TxQ affinity configuration. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
+ uint32_t sq_mem_len; /* Length of TxQ for WQEs */
uint64_t offloads; /* Offloads for Tx Queue. */
struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct mlx5_wqe *wqes; /* Work queue. */
@@ -167,6 +168,7 @@ struct __rte_cache_aligned mlx5_txq_data {
uint64_t ts_mask; /* Timestamp flag dynamic mask. */
uint64_t ts_last; /* Last scheduled timestamp. */
int32_t ts_offset; /* Timestamp field dynamic offset. */
+ uint32_t cq_mem_len; /* Length of TxQ for CQEs */
struct mlx5_dev_ctx_shared *sh; /* Shared context. */
struct mlx5_txq_stats stats; /* TX queue counters. */
struct mlx5_txq_stats stats_reset; /* stats on last reset. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 8ee8108497..6ae7d75cd0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -17,6 +17,7 @@
#include <bus_pci_driver.h>
#include <rte_common.h>
#include <rte_eal_paging.h>
+#include <rte_bitops.h>
#include <mlx5_common.h>
#include <mlx5_common_mr.h>
@@ -1032,6 +1033,57 @@ txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
!txq_ctrl->txq.inlen_empw);
}
+/*
+ * Calculate WQ memory length for a Tx queue.
+ *
+ * @param log_wqe_cnt
+ * Logarithm value of WQE numbers.
+ *
+ * @return
+ * memory length of this WQ.
+ */
+static uint32_t mlx5_txq_wq_mem_length(uint32_t log_wqe_cnt)
+{
+ uint32_t num_of_wqbbs = RTE_BIT32(log_wqe_cnt);
+ uint32_t umem_size;
+
+ umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
+ return umem_size;
+}
+
+/*
+ * Calculate CQ memory length for a Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param txq_ctrl
+ * Pointer to the TxQ control structure of the CQ.
+ *
+ * @return
+ * memory length of this CQ.
+ */
+static uint32_t
+mlx5_txq_cq_mem_length(struct rte_eth_dev *dev, struct mlx5_txq_ctrl *txq_ctrl)
+{
+ uint32_t cqe_n, log_desc_n;
+
+ if (__rte_trace_point_fp_is_enabled() &&
+ txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP)
+ cqe_n = UINT16_MAX / 2 - 1;
+ else
+ cqe_n = (1UL << txq_ctrl->txq.elts_n) / MLX5_TX_COMP_THRESH +
+ 1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+ log_desc_n = log2above(cqe_n);
+ cqe_n = 1UL << log_desc_n;
+ if (cqe_n > UINT16_MAX) {
+ DRV_LOG(ERR, "Port %u Tx queue %u requests to many CQEs %u.",
+ dev->data->port_id, txq_ctrl->txq.idx, cqe_n);
+ rte_errno = EINVAL;
+ return 0;
+ }
+ return sizeof(struct mlx5_cqe) * cqe_n;
+}
+
/**
* Create a DPDK Tx queue.
*
@@ -1057,6 +1109,8 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_txq_ctrl *tmpl;
uint16_t max_wqe;
+ uint32_t wqebb_cnt, log_desc_n;
+ uint32_t alignment;
if (socket != (unsigned int)SOCKET_ID_ANY) {
tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
@@ -1099,15 +1153,26 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->txq.idx = idx;
txq_set_params(tmpl);
txq_adjust_params(tmpl);
+ wqebb_cnt = txq_calc_wqebb_cnt(tmpl);
max_wqe = mlx5_dev_get_max_wq_size(priv->sh);
- if (txq_calc_wqebb_cnt(tmpl) > max_wqe) {
+ if (wqebb_cnt > max_wqe) {
DRV_LOG(ERR,
"port %u Tx WQEBB count (%d) exceeds the limit (%d),"
" try smaller queue size",
- dev->data->port_id, txq_calc_wqebb_cnt(tmpl), max_wqe);
+ dev->data->port_id, wqebb_cnt, max_wqe);
rte_errno = ENOMEM;
goto error;
}
+ if (priv->sh->config.txq_mem_algn != 0) {
+ log_desc_n = log2above(wqebb_cnt);
+ tmpl->txq.sq_mem_len = mlx5_txq_wq_mem_length(log_desc_n);
+ tmpl->txq.cq_mem_len = mlx5_txq_cq_mem_length(dev, tmpl);
+ DRV_LOG(DEBUG, "Port %u TxQ %u WQ length %u, CQ length %u before align.",
+ dev->data->port_id, idx, tmpl->txq.sq_mem_len, tmpl->txq.cq_mem_len);
+ alignment = RTE_BIT32(priv->sh->config.txq_mem_algn);
+ priv->consec_tx_mem.sq_total_size += RTE_ALIGN(tmpl->txq.sq_mem_len, alignment);
+ priv->consec_tx_mem.cq_total_size += RTE_ALIGN(tmpl->txq.cq_mem_len, alignment);
+ }
rte_atomic_fetch_add_explicit(&tmpl->refcnt, 1, rte_memory_order_relaxed);
tmpl->is_hairpin = false;
LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v3 3/5] net/mlx5: allocate and release unique resources for Tx queues
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
2025-06-27 16:37 ` [PATCH v3 1/5] net/mlx5: add new devarg for Tx queue consecutive memory Bing Zhao
2025-06-27 16:37 ` [PATCH v3 2/5] net/mlx5: calculate the memory length for all Tx queues Bing Zhao
@ 2025-06-27 16:37 ` Bing Zhao
2025-06-27 16:37 ` [PATCH v3 4/5] net/mlx5: pass the information in Tx queue start Bing Zhao
` (2 subsequent siblings)
5 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-27 16:37 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
If the unique umem and MR method is enabled, before starting Tx
queues in device start stage, the memory will be pre-allocated
and the MR will be registered for the Tx queues' usage later.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/net/mlx5/mlx5.h | 4 ++
drivers/net/mlx5/mlx5_trigger.c | 85 +++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 285c9ba396..c08894cd03 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -2141,6 +2141,10 @@ struct mlx5_priv {
struct {
uint32_t sq_total_size;
uint32_t cq_total_size;
+ void *umem;
+ void *umem_obj;
+ uint32_t sq_cur_off;
+ uint32_t cq_cur_off;
} consec_tx_mem;
RTE_ATOMIC(uint16_t) shared_refcnt; /* HW steering host reference counter. */
};
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 3aa7d01ee2..0fdf66d696 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1135,6 +1135,83 @@ mlx5_hw_representor_port_allowed_start(struct rte_eth_dev *dev)
#endif
+/*
+ * Allocate TxQs unique umem and register its MR.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int mlx5_dev_allocate_consec_tx_mem(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ size_t alignment;
+ uint32_t total_size;
+ struct mlx5dv_devx_umem *umem_obj = NULL;
+ void *umem_buf = NULL;
+
+ /* Legacy per queue allocation, do nothing here. */
+ if (priv->sh->config.txq_mem_algn == 0)
+ return 0;
+ alignment = RTE_BIT32(priv->sh->config.txq_mem_algn);
+ total_size = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
+ /*
+ * Hairpin queues can be skipped later
+ * queue size alignment is bigger than doorbell alignment, no need to align or
+ * round-up again. 1 queue have 2 DBs.
+ */
+ total_size += MLX5_DBR_SIZE * priv->txqs_n * 2;
+ umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, total_size,
+ alignment, priv->sh->numa_node);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate consecutive memory for TxQs.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ umem_obj = mlx5_os_umem_reg(priv->sh->cdev->ctx, (void *)(uintptr_t)umem_buf,
+ total_size, IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register unique umem for all SQs.");
+ rte_errno = errno;
+ if (umem_buf)
+ mlx5_free(umem_buf);
+ return -rte_errno;
+ }
+ priv->consec_tx_mem.umem = umem_buf;
+ priv->consec_tx_mem.sq_cur_off = 0;
+ priv->consec_tx_mem.cq_cur_off = priv->consec_tx_mem.sq_total_size;
+ priv->consec_tx_mem.umem_obj = umem_obj;
+ DRV_LOG(DEBUG, "Allocated umem %p with size %u for %u queues with sq_len %u,"
+ " cq_len %u and registered object %p on port %u",
+ umem_buf, total_size, priv->txqs_n, priv->consec_tx_mem.sq_total_size,
+ priv->consec_tx_mem.cq_total_size, (void *)umem_obj, dev->data->port_id);
+ return 0;
+}
+
+/*
+ * Release TxQs unique umem and register its MR.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void mlx5_dev_free_consec_tx_mem(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->sh->config.txq_mem_algn == 0)
+ return;
+ if (priv->consec_tx_mem.umem_obj) {
+ mlx5_os_umem_dereg(priv->consec_tx_mem.umem_obj);
+ priv->consec_tx_mem.umem_obj = NULL;
+ }
+ if (priv->consec_tx_mem.umem) {
+ mlx5_free(priv->consec_tx_mem.umem);
+ priv->consec_tx_mem.umem = NULL;
+ }
+}
+
/**
* DPDK callback to start the device.
*
@@ -1225,6 +1302,12 @@ mlx5_dev_start(struct rte_eth_dev *dev)
if (ret)
goto error;
}
+ ret = mlx5_dev_allocate_consec_tx_mem(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u Tx queues memory allocation failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ goto error;
+ }
ret = mlx5_txq_start(dev);
if (ret) {
DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
@@ -1358,6 +1441,7 @@ mlx5_dev_start(struct rte_eth_dev *dev)
mlx5_rxq_stop(dev);
if (priv->obj_ops.lb_dummy_queue_release)
priv->obj_ops.lb_dummy_queue_release(dev);
+ mlx5_dev_free_consec_tx_mem(dev);
mlx5_txpp_stop(dev); /* Stop last. */
rte_errno = ret; /* Restore rte_errno. */
return -rte_errno;
@@ -1470,6 +1554,7 @@ mlx5_dev_stop(struct rte_eth_dev *dev)
priv->sh->port[priv->dev_port - 1].nl_ih_port_id = RTE_MAX_ETHPORTS;
mlx5_txq_stop(dev);
mlx5_rxq_stop(dev);
+ mlx5_dev_free_consec_tx_mem(dev);
if (priv->obj_ops.lb_dummy_queue_release)
priv->obj_ops.lb_dummy_queue_release(dev);
mlx5_txpp_stop(dev);
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v3 4/5] net/mlx5: pass the information in Tx queue start
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
` (2 preceding siblings ...)
2025-06-27 16:37 ` [PATCH v3 3/5] net/mlx5: allocate and release unique resources for " Bing Zhao
@ 2025-06-27 16:37 ` Bing Zhao
2025-06-27 16:37 ` [PATCH v3 5/5] net/mlx5: use consecutive memory for Tx queue creation Bing Zhao
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
5 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-27 16:37 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
The actual Devx object of SQs and CQs are only created in the
function mlx5_txq_start() in the device stage.
By changing the 1-level iteration to 2-level iterations, the Tx
queue with a big number of queue depth will be set up firstly.
This will help to split the memory from big trunks to small trunks.
In the testing, such assignment will help to improve the performance
a little bit. All the doorbells will be grouped and padded at the end
of the umem area.
The umem object and offsets information are passed to the Devx
creation function for the further usage.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/common/mlx5/mlx5_devx_cmds.h | 10 ++++
drivers/net/mlx5/mlx5_devx.c | 32 ++++++++++-
drivers/net/mlx5/mlx5_trigger.c | 81 ++++++++++++++--------------
3 files changed, 82 insertions(+), 41 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 6c726a0d46..f5fda02c1e 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -483,6 +483,11 @@ struct mlx5_devx_create_sq_attr {
uint32_t packet_pacing_rate_limit_index:16;
uint32_t tis_lst_sz:16;
uint32_t tis_num:24;
+ uint32_t q_off;
+ void *umem;
+ void *umem_obj;
+ uint32_t q_len;
+ uint32_t db_off;
struct mlx5_devx_wq_attr wq_attr;
};
@@ -514,6 +519,11 @@ struct mlx5_devx_cq_attr {
uint64_t db_umem_offset;
uint32_t eqn;
uint64_t db_addr;
+ void *umem;
+ void *umem_obj;
+ uint32_t q_off;
+ uint32_t q_len;
+ uint32_t db_off;
};
/* Virtq attributes structure, used by VIRTQ operations. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 3d49e096ef..985ffdfd18 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -1493,10 +1493,25 @@ mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
mlx5_ts_format_conv(cdev->config.hca_attr.sq_ts_format),
.tis_num = mlx5_get_txq_tis_num(dev, idx),
};
+ uint32_t db_start = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
+ uint32_t act_sq_len, alignment;
+ int ret;
/* Create Send Queue object with DevX. */
- return mlx5_devx_sq_create(cdev->ctx, &txq_obj->sq_obj,
- log_desc_n, &sq_attr, priv->sh->numa_node);
+ if (priv->sh->config.txq_mem_algn) {
+ alignment = RTE_BIT32(priv->sh->config.txq_mem_algn);
+ sq_attr.umem = priv->consec_tx_mem.umem;
+ sq_attr.umem_obj = priv->consec_tx_mem.umem_obj;
+ act_sq_len = RTE_ALIGN(txq_data->sq_mem_len, alignment);
+ sq_attr.q_off = priv->consec_tx_mem.sq_cur_off;
+ sq_attr.db_off = db_start + (2 * idx) * MLX5_DBR_SIZE;
+ sq_attr.q_len = txq_data->sq_mem_len;
+ }
+ ret = mlx5_devx_sq_create(cdev->ctx, &txq_obj->sq_obj,
+ log_desc_n, &sq_attr, priv->sh->numa_node);
+ if (!ret)
+ priv->consec_tx_mem.sq_cur_off += act_sq_len;
+ return ret;
}
#endif
@@ -1536,6 +1551,8 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
uint32_t cqe_n, log_desc_n;
uint32_t wqe_n, wqe_size;
int ret = 0;
+ uint32_t db_start = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
+ uint32_t act_cq_len, alignment;
MLX5_ASSERT(txq_data);
MLX5_ASSERT(txq_obj);
@@ -1557,6 +1574,15 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
rte_errno = EINVAL;
return 0;
}
+ if (priv->sh->config.txq_mem_algn) {
+ alignment = RTE_BIT32(priv->sh->config.txq_mem_algn);
+ cq_attr.umem = priv->consec_tx_mem.umem;
+ cq_attr.umem_obj = priv->consec_tx_mem.umem_obj;
+ act_cq_len = RTE_ALIGN(txq_data->cq_mem_len, alignment);
+ cq_attr.q_off = priv->consec_tx_mem.cq_cur_off;
+ cq_attr.db_off = db_start + (2 * idx + 1) * MLX5_DBR_SIZE;
+ cq_attr.q_len = txq_data->cq_mem_len;
+ }
/* Create completion queue object with DevX. */
ret = mlx5_devx_cq_create(sh->cdev->ctx, &txq_obj->cq_obj, log_desc_n,
&cq_attr, priv->sh->numa_node);
@@ -1641,6 +1667,8 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
#endif
txq_ctrl->uar_mmap_offset =
mlx5_os_get_devx_uar_mmap_offset(sh->tx_uar.obj);
+ if (priv->sh->config.txq_mem_algn)
+ priv->consec_tx_mem.cq_cur_off += act_cq_len;
ppriv->uar_table[txq_data->idx] = sh->tx_uar.bf_db;
dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
return 0;
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 0fdf66d696..80ffe88120 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -51,52 +51,55 @@ static int
mlx5_txq_start(struct rte_eth_dev *dev)
{
struct mlx5_priv *priv = dev->data->dev_private;
- unsigned int i;
+ uint32_t log_max_wqe = log2above(mlx5_dev_get_max_wq_size(priv->sh));
+ uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
+ unsigned int i, cnt;
int ret;
- for (i = 0; i != priv->txqs_n; ++i) {
- struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
- struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
- uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
+ for (cnt = log_max_wqe; cnt > 0; cnt -= 1) {
+ for (i = 0; i != priv->txqs_n; ++i) {
+ struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+ struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
- if (!txq_ctrl)
- continue;
- if (!txq_ctrl->is_hairpin)
- txq_alloc_elts(txq_ctrl);
- MLX5_ASSERT(!txq_ctrl->obj);
- txq_ctrl->obj = mlx5_malloc_numa_tolerant(flags, sizeof(struct mlx5_txq_obj),
- 0, txq_ctrl->socket);
- if (!txq_ctrl->obj) {
- DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
- "memory resources.", dev->data->port_id,
- txq_data->idx);
- rte_errno = ENOMEM;
- goto error;
- }
- ret = priv->obj_ops.txq_obj_new(dev, i);
- if (ret < 0) {
- mlx5_free(txq_ctrl->obj);
- txq_ctrl->obj = NULL;
- goto error;
- }
- if (!txq_ctrl->is_hairpin) {
- size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
-
- txq_data->fcqs = mlx5_malloc_numa_tolerant(flags, size,
- RTE_CACHE_LINE_SIZE,
- txq_ctrl->socket);
- if (!txq_data->fcqs) {
- DRV_LOG(ERR, "Port %u Tx queue %u cannot "
- "allocate memory (FCQ).",
- dev->data->port_id, i);
+ if (!txq_ctrl || txq_data->elts_n != cnt)
+ continue;
+ if (!txq_ctrl->is_hairpin)
+ txq_alloc_elts(txq_ctrl);
+ MLX5_ASSERT(!txq_ctrl->obj);
+ txq_ctrl->obj = mlx5_malloc_numa_tolerant(flags, sizeof(struct mlx5_txq_obj),
+ 0, txq_ctrl->socket);
+ if (!txq_ctrl->obj) {
+ DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
+ "memory resources.", dev->data->port_id,
+ txq_data->idx);
rte_errno = ENOMEM;
goto error;
}
- }
- DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
- dev->data->port_id, i, (void *)&txq_ctrl->obj);
- LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
+ ret = priv->obj_ops.txq_obj_new(dev, i);
+ if (ret < 0) {
+ mlx5_free(txq_ctrl->obj);
+ txq_ctrl->obj = NULL;
+ goto error;
+ }
+ if (!txq_ctrl->is_hairpin) {
+ size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
+
+ txq_data->fcqs = mlx5_malloc_numa_tolerant(flags, size,
+ RTE_CACHE_LINE_SIZE,
+ txq_ctrl->socket);
+ if (!txq_data->fcqs) {
+ DRV_LOG(ERR, "Port %u Tx queue %u cannot "
+ "allocate memory (FCQ).",
+ dev->data->port_id, i);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ }
+ DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
+ dev->data->port_id, i, (void *)&txq_ctrl->obj);
+ LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
}
+}
return 0;
error:
ret = rte_errno; /* Save rte_errno before cleanup. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v3 5/5] net/mlx5: use consecutive memory for Tx queue creation
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
` (3 preceding siblings ...)
2025-06-27 16:37 ` [PATCH v3 4/5] net/mlx5: pass the information in Tx queue start Bing Zhao
@ 2025-06-27 16:37 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
5 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-27 16:37 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
The queue starting addresses offsets of a umem and doorbell offsets
are already passed to the Devx object creation function.
When the queue length is not zero, it means that the memory was
pre-allocated and the new object creation with consecutive memory
should be enabled.
When destroying the SQ / CQ objects, if it is in consecutive mode,
the umem and MR should not be released and the global resources
should only be released when stopping the device.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/common/mlx5/mlx5_common_devx.c | 160 +++++++++++++++++--------
drivers/common/mlx5/mlx5_common_devx.h | 2 +
2 files changed, 110 insertions(+), 52 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_common_devx.c b/drivers/common/mlx5/mlx5_common_devx.c
index aace5283e7..e237558ec2 100644
--- a/drivers/common/mlx5/mlx5_common_devx.c
+++ b/drivers/common/mlx5/mlx5_common_devx.c
@@ -30,6 +30,8 @@ mlx5_devx_cq_destroy(struct mlx5_devx_cq *cq)
{
if (cq->cq)
claim_zero(mlx5_devx_cmd_destroy(cq->cq));
+ if (cq->consec)
+ return;
if (cq->umem_obj)
claim_zero(mlx5_os_umem_dereg(cq->umem_obj));
if (cq->umem_buf)
@@ -93,6 +95,7 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
uint32_t eqn;
uint32_t num_of_cqes = RTE_BIT32(log_desc_n);
int ret;
+ uint32_t umem_offset, umem_id;
if (page_size == (size_t)-1 || alignment == (size_t)-1) {
DRV_LOG(ERR, "Failed to get page_size.");
@@ -108,29 +111,44 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
}
/* Allocate memory buffer for CQEs and doorbell record. */
umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
- umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
- umem_size += MLX5_DBR_SIZE;
- umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
- alignment, socket);
- if (!umem_buf) {
- DRV_LOG(ERR, "Failed to allocate memory for CQ.");
- rte_errno = ENOMEM;
- return -rte_errno;
- }
- /* Register allocated buffer in user space with DevX. */
- umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
- IBV_ACCESS_LOCAL_WRITE);
- if (!umem_obj) {
- DRV_LOG(ERR, "Failed to register umem for CQ.");
- rte_errno = errno;
- goto error;
+ if (!attr->q_len) {
+ umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+ umem_size += MLX5_DBR_SIZE;
+ umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+ alignment, socket);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for CQ.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ /* Register allocated buffer in user space with DevX. */
+ umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for CQ.");
+ rte_errno = errno;
+ goto error;
+ }
+ umem_offset = 0;
+ umem_id = mlx5_os_get_umem_id(umem_obj);
+ } else {
+ if (umem_size != attr->q_len) {
+ DRV_LOG(ERR, "Mismatch between saved length and calc length of CQ %u-%u",
+ umem_size, attr->q_len);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ umem_buf = attr->umem;
+ umem_offset = attr->q_off;
+ umem_dbrec = attr->db_off;
+ umem_id = mlx5_os_get_umem_id(attr->umem_obj);
}
/* Fill attributes for CQ object creation. */
attr->q_umem_valid = 1;
- attr->q_umem_id = mlx5_os_get_umem_id(umem_obj);
- attr->q_umem_offset = 0;
+ attr->q_umem_id = umem_id;
+ attr->q_umem_offset = umem_offset;
attr->db_umem_valid = 1;
- attr->db_umem_id = attr->q_umem_id;
+ attr->db_umem_id = umem_id;
attr->db_umem_offset = umem_dbrec;
attr->eqn = eqn;
attr->log_cq_size = log_desc_n;
@@ -142,19 +160,29 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
rte_errno = ENOMEM;
goto error;
}
- cq_obj->umem_buf = umem_buf;
- cq_obj->umem_obj = umem_obj;
+ if (!attr->q_len) {
+ cq_obj->umem_buf = umem_buf;
+ cq_obj->umem_obj = umem_obj;
+ cq_obj->db_rec = RTE_PTR_ADD(cq_obj->umem_buf, umem_dbrec);
+ cq_obj->consec = false;
+ } else {
+ cq_obj->umem_buf = RTE_PTR_ADD(umem_buf, umem_offset);
+ cq_obj->umem_obj = attr->umem_obj;
+ cq_obj->db_rec = RTE_PTR_ADD(umem_buf, umem_dbrec);
+ cq_obj->consec = true;
+ }
cq_obj->cq = cq;
- cq_obj->db_rec = RTE_PTR_ADD(cq_obj->umem_buf, umem_dbrec);
/* Mark all CQEs initially as invalid. */
mlx5_cq_init(cq_obj, num_of_cqes);
return 0;
error:
ret = rte_errno;
- if (umem_obj)
- claim_zero(mlx5_os_umem_dereg(umem_obj));
- if (umem_buf)
- mlx5_free((void *)(uintptr_t)umem_buf);
+ if (!attr->q_len) {
+ if (umem_obj)
+ claim_zero(mlx5_os_umem_dereg(umem_obj));
+ if (umem_buf)
+ mlx5_free((void *)(uintptr_t)umem_buf);
+ }
rte_errno = ret;
return -rte_errno;
}
@@ -171,6 +199,8 @@ mlx5_devx_sq_destroy(struct mlx5_devx_sq *sq)
{
if (sq->sq)
claim_zero(mlx5_devx_cmd_destroy(sq->sq));
+ if (sq->consec)
+ return;
if (sq->umem_obj)
claim_zero(mlx5_os_umem_dereg(sq->umem_obj));
if (sq->umem_buf)
@@ -220,6 +250,7 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
uint32_t umem_size, umem_dbrec;
uint32_t num_of_wqbbs = RTE_BIT32(log_wqbb_n);
int ret;
+ uint32_t umem_offset, umem_id;
if (alignment == (size_t)-1) {
DRV_LOG(ERR, "Failed to get WQE buf alignment.");
@@ -228,30 +259,45 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
}
/* Allocate memory buffer for WQEs and doorbell record. */
umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
- umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
- umem_size += MLX5_DBR_SIZE;
- umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
- alignment, socket);
- if (!umem_buf) {
- DRV_LOG(ERR, "Failed to allocate memory for SQ.");
- rte_errno = ENOMEM;
- return -rte_errno;
- }
- /* Register allocated buffer in user space with DevX. */
- umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
- IBV_ACCESS_LOCAL_WRITE);
- if (!umem_obj) {
- DRV_LOG(ERR, "Failed to register umem for SQ.");
- rte_errno = errno;
- goto error;
+ if (!attr->q_len) {
+ umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+ umem_size += MLX5_DBR_SIZE;
+ umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+ alignment, socket);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for SQ.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ /* Register allocated buffer in user space with DevX. */
+ umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for SQ.");
+ rte_errno = errno;
+ goto error;
+ }
+ umem_offset = 0;
+ umem_id = mlx5_os_get_umem_id(umem_obj);
+ } else {
+ if (umem_size != attr->q_len) {
+ DRV_LOG(ERR, "Mismatch between saved length and calc length of WQ %u-%u",
+ umem_size, attr->q_len);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ umem_buf = attr->umem;
+ umem_offset = attr->q_off;
+ umem_dbrec = attr->db_off;
+ umem_id = mlx5_os_get_umem_id(attr->umem_obj);
}
/* Fill attributes for SQ object creation. */
attr->wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
attr->wq_attr.wq_umem_valid = 1;
- attr->wq_attr.wq_umem_id = mlx5_os_get_umem_id(umem_obj);
- attr->wq_attr.wq_umem_offset = 0;
+ attr->wq_attr.wq_umem_id = umem_id;
+ attr->wq_attr.wq_umem_offset = umem_offset;
attr->wq_attr.dbr_umem_valid = 1;
- attr->wq_attr.dbr_umem_id = attr->wq_attr.wq_umem_id;
+ attr->wq_attr.dbr_umem_id = umem_id;
attr->wq_attr.dbr_addr = umem_dbrec;
attr->wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
attr->wq_attr.log_wq_sz = log_wqbb_n;
@@ -263,17 +309,27 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
rte_errno = ENOMEM;
goto error;
}
- sq_obj->umem_buf = umem_buf;
- sq_obj->umem_obj = umem_obj;
+ if (!attr->q_len) {
+ sq_obj->umem_buf = umem_buf;
+ sq_obj->umem_obj = umem_obj;
+ sq_obj->db_rec = RTE_PTR_ADD(sq_obj->umem_buf, umem_dbrec);
+ sq_obj->consec = false;
+ } else {
+ sq_obj->umem_buf = RTE_PTR_ADD(umem_buf, attr->q_off);
+ sq_obj->umem_obj = attr->umem_obj;
+ sq_obj->db_rec = RTE_PTR_ADD(umem_buf, attr->db_off);
+ sq_obj->consec = true;
+ }
sq_obj->sq = sq;
- sq_obj->db_rec = RTE_PTR_ADD(sq_obj->umem_buf, umem_dbrec);
return 0;
error:
ret = rte_errno;
- if (umem_obj)
- claim_zero(mlx5_os_umem_dereg(umem_obj));
- if (umem_buf)
- mlx5_free((void *)(uintptr_t)umem_buf);
+ if (!attr->q_len) {
+ if (umem_obj)
+ claim_zero(mlx5_os_umem_dereg(umem_obj));
+ if (umem_buf)
+ mlx5_free((void *)(uintptr_t)umem_buf);
+ }
rte_errno = ret;
return -rte_errno;
}
diff --git a/drivers/common/mlx5/mlx5_common_devx.h b/drivers/common/mlx5/mlx5_common_devx.h
index 743f06042c..4cb9111dbb 100644
--- a/drivers/common/mlx5/mlx5_common_devx.h
+++ b/drivers/common/mlx5/mlx5_common_devx.h
@@ -21,6 +21,7 @@ struct mlx5_devx_cq {
volatile struct mlx5_cqe *cqes; /* The CQ ring buffer. */
};
volatile uint32_t *db_rec; /* The CQ doorbell record. */
+ bool consec; /* Using consecutive memory. */
};
/* DevX Send Queue structure. */
@@ -33,6 +34,7 @@ struct mlx5_devx_sq {
volatile struct mlx5_aso_wqe *aso_wqes;
};
volatile uint32_t *db_rec; /* The SQ doorbell record. */
+ bool consec; /* Using consecutive memory. */
};
/* DevX Queue Pair structure. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 0/5] Use consecutive Tx queues' memory
2025-06-27 16:37 ` [PATCH v3 0/5] Use consecutive Tx queues' memory Bing Zhao
` (4 preceding siblings ...)
2025-06-27 16:37 ` [PATCH v3 5/5] net/mlx5: use consecutive memory for Tx queue creation Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 1/5] net/mlx5: add new devarg for Tx queue consecutive memory Bing Zhao
` (4 more replies)
5 siblings, 5 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
This patchset will move all the mlx5 Tx queues memory to a
consecutive memory area. All the WQEBBs will be allocated based
on the offset of this memory area.
---
v2:
1. add a new fix for legacy code of WQE calculation
2. fix the style
v3:
1. change the devarg and add description.
2. reorganize the code with different commits order.
v4:
1. fix building failure on Windows and OSes with different compilers
2. update the rst
3. addressing comments and fix bugs
---
Bing Zhao (5):
net/mlx5: add new devarg for Tx queue consecutive memory
net/mlx5: calculate the memory length for all Tx queues
net/mlx5: allocate and release unique resources for Tx queues
net/mlx5: pass the information in Tx queue start
net/mlx5: use consecutive memory for Tx queue creation
doc/guides/nics/mlx5.rst | 25 ++++
drivers/common/mlx5/mlx5_common_devx.c | 160 +++++++++++++++--------
drivers/common/mlx5/mlx5_common_devx.h | 2 +
drivers/common/mlx5/mlx5_devx_cmds.h | 10 ++
drivers/net/mlx5/mlx5.c | 36 +++++
drivers/net/mlx5/mlx5.h | 15 ++-
drivers/net/mlx5/mlx5_devx.c | 26 +++-
drivers/net/mlx5/mlx5_trigger.c | 173 +++++++++++++++++++------
drivers/net/mlx5/mlx5_tx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 67 +++++++++-
10 files changed, 418 insertions(+), 98 deletions(-)
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 1/5] net/mlx5: add new devarg for Tx queue consecutive memory
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 2/5] net/mlx5: calculate the memory length for all Tx queues Bing Zhao
` (3 subsequent siblings)
4 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
With this commit, a new device argument is introduced to control
the memory allocation for Tx queues.
By default, without specifying any value. A default alignment with
system page size will be used. All SQ / CQ memory of Tx queues will
be allocated once and a single umem & MR will be used.
When setting to 0, the legacy way of per queue umem allocation will
be selected in the following commit.
If the value is smaller than the system page size, the starting
address alignment will be rounded up to the page size.
The value is a logarithm value based to 2. Refer to the rst file
change for more details.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
doc/guides/nics/mlx5.rst | 25 +++++++++++++++++++++++++
drivers/net/mlx5/mlx5.c | 36 ++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5.h | 7 ++++---
3 files changed, 65 insertions(+), 3 deletions(-)
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index c1dcb9ca68..13e46970ab 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -1682,6 +1682,31 @@ for an additional list of options shared with other mlx5 drivers.
By default, the PMD will set this value to 1.
+- ``txq_mem_algn`` parameter [int]
+
+ A logarithm base 2 value for the memory starting address alignment
+ for Tx queues' WQ and associated CQ.
+
+ Different CPU architectures and generations may have different cache systems.
+ The memory accessing order may impact the cache misses rate on different CPUs.
+ This devarg gives the ability to control the umem alignment for all TxQs without
+ rebuilding the application binary.
+
+ The performance can be tuned by specifying this devarg after benchmark testing
+ on a specific system and hardware.
+
+ By default, ``txq_mem_algn`` is set to log2(4K), or log2(64K) on some specific OS
+ distributions - based on the system page size configuration.
+ All Tx queues will use a unique memory region and umem area. Each TxQ will start at
+ an address right after the previous one except the 1st queue that will be aligned at
+ the given size of address boundary controlled by this devarg.
+
+ If the value is less then the page size, it will be rounded up.
+ If it is bigger than the maximal queue size, a warning message will appear, there will
+ be some waste of memory at the beginning.
+
+ 0 indicates legacy per queue memory allocation and separate Memory Regions (MR).
+
Multiport E-Switch
------------------
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bad8a9e90..a364e9e421 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -185,6 +185,14 @@
/* Device parameter to control representor matching in ingress/egress flows with HWS. */
#define MLX5_REPR_MATCHING_EN "repr_matching_en"
+/*
+ * Alignment of the Tx queue starting address,
+ * If not set, using separate umem and MR for each TxQ.
+ * If set, using consecutive memory address and single MR for all Tx queues, each TxQ will start at
+ * the alignment specified.
+ */
+#define MLX5_TXQ_MEM_ALGN "txq_mem_algn"
+
/* Shared memory between primary and secondary processes. */
struct mlx5_shared_data *mlx5_shared_data;
@@ -1447,6 +1455,8 @@ mlx5_dev_args_check_handler(const char *key, const char *val, void *opaque)
config->cnt_svc.cycle_time = tmp;
} else if (strcmp(MLX5_REPR_MATCHING_EN, key) == 0) {
config->repr_matching = !!tmp;
+ } else if (strcmp(MLX5_TXQ_MEM_ALGN, key) == 0) {
+ config->txq_mem_algn = (uint32_t)tmp;
}
return 0;
}
@@ -1486,9 +1496,17 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
MLX5_HWS_CNT_SERVICE_CORE,
MLX5_HWS_CNT_CYCLE_TIME,
MLX5_REPR_MATCHING_EN,
+ MLX5_TXQ_MEM_ALGN,
NULL,
};
int ret = 0;
+ size_t alignment = rte_mem_page_size();
+ uint32_t max_queue_umem_size = MLX5_WQE_SIZE * mlx5_dev_get_max_wq_size(sh);
+
+ if (alignment == (size_t)-1) {
+ alignment = (1 << MLX5_LOG_PAGE_SIZE);
+ DRV_LOG(WARNING, "Failed to get page_size, using default %zu size.", alignment);
+ }
/* Default configuration. */
memset(config, 0, sizeof(*config));
@@ -1501,6 +1519,7 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->cnt_svc.cycle_time = MLX5_CNT_SVC_CYCLE_TIME_DEFAULT;
config->cnt_svc.service_core = rte_get_main_lcore();
config->repr_matching = 1;
+ config->txq_mem_algn = log2above(alignment);
if (mkvlist != NULL) {
/* Process parameters. */
ret = mlx5_kvargs_process(mkvlist, params,
@@ -1567,6 +1586,16 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->hw_fcs_strip = 0;
else
config->hw_fcs_strip = sh->dev_cap.hw_fcs_strip;
+ if (config->txq_mem_algn != 0 && config->txq_mem_algn < log2above(alignment)) {
+ DRV_LOG(WARNING,
+ "\"txq_mem_algn\" too small %u, round up to %u.",
+ config->txq_mem_algn, log2above(alignment));
+ config->txq_mem_algn = log2above(alignment);
+ } else if (config->txq_mem_algn > log2above(max_queue_umem_size)) {
+ DRV_LOG(WARNING,
+ "\"txq_mem_algn\" with value %u bigger than %u.",
+ config->txq_mem_algn, log2above(max_queue_umem_size));
+ }
DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
(config->hw_fcs_strip ? "" : "not "));
DRV_LOG(DEBUG, "\"tx_pp\" is %d.", config->tx_pp);
@@ -1584,6 +1613,7 @@ mlx5_shared_dev_ctx_args_config(struct mlx5_dev_ctx_shared *sh,
config->allow_duplicate_pattern);
DRV_LOG(DEBUG, "\"fdb_def_rule_en\" is %u.", config->fdb_def_rule);
DRV_LOG(DEBUG, "\"repr_matching_en\" is %u.", config->repr_matching);
+ DRV_LOG(DEBUG, "\"txq_mem_algn\" is %u.", config->txq_mem_algn);
return 0;
}
@@ -3151,6 +3181,12 @@ mlx5_probe_again_args_validate(struct mlx5_common_device *cdev,
sh->ibdev_name);
goto error;
}
+ if (sh->config.txq_mem_algn != config->txq_mem_algn) {
+ DRV_LOG(ERR, "\"TxQ memory alignment\" "
+ "configuration mismatch for shared %s context. %u - %u",
+ sh->ibdev_name, sh->config.txq_mem_algn, config->txq_mem_algn);
+ goto error;
+ }
mlx5_free(config);
return 0;
error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index f085656196..6b8d29a2bf 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -386,13 +386,14 @@ struct mlx5_sh_config {
uint32_t hw_fcs_strip:1; /* FCS stripping is supported. */
uint32_t allow_duplicate_pattern:1;
uint32_t lro_allowed:1; /* Whether LRO is allowed. */
+ /* Allow/Prevent the duplicate rules pattern. */
+ uint32_t fdb_def_rule:1; /* Create FDB default jump rule */
+ uint32_t repr_matching:1; /* Enable implicit vport matching in HWS FDB. */
+ uint32_t txq_mem_algn; /* logarithm value of the TxQ address alignment. */
struct {
uint16_t service_core;
uint32_t cycle_time; /* query cycle time in milli-second. */
} cnt_svc; /* configure for HW steering's counter's service. */
- /* Allow/Prevent the duplicate rules pattern. */
- uint32_t fdb_def_rule:1; /* Create FDB default jump rule */
- uint32_t repr_matching:1; /* Enable implicit vport matching in HWS FDB. */
};
/* Structure for VF VLAN workaround. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 2/5] net/mlx5: calculate the memory length for all Tx queues
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
2025-06-29 17:07 ` [PATCH v4 1/5] net/mlx5: add new devarg for Tx queue consecutive memory Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 3/5] net/mlx5: allocate and release unique resources for " Bing Zhao
` (2 subsequent siblings)
4 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
When the alignment is non-zero, it means that the single umem and MR
allocation for all Tx queues will be used.
In this commit, the total length of SQs and associated CQs will be
calculated and saved.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/net/mlx5/mlx5.h | 4 +++
drivers/net/mlx5/mlx5_tx.h | 2 ++
drivers/net/mlx5/mlx5_txq.c | 67 +++++++++++++++++++++++++++++++++++--
3 files changed, 71 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 6b8d29a2bf..285c9ba396 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -2138,6 +2138,10 @@ struct mlx5_priv {
struct mlx5_nta_sample_ctx *nta_sample_ctx;
#endif
struct rte_eth_dev *shared_host; /* Host device for HW steering. */
+ struct {
+ uint32_t sq_total_size;
+ uint32_t cq_total_size;
+ } consec_tx_mem;
RTE_ATOMIC(uint16_t) shared_refcnt; /* HW steering host reference counter. */
};
diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h
index 55568c41b1..94f2028513 100644
--- a/drivers/net/mlx5/mlx5_tx.h
+++ b/drivers/net/mlx5/mlx5_tx.h
@@ -149,6 +149,7 @@ struct __rte_cache_aligned mlx5_txq_data {
uint16_t inlen_mode; /* Minimal data length to inline. */
uint8_t tx_aggr_affinity; /* TxQ affinity configuration. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
+ uint32_t sq_mem_len; /* Length of TxQ for WQEs */
uint64_t offloads; /* Offloads for Tx Queue. */
struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct mlx5_wqe *wqes; /* Work queue. */
@@ -167,6 +168,7 @@ struct __rte_cache_aligned mlx5_txq_data {
uint64_t ts_mask; /* Timestamp flag dynamic mask. */
uint64_t ts_last; /* Last scheduled timestamp. */
int32_t ts_offset; /* Timestamp field dynamic offset. */
+ uint32_t cq_mem_len; /* Length of TxQ for CQEs */
struct mlx5_dev_ctx_shared *sh; /* Shared context. */
struct mlx5_txq_stats stats; /* TX queue counters. */
struct mlx5_txq_stats stats_reset; /* stats on last reset. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 8ee8108497..1948a700f1 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -17,6 +17,7 @@
#include <bus_pci_driver.h>
#include <rte_common.h>
#include <rte_eal_paging.h>
+#include <rte_bitops.h>
#include <mlx5_common.h>
#include <mlx5_common_mr.h>
@@ -1032,6 +1033,57 @@ txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
!txq_ctrl->txq.inlen_empw);
}
+/*
+ * Calculate WQ memory length for a Tx queue.
+ *
+ * @param log_wqe_cnt
+ * Logarithm value of WQE numbers.
+ *
+ * @return
+ * memory length of this WQ.
+ */
+static uint32_t mlx5_txq_wq_mem_length(uint32_t log_wqe_cnt)
+{
+ uint32_t num_of_wqbbs = 1U << log_wqe_cnt;
+ uint32_t umem_size;
+
+ umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
+ return umem_size;
+}
+
+/*
+ * Calculate CQ memory length for a Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param txq_ctrl
+ * Pointer to the TxQ control structure of the CQ.
+ *
+ * @return
+ * memory length of this CQ.
+ */
+static uint32_t
+mlx5_txq_cq_mem_length(struct rte_eth_dev *dev, struct mlx5_txq_ctrl *txq_ctrl)
+{
+ uint32_t cqe_n, log_desc_n;
+
+ if (__rte_trace_point_fp_is_enabled() &&
+ txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP)
+ cqe_n = UINT16_MAX / 2 - 1;
+ else
+ cqe_n = (1UL << txq_ctrl->txq.elts_n) / MLX5_TX_COMP_THRESH +
+ 1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+ log_desc_n = log2above(cqe_n);
+ cqe_n = 1UL << log_desc_n;
+ if (cqe_n > UINT16_MAX) {
+ DRV_LOG(ERR, "Port %u Tx queue %u requests to many CQEs %u.",
+ dev->data->port_id, txq_ctrl->txq.idx, cqe_n);
+ rte_errno = EINVAL;
+ return 0;
+ }
+ return sizeof(struct mlx5_cqe) * cqe_n;
+}
+
/**
* Create a DPDK Tx queue.
*
@@ -1057,6 +1109,7 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_txq_ctrl *tmpl;
uint16_t max_wqe;
+ uint32_t wqebb_cnt, log_desc_n;
if (socket != (unsigned int)SOCKET_ID_ANY) {
tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
@@ -1099,15 +1152,25 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->txq.idx = idx;
txq_set_params(tmpl);
txq_adjust_params(tmpl);
+ wqebb_cnt = txq_calc_wqebb_cnt(tmpl);
max_wqe = mlx5_dev_get_max_wq_size(priv->sh);
- if (txq_calc_wqebb_cnt(tmpl) > max_wqe) {
+ if (wqebb_cnt > max_wqe) {
DRV_LOG(ERR,
"port %u Tx WQEBB count (%d) exceeds the limit (%d),"
" try smaller queue size",
- dev->data->port_id, txq_calc_wqebb_cnt(tmpl), max_wqe);
+ dev->data->port_id, wqebb_cnt, max_wqe);
rte_errno = ENOMEM;
goto error;
}
+ if (priv->sh->config.txq_mem_algn != 0) {
+ log_desc_n = log2above(wqebb_cnt);
+ tmpl->txq.sq_mem_len = mlx5_txq_wq_mem_length(log_desc_n);
+ tmpl->txq.cq_mem_len = mlx5_txq_cq_mem_length(dev, tmpl);
+ DRV_LOG(DEBUG, "Port %u TxQ %u WQ length %u, CQ length %u before align.",
+ dev->data->port_id, idx, tmpl->txq.sq_mem_len, tmpl->txq.cq_mem_len);
+ priv->consec_tx_mem.sq_total_size += tmpl->txq.sq_mem_len;
+ priv->consec_tx_mem.cq_total_size += tmpl->txq.cq_mem_len;
+ }
rte_atomic_fetch_add_explicit(&tmpl->refcnt, 1, rte_memory_order_relaxed);
tmpl->is_hairpin = false;
LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 3/5] net/mlx5: allocate and release unique resources for Tx queues
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
2025-06-29 17:07 ` [PATCH v4 1/5] net/mlx5: add new devarg for Tx queue consecutive memory Bing Zhao
2025-06-29 17:07 ` [PATCH v4 2/5] net/mlx5: calculate the memory length for all Tx queues Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 4/5] net/mlx5: pass the information in Tx queue start Bing Zhao
2025-06-29 17:07 ` [PATCH v4 5/5] net/mlx5: use consecutive memory for Tx queue creation Bing Zhao
4 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
If the unique umem and MR method is enabled, before starting Tx
queues in device start stage, the memory will be pre-allocated
and the MR will be registered for the Tx queues' usage later.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/net/mlx5/mlx5.h | 4 ++
drivers/net/mlx5/mlx5_trigger.c | 91 +++++++++++++++++++++++++++++++++
2 files changed, 95 insertions(+)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 285c9ba396..c08894cd03 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -2141,6 +2141,10 @@ struct mlx5_priv {
struct {
uint32_t sq_total_size;
uint32_t cq_total_size;
+ void *umem;
+ void *umem_obj;
+ uint32_t sq_cur_off;
+ uint32_t cq_cur_off;
} consec_tx_mem;
RTE_ATOMIC(uint16_t) shared_refcnt; /* HW steering host reference counter. */
};
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 3aa7d01ee2..00ffb39ecb 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1135,6 +1135,89 @@ mlx5_hw_representor_port_allowed_start(struct rte_eth_dev *dev)
#endif
+/*
+ * Allocate TxQs unique umem and register its MR.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int mlx5_dev_allocate_consec_tx_mem(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ size_t alignment;
+ uint32_t total_size;
+ struct mlx5dv_devx_umem *umem_obj = NULL;
+ void *umem_buf = NULL;
+
+ /* Legacy per queue allocation, do nothing here. */
+ if (priv->sh->config.txq_mem_algn == 0)
+ return 0;
+ alignment = (size_t)(1U << priv->sh->config.txq_mem_algn);
+ total_size = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
+ /*
+ * Hairpin queues can be skipped later
+ * queue size alignment is bigger than doorbell alignment, no need to align or
+ * round-up again. One queue have two DBs (for CQ + WQ).
+ */
+ total_size += MLX5_DBR_SIZE * priv->txqs_n * 2;
+ umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, total_size,
+ alignment, priv->sh->numa_node);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate consecutive memory for TxQs.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ umem_obj = mlx5_os_umem_reg(priv->sh->cdev->ctx, (void *)(uintptr_t)umem_buf,
+ total_size, IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register unique umem for all SQs.");
+ rte_errno = errno;
+ if (umem_buf)
+ mlx5_free(umem_buf);
+ return -rte_errno;
+ }
+ priv->consec_tx_mem.umem = umem_buf;
+ priv->consec_tx_mem.sq_cur_off = 0;
+ priv->consec_tx_mem.cq_cur_off = priv->consec_tx_mem.sq_total_size;
+ priv->consec_tx_mem.umem_obj = umem_obj;
+ DRV_LOG(DEBUG, "Allocated umem %p with size %u for %u queues with sq_len %u,"
+ " cq_len %u and registered object %p on port %u",
+ umem_buf, total_size, priv->txqs_n, priv->consec_tx_mem.sq_total_size,
+ priv->consec_tx_mem.cq_total_size, (void *)umem_obj, dev->data->port_id);
+ return 0;
+}
+
+/*
+ * Release TxQs unique umem and register its MR.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param on_stop
+ * If this is on device stop stage.
+ */
+static void mlx5_dev_free_consec_tx_mem(struct rte_eth_dev *dev, bool on_stop)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->consec_tx_mem.umem_obj) {
+ mlx5_os_umem_dereg(priv->consec_tx_mem.umem_obj);
+ priv->consec_tx_mem.umem_obj = NULL;
+ }
+ if (priv->consec_tx_mem.umem) {
+ mlx5_free(priv->consec_tx_mem.umem);
+ priv->consec_tx_mem.umem = NULL;
+ }
+ /* Queues information will not be reset. */
+ if (on_stop) {
+ /* Reset to 0s for re-setting up queues. */
+ priv->consec_tx_mem.sq_cur_off = 0;
+ priv->consec_tx_mem.cq_cur_off = 0;
+ }
+}
+
/**
* DPDK callback to start the device.
*
@@ -1225,6 +1308,12 @@ mlx5_dev_start(struct rte_eth_dev *dev)
if (ret)
goto error;
}
+ ret = mlx5_dev_allocate_consec_tx_mem(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u Tx queues memory allocation failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ goto error;
+ }
ret = mlx5_txq_start(dev);
if (ret) {
DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
@@ -1358,6 +1447,7 @@ mlx5_dev_start(struct rte_eth_dev *dev)
mlx5_rxq_stop(dev);
if (priv->obj_ops.lb_dummy_queue_release)
priv->obj_ops.lb_dummy_queue_release(dev);
+ mlx5_dev_free_consec_tx_mem(dev, false);
mlx5_txpp_stop(dev); /* Stop last. */
rte_errno = ret; /* Restore rte_errno. */
return -rte_errno;
@@ -1470,6 +1560,7 @@ mlx5_dev_stop(struct rte_eth_dev *dev)
priv->sh->port[priv->dev_port - 1].nl_ih_port_id = RTE_MAX_ETHPORTS;
mlx5_txq_stop(dev);
mlx5_rxq_stop(dev);
+ mlx5_dev_free_consec_tx_mem(dev, true);
if (priv->obj_ops.lb_dummy_queue_release)
priv->obj_ops.lb_dummy_queue_release(dev);
mlx5_txpp_stop(dev);
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 4/5] net/mlx5: pass the information in Tx queue start
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
` (2 preceding siblings ...)
2025-06-29 17:07 ` [PATCH v4 3/5] net/mlx5: allocate and release unique resources for " Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
2025-06-29 17:07 ` [PATCH v4 5/5] net/mlx5: use consecutive memory for Tx queue creation Bing Zhao
4 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
The actual Devx object of SQs and CQs are only created in the
function mlx5_txq_start() in the device stage.
By changing the 1-level iteration to 2-level iterations, the Tx
queue with a big number of queue depth will be set up firstly.
This will help to split the memory from big trunks to small trunks.
In the testing, such assignment will help to improve the performance
a little bit. All the doorbells will be grouped and padded at the end
of the umem area.
The umem object and offsets information are passed to the Devx
creation function for the further usage.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/common/mlx5/mlx5_devx_cmds.h | 10 ++++
drivers/net/mlx5/mlx5_devx.c | 26 ++++++++-
drivers/net/mlx5/mlx5_trigger.c | 82 +++++++++++++++-------------
3 files changed, 77 insertions(+), 41 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 6c726a0d46..f5fda02c1e 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -483,6 +483,11 @@ struct mlx5_devx_create_sq_attr {
uint32_t packet_pacing_rate_limit_index:16;
uint32_t tis_lst_sz:16;
uint32_t tis_num:24;
+ uint32_t q_off;
+ void *umem;
+ void *umem_obj;
+ uint32_t q_len;
+ uint32_t db_off;
struct mlx5_devx_wq_attr wq_attr;
};
@@ -514,6 +519,11 @@ struct mlx5_devx_cq_attr {
uint64_t db_umem_offset;
uint32_t eqn;
uint64_t db_addr;
+ void *umem;
+ void *umem_obj;
+ uint32_t q_off;
+ uint32_t q_len;
+ uint32_t db_off;
};
/* Virtq attributes structure, used by VIRTQ operations. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 3d49e096ef..0ee16ba4f0 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -1493,10 +1493,22 @@ mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
mlx5_ts_format_conv(cdev->config.hca_attr.sq_ts_format),
.tis_num = mlx5_get_txq_tis_num(dev, idx),
};
+ uint32_t db_start = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
+ int ret;
/* Create Send Queue object with DevX. */
- return mlx5_devx_sq_create(cdev->ctx, &txq_obj->sq_obj,
- log_desc_n, &sq_attr, priv->sh->numa_node);
+ if (priv->sh->config.txq_mem_algn) {
+ sq_attr.umem = priv->consec_tx_mem.umem;
+ sq_attr.umem_obj = priv->consec_tx_mem.umem_obj;
+ sq_attr.q_off = priv->consec_tx_mem.sq_cur_off;
+ sq_attr.db_off = db_start + (2 * idx) * MLX5_DBR_SIZE;
+ sq_attr.q_len = txq_data->sq_mem_len;
+ }
+ ret = mlx5_devx_sq_create(cdev->ctx, &txq_obj->sq_obj,
+ log_desc_n, &sq_attr, priv->sh->numa_node);
+ if (!ret && priv->sh->config.txq_mem_algn)
+ priv->consec_tx_mem.sq_cur_off += txq_data->sq_mem_len;
+ return ret;
}
#endif
@@ -1536,6 +1548,7 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
uint32_t cqe_n, log_desc_n;
uint32_t wqe_n, wqe_size;
int ret = 0;
+ uint32_t db_start = priv->consec_tx_mem.sq_total_size + priv->consec_tx_mem.cq_total_size;
MLX5_ASSERT(txq_data);
MLX5_ASSERT(txq_obj);
@@ -1557,6 +1570,13 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
rte_errno = EINVAL;
return 0;
}
+ if (priv->sh->config.txq_mem_algn) {
+ cq_attr.umem = priv->consec_tx_mem.umem;
+ cq_attr.umem_obj = priv->consec_tx_mem.umem_obj;
+ cq_attr.q_off = priv->consec_tx_mem.cq_cur_off;
+ cq_attr.db_off = db_start + (2 * idx + 1) * MLX5_DBR_SIZE;
+ cq_attr.q_len = txq_data->cq_mem_len;
+ }
/* Create completion queue object with DevX. */
ret = mlx5_devx_cq_create(sh->cdev->ctx, &txq_obj->cq_obj, log_desc_n,
&cq_attr, priv->sh->numa_node);
@@ -1641,6 +1661,8 @@ mlx5_txq_devx_obj_new(struct rte_eth_dev *dev, uint16_t idx)
#endif
txq_ctrl->uar_mmap_offset =
mlx5_os_get_devx_uar_mmap_offset(sh->tx_uar.obj);
+ if (priv->sh->config.txq_mem_algn)
+ priv->consec_tx_mem.cq_cur_off += txq_data->cq_mem_len;
ppriv->uar_table[txq_data->idx] = sh->tx_uar.bf_db;
dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
return 0;
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 00ffb39ecb..855d7518b9 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -51,52 +51,56 @@ static int
mlx5_txq_start(struct rte_eth_dev *dev)
{
struct mlx5_priv *priv = dev->data->dev_private;
- unsigned int i;
+ uint32_t log_max_wqe = log2above(mlx5_dev_get_max_wq_size(priv->sh));
+ uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
+ unsigned int i, cnt;
int ret;
- for (i = 0; i != priv->txqs_n; ++i) {
- struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
- struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
- uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
+ for (cnt = log_max_wqe; cnt > 0; cnt -= 1) {
+ for (i = 0; i != priv->txqs_n; ++i) {
+ struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+ struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
- if (!txq_ctrl)
- continue;
- if (!txq_ctrl->is_hairpin)
- txq_alloc_elts(txq_ctrl);
- MLX5_ASSERT(!txq_ctrl->obj);
- txq_ctrl->obj = mlx5_malloc_numa_tolerant(flags, sizeof(struct mlx5_txq_obj),
- 0, txq_ctrl->socket);
- if (!txq_ctrl->obj) {
- DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
- "memory resources.", dev->data->port_id,
- txq_data->idx);
- rte_errno = ENOMEM;
- goto error;
- }
- ret = priv->obj_ops.txq_obj_new(dev, i);
- if (ret < 0) {
- mlx5_free(txq_ctrl->obj);
- txq_ctrl->obj = NULL;
- goto error;
- }
- if (!txq_ctrl->is_hairpin) {
- size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
-
- txq_data->fcqs = mlx5_malloc_numa_tolerant(flags, size,
- RTE_CACHE_LINE_SIZE,
- txq_ctrl->socket);
- if (!txq_data->fcqs) {
- DRV_LOG(ERR, "Port %u Tx queue %u cannot "
- "allocate memory (FCQ).",
- dev->data->port_id, i);
+ if (!txq_ctrl || txq_data->elts_n != cnt)
+ continue;
+ if (!txq_ctrl->is_hairpin)
+ txq_alloc_elts(txq_ctrl);
+ MLX5_ASSERT(!txq_ctrl->obj);
+ txq_ctrl->obj = mlx5_malloc_numa_tolerant(flags,
+ sizeof(struct mlx5_txq_obj),
+ 0, txq_ctrl->socket);
+ if (!txq_ctrl->obj) {
+ DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
+ "memory resources.", dev->data->port_id,
+ txq_data->idx);
rte_errno = ENOMEM;
goto error;
}
- }
- DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
- dev->data->port_id, i, (void *)&txq_ctrl->obj);
- LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
+ ret = priv->obj_ops.txq_obj_new(dev, i);
+ if (ret < 0) {
+ mlx5_free(txq_ctrl->obj);
+ txq_ctrl->obj = NULL;
+ goto error;
+ }
+ if (!txq_ctrl->is_hairpin) {
+ size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
+
+ txq_data->fcqs = mlx5_malloc_numa_tolerant(flags, size,
+ RTE_CACHE_LINE_SIZE,
+ txq_ctrl->socket);
+ if (!txq_data->fcqs) {
+ DRV_LOG(ERR, "Port %u Tx queue %u cannot "
+ "allocate memory (FCQ).",
+ dev->data->port_id, i);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ }
+ DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
+ dev->data->port_id, i, (void *)&txq_ctrl->obj);
+ LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
}
+}
return 0;
error:
ret = rte_errno; /* Save rte_errno before cleanup. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v4 5/5] net/mlx5: use consecutive memory for Tx queue creation
2025-06-29 17:07 ` [PATCH v4 0/5] Use consecutive Tx queues' memory Bing Zhao
` (3 preceding siblings ...)
2025-06-29 17:07 ` [PATCH v4 4/5] net/mlx5: pass the information in Tx queue start Bing Zhao
@ 2025-06-29 17:07 ` Bing Zhao
4 siblings, 0 replies; 22+ messages in thread
From: Bing Zhao @ 2025-06-29 17:07 UTC (permalink / raw)
To: viacheslavo, matan; +Cc: dev, thomas, dsosnowski, suanmingm, rasland
The queue starting addresses offsets of a umem and doorbell offsets
are already passed to the Devx object creation function.
When the queue length is not zero, it means that the memory was
pre-allocated and the new object creation with consecutive memory
should be enabled.
When destroying the SQ / CQ objects, if it is in consecutive mode,
the umem and MR should not be released and the global resources
should only be released when stopping the device.
Signed-off-by: Bing Zhao <bingz@nvidia.com>
---
drivers/common/mlx5/mlx5_common_devx.c | 160 +++++++++++++++++--------
drivers/common/mlx5/mlx5_common_devx.h | 2 +
2 files changed, 110 insertions(+), 52 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_common_devx.c b/drivers/common/mlx5/mlx5_common_devx.c
index aace5283e7..e237558ec2 100644
--- a/drivers/common/mlx5/mlx5_common_devx.c
+++ b/drivers/common/mlx5/mlx5_common_devx.c
@@ -30,6 +30,8 @@ mlx5_devx_cq_destroy(struct mlx5_devx_cq *cq)
{
if (cq->cq)
claim_zero(mlx5_devx_cmd_destroy(cq->cq));
+ if (cq->consec)
+ return;
if (cq->umem_obj)
claim_zero(mlx5_os_umem_dereg(cq->umem_obj));
if (cq->umem_buf)
@@ -93,6 +95,7 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
uint32_t eqn;
uint32_t num_of_cqes = RTE_BIT32(log_desc_n);
int ret;
+ uint32_t umem_offset, umem_id;
if (page_size == (size_t)-1 || alignment == (size_t)-1) {
DRV_LOG(ERR, "Failed to get page_size.");
@@ -108,29 +111,44 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
}
/* Allocate memory buffer for CQEs and doorbell record. */
umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
- umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
- umem_size += MLX5_DBR_SIZE;
- umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
- alignment, socket);
- if (!umem_buf) {
- DRV_LOG(ERR, "Failed to allocate memory for CQ.");
- rte_errno = ENOMEM;
- return -rte_errno;
- }
- /* Register allocated buffer in user space with DevX. */
- umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
- IBV_ACCESS_LOCAL_WRITE);
- if (!umem_obj) {
- DRV_LOG(ERR, "Failed to register umem for CQ.");
- rte_errno = errno;
- goto error;
+ if (!attr->q_len) {
+ umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+ umem_size += MLX5_DBR_SIZE;
+ umem_buf = mlx5_malloc_numa_tolerant(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+ alignment, socket);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for CQ.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ /* Register allocated buffer in user space with DevX. */
+ umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for CQ.");
+ rte_errno = errno;
+ goto error;
+ }
+ umem_offset = 0;
+ umem_id = mlx5_os_get_umem_id(umem_obj);
+ } else {
+ if (umem_size != attr->q_len) {
+ DRV_LOG(ERR, "Mismatch between saved length and calc length of CQ %u-%u",
+ umem_size, attr->q_len);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ umem_buf = attr->umem;
+ umem_offset = attr->q_off;
+ umem_dbrec = attr->db_off;
+ umem_id = mlx5_os_get_umem_id(attr->umem_obj);
}
/* Fill attributes for CQ object creation. */
attr->q_umem_valid = 1;
- attr->q_umem_id = mlx5_os_get_umem_id(umem_obj);
- attr->q_umem_offset = 0;
+ attr->q_umem_id = umem_id;
+ attr->q_umem_offset = umem_offset;
attr->db_umem_valid = 1;
- attr->db_umem_id = attr->q_umem_id;
+ attr->db_umem_id = umem_id;
attr->db_umem_offset = umem_dbrec;
attr->eqn = eqn;
attr->log_cq_size = log_desc_n;
@@ -142,19 +160,29 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
rte_errno = ENOMEM;
goto error;
}
- cq_obj->umem_buf = umem_buf;
- cq_obj->umem_obj = umem_obj;
+ if (!attr->q_len) {
+ cq_obj->umem_buf = umem_buf;
+ cq_obj->umem_obj = umem_obj;
+ cq_obj->db_rec = RTE_PTR_ADD(cq_obj->umem_buf, umem_dbrec);
+ cq_obj->consec = false;
+ } else {
+ cq_obj->umem_buf = RTE_PTR_ADD(umem_buf, umem_offset);
+ cq_obj->umem_obj = attr->umem_obj;
+ cq_obj->db_rec = RTE_PTR_ADD(umem_buf, umem_dbrec);
+ cq_obj->consec = true;
+ }
cq_obj->cq = cq;
- cq_obj->db_rec = RTE_PTR_ADD(cq_obj->umem_buf, umem_dbrec);
/* Mark all CQEs initially as invalid. */
mlx5_cq_init(cq_obj, num_of_cqes);
return 0;
error:
ret = rte_errno;
- if (umem_obj)
- claim_zero(mlx5_os_umem_dereg(umem_obj));
- if (umem_buf)
- mlx5_free((void *)(uintptr_t)umem_buf);
+ if (!attr->q_len) {
+ if (umem_obj)
+ claim_zero(mlx5_os_umem_dereg(umem_obj));
+ if (umem_buf)
+ mlx5_free((void *)(uintptr_t)umem_buf);
+ }
rte_errno = ret;
return -rte_errno;
}
@@ -171,6 +199,8 @@ mlx5_devx_sq_destroy(struct mlx5_devx_sq *sq)
{
if (sq->sq)
claim_zero(mlx5_devx_cmd_destroy(sq->sq));
+ if (sq->consec)
+ return;
if (sq->umem_obj)
claim_zero(mlx5_os_umem_dereg(sq->umem_obj));
if (sq->umem_buf)
@@ -220,6 +250,7 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
uint32_t umem_size, umem_dbrec;
uint32_t num_of_wqbbs = RTE_BIT32(log_wqbb_n);
int ret;
+ uint32_t umem_offset, umem_id;
if (alignment == (size_t)-1) {
DRV_LOG(ERR, "Failed to get WQE buf alignment.");
@@ -228,30 +259,45 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
}
/* Allocate memory buffer for WQEs and doorbell record. */
umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
- umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
- umem_size += MLX5_DBR_SIZE;
- umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
- alignment, socket);
- if (!umem_buf) {
- DRV_LOG(ERR, "Failed to allocate memory for SQ.");
- rte_errno = ENOMEM;
- return -rte_errno;
- }
- /* Register allocated buffer in user space with DevX. */
- umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
- IBV_ACCESS_LOCAL_WRITE);
- if (!umem_obj) {
- DRV_LOG(ERR, "Failed to register umem for SQ.");
- rte_errno = errno;
- goto error;
+ if (!attr->q_len) {
+ umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+ umem_size += MLX5_DBR_SIZE;
+ umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+ alignment, socket);
+ if (!umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for SQ.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ /* Register allocated buffer in user space with DevX. */
+ umem_obj = mlx5_os_umem_reg(ctx, (void *)(uintptr_t)umem_buf, umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for SQ.");
+ rte_errno = errno;
+ goto error;
+ }
+ umem_offset = 0;
+ umem_id = mlx5_os_get_umem_id(umem_obj);
+ } else {
+ if (umem_size != attr->q_len) {
+ DRV_LOG(ERR, "Mismatch between saved length and calc length of WQ %u-%u",
+ umem_size, attr->q_len);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ umem_buf = attr->umem;
+ umem_offset = attr->q_off;
+ umem_dbrec = attr->db_off;
+ umem_id = mlx5_os_get_umem_id(attr->umem_obj);
}
/* Fill attributes for SQ object creation. */
attr->wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
attr->wq_attr.wq_umem_valid = 1;
- attr->wq_attr.wq_umem_id = mlx5_os_get_umem_id(umem_obj);
- attr->wq_attr.wq_umem_offset = 0;
+ attr->wq_attr.wq_umem_id = umem_id;
+ attr->wq_attr.wq_umem_offset = umem_offset;
attr->wq_attr.dbr_umem_valid = 1;
- attr->wq_attr.dbr_umem_id = attr->wq_attr.wq_umem_id;
+ attr->wq_attr.dbr_umem_id = umem_id;
attr->wq_attr.dbr_addr = umem_dbrec;
attr->wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
attr->wq_attr.log_wq_sz = log_wqbb_n;
@@ -263,17 +309,27 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
rte_errno = ENOMEM;
goto error;
}
- sq_obj->umem_buf = umem_buf;
- sq_obj->umem_obj = umem_obj;
+ if (!attr->q_len) {
+ sq_obj->umem_buf = umem_buf;
+ sq_obj->umem_obj = umem_obj;
+ sq_obj->db_rec = RTE_PTR_ADD(sq_obj->umem_buf, umem_dbrec);
+ sq_obj->consec = false;
+ } else {
+ sq_obj->umem_buf = RTE_PTR_ADD(umem_buf, attr->q_off);
+ sq_obj->umem_obj = attr->umem_obj;
+ sq_obj->db_rec = RTE_PTR_ADD(umem_buf, attr->db_off);
+ sq_obj->consec = true;
+ }
sq_obj->sq = sq;
- sq_obj->db_rec = RTE_PTR_ADD(sq_obj->umem_buf, umem_dbrec);
return 0;
error:
ret = rte_errno;
- if (umem_obj)
- claim_zero(mlx5_os_umem_dereg(umem_obj));
- if (umem_buf)
- mlx5_free((void *)(uintptr_t)umem_buf);
+ if (!attr->q_len) {
+ if (umem_obj)
+ claim_zero(mlx5_os_umem_dereg(umem_obj));
+ if (umem_buf)
+ mlx5_free((void *)(uintptr_t)umem_buf);
+ }
rte_errno = ret;
return -rte_errno;
}
diff --git a/drivers/common/mlx5/mlx5_common_devx.h b/drivers/common/mlx5/mlx5_common_devx.h
index 743f06042c..4cb9111dbb 100644
--- a/drivers/common/mlx5/mlx5_common_devx.h
+++ b/drivers/common/mlx5/mlx5_common_devx.h
@@ -21,6 +21,7 @@ struct mlx5_devx_cq {
volatile struct mlx5_cqe *cqes; /* The CQ ring buffer. */
};
volatile uint32_t *db_rec; /* The CQ doorbell record. */
+ bool consec; /* Using consecutive memory. */
};
/* DevX Send Queue structure. */
@@ -33,6 +34,7 @@ struct mlx5_devx_sq {
volatile struct mlx5_aso_wqe *aso_wqes;
};
volatile uint32_t *db_rec; /* The SQ doorbell record. */
+ bool consec; /* Using consecutive memory. */
};
/* DevX Queue Pair structure. */
--
2.34.1
^ permalink raw reply [flat|nested] 22+ messages in thread