From: Rongwei Liu <rongweil@nvidia.com>
To: <matan@nvidia.com>, <viacheslavo@nvidia.com>, <orika@nvidia.com>,
<thomas@monjalon.net>
Cc: <dev@dpdk.org>, <rasland@nvidia.com>
Subject: [dpdk-dev] [PATCH v1 2/2] net/mlx5: set txq affinity in round-robin
Date: Thu, 21 Oct 2021 11:56:36 +0300 [thread overview]
Message-ID: <20211021085637.3627922-3-rongweil@nvidia.com> (raw)
In-Reply-To: <20211021085637.3627922-1-rongweil@nvidia.com>
Previously, we set txq affinity to 0 and let firmware
to perform round-robin when bonding. Firmware uses a
global counter to assign txq affinity to different
physical ports accord to remainder after division.
There are three dis-advantages:
1. The global counter is shared between kernel and dpdk.
2. After restarting pmd or port, the previous counter value
is reused, so the new affinity is unpredictable.
3. There is no way to get what affinity is set by firmware.
In this update, we will create several TISs up to the
number of bonding ports and bind each TIS to one PF port.
For each port, it will start to pick up TIS using its port
index. Upper layer application can quickly calculate each txq's
affinity without querying.
At DPDK layer, when creating txq with 2 bonding ports, the
affinity is set like:
port 0: 1-->2-->1-->2
port 1: 2-->1-->2-->1
port 2: 1-->2-->1-->2
Note: Only applicable to DevX api.
This affinity subjects to HW hash.
Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
doc/guides/nics/mlx5.rst | 4 ++
drivers/net/mlx5/linux/mlx5_os.c | 2 +-
drivers/net/mlx5/mlx5.c | 81 ++++++++++++++++++++++++++++----
drivers/net/mlx5/mlx5.h | 10 +++-
drivers/net/mlx5/mlx5_devx.c | 37 ++++++++++++++-
drivers/net/mlx5/mlx5_txpp.c | 4 +-
6 files changed, 124 insertions(+), 14 deletions(-)
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 7b540504f9..dd059b227d 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -464,6 +464,10 @@ Limitations
- In order to achieve best insertion rate, application should manage the flows per lcore.
- Better to disable memory reclaim by setting ``reclaim_mem_mode`` to 0 to accelerate the flow object allocation and release with cache.
+- HW hashed bonding
+
+ - TXQ affinity subjects to HW hash once enabled.
+
Statistics
----------
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 8a25ec8730..7356c91c92 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -878,7 +878,6 @@ mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
return false;
}
-
/**
* Spawn an Ethernet device from Verbs information.
*
@@ -1668,6 +1667,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
*/
MLX5_ASSERT(spawn->ifindex);
priv->if_index = spawn->ifindex;
+ priv->lag_affinity_idx = sh->refcnt - 1;
eth_dev->data->dev_private = priv;
priv->dev_data = eth_dev->data;
eth_dev->data->mac_addrs = priv->mac;
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index c712fc3465..ae54b18ad5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1256,6 +1256,68 @@ mlx5_dev_ctx_shared_mempool_subscribe(struct rte_eth_dev *dev)
return 0;
}
+/**
+ * Set up multiple TISs with different affinities according to
+ * number of bonding ports
+ *
+ * @param priv
+ * Pointer of shared context.
+ *
+ * @return
+ * Zero on success, -1 otherwise.
+ */
+static int
+mlx5_setup_tis(struct mlx5_dev_ctx_shared *sh)
+{
+ int i;
+ struct mlx5_devx_lag_context lag_ctx = { 0 };
+ struct mlx5_devx_tis_attr tis_attr = { 0 };
+
+ tis_attr.transport_domain = sh->td->id;
+ if (sh->bond.n_port) {
+ if (!mlx5_devx_cmd_query_lag(sh->ctx, &lag_ctx)) {
+ sh->lag.tx_remap_affinity[0] =
+ lag_ctx.tx_remap_affinity_1;
+ sh->lag.tx_remap_affinity[1] =
+ lag_ctx.tx_remap_affinity_2;
+ sh->lag.affinity_mode = lag_ctx.port_select_mode;
+ } else {
+ DRV_LOG(ERR, "Failed to query lag affinity.");
+ return -1;
+ }
+ if (sh->lag.affinity_mode == MLX5_LAG_MODE_TIS) {
+ for (i = 0; i < sh->bond.n_port; i++) {
+ tis_attr.lag_tx_port_affinity =
+ MLX5_IFC_LAG_MAP_TIS_AFFINITY(i,
+ sh->bond.n_port);
+ sh->tis[i] = mlx5_devx_cmd_create_tis(sh->ctx,
+ &tis_attr);
+ if (!sh->tis[i]) {
+ DRV_LOG(ERR, "Failed to TIS %d/%d for bonding device"
+ " %s.", i, sh->bond.n_port,
+ sh->ibdev_name);
+ return -1;
+ }
+ }
+ DRV_LOG(DEBUG, "LAG number of ports : %d, affinity_1 & 2 : pf%d & %d.\n",
+ sh->bond.n_port, lag_ctx.tx_remap_affinity_1,
+ lag_ctx.tx_remap_affinity_2);
+ return 0;
+ }
+ if (sh->lag.affinity_mode == MLX5_LAG_MODE_HASH)
+ DRV_LOG(INFO, "Device %s enabled HW hash based LAG.",
+ sh->ibdev_name);
+ }
+ tis_attr.lag_tx_port_affinity = 0;
+ sh->tis[0] = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+ if (!sh->tis[0]) {
+ DRV_LOG(ERR, "Failed to TIS 0 for bonding device"
+ " %s.", sh->ibdev_name);
+ return -1;
+ }
+ return 0;
+}
+
/**
* Allocate shared device context. If there is multiport device the
* master and representors will share this context, if there is single
@@ -1283,7 +1345,6 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
struct mlx5_dev_ctx_shared *sh;
int err = 0;
uint32_t i;
- struct mlx5_devx_tis_attr tis_attr = { 0 };
MLX5_ASSERT(spawn);
/* Secondary process should not create the shared context. */
@@ -1354,9 +1415,7 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
err = ENOMEM;
goto error;
}
- tis_attr.transport_domain = sh->td->id;
- sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
- if (!sh->tis) {
+ if (mlx5_setup_tis(sh)) {
DRV_LOG(ERR, "TIS allocation failure");
err = ENOMEM;
goto error;
@@ -1420,10 +1479,13 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
MLX5_ASSERT(sh);
if (sh->share_cache.cache.table)
mlx5_mr_btree_free(&sh->share_cache.cache);
- if (sh->tis)
- claim_zero(mlx5_devx_cmd_destroy(sh->tis));
if (sh->td)
claim_zero(mlx5_devx_cmd_destroy(sh->td));
+ i = 0;
+ do {
+ if (sh->tis[i])
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis[i]));
+ } while (++i < (uint32_t)sh->bond.n_port);
if (sh->devx_rx_uar)
mlx5_glue->devx_free_uar(sh->devx_rx_uar);
if (sh->tx_uar)
@@ -1449,6 +1511,7 @@ void
mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
{
int ret;
+ int i = 0;
pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
#ifdef RTE_LIBRTE_MLX5_DEBUG
@@ -1510,8 +1573,10 @@ mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
}
if (sh->pd)
claim_zero(mlx5_os_dealloc_pd(sh->pd));
- if (sh->tis)
- claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+ do {
+ if (sh->tis[i])
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis[i]));
+ } while (++i < sh->bond.n_port);
if (sh->td)
claim_zero(mlx5_devx_cmd_destroy(sh->td));
if (sh->devx_rx_uar)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index adab9dc052..dc385a8cbb 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1120,6 +1120,12 @@ struct mlx5_aso_ct_pools_mng {
struct mlx5_aso_sq aso_sq; /* ASO queue objects. */
};
+/* LAG attr. */
+struct mlx5_lag {
+ uint8_t tx_remap_affinity[16]; /* The PF port number of affinity */
+ uint8_t affinity_mode; /* TIS or hash based affinity */
+};
+
/*
* Shared Infiniband device context for Master/Representors
* which belong to same IB device with multiple IB ports.
@@ -1187,8 +1193,9 @@ struct mlx5_dev_ctx_shared {
struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
void *devx_comp; /* DEVX async comp obj. */
- struct mlx5_devx_obj *tis; /* TIS object. */
+ struct mlx5_devx_obj *tis[16]; /* TIS object. */
struct mlx5_devx_obj *td; /* Transport domain. */
+ struct mlx5_lag lag; /* LAG attributes */
void *tx_uar; /* Tx/packet pacing shared UAR. */
struct mlx5_flex_parser_profiles fp[MLX5_FLEX_PARSER_MAX];
/* Flex parser profiles information. */
@@ -1454,6 +1461,7 @@ struct mlx5_priv {
uint32_t rss_shared_actions; /* RSS shared actions. */
struct mlx5_devx_obj *q_counters; /* DevX queue counter object. */
uint32_t counter_set_id; /* Queue counter ID to set in DevX objects. */
+ uint32_t lag_affinity_idx; /* LAG mode queue 0 affinity starting. */
};
#define PORT_ID(priv) ((priv)->dev_data->port_id)
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a49602cb95..a24b1b897d 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -888,6 +888,37 @@ mlx5_devx_drop_action_destroy(struct rte_eth_dev *dev)
rte_errno = ENOTSUP;
}
+/**
+ * Select TXQ TIS number.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param queue_idx
+ * Queue index in DPDK Tx queue array.
+ *
+ * @return
+ * > 0 on success, a negative errno value otherwise.
+ */
+static uint32_t
+mlx5_get_txq_tis_num(struct rte_eth_dev *dev, uint16_t queue_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int tis_idx;
+
+ if (priv->sh->bond.n_port && priv->sh->lag.affinity_mode ==
+ MLX5_LAG_MODE_TIS) {
+ tis_idx = (priv->lag_affinity_idx + queue_idx) %
+ priv->sh->bond.n_port;
+ DRV_LOG(INFO, "port %d txq %d gets affinity %d and maps to PF %d.",
+ dev->data->port_id, queue_idx, tis_idx + 1,
+ priv->sh->lag.tx_remap_affinity[tis_idx]);
+ } else {
+ tis_idx = 0;
+ }
+ MLX5_ASSERT(priv->sh->tis[tis_idx]);
+ return priv->sh->tis[tis_idx]->id;
+}
+
/**
* Create the Tx hairpin queue object.
*
@@ -935,7 +966,8 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
attr.wq_attr.log_hairpin_num_packets =
attr.wq_attr.log_hairpin_data_sz -
MLX5_HAIRPIN_QUEUE_STRIDE;
- attr.tis_num = priv->sh->tis->id;
+
+ attr.tis_num = mlx5_get_txq_tis_num(dev, idx);
tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr);
if (!tmpl->sq) {
DRV_LOG(ERR,
@@ -992,14 +1024,15 @@ mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
.allow_swp = !!priv->config.swp,
.cqn = txq_obj->cq_obj.cq->id,
.tis_lst_sz = 1,
- .tis_num = priv->sh->tis->id,
.wq_attr = (struct mlx5_devx_wq_attr){
.pd = priv->sh->pdn,
.uar_page =
mlx5_os_get_devx_uar_page_id(priv->sh->tx_uar),
},
.ts_format = mlx5_ts_format_conv(priv->sh->sq_ts_format),
+ .tis_num = mlx5_get_txq_tis_num(dev, idx),
};
+
/* Create Send Queue object with DevX. */
return mlx5_devx_sq_create(priv->sh->ctx, &txq_obj->sq_obj, log_desc_n,
&sq_attr, priv->sh->numa_node);
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 2be7e71f89..6e874fa090 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -230,7 +230,7 @@ mlx5_txpp_create_rearm_queue(struct mlx5_dev_ctx_shared *sh)
.cd_master = 1,
.state = MLX5_SQC_STATE_RST,
.tis_lst_sz = 1,
- .tis_num = sh->tis->id,
+ .tis_num = sh->tis[0]->id,
.wq_attr = (struct mlx5_devx_wq_attr){
.pd = sh->pdn,
.uar_page = mlx5_os_get_devx_uar_page_id(sh->tx_uar),
@@ -433,7 +433,7 @@ mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
/* Create send queue object for Clock Queue. */
if (sh->txpp.test) {
sq_attr.tis_lst_sz = 1;
- sq_attr.tis_num = sh->tis->id;
+ sq_attr.tis_num = sh->tis[0]->id;
sq_attr.non_wire = 0;
sq_attr.static_sq_wq = 1;
} else {
--
2.27.0
next prev parent reply other threads:[~2021-10-21 8:57 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-21 8:56 [dpdk-dev] [PATCH v1 0/2] " Rongwei Liu
2021-10-21 8:56 ` [dpdk-dev] [PATCH v1 1/2] common/mlx5: support lag context query Rongwei Liu
2021-10-21 8:56 ` Rongwei Liu [this message]
2021-10-21 12:25 ` [dpdk-dev] [PATCH v1 0/2] set txq affinity in round-robin Raslan Darawsheh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211021085637.3627922-3-rongweil@nvidia.com \
--to=rongweil@nvidia.com \
--cc=dev@dpdk.org \
--cc=matan@nvidia.com \
--cc=orika@nvidia.com \
--cc=rasland@nvidia.com \
--cc=thomas@monjalon.net \
--cc=viacheslavo@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).