DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/mlx5: support bus socket with no hugepages
@ 2023-06-21  7:30 Hari Sasank
  2023-10-30 10:43 ` Slava Ovsiienko
  0 siblings, 1 reply; 4+ messages in thread
From: Hari Sasank @ 2023-06-21  7:30 UTC (permalink / raw)
  To: Matan Azrad, Viacheslav Ovsiienko, Ori Kam, Suanming Mou; +Cc: dev, Hari Sasank

When a Mellanox NIC is attached to a bus on a numa
socket, it tries to allocate rte memory in that socket.
If hugepages are not configured/available on that rte socket
mlx5_common_pci_probe fails with ENOMEM.

In this patch, a memflag MLX5_MEM_FALLBACK_ANY_SOCKET is
introduced which when set on mlx5_malloc, will allocate
the memory using SOCKET_ID_ANY if it is not able to allocate
memory on the specified socket. This allocates memory on
any socket starting with the current thread's socket.

Signed-off-by: Hari Sasank <harisasank@outlook.com>
---
 drivers/common/mlx5/mlx5_common_devx.c | 9 ++++++---
 drivers/common/mlx5/mlx5_common_mr.c   | 5 +++--
 drivers/common/mlx5/mlx5_malloc.c      | 7 +++++++
 drivers/common/mlx5/mlx5_malloc.h      | 4 ++++
 drivers/net/mlx5/mlx5.c                | 3 ++-
 drivers/net/mlx5/mlx5_devx.c           | 3 ++-
 drivers/net/mlx5/mlx5_rxq.c            | 3 ++-
 drivers/net/mlx5/mlx5_trigger.c        | 6 ++++--
 drivers/net/mlx5/mlx5_txpp.c           | 3 ++-
 drivers/net/mlx5/mlx5_txq.c            | 3 ++-
 10 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_common_devx.c b/drivers/common/mlx5/mlx5_common_devx.c
index 431d8361cebd..122f1c65eab6 100644
--- a/drivers/common/mlx5/mlx5_common_devx.c
+++ b/drivers/common/mlx5/mlx5_common_devx.c
@@ -107,7 +107,8 @@ mlx5_devx_cq_create(void *ctx, struct mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
 	umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
 	umem_size += MLX5_DBR_SIZE;
-	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			       MLX5_MEM_FALLBACK_ANY_SOCKET, umem_size,
 			       alignment, socket);
 	if (!umem_buf) {
 		DRV_LOG(ERR, "Failed to allocate memory for CQ.");
@@ -225,7 +226,8 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq *sq_obj, uint16_t log_wqbb_n,
 	umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
 	umem_size += MLX5_DBR_SIZE;
-	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			       MLX5_MEM_FALLBACK_ANY_SOCKET, umem_size,
 			       alignment, socket);
 	if (!umem_buf) {
 		DRV_LOG(ERR, "Failed to allocate memory for SQ.");
@@ -476,7 +478,8 @@ mlx5_devx_wq_init(void *ctx, uint32_t wqe_size, uint16_t log_wqbb_n, int socket,
 	umem_size = wqe_size * (1 << log_wqbb_n);
 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
 	umem_size += MLX5_DBR_SIZE;
-	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			       MLX5_MEM_FALLBACK_ANY_SOCKET, umem_size,
 			       alignment, socket);
 	if (!umem_buf) {
 		DRV_LOG(ERR, "Failed to allocate memory for RQ.");
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index 7b14b0c7bf1e..b2ad6a249732 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -223,7 +223,8 @@ mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket)
 	}
 	MLX5_ASSERT(!bt->table && !bt->size);
 	memset(bt, 0, sizeof(*bt));
-	bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+	bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+				MLX5_MEM_FALLBACK_ANY_SOCKET,
 				sizeof(struct mr_cache_entry) * n,
 				0, socket);
 	if (bt->table == NULL) {
@@ -767,7 +768,7 @@ mlx5_mr_create_primary(void *pd,
 	      (void *)addr, data.start, data.end, msl->page_sz, ms_n);
 	/* Size of memory for bitmap. */
 	bmp_size = rte_bitmap_get_memory_footprint(ms_n);
-	mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO,
+	mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO | MLX5_MEM_FALLBACK_ANY_SOCKET,
 			 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) +
 			 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
 	if (mr == NULL) {
diff --git a/drivers/common/mlx5/mlx5_malloc.c b/drivers/common/mlx5/mlx5_malloc.c
index c58c41da9266..e109f1bfa994 100644
--- a/drivers/common/mlx5/mlx5_malloc.c
+++ b/drivers/common/mlx5/mlx5_malloc.c
@@ -182,6 +182,13 @@ mlx5_malloc(uint32_t flags, size_t size, unsigned int align, int socket)
 			addr = rte_zmalloc_socket(NULL, size, align, socket);
 		else
 			addr = rte_malloc_socket(NULL, size, align, socket);
+		if (!addr && socket != SOCKET_ID_ANY &&
+		    (flags & MLX5_MEM_FALLBACK_ANY_SOCKET)) {
+			if (flags & MLX5_MEM_ZERO)
+				addr = rte_zmalloc_socket(NULL, size, align, SOCKET_ID_ANY);
+			else
+				addr = rte_malloc_socket(NULL, size, align, SOCKET_ID_ANY);
+		}
 		mlx5_mem_update_msl(addr);
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 		if (addr)
diff --git a/drivers/common/mlx5/mlx5_malloc.h b/drivers/common/mlx5/mlx5_malloc.h
index 9086a4f3f22e..cd57f95a629e 100644
--- a/drivers/common/mlx5/mlx5_malloc.h
+++ b/drivers/common/mlx5/mlx5_malloc.h
@@ -28,6 +28,10 @@ enum mlx5_mem_flags {
 	/* Memory should be allocated from rte hugepage. */
 	MLX5_MEM_ZERO = 1 << 2,
 	/* Memory should be cleared to zero. */
+	MLX5_MEM_FALLBACK_ANY_SOCKET = 1 << 3,
+	/* Memory can be allocated on any socket if
+	 * it fails to allocate on the given socket.
+	 */
 };
 
 /**
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f9aea1318736..5b520d468299 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -2063,7 +2063,8 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
 	 */
 	ppriv_size = sizeof(struct mlx5_proc_priv) +
 		     priv->txqs_n * sizeof(struct mlx5_uar_data);
-	ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, ppriv_size,
+	ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			    MLX5_MEM_FALLBACK_ANY_SOCKET, ppriv_size,
 			    RTE_CACHE_LINE_SIZE, dev->device->numa_node);
 	if (!ppriv) {
 		rte_errno = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 4369d2557e9e..47a925e5913a 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -1285,7 +1285,8 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
 			RTE_BIT32(host_mem_attr.wq_attr.log_hairpin_num_packets);
 		umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
 		umem_size += MLX5_DBR_SIZE;
-		umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, umem_size,
+		umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+				       MLX5_MEM_FALLBACK_ANY_SOCKET, umem_size,
 				       alignment, priv->sh->numa_node);
 		if (umem_buf == NULL && txq_ctrl->hairpin_conf.force_memory) {
 			DRV_LOG(ERR, "Failed to allocate memory for hairpin TX queue");
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index ad8fd13cbe8e..6bdf1678e499 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1728,7 +1728,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		desc >>= mprq_log_actual_stride_num;
 		alloc_size += desc * sizeof(struct mlx5_mprq_buf *);
 	}
-	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, alloc_size, 0, socket);
+	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			   MLX5_MEM_FALLBACK_ANY_SOCKET, alloc_size, 0, socket);
 	if (!tmpl) {
 		rte_errno = ENOMEM;
 		return NULL;
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index bbaa7d2aa021..1a6abdf2b61d 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -55,7 +55,8 @@ mlx5_txq_start(struct rte_eth_dev *dev)
 	for (i = 0; i != priv->txqs_n; ++i) {
 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
-		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
+		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO |
+				 MLX5_MEM_FALLBACK_ANY_SOCKET;
 
 		if (!txq_ctrl)
 			continue;
@@ -180,7 +181,8 @@ mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 			return ret;
 	}
 	MLX5_ASSERT(!rxq_ctrl->obj);
-	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+				    MLX5_MEM_FALLBACK_ANY_SOCKET,
 				    sizeof(*rxq_ctrl->obj), 0,
 				    rxq_ctrl->socket);
 	if (!rxq_ctrl->obj) {
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 5a5df2d1bb16..c81ae7bfd328 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -394,7 +394,8 @@ mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
 	int ret;
 
-	sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+	sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+				   MLX5_MEM_FALLBACK_ANY_SOCKET,
 				   MLX5_TXPP_REARM_SQ_SIZE *
 				   sizeof(struct mlx5_txpp_ts),
 				   0, sh->numa_node);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 8cb52b0f7d8e..b83e798544d2 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1074,7 +1074,8 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_txq_ctrl *tmpl;
 
-	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
+	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
+			   MLX5_MEM_FALLBACK_ANY_SOCKET, sizeof(*tmpl) +
 			   desc * sizeof(struct rte_mbuf *), 0, socket);
 	if (!tmpl) {
 		rte_errno = ENOMEM;
-- 
2.39.2


^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] net/mlx5: support bus socket with no hugepages
  2023-06-21  7:30 [PATCH] net/mlx5: support bus socket with no hugepages Hari Sasank
@ 2023-10-30 10:43 ` Slava Ovsiienko
  2023-10-30 11:39   ` Hari Sasank
  0 siblings, 1 reply; 4+ messages in thread
From: Slava Ovsiienko @ 2023-10-30 10:43 UTC (permalink / raw)
  To: Hari Sasank, Matan Azrad, Ori Kam, Suanming Mou; +Cc: dev

Hi, Hari

As I see almost all updates, using newly introduced MLX5_MEM_FALLBACK_ANY_SOCKET flag,
are related to the memory mapped for the hardware usage (NIC accesses these areas with DMA
over the PCIe bus segment NIC attached to). It means the memory allegiance to the specific
is strong and might be critical for the performance.

In general, mlx5 PMD is designed in way requesting the memory on the specific socket only
If, and only if it is really needed.

Generally speaking, missing huge pages on the socket to which the NIC is attached to
should be considered as misconfiguration.  Could you, please, elaborate a little bit more, what is a use case
for this scenario?

With best regards,
Slava

> -----Original Message-----
> From: Hari Sasank <harisasank@outlook.com>
> Sent: Wednesday, June 21, 2023 10:31 AM
> To: Matan Azrad <matan@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>; Suanming Mou
> <suanmingm@nvidia.com>
> Cc: dev@dpdk.org; Hari Sasank <harisasank@outlook.com>
> Subject: [PATCH] net/mlx5: support bus socket with no hugepages
> 
> When a Mellanox NIC is attached to a bus on a numa socket, it tries to
> allocate rte memory in that socket.
> If hugepages are not configured/available on that rte socket
> mlx5_common_pci_probe fails with ENOMEM.
> 
> In this patch, a memflag MLX5_MEM_FALLBACK_ANY_SOCKET is introduced
> which when set on mlx5_malloc, will allocate the memory using
> SOCKET_ID_ANY if it is not able to allocate memory on the specified socket.
> This allocates memory on any socket starting with the current thread's
> socket.
> 
> Signed-off-by: Hari Sasank <harisasank@outlook.com>
> ---
>  drivers/common/mlx5/mlx5_common_devx.c | 9 ++++++---
>  drivers/common/mlx5/mlx5_common_mr.c   | 5 +++--
>  drivers/common/mlx5/mlx5_malloc.c      | 7 +++++++
>  drivers/common/mlx5/mlx5_malloc.h      | 4 ++++
>  drivers/net/mlx5/mlx5.c                | 3 ++-
>  drivers/net/mlx5/mlx5_devx.c           | 3 ++-
>  drivers/net/mlx5/mlx5_rxq.c            | 3 ++-
>  drivers/net/mlx5/mlx5_trigger.c        | 6 ++++--
>  drivers/net/mlx5/mlx5_txpp.c           | 3 ++-
>  drivers/net/mlx5/mlx5_txq.c            | 3 ++-
>  10 files changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/common/mlx5/mlx5_common_devx.c
> b/drivers/common/mlx5/mlx5_common_devx.c
> index 431d8361cebd..122f1c65eab6 100644
> --- a/drivers/common/mlx5/mlx5_common_devx.c
> +++ b/drivers/common/mlx5/mlx5_common_devx.c
> @@ -107,7 +107,8 @@ mlx5_devx_cq_create(void *ctx, struct
> mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
>  	umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
>  	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>  	umem_size += MLX5_DBR_SIZE;
> -	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			       MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>  			       alignment, socket);
>  	if (!umem_buf) {
>  		DRV_LOG(ERR, "Failed to allocate memory for CQ."); @@ -
> 225,7 +226,8 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq
> *sq_obj, uint16_t log_wqbb_n,
>  	umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
>  	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>  	umem_size += MLX5_DBR_SIZE;
> -	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			       MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>  			       alignment, socket);
>  	if (!umem_buf) {
>  		DRV_LOG(ERR, "Failed to allocate memory for SQ."); @@ -
> 476,7 +478,8 @@ mlx5_devx_wq_init(void *ctx, uint32_t wqe_size, uint16_t
> log_wqbb_n, int socket,
>  	umem_size = wqe_size * (1 << log_wqbb_n);
>  	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>  	umem_size += MLX5_DBR_SIZE;
> -	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +	umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			       MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>  			       alignment, socket);
>  	if (!umem_buf) {
>  		DRV_LOG(ERR, "Failed to allocate memory for RQ."); diff --
> git a/drivers/common/mlx5/mlx5_common_mr.c
> b/drivers/common/mlx5/mlx5_common_mr.c
> index 7b14b0c7bf1e..b2ad6a249732 100644
> --- a/drivers/common/mlx5/mlx5_common_mr.c
> +++ b/drivers/common/mlx5/mlx5_common_mr.c
> @@ -223,7 +223,8 @@ mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n,
> int socket)
>  	}
>  	MLX5_ASSERT(!bt->table && !bt->size);
>  	memset(bt, 0, sizeof(*bt));
> -	bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +	bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +				MLX5_MEM_FALLBACK_ANY_SOCKET,
>  				sizeof(struct mr_cache_entry) * n,
>  				0, socket);
>  	if (bt->table == NULL) {
> @@ -767,7 +768,7 @@ mlx5_mr_create_primary(void *pd,
>  	      (void *)addr, data.start, data.end, msl->page_sz, ms_n);
>  	/* Size of memory for bitmap. */
>  	bmp_size = rte_bitmap_get_memory_footprint(ms_n);
> -	mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO,
> +	mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO |
> +MLX5_MEM_FALLBACK_ANY_SOCKET,
>  			 RTE_ALIGN_CEIL(sizeof(*mr),
> RTE_CACHE_LINE_SIZE) +
>  			 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
>  	if (mr == NULL) {
> diff --git a/drivers/common/mlx5/mlx5_malloc.c
> b/drivers/common/mlx5/mlx5_malloc.c
> index c58c41da9266..e109f1bfa994 100644
> --- a/drivers/common/mlx5/mlx5_malloc.c
> +++ b/drivers/common/mlx5/mlx5_malloc.c
> @@ -182,6 +182,13 @@ mlx5_malloc(uint32_t flags, size_t size, unsigned int
> align, int socket)
>  			addr = rte_zmalloc_socket(NULL, size, align, socket);
>  		else
>  			addr = rte_malloc_socket(NULL, size, align, socket);
> +		if (!addr && socket != SOCKET_ID_ANY &&
> +		    (flags & MLX5_MEM_FALLBACK_ANY_SOCKET)) {
> +			if (flags & MLX5_MEM_ZERO)
> +				addr = rte_zmalloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +			else
> +				addr = rte_malloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +		}
>  		mlx5_mem_update_msl(addr);
>  #ifdef RTE_LIBRTE_MLX5_DEBUG
>  		if (addr)
> diff --git a/drivers/common/mlx5/mlx5_malloc.h
> b/drivers/common/mlx5/mlx5_malloc.h
> index 9086a4f3f22e..cd57f95a629e 100644
> --- a/drivers/common/mlx5/mlx5_malloc.h
> +++ b/drivers/common/mlx5/mlx5_malloc.h
> @@ -28,6 +28,10 @@ enum mlx5_mem_flags {
>  	/* Memory should be allocated from rte hugepage. */
>  	MLX5_MEM_ZERO = 1 << 2,
>  	/* Memory should be cleared to zero. */
> +	MLX5_MEM_FALLBACK_ANY_SOCKET = 1 << 3,
> +	/* Memory can be allocated on any socket if
> +	 * it fails to allocate on the given socket.
> +	 */
>  };
> 
>  /**
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> f9aea1318736..5b520d468299 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -2063,7 +2063,8 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
>  	 */
>  	ppriv_size = sizeof(struct mlx5_proc_priv) +
>  		     priv->txqs_n * sizeof(struct mlx5_uar_data);
> -	ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> ppriv_size,
> +	ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			    MLX5_MEM_FALLBACK_ANY_SOCKET, ppriv_size,
>  			    RTE_CACHE_LINE_SIZE, dev->device-
> >numa_node);
>  	if (!ppriv) {
>  		rte_errno = ENOMEM;
> diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
> index 4369d2557e9e..47a925e5913a 100644
> --- a/drivers/net/mlx5/mlx5_devx.c
> +++ b/drivers/net/mlx5/mlx5_devx.c
> @@ -1285,7 +1285,8 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev
> *dev, uint16_t idx)
> 
> 	RTE_BIT32(host_mem_attr.wq_attr.log_hairpin_num_packets);
>  		umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>  		umem_size += MLX5_DBR_SIZE;
> -		umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO, umem_size,
> +		umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO |
> +				       MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>  				       alignment, priv->sh->numa_node);
>  		if (umem_buf == NULL && txq_ctrl-
> >hairpin_conf.force_memory) {
>  			DRV_LOG(ERR, "Failed to allocate memory for
> hairpin TX queue"); diff --git a/drivers/net/mlx5/mlx5_rxq.c
> b/drivers/net/mlx5/mlx5_rxq.c index ad8fd13cbe8e..6bdf1678e499 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1728,7 +1728,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  		desc >>= mprq_log_actual_stride_num;
>  		alloc_size += desc * sizeof(struct mlx5_mprq_buf *);
>  	}
> -	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> alloc_size, 0, socket);
> +	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			   MLX5_MEM_FALLBACK_ANY_SOCKET, alloc_size,
> 0, socket);
>  	if (!tmpl) {
>  		rte_errno = ENOMEM;
>  		return NULL;
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index bbaa7d2aa021..1a6abdf2b61d
> 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -55,7 +55,8 @@ mlx5_txq_start(struct rte_eth_dev *dev)
>  	for (i = 0; i != priv->txqs_n; ++i) {
>  		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
>  		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
> -		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
> +		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +				 MLX5_MEM_FALLBACK_ANY_SOCKET;
> 
>  		if (!txq_ctrl)
>  			continue;
> @@ -180,7 +181,8 @@ mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev,
> struct mlx5_rxq_ctrl *rxq_ctrl,
>  			return ret;
>  	}
>  	MLX5_ASSERT(!rxq_ctrl->obj);
> -	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +				    MLX5_MEM_FALLBACK_ANY_SOCKET,
>  				    sizeof(*rxq_ctrl->obj), 0,
>  				    rxq_ctrl->socket);
>  	if (!rxq_ctrl->obj) {
> diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
> index 5a5df2d1bb16..c81ae7bfd328 100644
> --- a/drivers/net/mlx5/mlx5_txpp.c
> +++ b/drivers/net/mlx5/mlx5_txpp.c
> @@ -394,7 +394,8 @@ mlx5_txpp_create_clock_queue(struct
> mlx5_dev_ctx_shared *sh)
>  	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
>  	int ret;
> 
> -	sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +	sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +				   MLX5_MEM_FALLBACK_ANY_SOCKET,
>  				   MLX5_TXPP_REARM_SQ_SIZE *
>  				   sizeof(struct mlx5_txpp_ts),
>  				   0, sh->numa_node);
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 8cb52b0f7d8e..b83e798544d2 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -1074,7 +1074,8 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_txq_ctrl *tmpl;
> 
> -	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> sizeof(*tmpl) +
> +	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +			   MLX5_MEM_FALLBACK_ANY_SOCKET,
> sizeof(*tmpl) +
>  			   desc * sizeof(struct rte_mbuf *), 0, socket);
>  	if (!tmpl) {
>  		rte_errno = ENOMEM;
> --
> 2.39.2


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] net/mlx5: support bus socket with no hugepages
  2023-10-30 10:43 ` Slava Ovsiienko
@ 2023-10-30 11:39   ` Hari Sasank
  2023-10-30 15:45     ` Slava Ovsiienko
  0 siblings, 1 reply; 4+ messages in thread
From: Hari Sasank @ 2023-10-30 11:39 UTC (permalink / raw)
  To: Slava Ovsiienko, Matan Azrad, Ori Kam, Suanming Mou; +Cc: dev

[-- Attachment #1: Type: text/plain, Size: 14015 bytes --]

Hi Slava

We have a DPDK application running with all the CPU logical cores belonging to a single socket on a multi-socket system. But the NIC itself is on a bus that is attached to the CPU from a different socket.
In this case, the driver is allocating huge pages belonging to a socket that is different from the CPU.  But, since huge pages are not available on the socket of the NIC card, it crashes the application.
(We started this dpdk application saying not to use memory from the socket where this NIC card is present because that is one of the requirements. We also probably can't move this NIC from the PCIe bus from this socket as all the buses from the other socket are in use.).

I understand the performance implications of this, so we use this flag to permit the mlx5_malloc to allocate its huge pages from any socket (SOCKET_ID_ANY seems to try starting with the current cpu lcore socket) only if huge pages are not available on the NIC socket that the driver is originally called with.

I would be happy to discover if there is a better way to solve this.

Thanks
Hari
________________________________
From: Slava Ovsiienko <viacheslavo@nvidia.com>
Sent: Monday, October 30, 2023 11:43 AM
To: Hari Sasank <harisasank@outlook.com>; Matan Azrad <matan@nvidia.com>; Ori Kam <orika@nvidia.com>; Suanming Mou <suanmingm@nvidia.com>
Cc: dev@dpdk.org <dev@dpdk.org>
Subject: RE: [PATCH] net/mlx5: support bus socket with no hugepages

Hi, Hari

As I see almost all updates, using newly introduced MLX5_MEM_FALLBACK_ANY_SOCKET flag,
are related to the memory mapped for the hardware usage (NIC accesses these areas with DMA
over the PCIe bus segment NIC attached to). It means the memory allegiance to the specific
is strong and might be critical for the performance.

In general, mlx5 PMD is designed in way requesting the memory on the specific socket only
If, and only if it is really needed.

Generally speaking, missing huge pages on the socket to which the NIC is attached to
should be considered as misconfiguration.  Could you, please, elaborate a little bit more, what is a use case
for this scenario?

With best regards,
Slava

> -----Original Message-----
> From: Hari Sasank <harisasank@outlook.com>
> Sent: Wednesday, June 21, 2023 10:31 AM
> To: Matan Azrad <matan@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>; Suanming Mou
> <suanmingm@nvidia.com>
> Cc: dev@dpdk.org; Hari Sasank <harisasank@outlook.com>
> Subject: [PATCH] net/mlx5: support bus socket with no hugepages
>
> When a Mellanox NIC is attached to a bus on a numa socket, it tries to
> allocate rte memory in that socket.
> If hugepages are not configured/available on that rte socket
> mlx5_common_pci_probe fails with ENOMEM.
>
> In this patch, a memflag MLX5_MEM_FALLBACK_ANY_SOCKET is introduced
> which when set on mlx5_malloc, will allocate the memory using
> SOCKET_ID_ANY if it is not able to allocate memory on the specified socket.
> This allocates memory on any socket starting with the current thread's
> socket.
>
> Signed-off-by: Hari Sasank <harisasank@outlook.com>
> ---
>  drivers/common/mlx5/mlx5_common_devx.c | 9 ++++++---
>  drivers/common/mlx5/mlx5_common_mr.c   | 5 +++--
>  drivers/common/mlx5/mlx5_malloc.c      | 7 +++++++
>  drivers/common/mlx5/mlx5_malloc.h      | 4 ++++
>  drivers/net/mlx5/mlx5.c                | 3 ++-
>  drivers/net/mlx5/mlx5_devx.c           | 3 ++-
>  drivers/net/mlx5/mlx5_rxq.c            | 3 ++-
>  drivers/net/mlx5/mlx5_trigger.c        | 6 ++++--
>  drivers/net/mlx5/mlx5_txpp.c           | 3 ++-
>  drivers/net/mlx5/mlx5_txq.c            | 3 ++-
>  10 files changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/common/mlx5/mlx5_common_devx.c
> b/drivers/common/mlx5/mlx5_common_devx.c
> index 431d8361cebd..122f1c65eab6 100644
> --- a/drivers/common/mlx5/mlx5_common_devx.c
> +++ b/drivers/common/mlx5/mlx5_common_devx.c
> @@ -107,7 +107,8 @@ mlx5_devx_cq_create(void *ctx, struct
> mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
>        umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for CQ."); @@ -
> 225,7 +226,8 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq
> *sq_obj, uint16_t log_wqbb_n,
>        umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for SQ."); @@ -
> 476,7 +478,8 @@ mlx5_devx_wq_init(void *ctx, uint32_t wqe_size, uint16_t
> log_wqbb_n, int socket,
>        umem_size = wqe_size * (1 << log_wqbb_n);
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for RQ."); diff --
> git a/drivers/common/mlx5/mlx5_common_mr.c
> b/drivers/common/mlx5/mlx5_common_mr.c
> index 7b14b0c7bf1e..b2ad6a249732 100644
> --- a/drivers/common/mlx5/mlx5_common_mr.c
> +++ b/drivers/common/mlx5/mlx5_common_mr.c
> @@ -223,7 +223,8 @@ mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n,
> int socket)
>        }
>        MLX5_ASSERT(!bt->table && !bt->size);
>        memset(bt, 0, sizeof(*bt));
> -     bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                             MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                sizeof(struct mr_cache_entry) * n,
>                                0, socket);
>        if (bt->table == NULL) {
> @@ -767,7 +768,7 @@ mlx5_mr_create_primary(void *pd,
>              (void *)addr, data.start, data.end, msl->page_sz, ms_n);
>        /* Size of memory for bitmap. */
>        bmp_size = rte_bitmap_get_memory_footprint(ms_n);
> -     mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO,
> +     mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO |
> +MLX5_MEM_FALLBACK_ANY_SOCKET,
>                         RTE_ALIGN_CEIL(sizeof(*mr),
> RTE_CACHE_LINE_SIZE) +
>                         bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
>        if (mr == NULL) {
> diff --git a/drivers/common/mlx5/mlx5_malloc.c
> b/drivers/common/mlx5/mlx5_malloc.c
> index c58c41da9266..e109f1bfa994 100644
> --- a/drivers/common/mlx5/mlx5_malloc.c
> +++ b/drivers/common/mlx5/mlx5_malloc.c
> @@ -182,6 +182,13 @@ mlx5_malloc(uint32_t flags, size_t size, unsigned int
> align, int socket)
>                        addr = rte_zmalloc_socket(NULL, size, align, socket);
>                else
>                        addr = rte_malloc_socket(NULL, size, align, socket);
> +             if (!addr && socket != SOCKET_ID_ANY &&
> +                 (flags & MLX5_MEM_FALLBACK_ANY_SOCKET)) {
> +                     if (flags & MLX5_MEM_ZERO)
> +                             addr = rte_zmalloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +                     else
> +                             addr = rte_malloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +             }
>                mlx5_mem_update_msl(addr);
>  #ifdef RTE_LIBRTE_MLX5_DEBUG
>                if (addr)
> diff --git a/drivers/common/mlx5/mlx5_malloc.h
> b/drivers/common/mlx5/mlx5_malloc.h
> index 9086a4f3f22e..cd57f95a629e 100644
> --- a/drivers/common/mlx5/mlx5_malloc.h
> +++ b/drivers/common/mlx5/mlx5_malloc.h
> @@ -28,6 +28,10 @@ enum mlx5_mem_flags {
>        /* Memory should be allocated from rte hugepage. */
>        MLX5_MEM_ZERO = 1 << 2,
>        /* Memory should be cleared to zero. */
> +     MLX5_MEM_FALLBACK_ANY_SOCKET = 1 << 3,
> +     /* Memory can be allocated on any socket if
> +      * it fails to allocate on the given socket.
> +      */
>  };
>
>  /**
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> f9aea1318736..5b520d468299 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -2063,7 +2063,8 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
>         */
>        ppriv_size = sizeof(struct mlx5_proc_priv) +
>                     priv->txqs_n * sizeof(struct mlx5_uar_data);
> -     ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> ppriv_size,
> +     ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                         MLX5_MEM_FALLBACK_ANY_SOCKET, ppriv_size,
>                            RTE_CACHE_LINE_SIZE, dev->device-
> >numa_node);
>        if (!ppriv) {
>                rte_errno = ENOMEM;
> diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
> index 4369d2557e9e..47a925e5913a 100644
> --- a/drivers/net/mlx5/mlx5_devx.c
> +++ b/drivers/net/mlx5/mlx5_devx.c
> @@ -1285,7 +1285,8 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev
> *dev, uint16_t idx)
>
>        RTE_BIT32(host_mem_attr.wq_attr.log_hairpin_num_packets);
>                umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>                umem_size += MLX5_DBR_SIZE;
> -             umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO, umem_size,
> +             umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO |
> +                                    MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                                       alignment, priv->sh->numa_node);
>                if (umem_buf == NULL && txq_ctrl-
> >hairpin_conf.force_memory) {
>                        DRV_LOG(ERR, "Failed to allocate memory for
> hairpin TX queue"); diff --git a/drivers/net/mlx5/mlx5_rxq.c
> b/drivers/net/mlx5/mlx5_rxq.c index ad8fd13cbe8e..6bdf1678e499 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1728,7 +1728,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>                desc >>= mprq_log_actual_stride_num;
>                alloc_size += desc * sizeof(struct mlx5_mprq_buf *);
>        }
> -     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> alloc_size, 0, socket);
> +     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                        MLX5_MEM_FALLBACK_ANY_SOCKET, alloc_size,
> 0, socket);
>        if (!tmpl) {
>                rte_errno = ENOMEM;
>                return NULL;
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index bbaa7d2aa021..1a6abdf2b61d
> 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -55,7 +55,8 @@ mlx5_txq_start(struct rte_eth_dev *dev)
>        for (i = 0; i != priv->txqs_n; ++i) {
>                struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
>                struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
> -             uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
> +             uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                              MLX5_MEM_FALLBACK_ANY_SOCKET;
>
>                if (!txq_ctrl)
>                        continue;
> @@ -180,7 +181,8 @@ mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev,
> struct mlx5_rxq_ctrl *rxq_ctrl,
>                        return ret;
>        }
>        MLX5_ASSERT(!rxq_ctrl->obj);
> -     rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                                 MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                    sizeof(*rxq_ctrl->obj), 0,
>                                    rxq_ctrl->socket);
>        if (!rxq_ctrl->obj) {
> diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
> index 5a5df2d1bb16..c81ae7bfd328 100644
> --- a/drivers/net/mlx5/mlx5_txpp.c
> +++ b/drivers/net/mlx5/mlx5_txpp.c
> @@ -394,7 +394,8 @@ mlx5_txpp_create_clock_queue(struct
> mlx5_dev_ctx_shared *sh)
>        struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
>        int ret;
>
> -     sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                                MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                   MLX5_TXPP_REARM_SQ_SIZE *
>                                   sizeof(struct mlx5_txpp_ts),
>                                   0, sh->numa_node);
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 8cb52b0f7d8e..b83e798544d2 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -1074,7 +1074,8 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>        struct mlx5_priv *priv = dev->data->dev_private;
>        struct mlx5_txq_ctrl *tmpl;
>
> -     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> sizeof(*tmpl) +
> +     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                        MLX5_MEM_FALLBACK_ANY_SOCKET,
> sizeof(*tmpl) +
>                           desc * sizeof(struct rte_mbuf *), 0, socket);
>        if (!tmpl) {
>                rte_errno = ENOMEM;
> --
> 2.39.2


[-- Attachment #2: Type: text/html, Size: 28745 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] net/mlx5: support bus socket with no hugepages
  2023-10-30 11:39   ` Hari Sasank
@ 2023-10-30 15:45     ` Slava Ovsiienko
  0 siblings, 0 replies; 4+ messages in thread
From: Slava Ovsiienko @ 2023-10-30 15:45 UTC (permalink / raw)
  To: Hari Sasank, Matan Azrad, Ori Kam, Suanming Mou; +Cc: dev

[-- Attachment #1: Type: text/plain, Size: 15731 bytes --]

Hi, Hari


  1.  >> We have a DPDK application running with all the CPU logical cores belonging to a single socket on a multi-socket system.
  2.  >> But the NIC itself is on a bus that is attached to the CPU from a different socket
  3.  >> socket where this NIC card is present because that is one of the requirements

This is quite not typical use case - running  the driver not close to the HW. All transactions between the NIC and remote (not on the same socket)
memory will cause memory subsystem overhead - QPI (or other inter-socket bus) will be highly loaded, MESI protocols will be engaged, and so on.
I believe you have strong reasons to choose this configuration, but this is certainly the mlx5 driver was not designed for.
I would prefer to have the way to detect this socket misconfiguration for more common scenarios (now mlx5 just fails).
What about introducing the some devargs "allow_any_socket",  allowing "wrong socket" allocation?

With best regards,
Slava

From: Hari Sasank <harisasank@outlook.com>
Sent: Monday, October 30, 2023 1:40 PM
To: Slava Ovsiienko <viacheslavo@nvidia.com>; Matan Azrad <matan@nvidia.com>; Ori Kam <orika@nvidia.com>; Suanming Mou <suanmingm@nvidia.com>
Cc: dev@dpdk.org
Subject: Re: [PATCH] net/mlx5: support bus socket with no hugepages

Hi Slava

We have a DPDK application running with all the CPU logical cores belonging to a single socket on a multi-socket system. But the NIC itself is on a bus that is attached to the CPU from a different socket.
In this case, the driver is allocating huge pages belonging to a socket that is different from the CPU.  But, since huge pages are not available on the socket of the NIC card, it crashes the application.
(We started this dpdk application saying not to use memory from the socket where this NIC card is present because that is one of the requirements. We also probably can't move this NIC from the PCIe bus from this socket as all the buses from the other socket are in use.).

I understand the performance implications of this, so we use this flag to permit the mlx5_malloc to allocate its huge pages from any socket (SOCKET_ID_ANY seems to try starting with the current cpu lcore socket) only if huge pages are not available on the NIC socket that the driver is originally called with.

I would be happy to discover if there is a better way to solve this.

Thanks
Hari
________________________________
From: Slava Ovsiienko <viacheslavo@nvidia.com<mailto:viacheslavo@nvidia.com>>
Sent: Monday, October 30, 2023 11:43 AM
To: Hari Sasank <harisasank@outlook.com<mailto:harisasank@outlook.com>>; Matan Azrad <matan@nvidia.com<mailto:matan@nvidia.com>>; Ori Kam <orika@nvidia.com<mailto:orika@nvidia.com>>; Suanming Mou <suanmingm@nvidia.com<mailto:suanmingm@nvidia.com>>
Cc: dev@dpdk.org<mailto:dev@dpdk.org> <dev@dpdk.org<mailto:dev@dpdk.org>>
Subject: RE: [PATCH] net/mlx5: support bus socket with no hugepages

Hi, Hari

As I see almost all updates, using newly introduced MLX5_MEM_FALLBACK_ANY_SOCKET flag,
are related to the memory mapped for the hardware usage (NIC accesses these areas with DMA
over the PCIe bus segment NIC attached to). It means the memory allegiance to the specific
is strong and might be critical for the performance.

In general, mlx5 PMD is designed in way requesting the memory on the specific socket only
If, and only if it is really needed.

Generally speaking, missing huge pages on the socket to which the NIC is attached to
should be considered as misconfiguration.  Could you, please, elaborate a little bit more, what is a use case
for this scenario?

With best regards,
Slava

> -----Original Message-----
> From: Hari Sasank <harisasank@outlook.com<mailto:harisasank@outlook.com>>
> Sent: Wednesday, June 21, 2023 10:31 AM
> To: Matan Azrad <matan@nvidia.com<mailto:matan@nvidia.com>>; Slava Ovsiienko
> <viacheslavo@nvidia.com<mailto:viacheslavo@nvidia.com>>; Ori Kam <orika@nvidia.com<mailto:orika@nvidia.com>>; Suanming Mou
> <suanmingm@nvidia.com<mailto:suanmingm@nvidia.com>>
> Cc: dev@dpdk.org<mailto:dev@dpdk.org>; Hari Sasank <harisasank@outlook.com<mailto:harisasank@outlook.com>>
> Subject: [PATCH] net/mlx5: support bus socket with no hugepages
>
> When a Mellanox NIC is attached to a bus on a numa socket, it tries to
> allocate rte memory in that socket.
> If hugepages are not configured/available on that rte socket
> mlx5_common_pci_probe fails with ENOMEM.
>
> In this patch, a memflag MLX5_MEM_FALLBACK_ANY_SOCKET is introduced
> which when set on mlx5_malloc, will allocate the memory using
> SOCKET_ID_ANY if it is not able to allocate memory on the specified socket.
> This allocates memory on any socket starting with the current thread's
> socket.
>
> Signed-off-by: Hari Sasank <harisasank@outlook.com<mailto:harisasank@outlook.com>>
> ---
>  drivers/common/mlx5/mlx5_common_devx.c | 9 ++++++---
>  drivers/common/mlx5/mlx5_common_mr.c   | 5 +++--
>  drivers/common/mlx5/mlx5_malloc.c      | 7 +++++++
>  drivers/common/mlx5/mlx5_malloc.h      | 4 ++++
>  drivers/net/mlx5/mlx5.c                | 3 ++-
>  drivers/net/mlx5/mlx5_devx.c           | 3 ++-
>  drivers/net/mlx5/mlx5_rxq.c            | 3 ++-
>  drivers/net/mlx5/mlx5_trigger.c        | 6 ++++--
>  drivers/net/mlx5/mlx5_txpp.c           | 3 ++-
>  drivers/net/mlx5/mlx5_txq.c            | 3 ++-
>  10 files changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/common/mlx5/mlx5_common_devx.c
> b/drivers/common/mlx5/mlx5_common_devx.c
> index 431d8361cebd..122f1c65eab6 100644
> --- a/drivers/common/mlx5/mlx5_common_devx.c
> +++ b/drivers/common/mlx5/mlx5_common_devx.c
> @@ -107,7 +107,8 @@ mlx5_devx_cq_create(void *ctx, struct
> mlx5_devx_cq *cq_obj, uint16_t log_desc_n,
>        umem_size = sizeof(struct mlx5_cqe) * num_of_cqes;
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for CQ."); @@ -
> 225,7 +226,8 @@ mlx5_devx_sq_create(void *ctx, struct mlx5_devx_sq
> *sq_obj, uint16_t log_wqbb_n,
>        umem_size = MLX5_WQE_SIZE * num_of_wqbbs;
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for SQ."); @@ -
> 476,7 +478,8 @@ mlx5_devx_wq_init(void *ctx, uint32_t wqe_size, uint16_t
> log_wqbb_n, int socket,
>        umem_size = wqe_size * (1 << log_wqbb_n);
>        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>        umem_size += MLX5_DBR_SIZE;
> -     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> umem_size,
> +     umem_buf = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                            MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                               alignment, socket);
>        if (!umem_buf) {
>                DRV_LOG(ERR, "Failed to allocate memory for RQ."); diff --
> git a/drivers/common/mlx5/mlx5_common_mr.c
> b/drivers/common/mlx5/mlx5_common_mr.c
> index 7b14b0c7bf1e..b2ad6a249732 100644
> --- a/drivers/common/mlx5/mlx5_common_mr.c
> +++ b/drivers/common/mlx5/mlx5_common_mr.c
> @@ -223,7 +223,8 @@ mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n,
> int socket)
>        }
>        MLX5_ASSERT(!bt->table && !bt->size);
>        memset(bt, 0, sizeof(*bt));
> -     bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                             MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                sizeof(struct mr_cache_entry) * n,
>                                0, socket);
>        if (bt->table == NULL) {
> @@ -767,7 +768,7 @@ mlx5_mr_create_primary(void *pd,
>              (void *)addr, data.start, data.end, msl->page_sz, ms_n);
>        /* Size of memory for bitmap. */
>        bmp_size = rte_bitmap_get_memory_footprint(ms_n);
> -     mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO,
> +     mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO |
> +MLX5_MEM_FALLBACK_ANY_SOCKET,
>                         RTE_ALIGN_CEIL(sizeof(*mr),
> RTE_CACHE_LINE_SIZE) +
>                         bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
>        if (mr == NULL) {
> diff --git a/drivers/common/mlx5/mlx5_malloc.c
> b/drivers/common/mlx5/mlx5_malloc.c
> index c58c41da9266..e109f1bfa994 100644
> --- a/drivers/common/mlx5/mlx5_malloc.c
> +++ b/drivers/common/mlx5/mlx5_malloc.c
> @@ -182,6 +182,13 @@ mlx5_malloc(uint32_t flags, size_t size, unsigned int
> align, int socket)
>                        addr = rte_zmalloc_socket(NULL, size, align, socket);
>                else
>                        addr = rte_malloc_socket(NULL, size, align, socket);
> +             if (!addr && socket != SOCKET_ID_ANY &&
> +                 (flags & MLX5_MEM_FALLBACK_ANY_SOCKET)) {
> +                     if (flags & MLX5_MEM_ZERO)
> +                             addr = rte_zmalloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +                     else
> +                             addr = rte_malloc_socket(NULL, size, align,
> SOCKET_ID_ANY);
> +             }
>                mlx5_mem_update_msl(addr);
>  #ifdef RTE_LIBRTE_MLX5_DEBUG
>                if (addr)
> diff --git a/drivers/common/mlx5/mlx5_malloc.h
> b/drivers/common/mlx5/mlx5_malloc.h
> index 9086a4f3f22e..cd57f95a629e 100644
> --- a/drivers/common/mlx5/mlx5_malloc.h
> +++ b/drivers/common/mlx5/mlx5_malloc.h
> @@ -28,6 +28,10 @@ enum mlx5_mem_flags {
>        /* Memory should be allocated from rte hugepage. */
>        MLX5_MEM_ZERO = 1 << 2,
>        /* Memory should be cleared to zero. */
> +     MLX5_MEM_FALLBACK_ANY_SOCKET = 1 << 3,
> +     /* Memory can be allocated on any socket if
> +      * it fails to allocate on the given socket.
> +      */
>  };
>
>  /**
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> f9aea1318736..5b520d468299 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -2063,7 +2063,8 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
>         */
>        ppriv_size = sizeof(struct mlx5_proc_priv) +
>                     priv->txqs_n * sizeof(struct mlx5_uar_data);
> -     ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> ppriv_size,
> +     ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                         MLX5_MEM_FALLBACK_ANY_SOCKET, ppriv_size,
>                            RTE_CACHE_LINE_SIZE, dev->device-
> >numa_node);
>        if (!ppriv) {
>                rte_errno = ENOMEM;
> diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
> index 4369d2557e9e..47a925e5913a 100644
> --- a/drivers/net/mlx5/mlx5_devx.c
> +++ b/drivers/net/mlx5/mlx5_devx.c
> @@ -1285,7 +1285,8 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev
> *dev, uint16_t idx)
>
>        RTE_BIT32(host_mem_attr.wq_attr.log_hairpin_num_packets);
>                umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
>                umem_size += MLX5_DBR_SIZE;
> -             umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO, umem_size,
> +             umem_buf = mlx5_malloc(MLX5_MEM_RTE |
> MLX5_MEM_ZERO |
> +                                    MLX5_MEM_FALLBACK_ANY_SOCKET,
> umem_size,
>                                       alignment, priv->sh->numa_node);
>                if (umem_buf == NULL && txq_ctrl-
> >hairpin_conf.force_memory) {
>                        DRV_LOG(ERR, "Failed to allocate memory for
> hairpin TX queue"); diff --git a/drivers/net/mlx5/mlx5_rxq.c
> b/drivers/net/mlx5/mlx5_rxq.c index ad8fd13cbe8e..6bdf1678e499 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1728,7 +1728,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>                desc >>= mprq_log_actual_stride_num;
>                alloc_size += desc * sizeof(struct mlx5_mprq_buf *);
>        }
> -     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> alloc_size, 0, socket);
> +     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                        MLX5_MEM_FALLBACK_ANY_SOCKET, alloc_size,
> 0, socket);
>        if (!tmpl) {
>                rte_errno = ENOMEM;
>                return NULL;
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index bbaa7d2aa021..1a6abdf2b61d
> 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -55,7 +55,8 @@ mlx5_txq_start(struct rte_eth_dev *dev)
>        for (i = 0; i != priv->txqs_n; ++i) {
>                struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
>                struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
> -             uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
> +             uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                              MLX5_MEM_FALLBACK_ANY_SOCKET;
>
>                if (!txq_ctrl)
>                        continue;
> @@ -180,7 +181,8 @@ mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev,
> struct mlx5_rxq_ctrl *rxq_ctrl,
>                        return ret;
>        }
>        MLX5_ASSERT(!rxq_ctrl->obj);
> -     rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                                 MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                    sizeof(*rxq_ctrl->obj), 0,
>                                    rxq_ctrl->socket);
>        if (!rxq_ctrl->obj) {
> diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
> index 5a5df2d1bb16..c81ae7bfd328 100644
> --- a/drivers/net/mlx5/mlx5_txpp.c
> +++ b/drivers/net/mlx5/mlx5_txpp.c
> @@ -394,7 +394,8 @@ mlx5_txpp_create_clock_queue(struct
> mlx5_dev_ctx_shared *sh)
>        struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
>        int ret;
>
> -     sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> +     sh->txpp.tsa = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                                MLX5_MEM_FALLBACK_ANY_SOCKET,
>                                   MLX5_TXPP_REARM_SQ_SIZE *
>                                   sizeof(struct mlx5_txpp_ts),
>                                   0, sh->numa_node);
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 8cb52b0f7d8e..b83e798544d2 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -1074,7 +1074,8 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>        struct mlx5_priv *priv = dev->data->dev_private;
>        struct mlx5_txq_ctrl *tmpl;
>
> -     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
> sizeof(*tmpl) +
> +     tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO |
> +                        MLX5_MEM_FALLBACK_ANY_SOCKET,
> sizeof(*tmpl) +
>                           desc * sizeof(struct rte_mbuf *), 0, socket);
>        if (!tmpl) {
>                rte_errno = ENOMEM;
> --
> 2.39.2

[-- Attachment #2: Type: text/html, Size: 35771 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-10-30 15:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-21  7:30 [PATCH] net/mlx5: support bus socket with no hugepages Hari Sasank
2023-10-30 10:43 ` Slava Ovsiienko
2023-10-30 11:39   ` Hari Sasank
2023-10-30 15:45     ` Slava Ovsiienko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).