DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] common/mlx5: fix non-expandable global MR cache
@ 2022-06-24 20:35 Dmitry Kozlyuk
  2022-06-29 22:08 ` [PATCH v2] " Dmitry Kozlyuk
  0 siblings, 1 reply; 4+ messages in thread
From: Dmitry Kozlyuk @ 2022-06-24 20:35 UTC (permalink / raw)
  To: dev; +Cc: stable, Matan Azrad, Viacheslav Ovsiienko

The number of memory regions (MR) that MLX5 PMD can use
was limited by 512 per IB device, the size of the global MR cache
that was fixed at compile time.
The cache allows to search MR LKey by address efficiently,
therefore it is the last place searched on data path
(skipped is the global MR database which would be slow).
If the application logic caused the PMD to create more than 512 MRs,
which can be the case with external memory,
those MRs would never be found on data path
and later cause a HW failure.

The cache size was fixed because at the time of overflow
the EAL memory hotplug lock may be held,
prohibiting to allocate a larger cache
(it must reside in DPDK memory for multi-process support).
This patch adds logic to release the necessary locks,
extend the cache, and repeat the attempt to insert new entries.

`mlx5_mr_btree` structure had `overflow` field
that was set when a cache (not only the global one)
could not accept new entries.
However, it was only checked for the global cache,
because caches of upper layers were dynamically expandable.
With the global cache size limitation removed, this field is not needed.
Cache size was previously limited by 16-bit indices.
Use the space in the structure previously fileld by `overflow` field
to extend indices to 32 bits.
With this patch, it is the HW and RAM that limit the number of MRs.

Fixes: 974f1e7ef146 ("net/mlx5: add new memory region support")
Cc: stable@dpdk.org

Signed-off-by: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/common/mlx5/mlx5_common.c    |  30 +++++
 drivers/common/mlx5/mlx5_common_mr.c | 158 ++++++++++++++++++++-------
 drivers/common/mlx5/mlx5_common_mr.h |   7 +-
 3 files changed, 150 insertions(+), 45 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_common.c b/drivers/common/mlx5/mlx5_common.c
index ef1604d223..89fef2b535 100644
--- a/drivers/common/mlx5/mlx5_common.c
+++ b/drivers/common/mlx5/mlx5_common.c
@@ -1082,6 +1082,7 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 			uint64_t iova __rte_unused, size_t len)
 {
 	struct mlx5_common_device *dev;
+	struct mlx5_mr_btree *bt;
 	struct mlx5_mr *mr;
 
 	dev = to_mlx5_device(rte_dev);
@@ -1099,7 +1100,36 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 		rte_errno = EINVAL;
 		return -1;
 	}
+try_insert:
 	rte_rwlock_write_lock(&dev->mr_scache.rwlock);
+	bt = &dev->mr_scache.cache;
+	if (bt->len == bt->size) {
+		uint32_t size;
+		int ret;
+
+		size = bt->size + 1;
+		MLX5_ASSERT(size > bt->size);
+		/*
+		 * Avoid deadlock (numbers show the sequence of events):
+		 *    mlx5_mr_create_primary():
+		 *        1) take EAL memory lock
+		 *        3) take MR lock
+		 *    this function:
+		 *        2) take MR lock
+		 *        4) take EAL memory lock while allocating the new cache
+		 * Releasing the MR lock before step 4
+		 * allows another thread to execute step 3.
+		 */
+		rte_rwlock_write_unlock(&dev->mr_scache.rwlock);
+		ret = mlx5_mr_expand_cache(&dev->mr_scache, size,
+					   rte_dev->numa_node);
+		if (ret < 0) {
+			mlx5_mr_free(mr, dev->mr_scache.dereg_mr_cb);
+			rte_errno = ret;
+			return -1;
+		}
+		goto try_insert;
+	}
 	LIST_INSERT_HEAD(&dev->mr_scache.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mlx5_mr_insert_cache(&dev->mr_scache, mr);
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index 06e4c8f187..0b43564bbb 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -78,7 +78,7 @@ mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
  *   0 on success, -1 on failure.
  */
 static int
-mr_btree_expand(struct mlx5_mr_btree *bt, int n)
+mr_btree_expand(struct mlx5_mr_btree *bt, uint32_t n)
 {
 	void *mem;
 	int ret = 0;
@@ -123,11 +123,11 @@ mr_btree_expand(struct mlx5_mr_btree *bt, int n)
  *   Searched LKey on success, UINT32_MAX on no match.
  */
 static uint32_t
-mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
+mr_btree_lookup(struct mlx5_mr_btree *bt, uint32_t *idx, uintptr_t addr)
 {
 	struct mr_cache_entry *lkp_tbl;
-	uint16_t n;
-	uint16_t base = 0;
+	uint32_t n;
+	uint32_t base = 0;
 
 	MLX5_ASSERT(bt != NULL);
 	lkp_tbl = *bt->table;
@@ -137,7 +137,7 @@ mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
 				    lkp_tbl[0].lkey == UINT32_MAX));
 	/* Binary search. */
 	do {
-		register uint16_t delta = n >> 1;
+		register uint32_t delta = n >> 1;
 
 		if (addr < lkp_tbl[base + delta].start) {
 			n = delta;
@@ -169,7 +169,7 @@ static int
 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
 {
 	struct mr_cache_entry *lkp_tbl;
-	uint16_t idx = 0;
+	uint32_t idx = 0;
 	size_t shift;
 
 	MLX5_ASSERT(bt != NULL);
@@ -185,11 +185,8 @@ mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
 		/* Already exist, return. */
 		return 0;
 	}
-	/* If table is full, return error. */
-	if (unlikely(bt->len == bt->size)) {
-		bt->overflow = 1;
-		return -1;
-	}
+	/* Caller must ensure that there is enough place for a new entry. */
+	MLX5_ASSERT(bt->len < bt->size);
 	/* Insert entry. */
 	++idx;
 	shift = (bt->len - idx) * sizeof(struct mr_cache_entry);
@@ -409,13 +406,8 @@ mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache,
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&share_cache->cache, &entry) < 0) {
-			/*
-			 * Overflowed, but the global table cannot be expanded
-			 * because of deadlock.
-			 */
+		if (mr_btree_insert(&share_cache->cache, &entry) < 0)
 			return -1;
-		}
 	}
 	return 0;
 }
@@ -477,26 +469,12 @@ static uint32_t
 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
 		     struct mr_cache_entry *entry, uintptr_t addr)
 {
-	uint16_t idx;
-	uint32_t lkey = UINT32_MAX;
-	struct mlx5_mr *mr;
+	uint32_t idx;
+	uint32_t lkey;
 
-	/*
-	 * If the global cache has overflowed since it failed to expand the
-	 * B-tree table, it can't have all the existing MRs. Then, the address
-	 * has to be searched by traversing the original MR list instead, which
-	 * is very slow path. Otherwise, the global cache is all inclusive.
-	 */
-	if (!unlikely(share_cache->cache.overflow)) {
-		lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
-		if (lkey != UINT32_MAX)
-			*entry = (*share_cache->cache.table)[idx];
-	} else {
-		/* Falling back to the slowest path. */
-		mr = mlx5_mr_lookup_list(share_cache, entry, addr);
-		if (mr != NULL)
-			lkey = entry->lkey;
-	}
+	lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
+	if (lkey != UINT32_MAX)
+		*entry = (*share_cache->cache.table)[idx];
 	MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
 					   addr < entry->end));
 	return lkey;
@@ -528,7 +506,6 @@ mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache)
 	DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache);
 	/* Flush cache to rebuild. */
 	share_cache->cache.len = 1;
-	share_cache->cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &share_cache->mr_list, mr)
 		if (mlx5_mr_insert_cache(share_cache, mr) < 0)
@@ -584,6 +561,74 @@ mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
 	return 1;
 }
 
+/**
+ * Get the number of virtually-contiguous chunks in the MR.
+ * HW MR does not need to be already created to use this function.
+ *
+ * @param mr
+ *   Pointer to the MR.
+ *
+ * @return
+ *   Number of chunks.
+ */
+static uint32_t
+mr_get_chunk_count(const struct mlx5_mr *mr)
+{
+	uint32_t i, count = 0;
+	bool was_in_chunk = false;
+	bool is_in_chunk;
+
+	/* There is only one chunk in case of external memory. */
+	if (mr->msl == NULL)
+		return 1;
+	for (i = 0; i < mr->ms_bmp_n; i++) {
+		is_in_chunk = rte_bitmap_get(mr->ms_bmp, i);
+		if (!was_in_chunk && is_in_chunk)
+			count++;
+		was_in_chunk = is_in_chunk;
+	}
+	return count;
+}
+
+/**
+ * Thread-safely expand the global MR cache to at least @p new_size slots.
+ *
+ * @param share_cache
+ *  Shared MR cache for locking.
+ * @param new_size
+ *  Desired cache size.
+ * @param socket
+ *  NUMA node.
+ *
+ * @return
+ *  0 in success, negative on failure and rte_errno is set.
+ */
+int
+mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache,
+		     uint32_t size, int socket)
+{
+	struct mlx5_mr_btree cache;
+	struct mlx5_mr_btree *bt;
+	struct mr_cache_entry *lkp_tbl;
+	int ret;
+
+	size = rte_align32pow2(size);
+	ret = mlx5_mr_btree_init(&cache, size, socket);
+	if (ret < 0)
+		return ret;
+	rte_rwlock_write_lock(&share_cache->rwlock);
+	bt = &share_cache->cache;
+	lkp_tbl = *bt->table;
+	if (cache.size > bt->size) {
+		rte_memcpy(cache.table, lkp_tbl, bt->len * sizeof(lkp_tbl[0]));
+		RTE_SWAP(*bt, cache);
+		DRV_LOG(DEBUG, "Global MR cache expanded to %u slots", size);
+	}
+	rte_rwlock_write_unlock(&share_cache->rwlock);
+	mlx5_mr_btree_free(&cache);
+	return 0;
+}
+
 /**
  * Create a new global Memory Region (MR) for a missing virtual address.
  * This API should be called on a secondary process, then a request is sent to
@@ -659,12 +704,14 @@ mlx5_mr_create_primary(void *pd,
 	struct mr_find_contig_memsegs_data data_re;
 	const struct rte_memseg_list *msl;
 	const struct rte_memseg *ms;
+	struct mlx5_mr_btree *bt;
 	struct mlx5_mr *mr = NULL;
 	int ms_idx_shift = -1;
 	uint32_t bmp_size;
 	void *bmp_mem;
 	uint32_t ms_n;
 	uint32_t n;
+	uint32_t chunks_n;
 	size_t len;
 
 	DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr);
@@ -676,6 +723,7 @@ mlx5_mr_create_primary(void *pd,
 	 * is quite opportunistic.
 	 */
 	mlx5_mr_garbage_collect(share_cache);
+find_range:
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -827,6 +875,33 @@ mlx5_mr_create_primary(void *pd,
 	len = data.end - data.start;
 	mr->ms_bmp_n = len / msl->page_sz;
 	MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
+	/*
+	 * It is now known how many entries will be used in the global cache.
+	 * If there is not enough, expand the cache.
+	 * This cannot be done while holding the memory hotplug lock.
+	 * While it is released, memory layout may change,
+	 * so the process must be repeated from the beginning.
+	 */
+	bt = &share_cache->cache;
+	chunks_n = mr_get_chunk_count(mr);
+	if (bt->len + chunks_n > bt->size) {
+		struct mlx5_common_device *cdev;
+		uint32_t size;
+
+		size = bt->size + chunks_n;
+		MLX5_ASSERT(size > bt->size);
+		cdev = container_of(share_cache, struct mlx5_common_device,
+				    mr_scache);
+		rte_rwlock_write_unlock(&share_cache->rwlock);
+		rte_mcfg_mem_read_unlock();
+		if (mlx5_mr_expand_cache(share_cache, size,
+					 cdev->dev->numa_node) < 0) {
+			DRV_LOG(ERR, "Failed to expand global MR cache to %u slots",
+				size);
+			goto err_nolock;
+		}
+		goto find_range;
+	}
 	/*
 	 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can
 	 * be called with holding the memory lock because it doesn't use
@@ -937,7 +1012,7 @@ mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl,
 		container_of(share_cache, struct mlx5_common_device, mr_scache);
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint32_t lkey;
-	uint16_t idx;
+	uint32_t idx;
 
 	/* If local cache table is full, try to double it. */
 	if (unlikely(bt->len == bt->size))
@@ -988,7 +1063,7 @@ static uint32_t
 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr)
 {
 	uint32_t lkey;
-	uint16_t bh_idx = 0;
+	uint32_t bh_idx = 0;
 	/* Victim in top-half cache to replace with new entry. */
 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
 
@@ -1085,7 +1160,6 @@ mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
 	memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
 	/* Reset the B-tree table. */
 	mr_ctrl->cache_bh.len = 1;
-	mr_ctrl->cache_bh.overflow = 0;
 	/* Update the generation number. */
 	mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
 	DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d",
@@ -1933,7 +2007,7 @@ mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl,
 		struct mlx5_mempool_mr *mr = &mpr->mrs[i];
 		struct mr_cache_entry entry;
 		uint32_t lkey;
-		uint16_t idx;
+		uint32_t idx;
 
 		lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr);
 		if (lkey != UINT32_MAX)
@@ -1971,7 +2045,7 @@ mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl,
 {
 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
 	uint32_t lkey;
-	uint16_t bh_idx = 0;
+	uint32_t bh_idx = 0;
 
 	/* Binary-search MR translation table. */
 	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
diff --git a/drivers/common/mlx5/mlx5_common_mr.h b/drivers/common/mlx5/mlx5_common_mr.h
index cf384b6748..213f5427cb 100644
--- a/drivers/common/mlx5/mlx5_common_mr.h
+++ b/drivers/common/mlx5/mlx5_common_mr.h
@@ -56,9 +56,8 @@ struct mr_cache_entry {
 
 /* MR Cache table for Binary search. */
 struct mlx5_mr_btree {
-	uint16_t len; /* Number of entries. */
-	uint16_t size; /* Total number of entries. */
-	int overflow; /* Mark failure of table expansion. */
+	uint32_t len; /* Number of entries. */
+	uint32_t size; /* Total number of entries. */
 	struct mr_cache_entry (*table)[];
 } __rte_packed;
 
@@ -218,6 +217,8 @@ void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused);
 __rte_internal
 uint32_t mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl,
 			       struct rte_mempool *mp, uintptr_t addr);
+int mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache,
+			 uint32_t new_size, int socket);
 void mlx5_mr_release_cache(struct mlx5_mr_share_cache *mr_cache);
 int mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket);
 void mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2] common/mlx5: fix non-expandable global MR cache
  2022-06-24 20:35 [PATCH] common/mlx5: fix non-expandable global MR cache Dmitry Kozlyuk
@ 2022-06-29 22:08 ` Dmitry Kozlyuk
  2022-07-04 14:22   ` Matan Azrad
  2022-07-04 16:20   ` Raslan Darawsheh
  0 siblings, 2 replies; 4+ messages in thread
From: Dmitry Kozlyuk @ 2022-06-29 22:08 UTC (permalink / raw)
  To: dev; +Cc: Raslan Darawsheh, stable, Matan Azrad, Viacheslav Ovsiienko

The number of memory regions (MR) that MLX5 PMD can use
was limited by 512 per IB device, the size of the global MR cache
that was fixed at compile time.
The cache allows to search MR LKey by address efficiently,
therefore it is the last place searched on data path
(skipped is the global MR database which would be slow).
If the application logic caused the PMD to create more than 512 MRs,
which can be the case with external memory,
those MRs would never be found on data path
and later cause a HW failure.

The cache size was fixed because at the time of overflow
the EAL memory hotplug lock may be held,
prohibiting to allocate a larger cache
(it must reside in DPDK memory for multi-process support).
This patch adds logic to release the necessary locks,
extend the cache, and repeat the attempt to insert new entries.

`mlx5_mr_btree` structure had `overflow` field
that was set when a cache (not only the global one)
could not accept new entries.
However, it was only checked for the global cache,
because caches of upper layers were dynamically expandable.
With the global cache size limitation removed, this field is not needed.
Cache size was previously limited by 16-bit indices.
Use the space in the structure previously fileld by `overflow` field
to extend indices to 32 bits.
With this patch, it is the HW and RAM that limit the number of MRs.

Fixes: 974f1e7ef146 ("net/mlx5: add new memory region support")
Cc: stable@dpdk.org

Signed-off-by: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
---
v2: fix warnings in debug mode and with assertions enabled (Raslan).

 drivers/common/mlx5/mlx5_common.c    |  30 +++++
 drivers/common/mlx5/mlx5_common_mr.c | 160 ++++++++++++++++++++-------
 drivers/common/mlx5/mlx5_common_mr.h |   7 +-
 3 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_common.c b/drivers/common/mlx5/mlx5_common.c
index ef1604d223..89fef2b535 100644
--- a/drivers/common/mlx5/mlx5_common.c
+++ b/drivers/common/mlx5/mlx5_common.c
@@ -1082,6 +1082,7 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 			uint64_t iova __rte_unused, size_t len)
 {
 	struct mlx5_common_device *dev;
+	struct mlx5_mr_btree *bt;
 	struct mlx5_mr *mr;
 
 	dev = to_mlx5_device(rte_dev);
@@ -1099,7 +1100,36 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 		rte_errno = EINVAL;
 		return -1;
 	}
+try_insert:
 	rte_rwlock_write_lock(&dev->mr_scache.rwlock);
+	bt = &dev->mr_scache.cache;
+	if (bt->len == bt->size) {
+		uint32_t size;
+		int ret;
+
+		size = bt->size + 1;
+		MLX5_ASSERT(size > bt->size);
+		/*
+		 * Avoid deadlock (numbers show the sequence of events):
+		 *    mlx5_mr_create_primary():
+		 *        1) take EAL memory lock
+		 *        3) take MR lock
+		 *    this function:
+		 *        2) take MR lock
+		 *        4) take EAL memory lock while allocating the new cache
+		 * Releasing the MR lock before step 4
+		 * allows another thread to execute step 3.
+		 */
+		rte_rwlock_write_unlock(&dev->mr_scache.rwlock);
+		ret = mlx5_mr_expand_cache(&dev->mr_scache, size,
+					   rte_dev->numa_node);
+		if (ret < 0) {
+			mlx5_mr_free(mr, dev->mr_scache.dereg_mr_cb);
+			rte_errno = ret;
+			return -1;
+		}
+		goto try_insert;
+	}
 	LIST_INSERT_HEAD(&dev->mr_scache.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mlx5_mr_insert_cache(&dev->mr_scache, mr);
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index 06e4c8f187..8d8bec99a9 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -78,7 +78,7 @@ mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
  *   0 on success, -1 on failure.
  */
 static int
-mr_btree_expand(struct mlx5_mr_btree *bt, int n)
+mr_btree_expand(struct mlx5_mr_btree *bt, uint32_t n)
 {
 	void *mem;
 	int ret = 0;
@@ -123,11 +123,11 @@ mr_btree_expand(struct mlx5_mr_btree *bt, int n)
  *   Searched LKey on success, UINT32_MAX on no match.
  */
 static uint32_t
-mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
+mr_btree_lookup(struct mlx5_mr_btree *bt, uint32_t *idx, uintptr_t addr)
 {
 	struct mr_cache_entry *lkp_tbl;
-	uint16_t n;
-	uint16_t base = 0;
+	uint32_t n;
+	uint32_t base = 0;
 
 	MLX5_ASSERT(bt != NULL);
 	lkp_tbl = *bt->table;
@@ -137,7 +137,7 @@ mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
 				    lkp_tbl[0].lkey == UINT32_MAX));
 	/* Binary search. */
 	do {
-		register uint16_t delta = n >> 1;
+		register uint32_t delta = n >> 1;
 
 		if (addr < lkp_tbl[base + delta].start) {
 			n = delta;
@@ -169,7 +169,7 @@ static int
 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
 {
 	struct mr_cache_entry *lkp_tbl;
-	uint16_t idx = 0;
+	uint32_t idx = 0;
 	size_t shift;
 
 	MLX5_ASSERT(bt != NULL);
@@ -185,11 +185,8 @@ mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
 		/* Already exist, return. */
 		return 0;
 	}
-	/* If table is full, return error. */
-	if (unlikely(bt->len == bt->size)) {
-		bt->overflow = 1;
-		return -1;
-	}
+	/* Caller must ensure that there is enough place for a new entry. */
+	MLX5_ASSERT(bt->len < bt->size);
 	/* Insert entry. */
 	++idx;
 	shift = (bt->len - idx) * sizeof(struct mr_cache_entry);
@@ -273,7 +270,7 @@ void
 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused)
 {
 #ifdef RTE_LIBRTE_MLX5_DEBUG
-	int idx;
+	uint32_t idx;
 	struct mr_cache_entry *lkp_tbl;
 
 	if (bt == NULL)
@@ -409,13 +406,8 @@ mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache,
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&share_cache->cache, &entry) < 0) {
-			/*
-			 * Overflowed, but the global table cannot be expanded
-			 * because of deadlock.
-			 */
+		if (mr_btree_insert(&share_cache->cache, &entry) < 0)
 			return -1;
-		}
 	}
 	return 0;
 }
@@ -477,26 +469,12 @@ static uint32_t
 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
 		     struct mr_cache_entry *entry, uintptr_t addr)
 {
-	uint16_t idx;
-	uint32_t lkey = UINT32_MAX;
-	struct mlx5_mr *mr;
+	uint32_t idx;
+	uint32_t lkey;
 
-	/*
-	 * If the global cache has overflowed since it failed to expand the
-	 * B-tree table, it can't have all the existing MRs. Then, the address
-	 * has to be searched by traversing the original MR list instead, which
-	 * is very slow path. Otherwise, the global cache is all inclusive.
-	 */
-	if (!unlikely(share_cache->cache.overflow)) {
-		lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
-		if (lkey != UINT32_MAX)
-			*entry = (*share_cache->cache.table)[idx];
-	} else {
-		/* Falling back to the slowest path. */
-		mr = mlx5_mr_lookup_list(share_cache, entry, addr);
-		if (mr != NULL)
-			lkey = entry->lkey;
-	}
+	lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
+	if (lkey != UINT32_MAX)
+		*entry = (*share_cache->cache.table)[idx];
 	MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
 					   addr < entry->end));
 	return lkey;
@@ -528,7 +506,6 @@ mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache)
 	DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache);
 	/* Flush cache to rebuild. */
 	share_cache->cache.len = 1;
-	share_cache->cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &share_cache->mr_list, mr)
 		if (mlx5_mr_insert_cache(share_cache, mr) < 0)
@@ -584,6 +561,74 @@ mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
 	return 1;
 }
 
+/**
+ * Get the number of virtually-contiguous chunks in the MR.
+ * HW MR does not need to be already created to use this function.
+ *
+ * @param mr
+ *   Pointer to the MR.
+ *
+ * @return
+ *   Number of chunks.
+ */
+static uint32_t
+mr_get_chunk_count(const struct mlx5_mr *mr)
+{
+	uint32_t i, count = 0;
+	bool was_in_chunk = false;
+	bool is_in_chunk;
+
+	/* There is only one chunk in case of external memory. */
+	if (mr->msl == NULL)
+		return 1;
+	for (i = 0; i < mr->ms_bmp_n; i++) {
+		is_in_chunk = rte_bitmap_get(mr->ms_bmp, i);
+		if (!was_in_chunk && is_in_chunk)
+			count++;
+		was_in_chunk = is_in_chunk;
+	}
+	return count;
+}
+
+/**
+ * Thread-safely expand the global MR cache to at least @p new_size slots.
+ *
+ * @param share_cache
+ *  Shared MR cache for locking.
+ * @param new_size
+ *  Desired cache size.
+ * @param socket
+ *  NUMA node.
+ *
+ * @return
+ *  0 in success, negative on failure and rte_errno is set.
+ */
+int
+mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache,
+		     uint32_t size, int socket)
+{
+	struct mlx5_mr_btree cache = {0};
+	struct mlx5_mr_btree *bt;
+	struct mr_cache_entry *lkp_tbl;
+	int ret;
+
+	size = rte_align32pow2(size);
+	ret = mlx5_mr_btree_init(&cache, size, socket);
+	if (ret < 0)
+		return ret;
+	rte_rwlock_write_lock(&share_cache->rwlock);
+	bt = &share_cache->cache;
+	lkp_tbl = *bt->table;
+	if (cache.size > bt->size) {
+		rte_memcpy(cache.table, lkp_tbl, bt->len * sizeof(lkp_tbl[0]));
+		RTE_SWAP(*bt, cache);
+		DRV_LOG(DEBUG, "Global MR cache expanded to %u slots", size);
+	}
+	rte_rwlock_write_unlock(&share_cache->rwlock);
+	mlx5_mr_btree_free(&cache);
+	return 0;
+}
+
 /**
  * Create a new global Memory Region (MR) for a missing virtual address.
  * This API should be called on a secondary process, then a request is sent to
@@ -659,12 +704,14 @@ mlx5_mr_create_primary(void *pd,
 	struct mr_find_contig_memsegs_data data_re;
 	const struct rte_memseg_list *msl;
 	const struct rte_memseg *ms;
+	struct mlx5_mr_btree *bt;
 	struct mlx5_mr *mr = NULL;
 	int ms_idx_shift = -1;
 	uint32_t bmp_size;
 	void *bmp_mem;
 	uint32_t ms_n;
 	uint32_t n;
+	uint32_t chunks_n;
 	size_t len;
 
 	DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr);
@@ -676,6 +723,7 @@ mlx5_mr_create_primary(void *pd,
 	 * is quite opportunistic.
 	 */
 	mlx5_mr_garbage_collect(share_cache);
+find_range:
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -827,6 +875,33 @@ mlx5_mr_create_primary(void *pd,
 	len = data.end - data.start;
 	mr->ms_bmp_n = len / msl->page_sz;
 	MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
+	/*
+	 * It is now known how many entries will be used in the global cache.
+	 * If there is not enough, expand the cache.
+	 * This cannot be done while holding the memory hotplug lock.
+	 * While it is released, memory layout may change,
+	 * so the process must be repeated from the beginning.
+	 */
+	bt = &share_cache->cache;
+	chunks_n = mr_get_chunk_count(mr);
+	if (bt->len + chunks_n > bt->size) {
+		struct mlx5_common_device *cdev;
+		uint32_t size;
+
+		size = bt->size + chunks_n;
+		MLX5_ASSERT(size > bt->size);
+		cdev = container_of(share_cache, struct mlx5_common_device,
+				    mr_scache);
+		rte_rwlock_write_unlock(&share_cache->rwlock);
+		rte_mcfg_mem_read_unlock();
+		if (mlx5_mr_expand_cache(share_cache, size,
+					 cdev->dev->numa_node) < 0) {
+			DRV_LOG(ERR, "Failed to expand global MR cache to %u slots",
+				size);
+			goto err_nolock;
+		}
+		goto find_range;
+	}
 	/*
 	 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can
 	 * be called with holding the memory lock because it doesn't use
@@ -937,7 +1012,7 @@ mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl,
 		container_of(share_cache, struct mlx5_common_device, mr_scache);
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint32_t lkey;
-	uint16_t idx;
+	uint32_t idx;
 
 	/* If local cache table is full, try to double it. */
 	if (unlikely(bt->len == bt->size))
@@ -988,7 +1063,7 @@ static uint32_t
 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr)
 {
 	uint32_t lkey;
-	uint16_t bh_idx = 0;
+	uint32_t bh_idx = 0;
 	/* Victim in top-half cache to replace with new entry. */
 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
 
@@ -1085,7 +1160,6 @@ mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
 	memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
 	/* Reset the B-tree table. */
 	mr_ctrl->cache_bh.len = 1;
-	mr_ctrl->cache_bh.overflow = 0;
 	/* Update the generation number. */
 	mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
 	DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d",
@@ -1933,7 +2007,7 @@ mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl,
 		struct mlx5_mempool_mr *mr = &mpr->mrs[i];
 		struct mr_cache_entry entry;
 		uint32_t lkey;
-		uint16_t idx;
+		uint32_t idx;
 
 		lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr);
 		if (lkey != UINT32_MAX)
@@ -1971,7 +2045,7 @@ mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl,
 {
 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
 	uint32_t lkey;
-	uint16_t bh_idx = 0;
+	uint32_t bh_idx = 0;
 
 	/* Binary-search MR translation table. */
 	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
diff --git a/drivers/common/mlx5/mlx5_common_mr.h b/drivers/common/mlx5/mlx5_common_mr.h
index cf384b6748..213f5427cb 100644
--- a/drivers/common/mlx5/mlx5_common_mr.h
+++ b/drivers/common/mlx5/mlx5_common_mr.h
@@ -56,9 +56,8 @@ struct mr_cache_entry {
 
 /* MR Cache table for Binary search. */
 struct mlx5_mr_btree {
-	uint16_t len; /* Number of entries. */
-	uint16_t size; /* Total number of entries. */
-	int overflow; /* Mark failure of table expansion. */
+	uint32_t len; /* Number of entries. */
+	uint32_t size; /* Total number of entries. */
 	struct mr_cache_entry (*table)[];
 } __rte_packed;
 
@@ -218,6 +217,8 @@ void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused);
 __rte_internal
 uint32_t mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl,
 			       struct rte_mempool *mp, uintptr_t addr);
+int mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache,
+			 uint32_t new_size, int socket);
 void mlx5_mr_release_cache(struct mlx5_mr_share_cache *mr_cache);
 int mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket);
 void mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH v2] common/mlx5: fix non-expandable global MR cache
  2022-06-29 22:08 ` [PATCH v2] " Dmitry Kozlyuk
@ 2022-07-04 14:22   ` Matan Azrad
  2022-07-04 16:20   ` Raslan Darawsheh
  1 sibling, 0 replies; 4+ messages in thread
From: Matan Azrad @ 2022-07-04 14:22 UTC (permalink / raw)
  To: Dmitry Kozlyuk, dev; +Cc: Raslan Darawsheh, stable, Slava Ovsiienko



From: Dmitry Kozlyuk
> Sent: Thursday, June 30, 2022 1:08 AM
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; stable@dpdk.org; Matan
> Azrad <matan@nvidia.com>; Slava Ovsiienko <viacheslavo@nvidia.com>
> Subject: [PATCH v2] common/mlx5: fix non-expandable global MR cache
> 
> The number of memory regions (MR) that MLX5 PMD can use was limited by
> 512 per IB device, the size of the global MR cache that was fixed at compile
> time.
> The cache allows to search MR LKey by address efficiently, therefore it is the
> last place searched on data path (skipped is the global MR database which
> would be slow).
> If the application logic caused the PMD to create more than 512 MRs, which
> can be the case with external memory, those MRs would never be found on
> data path and later cause a HW failure.
> 
> The cache size was fixed because at the time of overflow the EAL memory
> hotplug lock may be held, prohibiting to allocate a larger cache (it must reside
> in DPDK memory for multi-process support).
> This patch adds logic to release the necessary locks, extend the cache, and
> repeat the attempt to insert new entries.
> 
> `mlx5_mr_btree` structure had `overflow` field that was set when a cache
> (not only the global one) could not accept new entries.
> However, it was only checked for the global cache, because caches of upper
> layers were dynamically expandable.
> With the global cache size limitation removed, this field is not needed.
> Cache size was previously limited by 16-bit indices.
> Use the space in the structure previously fileld by `overflow` field to extend
> indices to 32 bits.
> With this patch, it is the HW and RAM that limit the number of MRs.
> 
> Fixes: 974f1e7ef146 ("net/mlx5: add new memory region support")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH v2] common/mlx5: fix non-expandable global MR cache
  2022-06-29 22:08 ` [PATCH v2] " Dmitry Kozlyuk
  2022-07-04 14:22   ` Matan Azrad
@ 2022-07-04 16:20   ` Raslan Darawsheh
  1 sibling, 0 replies; 4+ messages in thread
From: Raslan Darawsheh @ 2022-07-04 16:20 UTC (permalink / raw)
  To: Dmitry Kozlyuk, dev; +Cc: stable, Matan Azrad, Slava Ovsiienko

Hi,

> -----Original Message-----
> From: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
> Sent: Thursday, June 30, 2022 1:08 AM
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; stable@dpdk.org; Matan
> Azrad <matan@nvidia.com>; Slava Ovsiienko <viacheslavo@nvidia.com>
> Subject: [PATCH v2] common/mlx5: fix non-expandable global MR cache
> 
> The number of memory regions (MR) that MLX5 PMD can use
> was limited by 512 per IB device, the size of the global MR cache
> that was fixed at compile time.
> The cache allows to search MR LKey by address efficiently,
> therefore it is the last place searched on data path
> (skipped is the global MR database which would be slow).
> If the application logic caused the PMD to create more than 512 MRs,
> which can be the case with external memory,
> those MRs would never be found on data path
> and later cause a HW failure.
> 
> The cache size was fixed because at the time of overflow
> the EAL memory hotplug lock may be held,
> prohibiting to allocate a larger cache
> (it must reside in DPDK memory for multi-process support).
> This patch adds logic to release the necessary locks,
> extend the cache, and repeat the attempt to insert new entries.
> 
> `mlx5_mr_btree` structure had `overflow` field
> that was set when a cache (not only the global one)
> could not accept new entries.
> However, it was only checked for the global cache,
> because caches of upper layers were dynamically expandable.
> With the global cache size limitation removed, this field is not needed.
> Cache size was previously limited by 16-bit indices.
> Use the space in the structure previously fileld by `overflow` field
> to extend indices to 32 bits.
> With this patch, it is the HW and RAM that limit the number of MRs.
> 
> Fixes: 974f1e7ef146 ("net/mlx5: add new memory region support")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
> ---
> v2: fix warnings in debug mode and with assertions enabled (Raslan).
> 

Patch applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-07-04 16:20 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-24 20:35 [PATCH] common/mlx5: fix non-expandable global MR cache Dmitry Kozlyuk
2022-06-29 22:08 ` [PATCH v2] " Dmitry Kozlyuk
2022-07-04 14:22   ` Matan Azrad
2022-07-04 16:20   ` Raslan Darawsheh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).