DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
@ 2019-04-12 15:45 Viacheslav Ovsiienko
  2019-04-12 15:45 ` Viacheslav Ovsiienko
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-12 15:45 UTC (permalink / raw)
  To: dev; +Cc: shahafs

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the signle
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
portes, created on the base of multiport Infiniban device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 104 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9ff50df..7ff5c8b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 14c7f3c..8eb1019 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct { /* Global device shared L3 MR cache. */
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -323,13 +331,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a3732d4..5140cc2 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1345,6 +1350,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1390,6 +1397,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
+	++sh->mr.dev_gen;
 	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+			sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9965b2b..ed82594 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
  2019-04-12 15:45 [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
@ 2019-04-12 15:45 ` Viacheslav Ovsiienko
  2019-04-12 19:22 ` Yongseok Koh
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-12 15:45 UTC (permalink / raw)
  To: dev; +Cc: shahafs

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the signle
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
portes, created on the base of multiport Infiniban device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 104 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9ff50df..7ff5c8b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 14c7f3c..8eb1019 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct { /* Global device shared L3 MR cache. */
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -323,13 +331,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a3732d4..5140cc2 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1345,6 +1350,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1390,6 +1397,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
+	++sh->mr.dev_gen;
 	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+			sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9965b2b..ed82594 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
  2019-04-12 15:45 [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
  2019-04-12 15:45 ` Viacheslav Ovsiienko
@ 2019-04-12 19:22 ` Yongseok Koh
  2019-04-12 19:22   ` Yongseok Koh
  2019-04-15  6:42   ` Slava Ovsiienko
  2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  2019-04-24  8:48 ` [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
  3 siblings, 2 replies; 20+ messages in thread
From: Yongseok Koh @ 2019-04-12 19:22 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: dev, Shahaf Shuler

On Fri, Apr 12, 2019 at 03:45:40PM +0000, Viacheslav Ovsiienko wrote:
> The multiport Infiniband device support was introduced [1].
> All active ports, belonging to the same Infiniband device use the signle
> shared Infiniband context of that device and share the resources:
>   - QPs are created within shared context
>   - Verbs flows are also created with specifying port index
>   - DV/DR resources
>   - Protection Domain
>   - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between
> portes, created on the base of multiport Infiniban device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3
> is subject to share due to performance issues.
> 
> [1] https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatches.dpdk.org%2Fcover%2F51800%2F&amp;data=02%7C01%7Cyskoh%40mellanox.com%7Cf16a8f547c3b4b3a1f7608d6bf5df4cd%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636906807620990767&amp;sdata=uinIUW9euBYb74BhK9f8INkhaTM6jwyCXDhmGtmEIME%3D&amp;reserved=0
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c     |  40 +++++++----
>  drivers/net/mlx5/mlx5.h     |  15 ++--
>  drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++----------------------
>  drivers/net/mlx5/mlx5_mr.h  |   5 +-
>  drivers/net/mlx5/mlx5_txq.c |   2 +-
>  5 files changed, 121 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 9ff50df..7ff5c8b 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
>  	struct mlx5_switch_info info; /**< Switch information. */
>  	struct ibv_device *ibv_dev; /**< Associated IB device. */
>  	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
>  };
>  
>  static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
> @@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data {
>  		sizeof(sh->ibdev_name));
>  	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
>  		sizeof(sh->ibdev_path));
> +	sh->pci_dev = spawn->pci_dev;
>  	pthread_mutex_init(&sh->intr_mutex, NULL);
>  	/*
>  	 * Setting port_id to max unallowed value means
> @@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data {
>  		err = ENOMEM;
>  		goto error;
>  	}
> +	/*
> +	 * Once the device is added to the list of memory event
> +	 * callback, its global MR cache table cannot be expanded
> +	 * on the fly because of deadlock. If it overflows, lookup
> +	 * should be done by searching MR list linearly, which is slow.
> +	 *
> +	 * At this point the device is not added to the memory
> +	 * event list yet, context is just being created.
> +	 */
> +	err = mlx5_mr_btree_init(&sh->mr.cache,
> +				 MLX5_MR_BTREE_CACHE_N * 2,
> +				 sh->pci_dev->device.numa_node);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
>  	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
>  exit:
>  	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	if (--sh->refcnt)
>  		goto exit;
> +	/* Release created Memory Regions. */
> +	mlx5_mr_release(sh);
>  	LIST_REMOVE(sh, next);
>  	/*
>  	 *  Ensure there is no async event handler installed.
> @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
>  	}
>  	mlx5_proc_priv_uninit(dev);
>  	mlx5_mprq_free_mp(dev);
> -	mlx5_mr_release(dev);
> +	/* Remove from memory callback device list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_REMOVE(priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);

I'm almost okay with this patch but I'm concerned about this part. As devices
sharing the same context is added to the callback list,
mlx5_mr_mem_event_free_cb() will be called multiple times with the same
addr/len. This should be redundant and unnecessary. How about adding the shared
ctx to the list instead?

>  	assert(priv->sh);
>  	mlx5_free_shared_dr(priv);
>  	if (priv->rss_conf.rss_key != NULL)
> @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
>  		goto error;
>  	}
>  	priv->config.flow_prio = err;
> -	/*
> -	 * Once the device is added to the list of memory event
> -	 * callback, its global MR cache table cannot be expanded
> -	 * on the fly because of deadlock. If it overflows, lookup
> -	 * should be done by searching MR list linearly, which is slow.
> -	 */
> -	err = mlx5_mr_btree_init(&priv->mr.cache,
> -				 MLX5_MR_BTREE_CACHE_N * 2,
> -				 eth_dev->device->numa_node);
> -	if (err) {
> -		err = rte_errno;
> -		goto error;
> -	}
>  	/* Add device to memory callback list. */
>  	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
>  	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
>  			list[ns].ibv_port = i;
>  			list[ns].ibv_dev = ibv_match[0];
>  			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
>  			list[ns].ifindex = mlx5_nl_ifindex
>  					(nl_rdma, list[ns].ibv_dev->name, i);
>  			if (!list[ns].ifindex) {
> @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
>  			list[ns].ibv_port = 1;
>  			list[ns].ibv_dev = ibv_match[i];
>  			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
>  			list[ns].ifindex = 0;
>  			if (nl_rdma >= 0)
>  				list[ns].ifindex = mlx5_nl_ifindex
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 14c7f3c..8eb1019 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
>  	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
>  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
>  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> +	struct { /* Global device shared L3 MR cache. */

The comment sounds redundant and this structure has not only the global cache
but also MR list.

> +		uint32_t dev_gen; /* Generation number to flush local caches. */
> +		rte_rwlock_t rwlock; /* MR Lock. */
> +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> +	} mr;
>  	/* Shared DV/DR flow data section. */
>  	pthread_mutex_t dv_mutex; /* DV context mutex. */
>  	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> @@ -323,13 +331,6 @@ struct mlx5_priv {
>  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
>  	/* Flow counters. */
> -	struct {
> -		uint32_t dev_gen; /* Generation number to flush local caches. */
> -		rte_rwlock_t rwlock; /* MR Lock. */
> -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> -	} mr;
>  	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
>  	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
>  	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index a3732d4..5140cc2 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -36,7 +36,7 @@ struct mr_update_mp_data {
>  
>  /**
>   * Expand B-tree table to a given size. Can't be called with holding
> - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
>   *
>   * @param bt
>   *   Pointer to B-tree structure.
> @@ -350,7 +350,7 @@ struct mr_update_mp_data {
>  		n = mr_find_next_chunk(mr, &entry, n);
>  		if (!entry.end)
>  			break;
> -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
>  			/*
>  			 * Overflowed, but the global table cannot be expanded
>  			 * because of deadlock.
> @@ -382,7 +382,7 @@ struct mr_update_mp_data {
>  	struct mlx5_mr *mr;
>  
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
>  		unsigned int n;
>  
>  		if (mr->ms_n == 0)
> @@ -420,6 +420,7 @@ struct mr_update_mp_data {
>  	      uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	uint16_t idx;
>  	uint32_t lkey = UINT32_MAX;
>  	struct mlx5_mr *mr;
> @@ -430,10 +431,10 @@ struct mr_update_mp_data {
>  	 * has to be searched by traversing the original MR list instead, which
>  	 * is very slow path. Otherwise, the global cache is all inclusive.
>  	 */
> -	if (!unlikely(priv->mr.cache.overflow)) {
> -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	if (!unlikely(sh->mr.cache.overflow)) {
> +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
>  		if (lkey != UINT32_MAX)
> -			*entry = (*priv->mr.cache.table)[idx];
> +			*entry = (*sh->mr.cache.table)[idx];
>  	} else {
>  		/* Falling back to the slowest path. */
>  		mr = mr_lookup_dev_list(dev, entry, addr);
> @@ -468,13 +469,12 @@ struct mr_update_mp_data {
>  /**
>   * Release resources of detached MR having no online entry.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  static void
> -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
>  {
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr_next;
>  	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
>  
> @@ -484,11 +484,11 @@ struct mr_update_mp_data {
>  	 * MR can't be freed with holding the lock because rte_free() could call
>  	 * memory free callback function. This will be a deadlock situation.
>  	 */
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Detach the whole free list and release it after unlocking. */
> -	free_list = priv->mr.mr_free_list;
> -	LIST_INIT(&priv->mr.mr_free_list);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	free_list = sh->mr.mr_free_list;
> +	LIST_INIT(&sh->mr.mr_free_list);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Release resources. */
>  	mr_next = LIST_FIRST(&free_list);
>  	while (mr_next != NULL) {
> @@ -548,12 +548,12 @@ struct mr_update_mp_data {
>  		      dev->data->port_id, (void *)addr);
>  		return UINT32_MAX;
>  	}
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
>  	/* Fill in output data. */
>  	mr_lookup_dev(dev, entry, addr);
>  	/* Lookup can't fail. */
>  	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
>  	DEBUG("port %u MR CREATED by primary process for %p:\n"
>  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
>  	      dev->data->port_id, (void *)addr,
> @@ -582,6 +582,7 @@ struct mr_update_mp_data {
>  		       uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_dev_config *config = &priv->config;
>  	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>  	const struct rte_memseg_list *msl;
> @@ -602,12 +603,12 @@ struct mr_update_mp_data {
>  		dev->data->port_id, (void *)addr);
>  	/*
>  	 * Release detached MRs if any. This can't be called with holding either
> -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
> +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
>  	 * been detached by the memory free event but it couldn't be released
>  	 * inside the callback due to deadlock. As a result, releasing resources
>  	 * is quite opportunistic.
>  	 */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
>  	/*
>  	 * If enabled, find out a contiguous virtual address chunk in use, to
>  	 * which the given address belongs, in order to register maximum range.
> @@ -710,7 +711,7 @@ struct mr_update_mp_data {
>  		goto alloc_resources;
>  	}
>  	assert(data.msl == data_re.msl);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/*
>  	 * Check the address is really missing. If other thread already created
>  	 * one or it is not found due to overflow, abort and return.
> @@ -721,10 +722,10 @@ struct mr_update_mp_data {
>  		 * low-on-memory. Then, this entry will have to be searched
>  		 * here again.
>  		 */
> -		mr_btree_insert(&priv->mr.cache, entry);
> +		mr_btree_insert(&sh->mr.cache, entry);
>  		DEBUG("port %u found MR for %p on final lookup, abort",
>  		      dev->data->port_id, (void *)addr);
> -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> +		rte_rwlock_write_unlock(&sh->mr.rwlock);
>  		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  		/*
>  		 * Must be unlocked before calling rte_free() because
> @@ -769,7 +770,7 @@ struct mr_update_mp_data {
>  	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
>  	 * through mlx5_alloc_verbs_buf().
>  	 */
> -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
> +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
>  				       IBV_ACCESS_LOCAL_WRITE);
>  	if (mr->ibv_mr == NULL) {
>  		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@ -779,7 +780,7 @@ struct mr_update_mp_data {
>  	}
>  	assert((uintptr_t)mr->ibv_mr->addr == data.start);
>  	assert(mr->ibv_mr->length == len);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	DEBUG("port %u MR CREATED (%p) for %p:\n"
>  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
>  	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
> @@ -792,11 +793,11 @@ struct mr_update_mp_data {
>  	mr_lookup_dev(dev, entry, addr);
>  	/* Lookup can't fail. */
>  	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  	return entry->lkey;
>  err_mrlock:
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  err_memlock:
>  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  err_nolock:
> @@ -854,14 +855,15 @@ struct mr_update_mp_data {
>  mr_rebuild_dev_cache(struct rte_eth_dev *dev)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr *mr;
>  
>  	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
>  	/* Flush cache to rebuild. */
> -	priv->mr.cache.len = 1;
> -	priv->mr.cache.overflow = 0;
> +	sh->mr.cache.len = 1;
> +	sh->mr.cache.overflow = 0;
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
>  		if (mr_insert_dev_cache(dev, mr) < 0)
>  			return;
>  }
> @@ -888,6 +890,7 @@ struct mr_update_mp_data {
>  mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	const struct rte_memseg_list *msl;
>  	struct mlx5_mr *mr;
>  	int ms_n;
> @@ -901,7 +904,7 @@ struct mr_update_mp_data {
>  	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
>  	assert(len == RTE_ALIGN(len, msl->page_sz));
>  	ms_n = len / msl->page_sz;
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Clear bits of freed memsegs from MR. */
>  	for (i = 0; i < ms_n; ++i) {
>  		const struct rte_memseg *ms;
> @@ -928,7 +931,7 @@ struct mr_update_mp_data {
>  		rte_bitmap_clear(mr->ms_bmp, pos);
>  		if (--mr->ms_n == 0) {
>  			LIST_REMOVE(mr, mr);
> -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  			DEBUG("port %u remove MR(%p) from list",
>  			      dev->data->port_id, (void *)mr);
>  		}
> @@ -949,12 +952,12 @@ struct mr_update_mp_data {
>  		 * generation below) will be guaranteed to be seen by other core
>  		 * before the core sees the newly allocated memory.
>  		 */
> -		++priv->mr.dev_gen;
> +		++sh->mr.dev_gen;
>  		DEBUG("broadcasting local cache flush, gen=%d",
> -		      priv->mr.dev_gen);
> +		      sh->mr.dev_gen);
>  		rte_smp_wmb();
>  	}
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  }
>  
>  /**
> @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
>  		   struct mlx5_mr_cache *entry, uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
>  	uint16_t idx;
>  	uint32_t lkey;
> @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
>  	if (unlikely(bt->len == bt->size))
>  		mr_btree_expand(bt, bt->size << 1);
>  	/* Look up in the global cache. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
>  	if (lkey != UINT32_MAX) {
>  		/* Found. */
> -		*entry = (*priv->mr.cache.table)[idx];
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		*entry = (*sh->mr.cache.table)[idx];
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
>  		/*
>  		 * Update local cache. Even if it fails, return the found entry
>  		 * to update top-half cache. Next time, this entry will be found
> @@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
>  		mr_btree_insert(bt, entry);
>  		return lkey;
>  	}
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	/* First time to see the address? Create a new MR. */
>  	lkey = mlx5_mr_create(dev, entry, addr);
>  	/*
> @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
>  	struct mr_update_mp_data *data = opaque;
>  	struct rte_eth_dev *dev = data->dev;
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
>  	struct mlx5_mr *mr = NULL;
>  	uintptr_t addr = (uintptr_t)memhdr->addr;
> @@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
>  
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	/* If already registered, it should return. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	lkey = mr_lookup_dev(dev, &entry, addr);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	if (lkey != UINT32_MAX)
>  		return;
>  	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
> @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
>  		data->ret = -1;
>  		return;
>  	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	/* Insert to the global cache table. */
>  	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Insert to the local cache table */
>  	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
>  }
> @@ -1345,6 +1350,7 @@ struct mr_update_mp_data {
>  	struct rte_eth_dev *dev;
>  	struct mlx5_mr *mr;
>  	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
>  
>  	dev = pci_dev_to_eth_dev(pdev);
>  	if (!dev) {
> @@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
>  		rte_errno = EINVAL;
>  		return -1;
>  	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	sh = priv->sh;
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	/* Insert to the global cache table. */
>  	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	return 0;
>  }
>  
> @@ -1390,6 +1397,7 @@ struct mr_update_mp_data {
>  {
>  	struct rte_eth_dev *dev;
>  	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
>  	struct mlx5_mr *mr;
>  	struct mlx5_mr_cache entry;
>  
> @@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
>  		return -1;
>  	}
>  	priv = dev->data->dev_private;
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	sh = priv->sh;
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
>  	if (!mr) {
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
>  		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
>  				 "to PCI device %p", (uintptr_t)addr,
>  				 (void *)pdev);
> @@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
>  		return -1;
>  	}
>  	LIST_REMOVE(mr, mr);
> -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
>  	      (void *)mr);
>  	mr_rebuild_dev_cache(dev);
> @@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
>  	 * generation below) will be guaranteed to be seen by other core
>  	 * before the core sees the newly allocated memory.
>  	 */
> -	++priv->mr.dev_gen;
> +	++sh->mr.dev_gen;
>  	DEBUG("broadcasting local cache flush, gen=%d",
> -			priv->mr.dev_gen);
> +			sh->mr.dev_gen);

Looks indentation was wrong even before your change.
Please correct it.

Thanks,
Yongseok

>  	rte_smp_wmb();
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	return 0;
>  }
>  
> @@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
>  /**
>   * Dump all the created MRs and the global cache entries.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  void
> -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
>  {
>  #ifndef NDEBUG
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr;
>  	int mr_n = 0;
>  	int chunk_n = 0;
>  
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
>  		unsigned int n;
>  
> -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> -		      dev->data->port_id, mr_n++,
> +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> +		      sh->ibdev_name, mr_n++,
>  		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
>  		      mr->ms_n, mr->ms_bmp_n);
>  		if (mr->ms_n == 0)
> @@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
>  			      chunk_n++, ret.start, ret.end);
>  		}
>  	}
> -	DEBUG("port %u dumping global cache", dev->data->port_id);
> -	mlx5_mr_btree_dump(&priv->mr.cache);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> +	mlx5_mr_btree_dump(&sh->mr.cache);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  #endif
>  }
>  
>  /**
> - * Release all the created MRs and resources. Remove device from memory callback
> + * Release all the created MRs and resources for shared device context.
>   * list.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  void
> -mlx5_mr_release(struct rte_eth_dev *dev)
> +mlx5_mr_release(struct mlx5_ibv_shared *sh)
>  {
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr_next;
>  
> -	/* Remove from memory callback device list. */
> -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -	LIST_REMOVE(priv, mem_event_cb);
> -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> -		mlx5_mr_dump_dev(dev);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +		mlx5_mr_dump_dev(sh);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Detach from MR list and move to free list. */
> -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> +	mr_next = LIST_FIRST(&sh->mr.mr_list);
>  	while (mr_next != NULL) {
>  		struct mlx5_mr *mr = mr_next;
>  
>  		mr_next = LIST_NEXT(mr, mr);
>  		LIST_REMOVE(mr, mr);
> -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  	}
> -	LIST_INIT(&priv->mr.mr_list);
> +	LIST_INIT(&sh->mr.mr_list);
>  	/* Free global cache. */
> -	mlx5_mr_btree_free(&priv->mr.cache);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	mlx5_mr_btree_free(&sh->mr.cache);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Free all remaining MRs. */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
>  }
> diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> index 786f6a3..89e89b7 100644
> --- a/drivers/net/mlx5/mlx5_mr.h
> +++ b/drivers/net/mlx5/mlx5_mr.h
> @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
>  	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
>  } __rte_packed;
>  
> +struct mlx5_ibv_shared;
>  extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
>  extern rte_rwlock_t mlx5_mem_event_rwlock;
>  
> @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
>  			  size_t len, void *arg);
>  int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
>  		      struct rte_mempool *mp);
> -void mlx5_mr_release(struct rte_eth_dev *dev);
> +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
>  
>  /* Debug purpose functions. */
>  void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
> -void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
>  
>  /**
>   * Look up LKey from given lookup table by linear search. Firstly look up the
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 9965b2b..ed82594 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
>  		goto error;
>  	}
>  	/* Save pointer of global generation number to check memory event. */
> -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
>  	assert(desc > MLX5_TX_COMP_THRESH);
>  	tmpl->txq.offloads = conf->offloads |
>  			     dev->data->dev_conf.txmode.offloads;
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
  2019-04-12 19:22 ` Yongseok Koh
@ 2019-04-12 19:22   ` Yongseok Koh
  2019-04-15  6:42   ` Slava Ovsiienko
  1 sibling, 0 replies; 20+ messages in thread
From: Yongseok Koh @ 2019-04-12 19:22 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: dev, Shahaf Shuler

On Fri, Apr 12, 2019 at 03:45:40PM +0000, Viacheslav Ovsiienko wrote:
> The multiport Infiniband device support was introduced [1].
> All active ports, belonging to the same Infiniband device use the signle
> shared Infiniband context of that device and share the resources:
>   - QPs are created within shared context
>   - Verbs flows are also created with specifying port index
>   - DV/DR resources
>   - Protection Domain
>   - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between
> portes, created on the base of multiport Infiniban device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3
> is subject to share due to performance issues.
> 
> [1] https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatches.dpdk.org%2Fcover%2F51800%2F&amp;data=02%7C01%7Cyskoh%40mellanox.com%7Cf16a8f547c3b4b3a1f7608d6bf5df4cd%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636906807620990767&amp;sdata=uinIUW9euBYb74BhK9f8INkhaTM6jwyCXDhmGtmEIME%3D&amp;reserved=0
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c     |  40 +++++++----
>  drivers/net/mlx5/mlx5.h     |  15 ++--
>  drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++----------------------
>  drivers/net/mlx5/mlx5_mr.h  |   5 +-
>  drivers/net/mlx5/mlx5_txq.c |   2 +-
>  5 files changed, 121 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 9ff50df..7ff5c8b 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
>  	struct mlx5_switch_info info; /**< Switch information. */
>  	struct ibv_device *ibv_dev; /**< Associated IB device. */
>  	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
>  };
>  
>  static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
> @@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data {
>  		sizeof(sh->ibdev_name));
>  	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
>  		sizeof(sh->ibdev_path));
> +	sh->pci_dev = spawn->pci_dev;
>  	pthread_mutex_init(&sh->intr_mutex, NULL);
>  	/*
>  	 * Setting port_id to max unallowed value means
> @@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data {
>  		err = ENOMEM;
>  		goto error;
>  	}
> +	/*
> +	 * Once the device is added to the list of memory event
> +	 * callback, its global MR cache table cannot be expanded
> +	 * on the fly because of deadlock. If it overflows, lookup
> +	 * should be done by searching MR list linearly, which is slow.
> +	 *
> +	 * At this point the device is not added to the memory
> +	 * event list yet, context is just being created.
> +	 */
> +	err = mlx5_mr_btree_init(&sh->mr.cache,
> +				 MLX5_MR_BTREE_CACHE_N * 2,
> +				 sh->pci_dev->device.numa_node);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
>  	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
>  exit:
>  	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	if (--sh->refcnt)
>  		goto exit;
> +	/* Release created Memory Regions. */
> +	mlx5_mr_release(sh);
>  	LIST_REMOVE(sh, next);
>  	/*
>  	 *  Ensure there is no async event handler installed.
> @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
>  	}
>  	mlx5_proc_priv_uninit(dev);
>  	mlx5_mprq_free_mp(dev);
> -	mlx5_mr_release(dev);
> +	/* Remove from memory callback device list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_REMOVE(priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);

I'm almost okay with this patch but I'm concerned about this part. As devices
sharing the same context is added to the callback list,
mlx5_mr_mem_event_free_cb() will be called multiple times with the same
addr/len. This should be redundant and unnecessary. How about adding the shared
ctx to the list instead?

>  	assert(priv->sh);
>  	mlx5_free_shared_dr(priv);
>  	if (priv->rss_conf.rss_key != NULL)
> @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
>  		goto error;
>  	}
>  	priv->config.flow_prio = err;
> -	/*
> -	 * Once the device is added to the list of memory event
> -	 * callback, its global MR cache table cannot be expanded
> -	 * on the fly because of deadlock. If it overflows, lookup
> -	 * should be done by searching MR list linearly, which is slow.
> -	 */
> -	err = mlx5_mr_btree_init(&priv->mr.cache,
> -				 MLX5_MR_BTREE_CACHE_N * 2,
> -				 eth_dev->device->numa_node);
> -	if (err) {
> -		err = rte_errno;
> -		goto error;
> -	}
>  	/* Add device to memory callback list. */
>  	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
>  	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
>  			list[ns].ibv_port = i;
>  			list[ns].ibv_dev = ibv_match[0];
>  			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
>  			list[ns].ifindex = mlx5_nl_ifindex
>  					(nl_rdma, list[ns].ibv_dev->name, i);
>  			if (!list[ns].ifindex) {
> @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
>  			list[ns].ibv_port = 1;
>  			list[ns].ibv_dev = ibv_match[i];
>  			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
>  			list[ns].ifindex = 0;
>  			if (nl_rdma >= 0)
>  				list[ns].ifindex = mlx5_nl_ifindex
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 14c7f3c..8eb1019 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
>  	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
>  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
>  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> +	struct { /* Global device shared L3 MR cache. */

The comment sounds redundant and this structure has not only the global cache
but also MR list.

> +		uint32_t dev_gen; /* Generation number to flush local caches. */
> +		rte_rwlock_t rwlock; /* MR Lock. */
> +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> +	} mr;
>  	/* Shared DV/DR flow data section. */
>  	pthread_mutex_t dv_mutex; /* DV context mutex. */
>  	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> @@ -323,13 +331,6 @@ struct mlx5_priv {
>  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
>  	/* Flow counters. */
> -	struct {
> -		uint32_t dev_gen; /* Generation number to flush local caches. */
> -		rte_rwlock_t rwlock; /* MR Lock. */
> -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> -	} mr;
>  	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
>  	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
>  	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index a3732d4..5140cc2 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -36,7 +36,7 @@ struct mr_update_mp_data {
>  
>  /**
>   * Expand B-tree table to a given size. Can't be called with holding
> - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
>   *
>   * @param bt
>   *   Pointer to B-tree structure.
> @@ -350,7 +350,7 @@ struct mr_update_mp_data {
>  		n = mr_find_next_chunk(mr, &entry, n);
>  		if (!entry.end)
>  			break;
> -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
>  			/*
>  			 * Overflowed, but the global table cannot be expanded
>  			 * because of deadlock.
> @@ -382,7 +382,7 @@ struct mr_update_mp_data {
>  	struct mlx5_mr *mr;
>  
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
>  		unsigned int n;
>  
>  		if (mr->ms_n == 0)
> @@ -420,6 +420,7 @@ struct mr_update_mp_data {
>  	      uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	uint16_t idx;
>  	uint32_t lkey = UINT32_MAX;
>  	struct mlx5_mr *mr;
> @@ -430,10 +431,10 @@ struct mr_update_mp_data {
>  	 * has to be searched by traversing the original MR list instead, which
>  	 * is very slow path. Otherwise, the global cache is all inclusive.
>  	 */
> -	if (!unlikely(priv->mr.cache.overflow)) {
> -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	if (!unlikely(sh->mr.cache.overflow)) {
> +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
>  		if (lkey != UINT32_MAX)
> -			*entry = (*priv->mr.cache.table)[idx];
> +			*entry = (*sh->mr.cache.table)[idx];
>  	} else {
>  		/* Falling back to the slowest path. */
>  		mr = mr_lookup_dev_list(dev, entry, addr);
> @@ -468,13 +469,12 @@ struct mr_update_mp_data {
>  /**
>   * Release resources of detached MR having no online entry.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  static void
> -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
>  {
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr_next;
>  	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
>  
> @@ -484,11 +484,11 @@ struct mr_update_mp_data {
>  	 * MR can't be freed with holding the lock because rte_free() could call
>  	 * memory free callback function. This will be a deadlock situation.
>  	 */
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Detach the whole free list and release it after unlocking. */
> -	free_list = priv->mr.mr_free_list;
> -	LIST_INIT(&priv->mr.mr_free_list);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	free_list = sh->mr.mr_free_list;
> +	LIST_INIT(&sh->mr.mr_free_list);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Release resources. */
>  	mr_next = LIST_FIRST(&free_list);
>  	while (mr_next != NULL) {
> @@ -548,12 +548,12 @@ struct mr_update_mp_data {
>  		      dev->data->port_id, (void *)addr);
>  		return UINT32_MAX;
>  	}
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
>  	/* Fill in output data. */
>  	mr_lookup_dev(dev, entry, addr);
>  	/* Lookup can't fail. */
>  	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
>  	DEBUG("port %u MR CREATED by primary process for %p:\n"
>  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
>  	      dev->data->port_id, (void *)addr,
> @@ -582,6 +582,7 @@ struct mr_update_mp_data {
>  		       uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_dev_config *config = &priv->config;
>  	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>  	const struct rte_memseg_list *msl;
> @@ -602,12 +603,12 @@ struct mr_update_mp_data {
>  		dev->data->port_id, (void *)addr);
>  	/*
>  	 * Release detached MRs if any. This can't be called with holding either
> -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
> +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
>  	 * been detached by the memory free event but it couldn't be released
>  	 * inside the callback due to deadlock. As a result, releasing resources
>  	 * is quite opportunistic.
>  	 */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
>  	/*
>  	 * If enabled, find out a contiguous virtual address chunk in use, to
>  	 * which the given address belongs, in order to register maximum range.
> @@ -710,7 +711,7 @@ struct mr_update_mp_data {
>  		goto alloc_resources;
>  	}
>  	assert(data.msl == data_re.msl);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/*
>  	 * Check the address is really missing. If other thread already created
>  	 * one or it is not found due to overflow, abort and return.
> @@ -721,10 +722,10 @@ struct mr_update_mp_data {
>  		 * low-on-memory. Then, this entry will have to be searched
>  		 * here again.
>  		 */
> -		mr_btree_insert(&priv->mr.cache, entry);
> +		mr_btree_insert(&sh->mr.cache, entry);
>  		DEBUG("port %u found MR for %p on final lookup, abort",
>  		      dev->data->port_id, (void *)addr);
> -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> +		rte_rwlock_write_unlock(&sh->mr.rwlock);
>  		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  		/*
>  		 * Must be unlocked before calling rte_free() because
> @@ -769,7 +770,7 @@ struct mr_update_mp_data {
>  	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
>  	 * through mlx5_alloc_verbs_buf().
>  	 */
> -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
> +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
>  				       IBV_ACCESS_LOCAL_WRITE);
>  	if (mr->ibv_mr == NULL) {
>  		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@ -779,7 +780,7 @@ struct mr_update_mp_data {
>  	}
>  	assert((uintptr_t)mr->ibv_mr->addr == data.start);
>  	assert(mr->ibv_mr->length == len);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	DEBUG("port %u MR CREATED (%p) for %p:\n"
>  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
>  	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
> @@ -792,11 +793,11 @@ struct mr_update_mp_data {
>  	mr_lookup_dev(dev, entry, addr);
>  	/* Lookup can't fail. */
>  	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  	return entry->lkey;
>  err_mrlock:
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  err_memlock:
>  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
>  err_nolock:
> @@ -854,14 +855,15 @@ struct mr_update_mp_data {
>  mr_rebuild_dev_cache(struct rte_eth_dev *dev)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr *mr;
>  
>  	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
>  	/* Flush cache to rebuild. */
> -	priv->mr.cache.len = 1;
> -	priv->mr.cache.overflow = 0;
> +	sh->mr.cache.len = 1;
> +	sh->mr.cache.overflow = 0;
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
>  		if (mr_insert_dev_cache(dev, mr) < 0)
>  			return;
>  }
> @@ -888,6 +890,7 @@ struct mr_update_mp_data {
>  mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	const struct rte_memseg_list *msl;
>  	struct mlx5_mr *mr;
>  	int ms_n;
> @@ -901,7 +904,7 @@ struct mr_update_mp_data {
>  	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
>  	assert(len == RTE_ALIGN(len, msl->page_sz));
>  	ms_n = len / msl->page_sz;
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Clear bits of freed memsegs from MR. */
>  	for (i = 0; i < ms_n; ++i) {
>  		const struct rte_memseg *ms;
> @@ -928,7 +931,7 @@ struct mr_update_mp_data {
>  		rte_bitmap_clear(mr->ms_bmp, pos);
>  		if (--mr->ms_n == 0) {
>  			LIST_REMOVE(mr, mr);
> -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  			DEBUG("port %u remove MR(%p) from list",
>  			      dev->data->port_id, (void *)mr);
>  		}
> @@ -949,12 +952,12 @@ struct mr_update_mp_data {
>  		 * generation below) will be guaranteed to be seen by other core
>  		 * before the core sees the newly allocated memory.
>  		 */
> -		++priv->mr.dev_gen;
> +		++sh->mr.dev_gen;
>  		DEBUG("broadcasting local cache flush, gen=%d",
> -		      priv->mr.dev_gen);
> +		      sh->mr.dev_gen);
>  		rte_smp_wmb();
>  	}
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  }
>  
>  /**
> @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
>  		   struct mlx5_mr_cache *entry, uintptr_t addr)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
>  	uint16_t idx;
>  	uint32_t lkey;
> @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
>  	if (unlikely(bt->len == bt->size))
>  		mr_btree_expand(bt, bt->size << 1);
>  	/* Look up in the global cache. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
>  	if (lkey != UINT32_MAX) {
>  		/* Found. */
> -		*entry = (*priv->mr.cache.table)[idx];
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		*entry = (*sh->mr.cache.table)[idx];
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
>  		/*
>  		 * Update local cache. Even if it fails, return the found entry
>  		 * to update top-half cache. Next time, this entry will be found
> @@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
>  		mr_btree_insert(bt, entry);
>  		return lkey;
>  	}
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	/* First time to see the address? Create a new MR. */
>  	lkey = mlx5_mr_create(dev, entry, addr);
>  	/*
> @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
>  	struct mr_update_mp_data *data = opaque;
>  	struct rte_eth_dev *dev = data->dev;
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
>  	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
>  	struct mlx5_mr *mr = NULL;
>  	uintptr_t addr = (uintptr_t)memhdr->addr;
> @@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
>  
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	/* If already registered, it should return. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	lkey = mr_lookup_dev(dev, &entry, addr);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	if (lkey != UINT32_MAX)
>  		return;
>  	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
> @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
>  		data->ret = -1;
>  		return;
>  	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	/* Insert to the global cache table. */
>  	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Insert to the local cache table */
>  	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
>  }
> @@ -1345,6 +1350,7 @@ struct mr_update_mp_data {
>  	struct rte_eth_dev *dev;
>  	struct mlx5_mr *mr;
>  	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
>  
>  	dev = pci_dev_to_eth_dev(pdev);
>  	if (!dev) {
> @@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
>  		rte_errno = EINVAL;
>  		return -1;
>  	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	sh = priv->sh;
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
>  	/* Insert to the global cache table. */
>  	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	return 0;
>  }
>  
> @@ -1390,6 +1397,7 @@ struct mr_update_mp_data {
>  {
>  	struct rte_eth_dev *dev;
>  	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
>  	struct mlx5_mr *mr;
>  	struct mlx5_mr_cache entry;
>  
> @@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
>  		return -1;
>  	}
>  	priv = dev->data->dev_private;
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	sh = priv->sh;
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
>  	if (!mr) {
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
>  		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
>  				 "to PCI device %p", (uintptr_t)addr,
>  				 (void *)pdev);
> @@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
>  		return -1;
>  	}
>  	LIST_REMOVE(mr, mr);
> -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
>  	      (void *)mr);
>  	mr_rebuild_dev_cache(dev);
> @@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
>  	 * generation below) will be guaranteed to be seen by other core
>  	 * before the core sees the newly allocated memory.
>  	 */
> -	++priv->mr.dev_gen;
> +	++sh->mr.dev_gen;
>  	DEBUG("broadcasting local cache flush, gen=%d",
> -			priv->mr.dev_gen);
> +			sh->mr.dev_gen);

Looks indentation was wrong even before your change.
Please correct it.

Thanks,
Yongseok

>  	rte_smp_wmb();
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  	return 0;
>  }
>  
> @@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
>  /**
>   * Dump all the created MRs and the global cache entries.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  void
> -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
>  {
>  #ifndef NDEBUG
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr;
>  	int mr_n = 0;
>  	int chunk_n = 0;
>  
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
>  	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
>  		unsigned int n;
>  
> -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> -		      dev->data->port_id, mr_n++,
> +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> +		      sh->ibdev_name, mr_n++,
>  		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
>  		      mr->ms_n, mr->ms_bmp_n);
>  		if (mr->ms_n == 0)
> @@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
>  			      chunk_n++, ret.start, ret.end);
>  		}
>  	}
> -	DEBUG("port %u dumping global cache", dev->data->port_id);
> -	mlx5_mr_btree_dump(&priv->mr.cache);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> +	mlx5_mr_btree_dump(&sh->mr.cache);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
>  #endif
>  }
>  
>  /**
> - * Release all the created MRs and resources. Remove device from memory callback
> + * Release all the created MRs and resources for shared device context.
>   * list.
>   *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>   */
>  void
> -mlx5_mr_release(struct rte_eth_dev *dev)
> +mlx5_mr_release(struct mlx5_ibv_shared *sh)
>  {
> -	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_mr *mr_next;
>  
> -	/* Remove from memory callback device list. */
> -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -	LIST_REMOVE(priv, mem_event_cb);
> -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> -		mlx5_mr_dump_dev(dev);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +		mlx5_mr_dump_dev(sh);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
>  	/* Detach from MR list and move to free list. */
> -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> +	mr_next = LIST_FIRST(&sh->mr.mr_list);
>  	while (mr_next != NULL) {
>  		struct mlx5_mr *mr = mr_next;
>  
>  		mr_next = LIST_NEXT(mr, mr);
>  		LIST_REMOVE(mr, mr);
> -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
>  	}
> -	LIST_INIT(&priv->mr.mr_list);
> +	LIST_INIT(&sh->mr.mr_list);
>  	/* Free global cache. */
> -	mlx5_mr_btree_free(&priv->mr.cache);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	mlx5_mr_btree_free(&sh->mr.cache);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
>  	/* Free all remaining MRs. */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
>  }
> diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> index 786f6a3..89e89b7 100644
> --- a/drivers/net/mlx5/mlx5_mr.h
> +++ b/drivers/net/mlx5/mlx5_mr.h
> @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
>  	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
>  } __rte_packed;
>  
> +struct mlx5_ibv_shared;
>  extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
>  extern rte_rwlock_t mlx5_mem_event_rwlock;
>  
> @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
>  			  size_t len, void *arg);
>  int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
>  		      struct rte_mempool *mp);
> -void mlx5_mr_release(struct rte_eth_dev *dev);
> +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
>  
>  /* Debug purpose functions. */
>  void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
> -void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
>  
>  /**
>   * Look up LKey from given lookup table by linear search. Firstly look up the
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 9965b2b..ed82594 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
>  		goto error;
>  	}
>  	/* Save pointer of global generation number to check memory event. */
> -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
>  	assert(desc > MLX5_TX_COMP_THRESH);
>  	tmpl->txq.offloads = conf->offloads |
>  			     dev->data->dev_conf.txmode.offloads;
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
  2019-04-12 19:22 ` Yongseok Koh
  2019-04-12 19:22   ` Yongseok Koh
@ 2019-04-15  6:42   ` Slava Ovsiienko
  2019-04-15  6:42     ` Slava Ovsiienko
  1 sibling, 1 reply; 20+ messages in thread
From: Slava Ovsiienko @ 2019-04-15  6:42 UTC (permalink / raw)
  To: Yongseok Koh; +Cc: dev, Shahaf Shuler

> -----Original Message-----
> From: Yongseok Koh
> Sent: Friday, April 12, 2019 22:22
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; Shahaf Shuler <shahafs@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for
> multiport device
> 
> On Fri, Apr 12, 2019 at 03:45:40PM +0000, Viacheslav Ovsiienko wrote:
> > The multiport Infiniband device support was introduced [1].
> > All active ports, belonging to the same Infiniband device use the
> > signle shared Infiniband context of that device and share the resources:
> >   - QPs are created within shared context
> >   - Verbs flows are also created with specifying port index
> >   - DV/DR resources
> >   - Protection Domain
> >   - Event Handlers
> >
> > This patchset adds support for Memory Regions sharing between portes,
> > created on the base of multiport Infiniban device.
> > The datapath of mlx5 uses the layered cache subsystem for
> > allocating/releasing Memory Regions, only the lowest layer L3 is
> > subject to share due to performance issues.
> >
> > [1]
> > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> es.dpdk.org%2Fcover%2F51800%2F&amp;data=02%7C01%7Cyskoh%40mella
> nox.com
> >
> %7Cf16a8f547c3b4b3a1f7608d6bf5df4cd%7Ca652971c7d2e4d9ba6a4d14925
> 6f461b
> >
> %7C0%7C0%7C636906807620990767&amp;sdata=uinIUW9euBYb74BhK9f8IN
> khaTM6jw
> > yCXDhmGtmEIME%3D&amp;reserved=0
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c     |  40 +++++++----
> >  drivers/net/mlx5/mlx5.h     |  15 ++--
> >  drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++-----------------
> -----
> >  drivers/net/mlx5/mlx5_mr.h  |   5 +-
> >  drivers/net/mlx5/mlx5_txq.c |   2 +-
> >  5 files changed, 121 insertions(+), 104 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 9ff50df..7ff5c8b 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
> >  	struct mlx5_switch_info info; /**< Switch information. */
> >  	struct ibv_device *ibv_dev; /**< Associated IB device. */
> >  	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> > +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
> >  };
> >
> >  static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list =
> > LIST_HEAD_INITIALIZER(); @@ -222,6 +223,7 @@ struct
> mlx5_dev_spawn_data {
> >  		sizeof(sh->ibdev_name));
> >  	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> >  		sizeof(sh->ibdev_path));
> > +	sh->pci_dev = spawn->pci_dev;
> >  	pthread_mutex_init(&sh->intr_mutex, NULL);
> >  	/*
> >  	 * Setting port_id to max unallowed value means @@ -236,6 +238,22
> @@
> > struct mlx5_dev_spawn_data {
> >  		err = ENOMEM;
> >  		goto error;
> >  	}
> > +	/*
> > +	 * Once the device is added to the list of memory event
> > +	 * callback, its global MR cache table cannot be expanded
> > +	 * on the fly because of deadlock. If it overflows, lookup
> > +	 * should be done by searching MR list linearly, which is slow.
> > +	 *
> > +	 * At this point the device is not added to the memory
> > +	 * event list yet, context is just being created.
> > +	 */
> > +	err = mlx5_mr_btree_init(&sh->mr.cache,
> > +				 MLX5_MR_BTREE_CACHE_N * 2,
> > +				 sh->pci_dev->device.numa_node);
> > +	if (err) {
> > +		err = rte_errno;
> > +		goto error;
> > +	}
> >  	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> >  exit:
> >  	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> > @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
> >  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> >  	if (--sh->refcnt)
> >  		goto exit;
> > +	/* Release created Memory Regions. */
> > +	mlx5_mr_release(sh);
> >  	LIST_REMOVE(sh, next);
> >  	/*
> >  	 *  Ensure there is no async event handler installed.
> > @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
> >  	}
> >  	mlx5_proc_priv_uninit(dev);
> >  	mlx5_mprq_free_mp(dev);
> > -	mlx5_mr_release(dev);
> > +	/* Remove from memory callback device list. */
> > +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> > +	LIST_REMOVE(priv, mem_event_cb);
> > +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 
> I'm almost okay with this patch but I'm concerned about this part. As devices
> sharing the same context is added to the callback list,
> mlx5_mr_mem_event_free_cb() will be called multiple times with the same
> addr/len. This should be redundant and unnecessary. How about adding the
> shared ctx to the list instead?

Yes, you are quite right. The moving mem_event_cb to shared context (i.e.
including shared context to the list instead of priv) was going to be part of the patch.
It seems I just occasionally dropped the last commit (instead of squash) while
rebasing to the new branch.

> 
> >  	assert(priv->sh);
> >  	mlx5_free_shared_dr(priv);
> >  	if (priv->rss_conf.rss_key != NULL)
> > @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
> >  		goto error;
> >  	}
> >  	priv->config.flow_prio = err;
> > -	/*
> > -	 * Once the device is added to the list of memory event
> > -	 * callback, its global MR cache table cannot be expanded
> > -	 * on the fly because of deadlock. If it overflows, lookup
> > -	 * should be done by searching MR list linearly, which is slow.
> > -	 */
> > -	err = mlx5_mr_btree_init(&priv->mr.cache,
> > -				 MLX5_MR_BTREE_CACHE_N * 2,
> > -				 eth_dev->device->numa_node);
> > -	if (err) {
> > -		err = rte_errno;
> > -		goto error;
> > -	}
> >  	/* Add device to memory callback list. */
> >  	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> >  	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> > @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
> >  			list[ns].ibv_port = i;
> >  			list[ns].ibv_dev = ibv_match[0];
> >  			list[ns].eth_dev = NULL;
> > +			list[ns].pci_dev = pci_dev;
> >  			list[ns].ifindex = mlx5_nl_ifindex
> >  					(nl_rdma, list[ns].ibv_dev->name, i);
> >  			if (!list[ns].ifindex) {
> > @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
> >  			list[ns].ibv_port = 1;
> >  			list[ns].ibv_dev = ibv_match[i];
> >  			list[ns].eth_dev = NULL;
> > +			list[ns].pci_dev = pci_dev;
> >  			list[ns].ifindex = 0;
> >  			if (nl_rdma >= 0)
> >  				list[ns].ifindex = mlx5_nl_ifindex diff --git
> > a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 14c7f3c..8eb1019 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
> >  	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> secondary */
> >  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> > +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> > +	struct { /* Global device shared L3 MR cache. */
> 
> The comment sounds redundant and this structure has not only the global
> cache but also MR list.
OK, I'll try to make the comments more consize.

> 
> > +		uint32_t dev_gen; /* Generation number to flush local
> caches. */
> > +		rte_rwlock_t rwlock; /* MR Lock. */
> > +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> > +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> > +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> > +	} mr;
> >  	/* Shared DV/DR flow data section. */
> >  	pthread_mutex_t dv_mutex; /* DV context mutex. */
> >  	uint32_t dv_refcnt; /* DV/DR data reference counter. */ @@ -323,13
> > +331,6 @@ struct mlx5_priv {
> >  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
> >  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
> >  	/* Flow counters. */
> > -	struct {
> > -		uint32_t dev_gen; /* Generation number to flush local
> caches. */
> > -		rte_rwlock_t rwlock; /* MR Lock. */
> > -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> > -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> > -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> > -	} mr;
> >  	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
> >  	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
> >  	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ diff
> > --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index
> > a3732d4..5140cc2 100644
> > --- a/drivers/net/mlx5/mlx5_mr.c
> > +++ b/drivers/net/mlx5/mlx5_mr.c
> > @@ -36,7 +36,7 @@ struct mr_update_mp_data {
> >
> >  /**
> >   * Expand B-tree table to a given size. Can't be called with holding
> > - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> > + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
> >   *
> >   * @param bt
> >   *   Pointer to B-tree structure.
> > @@ -350,7 +350,7 @@ struct mr_update_mp_data {
> >  		n = mr_find_next_chunk(mr, &entry, n);
> >  		if (!entry.end)
> >  			break;
> > -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> > +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
> >  			/*
> >  			 * Overflowed, but the global table cannot be
> expanded
> >  			 * because of deadlock.
> > @@ -382,7 +382,7 @@ struct mr_update_mp_data {
> >  	struct mlx5_mr *mr;
> >
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> > +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
> >  		unsigned int n;
> >
> >  		if (mr->ms_n == 0)
> > @@ -420,6 +420,7 @@ struct mr_update_mp_data {
> >  	      uintptr_t addr)
> >  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	uint16_t idx;
> >  	uint32_t lkey = UINT32_MAX;
> >  	struct mlx5_mr *mr;
> > @@ -430,10 +431,10 @@ struct mr_update_mp_data {
> >  	 * has to be searched by traversing the original MR list instead, which
> >  	 * is very slow path. Otherwise, the global cache is all inclusive.
> >  	 */
> > -	if (!unlikely(priv->mr.cache.overflow)) {
> > -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> > +	if (!unlikely(sh->mr.cache.overflow)) {
> > +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> >  		if (lkey != UINT32_MAX)
> > -			*entry = (*priv->mr.cache.table)[idx];
> > +			*entry = (*sh->mr.cache.table)[idx];
> >  	} else {
> >  		/* Falling back to the slowest path. */
> >  		mr = mr_lookup_dev_list(dev, entry, addr); @@ -468,13
> +469,12 @@
> > struct mr_update_mp_data {
> >  /**
> >   * Release resources of detached MR having no online entry.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  static void
> > -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> > +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
> >  {
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr_next;
> >  	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
> >
> > @@ -484,11 +484,11 @@ struct mr_update_mp_data {
> >  	 * MR can't be freed with holding the lock because rte_free() could
> call
> >  	 * memory free callback function. This will be a deadlock situation.
> >  	 */
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Detach the whole free list and release it after unlocking. */
> > -	free_list = priv->mr.mr_free_list;
> > -	LIST_INIT(&priv->mr.mr_free_list);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	free_list = sh->mr.mr_free_list;
> > +	LIST_INIT(&sh->mr.mr_free_list);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Release resources. */
> >  	mr_next = LIST_FIRST(&free_list);
> >  	while (mr_next != NULL) {
> > @@ -548,12 +548,12 @@ struct mr_update_mp_data {
> >  		      dev->data->port_id, (void *)addr);
> >  		return UINT32_MAX;
> >  	}
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
> >  	/* Fill in output data. */
> >  	mr_lookup_dev(dev, entry, addr);
> >  	/* Lookup can't fail. */
> >  	assert(entry->lkey != UINT32_MAX);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
> >  	DEBUG("port %u MR CREATED by primary process for %p:\n"
> >  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
> >  	      dev->data->port_id, (void *)addr, @@ -582,6 +582,7 @@ struct
> > mr_update_mp_data {
> >  		       uintptr_t addr)
> >  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_dev_config *config = &priv->config;
> >  	struct rte_mem_config *mcfg = rte_eal_get_configuration()-
> >mem_config;
> >  	const struct rte_memseg_list *msl;
> > @@ -602,12 +603,12 @@ struct mr_update_mp_data {
> >  		dev->data->port_id, (void *)addr);
> >  	/*
> >  	 * Release detached MRs if any. This can't be called with holding
> either
> > -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list
> have
> > +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
> >  	 * been detached by the memory free event but it couldn't be
> released
> >  	 * inside the callback due to deadlock. As a result, releasing resources
> >  	 * is quite opportunistic.
> >  	 */
> > -	mlx5_mr_garbage_collect(dev);
> > +	mlx5_mr_garbage_collect(sh);
> >  	/*
> >  	 * If enabled, find out a contiguous virtual address chunk in use, to
> >  	 * which the given address belongs, in order to register maximum
> range.
> > @@ -710,7 +711,7 @@ struct mr_update_mp_data {
> >  		goto alloc_resources;
> >  	}
> >  	assert(data.msl == data_re.msl);
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/*
> >  	 * Check the address is really missing. If other thread already created
> >  	 * one or it is not found due to overflow, abort and return.
> > @@ -721,10 +722,10 @@ struct mr_update_mp_data {
> >  		 * low-on-memory. Then, this entry will have to be searched
> >  		 * here again.
> >  		 */
> > -		mr_btree_insert(&priv->mr.cache, entry);
> > +		mr_btree_insert(&sh->mr.cache, entry);
> >  		DEBUG("port %u found MR for %p on final lookup, abort",
> >  		      dev->data->port_id, (void *)addr);
> > -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +		rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  		/*
> >  		 * Must be unlocked before calling rte_free() because @@ -
> 769,7
> > +770,7 @@ struct mr_update_mp_data {
> >  	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
> >  	 * through mlx5_alloc_verbs_buf().
> >  	 */
> > -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start,
> len,
> > +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
> >  				       IBV_ACCESS_LOCAL_WRITE);
> >  	if (mr->ibv_mr == NULL) {
> >  		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@
> > -779,7 +780,7 @@ struct mr_update_mp_data {
> >  	}
> >  	assert((uintptr_t)mr->ibv_mr->addr == data.start);
> >  	assert(mr->ibv_mr->length == len);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	DEBUG("port %u MR CREATED (%p) for %p:\n"
> >  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
> >  	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", @@ -
> 792,11
> > +793,11 @@ struct mr_update_mp_data {
> >  	mr_lookup_dev(dev, entry, addr);
> >  	/* Lookup can't fail. */
> >  	assert(entry->lkey != UINT32_MAX);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  	return entry->lkey;
> >  err_mrlock:
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  err_memlock:
> >  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  err_nolock:
> > @@ -854,14 +855,15 @@ struct mr_update_mp_data {
> > mr_rebuild_dev_cache(struct rte_eth_dev *dev)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr *mr;
> >
> >  	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data-
> >port_id);
> >  	/* Flush cache to rebuild. */
> > -	priv->mr.cache.len = 1;
> > -	priv->mr.cache.overflow = 0;
> > +	sh->mr.cache.len = 1;
> > +	sh->mr.cache.overflow = 0;
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> > +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
> >  		if (mr_insert_dev_cache(dev, mr) < 0)
> >  			return;
> >  }
> > @@ -888,6 +890,7 @@ struct mr_update_mp_data {
> > mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr,
> > size_t len)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	const struct rte_memseg_list *msl;
> >  	struct mlx5_mr *mr;
> >  	int ms_n;
> > @@ -901,7 +904,7 @@ struct mr_update_mp_data {
> >  	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
> >  	assert(len == RTE_ALIGN(len, msl->page_sz));
> >  	ms_n = len / msl->page_sz;
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Clear bits of freed memsegs from MR. */
> >  	for (i = 0; i < ms_n; ++i) {
> >  		const struct rte_memseg *ms;
> > @@ -928,7 +931,7 @@ struct mr_update_mp_data {
> >  		rte_bitmap_clear(mr->ms_bmp, pos);
> >  		if (--mr->ms_n == 0) {
> >  			LIST_REMOVE(mr, mr);
> > -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  			DEBUG("port %u remove MR(%p) from list",
> >  			      dev->data->port_id, (void *)mr);
> >  		}
> > @@ -949,12 +952,12 @@ struct mr_update_mp_data {
> >  		 * generation below) will be guaranteed to be seen by other
> core
> >  		 * before the core sees the newly allocated memory.
> >  		 */
> > -		++priv->mr.dev_gen;
> > +		++sh->mr.dev_gen;
> >  		DEBUG("broadcasting local cache flush, gen=%d",
> > -		      priv->mr.dev_gen);
> > +		      sh->mr.dev_gen);
> >  		rte_smp_wmb();
> >  	}
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  }
> >
> >  /**
> > @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
> >  		   struct mlx5_mr_cache *entry, uintptr_t addr)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
> >  	uint16_t idx;
> >  	uint32_t lkey;
> > @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
> >  	if (unlikely(bt->len == bt->size))
> >  		mr_btree_expand(bt, bt->size << 1);
> >  	/* Look up in the global cache. */
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> > +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> >  	if (lkey != UINT32_MAX) {
> >  		/* Found. */
> > -		*entry = (*priv->mr.cache.table)[idx];
> > -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +		*entry = (*sh->mr.cache.table)[idx];
> > +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  		/*
> >  		 * Update local cache. Even if it fails, return the found entry
> >  		 * to update top-half cache. Next time, this entry will be
> found @@
> > -1035,7 +1039,7 @@ struct mr_update_mp_data {
> >  		mr_btree_insert(bt, entry);
> >  		return lkey;
> >  	}
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	/* First time to see the address? Create a new MR. */
> >  	lkey = mlx5_mr_create(dev, entry, addr);
> >  	/*
> > @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
> >  	struct mr_update_mp_data *data = opaque;
> >  	struct rte_eth_dev *dev = data->dev;
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
> >  	struct mlx5_mr *mr = NULL;
> >  	uintptr_t addr = (uintptr_t)memhdr->addr; @@ -1270,9 +1275,9 @@
> > struct mr_update_mp_data {
> >
> >  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> >  	/* If already registered, it should return. */
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	lkey = mr_lookup_dev(dev, &entry, addr);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	if (lkey != UINT32_MAX)
> >  		return;
> >  	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool
> (%s)",
> > @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
> >  		data->ret = -1;
> >  		return;
> >  	}
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	/* Insert to the global cache table. */
> >  	mr_insert_dev_cache(dev, mr);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Insert to the local cache table */
> >  	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);  } @@ -1345,6 +1350,7
> @@
> > struct mr_update_mp_data {
> >  	struct rte_eth_dev *dev;
> >  	struct mlx5_mr *mr;
> >  	struct mlx5_priv *priv;
> > +	struct mlx5_ibv_shared *sh;
> >
> >  	dev = pci_dev_to_eth_dev(pdev);
> >  	if (!dev) {
> > @@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
> >  		rte_errno = EINVAL;
> >  		return -1;
> >  	}
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	sh = priv->sh;
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	/* Insert to the global cache table. */
> >  	mr_insert_dev_cache(dev, mr);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	return 0;
> >  }
> >
> > @@ -1390,6 +1397,7 @@ struct mr_update_mp_data {  {
> >  	struct rte_eth_dev *dev;
> >  	struct mlx5_priv *priv;
> > +	struct mlx5_ibv_shared *sh;
> >  	struct mlx5_mr *mr;
> >  	struct mlx5_mr_cache entry;
> >
> > @@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
> >  		return -1;
> >  	}
> >  	priv = dev->data->dev_private;
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	sh = priv->sh;
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> >  	if (!mr) {
> > -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't
> registered "
> >  				 "to PCI device %p", (uintptr_t)addr,
> >  				 (void *)pdev);
> > @@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
> >  		return -1;
> >  	}
> >  	LIST_REMOVE(mr, mr);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> >  	      (void *)mr);
> >  	mr_rebuild_dev_cache(dev);
> > @@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
> >  	 * generation below) will be guaranteed to be seen by other core
> >  	 * before the core sees the newly allocated memory.
> >  	 */
> > -	++priv->mr.dev_gen;
> > +	++sh->mr.dev_gen;
> >  	DEBUG("broadcasting local cache flush, gen=%d",
> > -			priv->mr.dev_gen);
> > +			sh->mr.dev_gen);
> 
> Looks indentation was wrong even before your change.
> Please correct it.
OK.

> 
> Thanks,
> Yongseok
> 
Thank you for review,
Best Regards,
Slava

> >  	rte_smp_wmb();
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	return 0;
> >  }
> >
> > @@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
> >  /**
> >   * Dump all the created MRs and the global cache entries.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  void
> > -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> > +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
> >  {
> >  #ifndef NDEBUG
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr;
> >  	int mr_n = 0;
> >  	int chunk_n = 0;
> >
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> > +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
> >  		unsigned int n;
> >
> > -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u,
> ms_bmp_n = %u",
> > -		      dev->data->port_id, mr_n++,
> > +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u,
> ms_bmp_n = %u",
> > +		      sh->ibdev_name, mr_n++,
> >  		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
> >  		      mr->ms_n, mr->ms_bmp_n);
> >  		if (mr->ms_n == 0)
> > @@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
> >  			      chunk_n++, ret.start, ret.end);
> >  		}
> >  	}
> > -	DEBUG("port %u dumping global cache", dev->data->port_id);
> > -	mlx5_mr_btree_dump(&priv->mr.cache);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> > +	mlx5_mr_btree_dump(&sh->mr.cache);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  #endif
> >  }
> >
> >  /**
> > - * Release all the created MRs and resources. Remove device from
> > memory callback
> > + * Release all the created MRs and resources for shared device context.
> >   * list.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  void
> > -mlx5_mr_release(struct rte_eth_dev *dev)
> > +mlx5_mr_release(struct mlx5_ibv_shared *sh)
> >  {
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr_next;
> >
> > -	/* Remove from memory callback device list. */
> > -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> > -	LIST_REMOVE(priv, mem_event_cb);
> > -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> >  	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> > -		mlx5_mr_dump_dev(dev);
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +		mlx5_mr_dump_dev(sh);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Detach from MR list and move to free list. */
> > -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> > +	mr_next = LIST_FIRST(&sh->mr.mr_list);
> >  	while (mr_next != NULL) {
> >  		struct mlx5_mr *mr = mr_next;
> >
> >  		mr_next = LIST_NEXT(mr, mr);
> >  		LIST_REMOVE(mr, mr);
> > -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  	}
> > -	LIST_INIT(&priv->mr.mr_list);
> > +	LIST_INIT(&sh->mr.mr_list);
> >  	/* Free global cache. */
> > -	mlx5_mr_btree_free(&priv->mr.cache);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	mlx5_mr_btree_free(&sh->mr.cache);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Free all remaining MRs. */
> > -	mlx5_mr_garbage_collect(dev);
> > +	mlx5_mr_garbage_collect(sh);
> >  }
> > diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> > index 786f6a3..89e89b7 100644
> > --- a/drivers/net/mlx5/mlx5_mr.h
> > +++ b/drivers/net/mlx5/mlx5_mr.h
> > @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
> >  	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */  }
> > __rte_packed;
> >
> > +struct mlx5_ibv_shared;
> >  extern struct mlx5_dev_list  mlx5_mem_event_cb_list;  extern
> > rte_rwlock_t mlx5_mem_event_rwlock;
> >
> > @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum
> rte_mem_event event_type, const void *addr,
> >  			  size_t len, void *arg);
> >  int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl
> *mr_ctrl,
> >  		      struct rte_mempool *mp);
> > -void mlx5_mr_release(struct rte_eth_dev *dev);
> > +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
> >
> >  /* Debug purpose functions. */
> >  void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt); -void
> > mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> > +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
> >
> >  /**
> >   * Look up LKey from given lookup table by linear search. Firstly
> > look up the diff --git a/drivers/net/mlx5/mlx5_txq.c
> > b/drivers/net/mlx5/mlx5_txq.c index 9965b2b..ed82594 100644
> > --- a/drivers/net/mlx5/mlx5_txq.c
> > +++ b/drivers/net/mlx5/mlx5_txq.c
> > @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
> >  		goto error;
> >  	}
> >  	/* Save pointer of global generation number to check memory event.
> */
> > -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> > +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
> >  	assert(desc > MLX5_TX_COMP_THRESH);
> >  	tmpl->txq.offloads = conf->offloads |
> >  			     dev->data->dev_conf.txmode.offloads;
> > --
> > 1.8.3.1
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device
  2019-04-15  6:42   ` Slava Ovsiienko
@ 2019-04-15  6:42     ` Slava Ovsiienko
  0 siblings, 0 replies; 20+ messages in thread
From: Slava Ovsiienko @ 2019-04-15  6:42 UTC (permalink / raw)
  To: Yongseok Koh; +Cc: dev, Shahaf Shuler

> -----Original Message-----
> From: Yongseok Koh
> Sent: Friday, April 12, 2019 22:22
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; Shahaf Shuler <shahafs@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for
> multiport device
> 
> On Fri, Apr 12, 2019 at 03:45:40PM +0000, Viacheslav Ovsiienko wrote:
> > The multiport Infiniband device support was introduced [1].
> > All active ports, belonging to the same Infiniband device use the
> > signle shared Infiniband context of that device and share the resources:
> >   - QPs are created within shared context
> >   - Verbs flows are also created with specifying port index
> >   - DV/DR resources
> >   - Protection Domain
> >   - Event Handlers
> >
> > This patchset adds support for Memory Regions sharing between portes,
> > created on the base of multiport Infiniban device.
> > The datapath of mlx5 uses the layered cache subsystem for
> > allocating/releasing Memory Regions, only the lowest layer L3 is
> > subject to share due to performance issues.
> >
> > [1]
> > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> es.dpdk.org%2Fcover%2F51800%2F&amp;data=02%7C01%7Cyskoh%40mella
> nox.com
> >
> %7Cf16a8f547c3b4b3a1f7608d6bf5df4cd%7Ca652971c7d2e4d9ba6a4d14925
> 6f461b
> >
> %7C0%7C0%7C636906807620990767&amp;sdata=uinIUW9euBYb74BhK9f8IN
> khaTM6jw
> > yCXDhmGtmEIME%3D&amp;reserved=0
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c     |  40 +++++++----
> >  drivers/net/mlx5/mlx5.h     |  15 ++--
> >  drivers/net/mlx5/mlx5_mr.c  | 163 ++++++++++++++++++++++-----------------
> -----
> >  drivers/net/mlx5/mlx5_mr.h  |   5 +-
> >  drivers/net/mlx5/mlx5_txq.c |   2 +-
> >  5 files changed, 121 insertions(+), 104 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 9ff50df..7ff5c8b 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data {
> >  	struct mlx5_switch_info info; /**< Switch information. */
> >  	struct ibv_device *ibv_dev; /**< Associated IB device. */
> >  	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> > +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
> >  };
> >
> >  static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list =
> > LIST_HEAD_INITIALIZER(); @@ -222,6 +223,7 @@ struct
> mlx5_dev_spawn_data {
> >  		sizeof(sh->ibdev_name));
> >  	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> >  		sizeof(sh->ibdev_path));
> > +	sh->pci_dev = spawn->pci_dev;
> >  	pthread_mutex_init(&sh->intr_mutex, NULL);
> >  	/*
> >  	 * Setting port_id to max unallowed value means @@ -236,6 +238,22
> @@
> > struct mlx5_dev_spawn_data {
> >  		err = ENOMEM;
> >  		goto error;
> >  	}
> > +	/*
> > +	 * Once the device is added to the list of memory event
> > +	 * callback, its global MR cache table cannot be expanded
> > +	 * on the fly because of deadlock. If it overflows, lookup
> > +	 * should be done by searching MR list linearly, which is slow.
> > +	 *
> > +	 * At this point the device is not added to the memory
> > +	 * event list yet, context is just being created.
> > +	 */
> > +	err = mlx5_mr_btree_init(&sh->mr.cache,
> > +				 MLX5_MR_BTREE_CACHE_N * 2,
> > +				 sh->pci_dev->device.numa_node);
> > +	if (err) {
> > +		err = rte_errno;
> > +		goto error;
> > +	}
> >  	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> >  exit:
> >  	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> > @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data {
> >  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> >  	if (--sh->refcnt)
> >  		goto exit;
> > +	/* Release created Memory Regions. */
> > +	mlx5_mr_release(sh);
> >  	LIST_REMOVE(sh, next);
> >  	/*
> >  	 *  Ensure there is no async event handler installed.
> > @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data {
> >  	}
> >  	mlx5_proc_priv_uninit(dev);
> >  	mlx5_mprq_free_mp(dev);
> > -	mlx5_mr_release(dev);
> > +	/* Remove from memory callback device list. */
> > +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> > +	LIST_REMOVE(priv, mem_event_cb);
> > +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 
> I'm almost okay with this patch but I'm concerned about this part. As devices
> sharing the same context is added to the callback list,
> mlx5_mr_mem_event_free_cb() will be called multiple times with the same
> addr/len. This should be redundant and unnecessary. How about adding the
> shared ctx to the list instead?

Yes, you are quite right. The moving mem_event_cb to shared context (i.e.
including shared context to the list instead of priv) was going to be part of the patch.
It seems I just occasionally dropped the last commit (instead of squash) while
rebasing to the new branch.

> 
> >  	assert(priv->sh);
> >  	mlx5_free_shared_dr(priv);
> >  	if (priv->rss_conf.rss_key != NULL)
> > @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data {
> >  		goto error;
> >  	}
> >  	priv->config.flow_prio = err;
> > -	/*
> > -	 * Once the device is added to the list of memory event
> > -	 * callback, its global MR cache table cannot be expanded
> > -	 * on the fly because of deadlock. If it overflows, lookup
> > -	 * should be done by searching MR list linearly, which is slow.
> > -	 */
> > -	err = mlx5_mr_btree_init(&priv->mr.cache,
> > -				 MLX5_MR_BTREE_CACHE_N * 2,
> > -				 eth_dev->device->numa_node);
> > -	if (err) {
> > -		err = rte_errno;
> > -		goto error;
> > -	}
> >  	/* Add device to memory callback list. */
> >  	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> >  	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> > @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data {
> >  			list[ns].ibv_port = i;
> >  			list[ns].ibv_dev = ibv_match[0];
> >  			list[ns].eth_dev = NULL;
> > +			list[ns].pci_dev = pci_dev;
> >  			list[ns].ifindex = mlx5_nl_ifindex
> >  					(nl_rdma, list[ns].ibv_dev->name, i);
> >  			if (!list[ns].ifindex) {
> > @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data {
> >  			list[ns].ibv_port = 1;
> >  			list[ns].ibv_dev = ibv_match[i];
> >  			list[ns].eth_dev = NULL;
> > +			list[ns].pci_dev = pci_dev;
> >  			list[ns].ifindex = 0;
> >  			if (nl_rdma >= 0)
> >  				list[ns].ifindex = mlx5_nl_ifindex diff --git
> > a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 14c7f3c..8eb1019 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -257,6 +257,14 @@ struct mlx5_ibv_shared {
> >  	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> >  	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> secondary */
> >  	struct ibv_device_attr_ex device_attr; /* Device properties. */
> > +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> > +	struct { /* Global device shared L3 MR cache. */
> 
> The comment sounds redundant and this structure has not only the global
> cache but also MR list.
OK, I'll try to make the comments more consize.

> 
> > +		uint32_t dev_gen; /* Generation number to flush local
> caches. */
> > +		rte_rwlock_t rwlock; /* MR Lock. */
> > +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> > +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> > +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> > +	} mr;
> >  	/* Shared DV/DR flow data section. */
> >  	pthread_mutex_t dv_mutex; /* DV context mutex. */
> >  	uint32_t dv_refcnt; /* DV/DR data reference counter. */ @@ -323,13
> > +331,6 @@ struct mlx5_priv {
> >  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
> >  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
> >  	/* Flow counters. */
> > -	struct {
> > -		uint32_t dev_gen; /* Generation number to flush local
> caches. */
> > -		rte_rwlock_t rwlock; /* MR Lock. */
> > -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> > -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> > -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> > -	} mr;
> >  	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
> >  	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
> >  	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ diff
> > --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index
> > a3732d4..5140cc2 100644
> > --- a/drivers/net/mlx5/mlx5_mr.c
> > +++ b/drivers/net/mlx5/mlx5_mr.c
> > @@ -36,7 +36,7 @@ struct mr_update_mp_data {
> >
> >  /**
> >   * Expand B-tree table to a given size. Can't be called with holding
> > - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> > + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
> >   *
> >   * @param bt
> >   *   Pointer to B-tree structure.
> > @@ -350,7 +350,7 @@ struct mr_update_mp_data {
> >  		n = mr_find_next_chunk(mr, &entry, n);
> >  		if (!entry.end)
> >  			break;
> > -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> > +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
> >  			/*
> >  			 * Overflowed, but the global table cannot be
> expanded
> >  			 * because of deadlock.
> > @@ -382,7 +382,7 @@ struct mr_update_mp_data {
> >  	struct mlx5_mr *mr;
> >
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> > +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
> >  		unsigned int n;
> >
> >  		if (mr->ms_n == 0)
> > @@ -420,6 +420,7 @@ struct mr_update_mp_data {
> >  	      uintptr_t addr)
> >  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	uint16_t idx;
> >  	uint32_t lkey = UINT32_MAX;
> >  	struct mlx5_mr *mr;
> > @@ -430,10 +431,10 @@ struct mr_update_mp_data {
> >  	 * has to be searched by traversing the original MR list instead, which
> >  	 * is very slow path. Otherwise, the global cache is all inclusive.
> >  	 */
> > -	if (!unlikely(priv->mr.cache.overflow)) {
> > -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> > +	if (!unlikely(sh->mr.cache.overflow)) {
> > +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> >  		if (lkey != UINT32_MAX)
> > -			*entry = (*priv->mr.cache.table)[idx];
> > +			*entry = (*sh->mr.cache.table)[idx];
> >  	} else {
> >  		/* Falling back to the slowest path. */
> >  		mr = mr_lookup_dev_list(dev, entry, addr); @@ -468,13
> +469,12 @@
> > struct mr_update_mp_data {
> >  /**
> >   * Release resources of detached MR having no online entry.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  static void
> > -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> > +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
> >  {
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr_next;
> >  	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
> >
> > @@ -484,11 +484,11 @@ struct mr_update_mp_data {
> >  	 * MR can't be freed with holding the lock because rte_free() could
> call
> >  	 * memory free callback function. This will be a deadlock situation.
> >  	 */
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Detach the whole free list and release it after unlocking. */
> > -	free_list = priv->mr.mr_free_list;
> > -	LIST_INIT(&priv->mr.mr_free_list);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	free_list = sh->mr.mr_free_list;
> > +	LIST_INIT(&sh->mr.mr_free_list);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Release resources. */
> >  	mr_next = LIST_FIRST(&free_list);
> >  	while (mr_next != NULL) {
> > @@ -548,12 +548,12 @@ struct mr_update_mp_data {
> >  		      dev->data->port_id, (void *)addr);
> >  		return UINT32_MAX;
> >  	}
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
> >  	/* Fill in output data. */
> >  	mr_lookup_dev(dev, entry, addr);
> >  	/* Lookup can't fail. */
> >  	assert(entry->lkey != UINT32_MAX);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
> >  	DEBUG("port %u MR CREATED by primary process for %p:\n"
> >  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
> >  	      dev->data->port_id, (void *)addr, @@ -582,6 +582,7 @@ struct
> > mr_update_mp_data {
> >  		       uintptr_t addr)
> >  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_dev_config *config = &priv->config;
> >  	struct rte_mem_config *mcfg = rte_eal_get_configuration()-
> >mem_config;
> >  	const struct rte_memseg_list *msl;
> > @@ -602,12 +603,12 @@ struct mr_update_mp_data {
> >  		dev->data->port_id, (void *)addr);
> >  	/*
> >  	 * Release detached MRs if any. This can't be called with holding
> either
> > -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list
> have
> > +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
> >  	 * been detached by the memory free event but it couldn't be
> released
> >  	 * inside the callback due to deadlock. As a result, releasing resources
> >  	 * is quite opportunistic.
> >  	 */
> > -	mlx5_mr_garbage_collect(dev);
> > +	mlx5_mr_garbage_collect(sh);
> >  	/*
> >  	 * If enabled, find out a contiguous virtual address chunk in use, to
> >  	 * which the given address belongs, in order to register maximum
> range.
> > @@ -710,7 +711,7 @@ struct mr_update_mp_data {
> >  		goto alloc_resources;
> >  	}
> >  	assert(data.msl == data_re.msl);
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/*
> >  	 * Check the address is really missing. If other thread already created
> >  	 * one or it is not found due to overflow, abort and return.
> > @@ -721,10 +722,10 @@ struct mr_update_mp_data {
> >  		 * low-on-memory. Then, this entry will have to be searched
> >  		 * here again.
> >  		 */
> > -		mr_btree_insert(&priv->mr.cache, entry);
> > +		mr_btree_insert(&sh->mr.cache, entry);
> >  		DEBUG("port %u found MR for %p on final lookup, abort",
> >  		      dev->data->port_id, (void *)addr);
> > -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +		rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  		/*
> >  		 * Must be unlocked before calling rte_free() because @@ -
> 769,7
> > +770,7 @@ struct mr_update_mp_data {
> >  	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
> >  	 * through mlx5_alloc_verbs_buf().
> >  	 */
> > -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start,
> len,
> > +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
> >  				       IBV_ACCESS_LOCAL_WRITE);
> >  	if (mr->ibv_mr == NULL) {
> >  		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@
> > -779,7 +780,7 @@ struct mr_update_mp_data {
> >  	}
> >  	assert((uintptr_t)mr->ibv_mr->addr == data.start);
> >  	assert(mr->ibv_mr->length == len);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	DEBUG("port %u MR CREATED (%p) for %p:\n"
> >  	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
> >  	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", @@ -
> 792,11
> > +793,11 @@ struct mr_update_mp_data {
> >  	mr_lookup_dev(dev, entry, addr);
> >  	/* Lookup can't fail. */
> >  	assert(entry->lkey != UINT32_MAX);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  	return entry->lkey;
> >  err_mrlock:
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  err_memlock:
> >  	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >  err_nolock:
> > @@ -854,14 +855,15 @@ struct mr_update_mp_data {
> > mr_rebuild_dev_cache(struct rte_eth_dev *dev)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr *mr;
> >
> >  	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data-
> >port_id);
> >  	/* Flush cache to rebuild. */
> > -	priv->mr.cache.len = 1;
> > -	priv->mr.cache.overflow = 0;
> > +	sh->mr.cache.len = 1;
> > +	sh->mr.cache.overflow = 0;
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> > +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
> >  		if (mr_insert_dev_cache(dev, mr) < 0)
> >  			return;
> >  }
> > @@ -888,6 +890,7 @@ struct mr_update_mp_data {
> > mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr,
> > size_t len)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	const struct rte_memseg_list *msl;
> >  	struct mlx5_mr *mr;
> >  	int ms_n;
> > @@ -901,7 +904,7 @@ struct mr_update_mp_data {
> >  	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
> >  	assert(len == RTE_ALIGN(len, msl->page_sz));
> >  	ms_n = len / msl->page_sz;
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Clear bits of freed memsegs from MR. */
> >  	for (i = 0; i < ms_n; ++i) {
> >  		const struct rte_memseg *ms;
> > @@ -928,7 +931,7 @@ struct mr_update_mp_data {
> >  		rte_bitmap_clear(mr->ms_bmp, pos);
> >  		if (--mr->ms_n == 0) {
> >  			LIST_REMOVE(mr, mr);
> > -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  			DEBUG("port %u remove MR(%p) from list",
> >  			      dev->data->port_id, (void *)mr);
> >  		}
> > @@ -949,12 +952,12 @@ struct mr_update_mp_data {
> >  		 * generation below) will be guaranteed to be seen by other
> core
> >  		 * before the core sees the newly allocated memory.
> >  		 */
> > -		++priv->mr.dev_gen;
> > +		++sh->mr.dev_gen;
> >  		DEBUG("broadcasting local cache flush, gen=%d",
> > -		      priv->mr.dev_gen);
> > +		      sh->mr.dev_gen);
> >  		rte_smp_wmb();
> >  	}
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  }
> >
> >  /**
> > @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
> >  		   struct mlx5_mr_cache *entry, uintptr_t addr)  {
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
> >  	uint16_t idx;
> >  	uint32_t lkey;
> > @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
> >  	if (unlikely(bt->len == bt->size))
> >  		mr_btree_expand(bt, bt->size << 1);
> >  	/* Look up in the global cache. */
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> > +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> >  	if (lkey != UINT32_MAX) {
> >  		/* Found. */
> > -		*entry = (*priv->mr.cache.table)[idx];
> > -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +		*entry = (*sh->mr.cache.table)[idx];
> > +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  		/*
> >  		 * Update local cache. Even if it fails, return the found entry
> >  		 * to update top-half cache. Next time, this entry will be
> found @@
> > -1035,7 +1039,7 @@ struct mr_update_mp_data {
> >  		mr_btree_insert(bt, entry);
> >  		return lkey;
> >  	}
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	/* First time to see the address? Create a new MR. */
> >  	lkey = mlx5_mr_create(dev, entry, addr);
> >  	/*
> > @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
> >  	struct mr_update_mp_data *data = opaque;
> >  	struct rte_eth_dev *dev = data->dev;
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> > +	struct mlx5_ibv_shared *sh = priv->sh;
> >  	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
> >  	struct mlx5_mr *mr = NULL;
> >  	uintptr_t addr = (uintptr_t)memhdr->addr; @@ -1270,9 +1275,9 @@
> > struct mr_update_mp_data {
> >
> >  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> >  	/* If already registered, it should return. */
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	lkey = mr_lookup_dev(dev, &entry, addr);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	if (lkey != UINT32_MAX)
> >  		return;
> >  	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool
> (%s)",
> > @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
> >  		data->ret = -1;
> >  		return;
> >  	}
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	/* Insert to the global cache table. */
> >  	mr_insert_dev_cache(dev, mr);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Insert to the local cache table */
> >  	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);  } @@ -1345,6 +1350,7
> @@
> > struct mr_update_mp_data {
> >  	struct rte_eth_dev *dev;
> >  	struct mlx5_mr *mr;
> >  	struct mlx5_priv *priv;
> > +	struct mlx5_ibv_shared *sh;
> >
> >  	dev = pci_dev_to_eth_dev(pdev);
> >  	if (!dev) {
> > @@ -1361,11 +1367,12 @@ struct mr_update_mp_data {
> >  		rte_errno = EINVAL;
> >  		return -1;
> >  	}
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	sh = priv->sh;
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> >  	/* Insert to the global cache table. */
> >  	mr_insert_dev_cache(dev, mr);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	return 0;
> >  }
> >
> > @@ -1390,6 +1397,7 @@ struct mr_update_mp_data {  {
> >  	struct rte_eth_dev *dev;
> >  	struct mlx5_priv *priv;
> > +	struct mlx5_ibv_shared *sh;
> >  	struct mlx5_mr *mr;
> >  	struct mlx5_mr_cache entry;
> >
> > @@ -1401,10 +1409,11 @@ struct mr_update_mp_data {
> >  		return -1;
> >  	}
> >  	priv = dev->data->dev_private;
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	sh = priv->sh;
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> >  	if (!mr) {
> > -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't
> registered "
> >  				 "to PCI device %p", (uintptr_t)addr,
> >  				 (void *)pdev);
> > @@ -1412,7 +1421,7 @@ struct mr_update_mp_data {
> >  		return -1;
> >  	}
> >  	LIST_REMOVE(mr, mr);
> > -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> >  	      (void *)mr);
> >  	mr_rebuild_dev_cache(dev);
> > @@ -1425,11 +1434,11 @@ struct mr_update_mp_data {
> >  	 * generation below) will be guaranteed to be seen by other core
> >  	 * before the core sees the newly allocated memory.
> >  	 */
> > -	++priv->mr.dev_gen;
> > +	++sh->mr.dev_gen;
> >  	DEBUG("broadcasting local cache flush, gen=%d",
> > -			priv->mr.dev_gen);
> > +			sh->mr.dev_gen);
> 
> Looks indentation was wrong even before your change.
> Please correct it.
OK.

> 
> Thanks,
> Yongseok
> 
Thank you for review,
Best Regards,
Slava

> >  	rte_smp_wmb();
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  	return 0;
> >  }
> >
> > @@ -1550,25 +1559,24 @@ struct mr_update_mp_data {
> >  /**
> >   * Dump all the created MRs and the global cache entries.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  void
> > -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> > +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
> >  {
> >  #ifndef NDEBUG
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr;
> >  	int mr_n = 0;
> >  	int chunk_n = 0;
> >
> > -	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	rte_rwlock_read_lock(&sh->mr.rwlock);
> >  	/* Iterate all the existing MRs. */
> > -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> > +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
> >  		unsigned int n;
> >
> > -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u,
> ms_bmp_n = %u",
> > -		      dev->data->port_id, mr_n++,
> > +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u,
> ms_bmp_n = %u",
> > +		      sh->ibdev_name, mr_n++,
> >  		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
> >  		      mr->ms_n, mr->ms_bmp_n);
> >  		if (mr->ms_n == 0)
> > @@ -1583,45 +1591,40 @@ struct mr_update_mp_data {
> >  			      chunk_n++, ret.start, ret.end);
> >  		}
> >  	}
> > -	DEBUG("port %u dumping global cache", dev->data->port_id);
> > -	mlx5_mr_btree_dump(&priv->mr.cache);
> > -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> > +	mlx5_mr_btree_dump(&sh->mr.cache);
> > +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> >  #endif
> >  }
> >
> >  /**
> > - * Release all the created MRs and resources. Remove device from
> > memory callback
> > + * Release all the created MRs and resources for shared device context.
> >   * list.
> >   *
> > - * @param dev
> > - *   Pointer to Ethernet device.
> > + * @param sh
> > + *   Pointer to Ethernet device shared context.
> >   */
> >  void
> > -mlx5_mr_release(struct rte_eth_dev *dev)
> > +mlx5_mr_release(struct mlx5_ibv_shared *sh)
> >  {
> > -	struct mlx5_priv *priv = dev->data->dev_private;
> >  	struct mlx5_mr *mr_next;
> >
> > -	/* Remove from memory callback device list. */
> > -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> > -	LIST_REMOVE(priv, mem_event_cb);
> > -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> >  	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> > -		mlx5_mr_dump_dev(dev);
> > -	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +		mlx5_mr_dump_dev(sh);
> > +	rte_rwlock_write_lock(&sh->mr.rwlock);
> >  	/* Detach from MR list and move to free list. */
> > -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> > +	mr_next = LIST_FIRST(&sh->mr.mr_list);
> >  	while (mr_next != NULL) {
> >  		struct mlx5_mr *mr = mr_next;
> >
> >  		mr_next = LIST_NEXT(mr, mr);
> >  		LIST_REMOVE(mr, mr);
> > -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> >  	}
> > -	LIST_INIT(&priv->mr.mr_list);
> > +	LIST_INIT(&sh->mr.mr_list);
> >  	/* Free global cache. */
> > -	mlx5_mr_btree_free(&priv->mr.cache);
> > -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	mlx5_mr_btree_free(&sh->mr.cache);
> > +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> >  	/* Free all remaining MRs. */
> > -	mlx5_mr_garbage_collect(dev);
> > +	mlx5_mr_garbage_collect(sh);
> >  }
> > diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> > index 786f6a3..89e89b7 100644
> > --- a/drivers/net/mlx5/mlx5_mr.h
> > +++ b/drivers/net/mlx5/mlx5_mr.h
> > @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
> >  	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */  }
> > __rte_packed;
> >
> > +struct mlx5_ibv_shared;
> >  extern struct mlx5_dev_list  mlx5_mem_event_cb_list;  extern
> > rte_rwlock_t mlx5_mem_event_rwlock;
> >
> > @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum
> rte_mem_event event_type, const void *addr,
> >  			  size_t len, void *arg);
> >  int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl
> *mr_ctrl,
> >  		      struct rte_mempool *mp);
> > -void mlx5_mr_release(struct rte_eth_dev *dev);
> > +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
> >
> >  /* Debug purpose functions. */
> >  void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt); -void
> > mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> > +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
> >
> >  /**
> >   * Look up LKey from given lookup table by linear search. Firstly
> > look up the diff --git a/drivers/net/mlx5/mlx5_txq.c
> > b/drivers/net/mlx5/mlx5_txq.c index 9965b2b..ed82594 100644
> > --- a/drivers/net/mlx5/mlx5_txq.c
> > +++ b/drivers/net/mlx5/mlx5_txq.c
> > @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
> >  		goto error;
> >  	}
> >  	/* Save pointer of global generation number to check memory event.
> */
> > -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> > +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
> >  	assert(desc > MLX5_TX_COMP_THRESH);
> >  	tmpl->txq.offloads = conf->offloads |
> >  			     dev->data->dev_conf.txmode.offloads;
> > --
> > 1.8.3.1
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v2] net/mlx5: share Memory Regions for multiport device
  2019-04-12 15:45 [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
  2019-04-12 15:45 ` Viacheslav Ovsiienko
  2019-04-12 19:22 ` Yongseok Koh
@ 2019-04-24  8:44 ` Viacheslav Ovsiienko
  2019-04-24  8:44   ` Viacheslav Ovsiienko
                     ` (2 more replies)
  2019-04-24  8:48 ` [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
  3 siblings, 3 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-24  8:44 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the signle
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
portes, created on the base of multiport Infiniban device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v2:
  - intendation issues
  - comments cleanup

v1:
  http://patches.dpdk.org/patch/52723/

 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 105 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bb58b1..b563e0f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0a6d7f1..2575732 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -347,13 +355,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index e4957a4..a7a63b1 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
-	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+	++sh->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbe074f..4d55fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v2] net/mlx5: share Memory Regions for multiport device
  2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
@ 2019-04-24  8:44   ` Viacheslav Ovsiienko
  2019-04-24  8:52   ` Yongseok Koh
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
  2 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-24  8:44 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the signle
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
portes, created on the base of multiport Infiniban device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v2:
  - intendation issues
  - comments cleanup

v1:
  http://patches.dpdk.org/patch/52723/

 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 105 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bb58b1..b563e0f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0a6d7f1..2575732 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -347,13 +355,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index e4957a4..a7a63b1 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
-	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+	++sh->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbe074f..4d55fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context
  2019-04-12 15:45 [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
                   ` (2 preceding siblings ...)
  2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
@ 2019-04-24  8:48 ` Viacheslav Ovsiienko
  2019-04-24  8:48   ` Viacheslav Ovsiienko
  3 siblings, 1 reply; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-24  8:48 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

Mellanox mlx5 PMD implements the list of devices to process the memory
free event to reflect the actual memory state to Memory Regions.
Because this list contains the devices and devices may share the
same context the callback routine may be called multiple times
with the same parameter, that is not optimal. This patch modifies
the list to contain the device contexts instead of device objects
and shared context is included in the list only once.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Yongseok Koh <yskoh@mellanox.com>

---
 drivers/net/mlx5/mlx5.c    |  6 +--
 drivers/net/mlx5/mlx5.h    |  6 +--
 drivers/net/mlx5/mlx5_mr.c | 91 +++++++++++++++++++++-------------------------
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b563e0f..de85e85 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -673,9 +673,9 @@ struct mlx5_dev_spawn_data {
 	mlx5_mprq_free_mp(dev);
 	/* Remove from memory callback device list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
+	LIST_REMOVE(priv->sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
 		rte_free(priv->rss_conf.rss_key);
@@ -1574,7 +1574,7 @@ struct mlx5_dev_spawn_data {
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-			 priv, mem_event_cb);
+			 sh, mem_event_cb);
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2575732..82fcb29 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -99,7 +99,7 @@ struct mlx5_switch_info {
 	uint64_t switch_id; /**< Switch identifier. */
 };
 
-LIST_HEAD(mlx5_dev_list, mlx5_priv);
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
 
 /* Shared data between primary and secondary processes. */
 struct mlx5_shared_data {
@@ -276,6 +276,8 @@ struct mlx5_ibv_shared {
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+	/**< Called by memory event callback. */
 	struct {
 		uint32_t dev_gen; /* Generation number to flush local caches. */
 		rte_rwlock_t rwlock; /* MR Lock. */
@@ -322,8 +324,6 @@ struct mlx5_proc_priv {
 	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
 
 struct mlx5_priv {
-	LIST_ENTRY(mlx5_priv) mem_event_cb;
-	/**< Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
 	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
 	uint32_t ibv_port; /* IB device port number. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a7a63b1..66e8e87 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -327,7 +327,7 @@ struct mr_update_mp_data {
  * mlx5_mr_create() on miss.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param mr
  *   Pointer to MR to insert.
  *
@@ -335,13 +335,12 @@ struct mr_update_mp_data {
  *   0 on success, -1 on failure.
  */
 static int
-mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr)
+mr_insert_dev_cache(struct mlx5_ibv_shared *sh, struct mlx5_mr *mr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	unsigned int n;
 
-	DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache",
-		dev->data->port_id, (void *)mr);
+	DRV_LOG(DEBUG, "device %s inserting MR(%p) to global cache",
+		sh->ibdev_name, (void *)mr);
 	for (n = 0; n < mr->ms_bmp_n; ) {
 		struct mlx5_mr_cache entry;
 
@@ -350,7 +349,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -364,8 +363,8 @@ struct mr_update_mp_data {
 /**
  * Look up address in the original global MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -375,14 +374,13 @@ struct mr_update_mp_data {
  *   Found MR on match, NULL otherwise.
  */
 static struct mlx5_mr *
-mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev_list(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 		   uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -406,7 +404,7 @@ struct mr_update_mp_data {
  * Look up address on device.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -416,11 +414,9 @@ struct mr_update_mp_data {
  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
  */
 static uint32_t
-mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 	      uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -437,7 +433,7 @@ struct mr_update_mp_data {
 			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
-		mr = mr_lookup_dev_list(dev, entry, addr);
+		mr = mr_lookup_dev_list(sh, entry, addr);
 		if (mr != NULL)
 			lkey = entry->lkey;
 	}
@@ -550,7 +546,7 @@ struct mr_update_mp_data {
 	}
 	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(priv->sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
@@ -716,7 +712,7 @@ struct mr_update_mp_data {
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
 	 */
-	if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+	if (mr_lookup_dev(sh, entry, addr) != UINT32_MAX) {
 		/*
 		 * Insert to the global cache table. It may fail due to
 		 * low-on-memory. Then, this entry will have to be searched
@@ -746,7 +742,7 @@ struct mr_update_mp_data {
 		memset(&ret, 0, sizeof(ret));
 		start = data_re.start + n * msl->page_sz;
 		/* Exclude memsegs already registered by other MRs. */
-		if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+		if (mr_lookup_dev(sh, &ret, start) == UINT32_MAX) {
 			/*
 			 * Start from the first unregistered memseg in the
 			 * extended range.
@@ -788,9 +784,9 @@ struct mr_update_mp_data {
 	      data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
@@ -848,23 +844,21 @@ struct mr_update_mp_data {
 /**
  * Rebuild the global B-tree cache of device from the original MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+mr_rebuild_dev_cache(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
-	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
+	DRV_LOG(DEBUG, "device %s rebuild dev cache[]", sh->ibdev_name);
 	/* Flush cache to rebuild. */
 	sh->mr.cache.len = 1;
 	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
-		if (mr_insert_dev_cache(dev, mr) < 0)
+		if (mr_insert_dev_cache(sh, mr) < 0)
 			return;
 }
 
@@ -879,26 +873,25 @@ struct mr_update_mp_data {
  * The global cache must be rebuilt if there's any change and this event has to
  * be propagated to dataplane threads to flush the local caches.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to the Ethernet device shared context.
  * @param addr
  *   Address of freed memory.
  * @param len
  *   Size of freed memory.
  */
 static void
-mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+			  const void *addr, size_t len)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
 	int i;
 	int rebuild = 0;
 
-	DEBUG("port %u free callback: addr=%p, len=%zu",
-	      dev->data->port_id, addr, len);
+	DEBUG("device %s free callback: addr=%p, len=%zu",
+	      sh->ibdev_name, addr, len);
 	msl = rte_mem_virt2memseg_list(addr);
 	/* addr and len must be page-aligned. */
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
@@ -915,7 +908,7 @@ struct mr_update_mp_data {
 
 		/* Find MR having this memseg. */
 		start = (uintptr_t)addr + i * msl->page_sz;
-		mr = mr_lookup_dev_list(dev, &entry, start);
+		mr = mr_lookup_dev_list(sh, &entry, start);
 		if (mr == NULL)
 			continue;
 		assert(mr->msl); /* Can't be external memory. */
@@ -926,14 +919,14 @@ struct mr_update_mp_data {
 		pos = ms_idx - mr->ms_base_idx;
 		assert(rte_bitmap_get(mr->ms_bmp, pos));
 		assert(pos < mr->ms_bmp_n);
-		DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
-		      dev->data->port_id, (void *)mr, pos, (void *)start);
+		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+		      sh->ibdev_name, (void *)mr, pos, (void *)start);
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
 			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
-			DEBUG("port %u remove MR(%p) from list",
-			      dev->data->port_id, (void *)mr);
+			DEBUG("device %s remove MR(%p) from list",
+			      sh->ibdev_name, (void *)mr);
 		}
 		/*
 		 * MR is fragmented or will be freed. the global cache must be
@@ -942,7 +935,7 @@ struct mr_update_mp_data {
 		rebuild = 1;
 	}
 	if (rebuild) {
-		mr_rebuild_dev_cache(dev);
+		mr_rebuild_dev_cache(sh);
 		/*
 		 * Flush local caches by propagating invalidation across cores.
 		 * rte_smp_wmb() is enough to synchronize this event. If one of
@@ -975,7 +968,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
-	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
 
 	/* Must be called from the primary process. */
@@ -984,8 +977,8 @@ struct mr_update_mp_data {
 	case RTE_MEM_EVENT_FREE:
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx5 devices. */
-		LIST_FOREACH(priv, dev_list, mem_event_cb)
-			mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
+		LIST_FOREACH(sh, dev_list, mem_event_cb)
+			mlx5_mr_mem_event_free_cb(sh, addr, len);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
@@ -1276,7 +1269,7 @@ struct mr_update_mp_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	lkey = mr_lookup_dev(dev, &entry, addr);
+	lkey = mr_lookup_dev(sh, &entry, addr);
 	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
@@ -1294,7 +1287,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
@@ -1365,7 +1358,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
@@ -1405,7 +1398,7 @@ struct mr_update_mp_data {
 	priv = dev->data->dev_private;
 	sh = priv->sh;
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
+	mr = mr_lookup_dev_list(sh, &entry, (uintptr_t)addr);
 	if (!mr) {
 		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
@@ -1418,7 +1411,7 @@ struct mr_update_mp_data {
 	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
-	mr_rebuild_dev_cache(dev);
+	mr_rebuild_dev_cache(sh);
 	/*
 	 * Flush local caches by propagating invalidation across cores.
 	 * rte_smp_wmb() is enough to synchronize this event. If one of
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context
  2019-04-24  8:48 ` [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
@ 2019-04-24  8:48   ` Viacheslav Ovsiienko
  0 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-24  8:48 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

Mellanox mlx5 PMD implements the list of devices to process the memory
free event to reflect the actual memory state to Memory Regions.
Because this list contains the devices and devices may share the
same context the callback routine may be called multiple times
with the same parameter, that is not optimal. This patch modifies
the list to contain the device contexts instead of device objects
and shared context is included in the list only once.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Yongseok Koh <yskoh@mellanox.com>

---
 drivers/net/mlx5/mlx5.c    |  6 +--
 drivers/net/mlx5/mlx5.h    |  6 +--
 drivers/net/mlx5/mlx5_mr.c | 91 +++++++++++++++++++++-------------------------
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b563e0f..de85e85 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -673,9 +673,9 @@ struct mlx5_dev_spawn_data {
 	mlx5_mprq_free_mp(dev);
 	/* Remove from memory callback device list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
+	LIST_REMOVE(priv->sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
 		rte_free(priv->rss_conf.rss_key);
@@ -1574,7 +1574,7 @@ struct mlx5_dev_spawn_data {
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-			 priv, mem_event_cb);
+			 sh, mem_event_cb);
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2575732..82fcb29 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -99,7 +99,7 @@ struct mlx5_switch_info {
 	uint64_t switch_id; /**< Switch identifier. */
 };
 
-LIST_HEAD(mlx5_dev_list, mlx5_priv);
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
 
 /* Shared data between primary and secondary processes. */
 struct mlx5_shared_data {
@@ -276,6 +276,8 @@ struct mlx5_ibv_shared {
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+	/**< Called by memory event callback. */
 	struct {
 		uint32_t dev_gen; /* Generation number to flush local caches. */
 		rte_rwlock_t rwlock; /* MR Lock. */
@@ -322,8 +324,6 @@ struct mlx5_proc_priv {
 	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
 
 struct mlx5_priv {
-	LIST_ENTRY(mlx5_priv) mem_event_cb;
-	/**< Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
 	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
 	uint32_t ibv_port; /* IB device port number. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a7a63b1..66e8e87 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -327,7 +327,7 @@ struct mr_update_mp_data {
  * mlx5_mr_create() on miss.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param mr
  *   Pointer to MR to insert.
  *
@@ -335,13 +335,12 @@ struct mr_update_mp_data {
  *   0 on success, -1 on failure.
  */
 static int
-mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr)
+mr_insert_dev_cache(struct mlx5_ibv_shared *sh, struct mlx5_mr *mr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	unsigned int n;
 
-	DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache",
-		dev->data->port_id, (void *)mr);
+	DRV_LOG(DEBUG, "device %s inserting MR(%p) to global cache",
+		sh->ibdev_name, (void *)mr);
 	for (n = 0; n < mr->ms_bmp_n; ) {
 		struct mlx5_mr_cache entry;
 
@@ -350,7 +349,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -364,8 +363,8 @@ struct mr_update_mp_data {
 /**
  * Look up address in the original global MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -375,14 +374,13 @@ struct mr_update_mp_data {
  *   Found MR on match, NULL otherwise.
  */
 static struct mlx5_mr *
-mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev_list(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 		   uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -406,7 +404,7 @@ struct mr_update_mp_data {
  * Look up address on device.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -416,11 +414,9 @@ struct mr_update_mp_data {
  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
  */
 static uint32_t
-mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 	      uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -437,7 +433,7 @@ struct mr_update_mp_data {
 			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
-		mr = mr_lookup_dev_list(dev, entry, addr);
+		mr = mr_lookup_dev_list(sh, entry, addr);
 		if (mr != NULL)
 			lkey = entry->lkey;
 	}
@@ -550,7 +546,7 @@ struct mr_update_mp_data {
 	}
 	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(priv->sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
@@ -716,7 +712,7 @@ struct mr_update_mp_data {
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
 	 */
-	if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+	if (mr_lookup_dev(sh, entry, addr) != UINT32_MAX) {
 		/*
 		 * Insert to the global cache table. It may fail due to
 		 * low-on-memory. Then, this entry will have to be searched
@@ -746,7 +742,7 @@ struct mr_update_mp_data {
 		memset(&ret, 0, sizeof(ret));
 		start = data_re.start + n * msl->page_sz;
 		/* Exclude memsegs already registered by other MRs. */
-		if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+		if (mr_lookup_dev(sh, &ret, start) == UINT32_MAX) {
 			/*
 			 * Start from the first unregistered memseg in the
 			 * extended range.
@@ -788,9 +784,9 @@ struct mr_update_mp_data {
 	      data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
@@ -848,23 +844,21 @@ struct mr_update_mp_data {
 /**
  * Rebuild the global B-tree cache of device from the original MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+mr_rebuild_dev_cache(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
-	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
+	DRV_LOG(DEBUG, "device %s rebuild dev cache[]", sh->ibdev_name);
 	/* Flush cache to rebuild. */
 	sh->mr.cache.len = 1;
 	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
-		if (mr_insert_dev_cache(dev, mr) < 0)
+		if (mr_insert_dev_cache(sh, mr) < 0)
 			return;
 }
 
@@ -879,26 +873,25 @@ struct mr_update_mp_data {
  * The global cache must be rebuilt if there's any change and this event has to
  * be propagated to dataplane threads to flush the local caches.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to the Ethernet device shared context.
  * @param addr
  *   Address of freed memory.
  * @param len
  *   Size of freed memory.
  */
 static void
-mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+			  const void *addr, size_t len)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
 	int i;
 	int rebuild = 0;
 
-	DEBUG("port %u free callback: addr=%p, len=%zu",
-	      dev->data->port_id, addr, len);
+	DEBUG("device %s free callback: addr=%p, len=%zu",
+	      sh->ibdev_name, addr, len);
 	msl = rte_mem_virt2memseg_list(addr);
 	/* addr and len must be page-aligned. */
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
@@ -915,7 +908,7 @@ struct mr_update_mp_data {
 
 		/* Find MR having this memseg. */
 		start = (uintptr_t)addr + i * msl->page_sz;
-		mr = mr_lookup_dev_list(dev, &entry, start);
+		mr = mr_lookup_dev_list(sh, &entry, start);
 		if (mr == NULL)
 			continue;
 		assert(mr->msl); /* Can't be external memory. */
@@ -926,14 +919,14 @@ struct mr_update_mp_data {
 		pos = ms_idx - mr->ms_base_idx;
 		assert(rte_bitmap_get(mr->ms_bmp, pos));
 		assert(pos < mr->ms_bmp_n);
-		DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
-		      dev->data->port_id, (void *)mr, pos, (void *)start);
+		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+		      sh->ibdev_name, (void *)mr, pos, (void *)start);
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
 			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
-			DEBUG("port %u remove MR(%p) from list",
-			      dev->data->port_id, (void *)mr);
+			DEBUG("device %s remove MR(%p) from list",
+			      sh->ibdev_name, (void *)mr);
 		}
 		/*
 		 * MR is fragmented or will be freed. the global cache must be
@@ -942,7 +935,7 @@ struct mr_update_mp_data {
 		rebuild = 1;
 	}
 	if (rebuild) {
-		mr_rebuild_dev_cache(dev);
+		mr_rebuild_dev_cache(sh);
 		/*
 		 * Flush local caches by propagating invalidation across cores.
 		 * rte_smp_wmb() is enough to synchronize this event. If one of
@@ -975,7 +968,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
-	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
 
 	/* Must be called from the primary process. */
@@ -984,8 +977,8 @@ struct mr_update_mp_data {
 	case RTE_MEM_EVENT_FREE:
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx5 devices. */
-		LIST_FOREACH(priv, dev_list, mem_event_cb)
-			mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
+		LIST_FOREACH(sh, dev_list, mem_event_cb)
+			mlx5_mr_mem_event_free_cb(sh, addr, len);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
@@ -1276,7 +1269,7 @@ struct mr_update_mp_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	lkey = mr_lookup_dev(dev, &entry, addr);
+	lkey = mr_lookup_dev(sh, &entry, addr);
 	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
@@ -1294,7 +1287,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
@@ -1365,7 +1358,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
@@ -1405,7 +1398,7 @@ struct mr_update_mp_data {
 	priv = dev->data->dev_private;
 	sh = priv->sh;
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
+	mr = mr_lookup_dev_list(sh, &entry, (uintptr_t)addr);
 	if (!mr) {
 		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
@@ -1418,7 +1411,7 @@ struct mr_update_mp_data {
 	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
-	mr_rebuild_dev_cache(dev);
+	mr_rebuild_dev_cache(sh);
 	/*
 	 * Flush local caches by propagating invalidation across cores.
 	 * rte_smp_wmb() is enough to synchronize this event. If one of
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/mlx5: share Memory Regions for multiport device
  2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  2019-04-24  8:44   ` Viacheslav Ovsiienko
@ 2019-04-24  8:52   ` Yongseok Koh
  2019-04-24  8:52     ` Yongseok Koh
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
  2 siblings, 1 reply; 20+ messages in thread
From: Yongseok Koh @ 2019-04-24  8:52 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: Shahaf Shuler, dev


> On Apr 24, 2019, at 1:44 AM, Viacheslav Ovsiienko <viacheslavo@mellanox.com> wrote:
> 
> The multiport Infiniband device support was introduced [1].
> All active ports, belonging to the same Infiniband device use the signle
> shared Infiniband context of that device and share the resources:
>  - QPs are created within shared context
>  - Verbs flows are also created with specifying port index
>  - DV/DR resources
>  - Protection Domain
>  - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between
> portes, created on the base of multiport Infiniban device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3
> is subject to share due to performance issues.
> 
> [1] http://patches.dpdk.org/cover/51800/
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
Acked-by: Yongseok Koh <yskoh@mellanox.com>

> v2:
>  - intendation issues
>  - comments cleanup
> 
> v1:
>  http://patches.dpdk.org/patch/52723/
> 
> drivers/net/mlx5/mlx5.c     |  40 +++++++----
> drivers/net/mlx5/mlx5.h     |  15 ++--
> drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
> drivers/net/mlx5/mlx5_mr.h  |   5 +-
> drivers/net/mlx5/mlx5_txq.c |   2 +-
> 5 files changed, 121 insertions(+), 105 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 1bb58b1..b563e0f 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
> 	struct mlx5_switch_info info; /**< Switch information. */
> 	struct ibv_device *ibv_dev; /**< Associated IB device. */
> 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
> };
> 
> static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
> @@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
> 		sizeof(sh->ibdev_name));
> 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> 		sizeof(sh->ibdev_path));
> +	sh->pci_dev = spawn->pci_dev;
> 	pthread_mutex_init(&sh->intr_mutex, NULL);
> 	/*
> 	 * Setting port_id to max unallowed value means
> @@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
> 		err = ENOMEM;
> 		goto error;
> 	}
> +	/*
> +	 * Once the device is added to the list of memory event
> +	 * callback, its global MR cache table cannot be expanded
> +	 * on the fly because of deadlock. If it overflows, lookup
> +	 * should be done by searching MR list linearly, which is slow.
> +	 *
> +	 * At this point the device is not added to the memory
> +	 * event list yet, context is just being created.
> +	 */
> +	err = mlx5_mr_btree_init(&sh->mr.cache,
> +				 MLX5_MR_BTREE_CACHE_N * 2,
> +				 sh->pci_dev->device.numa_node);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
> 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> exit:
> 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> @@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	if (--sh->refcnt)
> 		goto exit;
> +	/* Release created Memory Regions. */
> +	mlx5_mr_release(sh);
> 	LIST_REMOVE(sh, next);
> 	/*
> 	 *  Ensure there is no async event handler installed.
> @@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
> 	}
> 	mlx5_proc_priv_uninit(dev);
> 	mlx5_mprq_free_mp(dev);
> -	mlx5_mr_release(dev);
> +	/* Remove from memory callback device list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_REMOVE(priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	assert(priv->sh);
> 	mlx5_free_shared_dr(priv);
> 	if (priv->rss_conf.rss_key != NULL)
> @@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
> 		goto error;
> 	}
> 	priv->config.flow_prio = err;
> -	/*
> -	 * Once the device is added to the list of memory event
> -	 * callback, its global MR cache table cannot be expanded
> -	 * on the fly because of deadlock. If it overflows, lookup
> -	 * should be done by searching MR list linearly, which is slow.
> -	 */
> -	err = mlx5_mr_btree_init(&priv->mr.cache,
> -				 MLX5_MR_BTREE_CACHE_N * 2,
> -				 eth_dev->device->numa_node);
> -	if (err) {
> -		err = rte_errno;
> -		goto error;
> -	}
> 	/* Add device to memory callback list. */
> 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> @@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = i;
> 			list[ns].ibv_dev = ibv_match[0];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = mlx5_nl_ifindex
> 					(nl_rdma, list[ns].ibv_dev->name, i);
> 			if (!list[ns].ifindex) {
> @@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = 1;
> 			list[ns].ibv_dev = ibv_match[i];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = 0;
> 			if (nl_rdma >= 0)
> 				list[ns].ifindex = mlx5_nl_ifindex
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 0a6d7f1..2575732 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
> 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
> 	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> +	struct {
> +		uint32_t dev_gen; /* Generation number to flush local caches. */
> +		rte_rwlock_t rwlock; /* MR Lock. */
> +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> +	} mr;
> 	/* Shared DV/DR flow data section. */
> 	pthread_mutex_t dv_mutex; /* DV context mutex. */
> 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> @@ -347,13 +355,6 @@ struct mlx5_priv {
> 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
> 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
> 	/* Flow counters. */
> -	struct {
> -		uint32_t dev_gen; /* Generation number to flush local caches. */
> -		rte_rwlock_t rwlock; /* MR Lock. */
> -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> -	} mr;
> 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
> 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
> 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index e4957a4..a7a63b1 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -36,7 +36,7 @@ struct mr_update_mp_data {
> 
> /**
>  * Expand B-tree table to a given size. Can't be called with holding
> - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
>  *
>  * @param bt
>  *   Pointer to B-tree structure.
> @@ -350,7 +350,7 @@ struct mr_update_mp_data {
> 		n = mr_find_next_chunk(mr, &entry, n);
> 		if (!entry.end)
> 			break;
> -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
> 			/*
> 			 * Overflowed, but the global table cannot be expanded
> 			 * because of deadlock.
> @@ -382,7 +382,7 @@ struct mr_update_mp_data {
> 	struct mlx5_mr *mr;
> 
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> 		if (mr->ms_n == 0)
> @@ -420,6 +420,7 @@ struct mr_update_mp_data {
> 	      uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	uint16_t idx;
> 	uint32_t lkey = UINT32_MAX;
> 	struct mlx5_mr *mr;
> @@ -430,10 +431,10 @@ struct mr_update_mp_data {
> 	 * has to be searched by traversing the original MR list instead, which
> 	 * is very slow path. Otherwise, the global cache is all inclusive.
> 	 */
> -	if (!unlikely(priv->mr.cache.overflow)) {
> -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	if (!unlikely(sh->mr.cache.overflow)) {
> +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 		if (lkey != UINT32_MAX)
> -			*entry = (*priv->mr.cache.table)[idx];
> +			*entry = (*sh->mr.cache.table)[idx];
> 	} else {
> 		/* Falling back to the slowest path. */
> 		mr = mr_lookup_dev_list(dev, entry, addr);
> @@ -468,13 +469,12 @@ struct mr_update_mp_data {
> /**
>  * Release resources of detached MR having no online entry.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> static void
> -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
> 
> @@ -484,11 +484,11 @@ struct mr_update_mp_data {
> 	 * MR can't be freed with holding the lock because rte_free() could call
> 	 * memory free callback function. This will be a deadlock situation.
> 	 */
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach the whole free list and release it after unlocking. */
> -	free_list = priv->mr.mr_free_list;
> -	LIST_INIT(&priv->mr.mr_free_list);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	free_list = sh->mr.mr_free_list;
> +	LIST_INIT(&sh->mr.mr_free_list);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Release resources. */
> 	mr_next = LIST_FIRST(&free_list);
> 	while (mr_next != NULL) {
> @@ -548,12 +548,12 @@ struct mr_update_mp_data {
> 		      dev->data->port_id, (void *)addr);
> 		return UINT32_MAX;
> 	}
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
> 	/* Fill in output data. */
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
> 	DEBUG("port %u MR CREATED by primary process for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
> 	      dev->data->port_id, (void *)addr,
> @@ -582,6 +582,7 @@ struct mr_update_mp_data {
> 		       uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_dev_config *config = &priv->config;
> 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> 	const struct rte_memseg_list *msl;
> @@ -602,12 +603,12 @@ struct mr_update_mp_data {
> 		dev->data->port_id, (void *)addr);
> 	/*
> 	 * Release detached MRs if any. This can't be called with holding either
> -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
> +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
> 	 * been detached by the memory free event but it couldn't be released
> 	 * inside the callback due to deadlock. As a result, releasing resources
> 	 * is quite opportunistic.
> 	 */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> 	/*
> 	 * If enabled, find out a contiguous virtual address chunk in use, to
> 	 * which the given address belongs, in order to register maximum range.
> @@ -710,7 +711,7 @@ struct mr_update_mp_data {
> 		goto alloc_resources;
> 	}
> 	assert(data.msl == data_re.msl);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/*
> 	 * Check the address is really missing. If other thread already created
> 	 * one or it is not found due to overflow, abort and return.
> @@ -721,10 +722,10 @@ struct mr_update_mp_data {
> 		 * low-on-memory. Then, this entry will have to be searched
> 		 * here again.
> 		 */
> -		mr_btree_insert(&priv->mr.cache, entry);
> +		mr_btree_insert(&sh->mr.cache, entry);
> 		DEBUG("port %u found MR for %p on final lookup, abort",
> 		      dev->data->port_id, (void *)addr);
> -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> +		rte_rwlock_write_unlock(&sh->mr.rwlock);
> 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 		/*
> 		 * Must be unlocked before calling rte_free() because
> @@ -769,7 +770,7 @@ struct mr_update_mp_data {
> 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
> 	 * through mlx5_alloc_verbs_buf().
> 	 */
> -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
> +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
> 				       IBV_ACCESS_LOCAL_WRITE);
> 	if (mr->ibv_mr == NULL) {
> 		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@ -779,7 +780,7 @@ struct mr_update_mp_data {
> 	}
> 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
> 	assert(mr->ibv_mr->length == len);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	DEBUG("port %u MR CREATED (%p) for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
> 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
> @@ -792,11 +793,11 @@ struct mr_update_mp_data {
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 	return entry->lkey;
> err_mrlock:
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> err_memlock:
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> err_nolock:
> @@ -854,14 +855,15 @@ struct mr_update_mp_data {
> mr_rebuild_dev_cache(struct rte_eth_dev *dev)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr *mr;
> 
> 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
> 	/* Flush cache to rebuild. */
> -	priv->mr.cache.len = 1;
> -	priv->mr.cache.overflow = 0;
> +	sh->mr.cache.len = 1;
> +	sh->mr.cache.overflow = 0;
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
> 		if (mr_insert_dev_cache(dev, mr) < 0)
> 			return;
> }
> @@ -888,6 +890,7 @@ struct mr_update_mp_data {
> mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	const struct rte_memseg_list *msl;
> 	struct mlx5_mr *mr;
> 	int ms_n;
> @@ -901,7 +904,7 @@ struct mr_update_mp_data {
> 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
> 	assert(len == RTE_ALIGN(len, msl->page_sz));
> 	ms_n = len / msl->page_sz;
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Clear bits of freed memsegs from MR. */
> 	for (i = 0; i < ms_n; ++i) {
> 		const struct rte_memseg *ms;
> @@ -928,7 +931,7 @@ struct mr_update_mp_data {
> 		rte_bitmap_clear(mr->ms_bmp, pos);
> 		if (--mr->ms_n == 0) {
> 			LIST_REMOVE(mr, mr);
> -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 			DEBUG("port %u remove MR(%p) from list",
> 			      dev->data->port_id, (void *)mr);
> 		}
> @@ -949,12 +952,12 @@ struct mr_update_mp_data {
> 		 * generation below) will be guaranteed to be seen by other core
> 		 * before the core sees the newly allocated memory.
> 		 */
> -		++priv->mr.dev_gen;
> +		++sh->mr.dev_gen;
> 		DEBUG("broadcasting local cache flush, gen=%d",
> -		      priv->mr.dev_gen);
> +		      sh->mr.dev_gen);
> 		rte_smp_wmb();
> 	}
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> }
> 
> /**
> @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
> 		   struct mlx5_mr_cache *entry, uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
> 	uint16_t idx;
> 	uint32_t lkey;
> @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
> 	if (unlikely(bt->len == bt->size))
> 		mr_btree_expand(bt, bt->size << 1);
> 	/* Look up in the global cache. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 	if (lkey != UINT32_MAX) {
> 		/* Found. */
> -		*entry = (*priv->mr.cache.table)[idx];
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		*entry = (*sh->mr.cache.table)[idx];
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		/*
> 		 * Update local cache. Even if it fails, return the found entry
> 		 * to update top-half cache. Next time, this entry will be found
> @@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
> 		mr_btree_insert(bt, entry);
> 		return lkey;
> 	}
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	/* First time to see the address? Create a new MR. */
> 	lkey = mlx5_mr_create(dev, entry, addr);
> 	/*
> @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
> 	struct mr_update_mp_data *data = opaque;
> 	struct rte_eth_dev *dev = data->dev;
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
> 	struct mlx5_mr *mr = NULL;
> 	uintptr_t addr = (uintptr_t)memhdr->addr;
> @@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
> 
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	/* If already registered, it should return. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	lkey = mr_lookup_dev(dev, &entry, addr);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	if (lkey != UINT32_MAX)
> 		return;
> 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
> @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
> 		data->ret = -1;
> 		return;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Insert to the local cache table */
> 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
> }
> @@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_mr *mr;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 
> 	dev = pci_dev_to_eth_dev(pdev);
> 	if (!dev) {
> @@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
> 		rte_errno = EINVAL;
> 		return -1;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	sh = priv->sh;
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
> {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 	struct mlx5_mr *mr;
> 	struct mlx5_mr_cache entry;
> 
> @@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	priv = dev->data->dev_private;
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	sh = priv->sh;
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> 	if (!mr) {
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
> 				 "to PCI device %p", (uintptr_t)addr,
> 				 (void *)pdev);
> @@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	LIST_REMOVE(mr, mr);
> -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> 	      (void *)mr);
> 	mr_rebuild_dev_cache(dev);
> @@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
> 	 * generation below) will be guaranteed to be seen by other core
> 	 * before the core sees the newly allocated memory.
> 	 */
> -	++priv->mr.dev_gen;
> -	DEBUG("broadcasting local cache flush, gen=%d",
> -			priv->mr.dev_gen);
> +	++sh->mr.dev_gen;
> +	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
> 	rte_smp_wmb();
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
> /**
>  * Dump all the created MRs and the global cache entries.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
> {
> #ifndef NDEBUG
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr;
> 	int mr_n = 0;
> 	int chunk_n = 0;
> 
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> -		      dev->data->port_id, mr_n++,
> +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> +		      sh->ibdev_name, mr_n++,
> 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
> 		      mr->ms_n, mr->ms_bmp_n);
> 		if (mr->ms_n == 0)
> @@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
> 			      chunk_n++, ret.start, ret.end);
> 		}
> 	}
> -	DEBUG("port %u dumping global cache", dev->data->port_id);
> -	mlx5_mr_btree_dump(&priv->mr.cache);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> +	mlx5_mr_btree_dump(&sh->mr.cache);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> #endif
> }
> 
> /**
> - * Release all the created MRs and resources. Remove device from memory callback
> + * Release all the created MRs and resources for shared device context.
>  * list.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_release(struct rte_eth_dev *dev)
> +mlx5_mr_release(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 
> -	/* Remove from memory callback device list. */
> -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -	LIST_REMOVE(priv, mem_event_cb);
> -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> -		mlx5_mr_dump_dev(dev);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +		mlx5_mr_dump_dev(sh);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach from MR list and move to free list. */
> -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> +	mr_next = LIST_FIRST(&sh->mr.mr_list);
> 	while (mr_next != NULL) {
> 		struct mlx5_mr *mr = mr_next;
> 
> 		mr_next = LIST_NEXT(mr, mr);
> 		LIST_REMOVE(mr, mr);
> -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	}
> -	LIST_INIT(&priv->mr.mr_list);
> +	LIST_INIT(&sh->mr.mr_list);
> 	/* Free global cache. */
> -	mlx5_mr_btree_free(&priv->mr.cache);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	mlx5_mr_btree_free(&sh->mr.cache);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Free all remaining MRs. */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> }
> diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> index 786f6a3..89e89b7 100644
> --- a/drivers/net/mlx5/mlx5_mr.h
> +++ b/drivers/net/mlx5/mlx5_mr.h
> @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
> 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
> } __rte_packed;
> 
> +struct mlx5_ibv_shared;
> extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
> extern rte_rwlock_t mlx5_mem_event_rwlock;
> 
> @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
> 			  size_t len, void *arg);
> int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
> 		      struct rte_mempool *mp);
> -void mlx5_mr_release(struct rte_eth_dev *dev);
> +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
> 
> /* Debug purpose functions. */
> void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
> -void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
> 
> /**
>  * Look up LKey from given lookup table by linear search. Firstly look up the
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index dbe074f..4d55fd4 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
> 		goto error;
> 	}
> 	/* Save pointer of global generation number to check memory event. */
> -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
> 	assert(desc > MLX5_TX_COMP_THRESH);
> 	tmpl->txq.offloads = conf->offloads |
> 			     dev->data->dev_conf.txmode.offloads;
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/mlx5: share Memory Regions for multiport device
  2019-04-24  8:52   ` Yongseok Koh
@ 2019-04-24  8:52     ` Yongseok Koh
  0 siblings, 0 replies; 20+ messages in thread
From: Yongseok Koh @ 2019-04-24  8:52 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: Shahaf Shuler, dev


> On Apr 24, 2019, at 1:44 AM, Viacheslav Ovsiienko <viacheslavo@mellanox.com> wrote:
> 
> The multiport Infiniband device support was introduced [1].
> All active ports, belonging to the same Infiniband device use the signle
> shared Infiniband context of that device and share the resources:
>  - QPs are created within shared context
>  - Verbs flows are also created with specifying port index
>  - DV/DR resources
>  - Protection Domain
>  - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between
> portes, created on the base of multiport Infiniban device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3
> is subject to share due to performance issues.
> 
> [1] http://patches.dpdk.org/cover/51800/
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
Acked-by: Yongseok Koh <yskoh@mellanox.com>

> v2:
>  - intendation issues
>  - comments cleanup
> 
> v1:
>  http://patches.dpdk.org/patch/52723/
> 
> drivers/net/mlx5/mlx5.c     |  40 +++++++----
> drivers/net/mlx5/mlx5.h     |  15 ++--
> drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
> drivers/net/mlx5/mlx5_mr.h  |   5 +-
> drivers/net/mlx5/mlx5_txq.c |   2 +-
> 5 files changed, 121 insertions(+), 105 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 1bb58b1..b563e0f 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
> 	struct mlx5_switch_info info; /**< Switch information. */
> 	struct ibv_device *ibv_dev; /**< Associated IB device. */
> 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
> };
> 
> static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
> @@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
> 		sizeof(sh->ibdev_name));
> 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> 		sizeof(sh->ibdev_path));
> +	sh->pci_dev = spawn->pci_dev;
> 	pthread_mutex_init(&sh->intr_mutex, NULL);
> 	/*
> 	 * Setting port_id to max unallowed value means
> @@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
> 		err = ENOMEM;
> 		goto error;
> 	}
> +	/*
> +	 * Once the device is added to the list of memory event
> +	 * callback, its global MR cache table cannot be expanded
> +	 * on the fly because of deadlock. If it overflows, lookup
> +	 * should be done by searching MR list linearly, which is slow.
> +	 *
> +	 * At this point the device is not added to the memory
> +	 * event list yet, context is just being created.
> +	 */
> +	err = mlx5_mr_btree_init(&sh->mr.cache,
> +				 MLX5_MR_BTREE_CACHE_N * 2,
> +				 sh->pci_dev->device.numa_node);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
> 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> exit:
> 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> @@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	if (--sh->refcnt)
> 		goto exit;
> +	/* Release created Memory Regions. */
> +	mlx5_mr_release(sh);
> 	LIST_REMOVE(sh, next);
> 	/*
> 	 *  Ensure there is no async event handler installed.
> @@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
> 	}
> 	mlx5_proc_priv_uninit(dev);
> 	mlx5_mprq_free_mp(dev);
> -	mlx5_mr_release(dev);
> +	/* Remove from memory callback device list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_REMOVE(priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	assert(priv->sh);
> 	mlx5_free_shared_dr(priv);
> 	if (priv->rss_conf.rss_key != NULL)
> @@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
> 		goto error;
> 	}
> 	priv->config.flow_prio = err;
> -	/*
> -	 * Once the device is added to the list of memory event
> -	 * callback, its global MR cache table cannot be expanded
> -	 * on the fly because of deadlock. If it overflows, lookup
> -	 * should be done by searching MR list linearly, which is slow.
> -	 */
> -	err = mlx5_mr_btree_init(&priv->mr.cache,
> -				 MLX5_MR_BTREE_CACHE_N * 2,
> -				 eth_dev->device->numa_node);
> -	if (err) {
> -		err = rte_errno;
> -		goto error;
> -	}
> 	/* Add device to memory callback list. */
> 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> @@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = i;
> 			list[ns].ibv_dev = ibv_match[0];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = mlx5_nl_ifindex
> 					(nl_rdma, list[ns].ibv_dev->name, i);
> 			if (!list[ns].ifindex) {
> @@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = 1;
> 			list[ns].ibv_dev = ibv_match[i];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = 0;
> 			if (nl_rdma >= 0)
> 				list[ns].ifindex = mlx5_nl_ifindex
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 0a6d7f1..2575732 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
> 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
> 	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> +	struct {
> +		uint32_t dev_gen; /* Generation number to flush local caches. */
> +		rte_rwlock_t rwlock; /* MR Lock. */
> +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> +	} mr;
> 	/* Shared DV/DR flow data section. */
> 	pthread_mutex_t dv_mutex; /* DV context mutex. */
> 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> @@ -347,13 +355,6 @@ struct mlx5_priv {
> 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
> 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
> 	/* Flow counters. */
> -	struct {
> -		uint32_t dev_gen; /* Generation number to flush local caches. */
> -		rte_rwlock_t rwlock; /* MR Lock. */
> -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> -	} mr;
> 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
> 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
> 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index e4957a4..a7a63b1 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -36,7 +36,7 @@ struct mr_update_mp_data {
> 
> /**
>  * Expand B-tree table to a given size. Can't be called with holding
> - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
>  *
>  * @param bt
>  *   Pointer to B-tree structure.
> @@ -350,7 +350,7 @@ struct mr_update_mp_data {
> 		n = mr_find_next_chunk(mr, &entry, n);
> 		if (!entry.end)
> 			break;
> -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
> 			/*
> 			 * Overflowed, but the global table cannot be expanded
> 			 * because of deadlock.
> @@ -382,7 +382,7 @@ struct mr_update_mp_data {
> 	struct mlx5_mr *mr;
> 
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> 		if (mr->ms_n == 0)
> @@ -420,6 +420,7 @@ struct mr_update_mp_data {
> 	      uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	uint16_t idx;
> 	uint32_t lkey = UINT32_MAX;
> 	struct mlx5_mr *mr;
> @@ -430,10 +431,10 @@ struct mr_update_mp_data {
> 	 * has to be searched by traversing the original MR list instead, which
> 	 * is very slow path. Otherwise, the global cache is all inclusive.
> 	 */
> -	if (!unlikely(priv->mr.cache.overflow)) {
> -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	if (!unlikely(sh->mr.cache.overflow)) {
> +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 		if (lkey != UINT32_MAX)
> -			*entry = (*priv->mr.cache.table)[idx];
> +			*entry = (*sh->mr.cache.table)[idx];
> 	} else {
> 		/* Falling back to the slowest path. */
> 		mr = mr_lookup_dev_list(dev, entry, addr);
> @@ -468,13 +469,12 @@ struct mr_update_mp_data {
> /**
>  * Release resources of detached MR having no online entry.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> static void
> -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
> 
> @@ -484,11 +484,11 @@ struct mr_update_mp_data {
> 	 * MR can't be freed with holding the lock because rte_free() could call
> 	 * memory free callback function. This will be a deadlock situation.
> 	 */
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach the whole free list and release it after unlocking. */
> -	free_list = priv->mr.mr_free_list;
> -	LIST_INIT(&priv->mr.mr_free_list);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	free_list = sh->mr.mr_free_list;
> +	LIST_INIT(&sh->mr.mr_free_list);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Release resources. */
> 	mr_next = LIST_FIRST(&free_list);
> 	while (mr_next != NULL) {
> @@ -548,12 +548,12 @@ struct mr_update_mp_data {
> 		      dev->data->port_id, (void *)addr);
> 		return UINT32_MAX;
> 	}
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
> 	/* Fill in output data. */
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
> 	DEBUG("port %u MR CREATED by primary process for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
> 	      dev->data->port_id, (void *)addr,
> @@ -582,6 +582,7 @@ struct mr_update_mp_data {
> 		       uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_dev_config *config = &priv->config;
> 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> 	const struct rte_memseg_list *msl;
> @@ -602,12 +603,12 @@ struct mr_update_mp_data {
> 		dev->data->port_id, (void *)addr);
> 	/*
> 	 * Release detached MRs if any. This can't be called with holding either
> -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
> +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
> 	 * been detached by the memory free event but it couldn't be released
> 	 * inside the callback due to deadlock. As a result, releasing resources
> 	 * is quite opportunistic.
> 	 */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> 	/*
> 	 * If enabled, find out a contiguous virtual address chunk in use, to
> 	 * which the given address belongs, in order to register maximum range.
> @@ -710,7 +711,7 @@ struct mr_update_mp_data {
> 		goto alloc_resources;
> 	}
> 	assert(data.msl == data_re.msl);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/*
> 	 * Check the address is really missing. If other thread already created
> 	 * one or it is not found due to overflow, abort and return.
> @@ -721,10 +722,10 @@ struct mr_update_mp_data {
> 		 * low-on-memory. Then, this entry will have to be searched
> 		 * here again.
> 		 */
> -		mr_btree_insert(&priv->mr.cache, entry);
> +		mr_btree_insert(&sh->mr.cache, entry);
> 		DEBUG("port %u found MR for %p on final lookup, abort",
> 		      dev->data->port_id, (void *)addr);
> -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> +		rte_rwlock_write_unlock(&sh->mr.rwlock);
> 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 		/*
> 		 * Must be unlocked before calling rte_free() because
> @@ -769,7 +770,7 @@ struct mr_update_mp_data {
> 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
> 	 * through mlx5_alloc_verbs_buf().
> 	 */
> -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
> +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
> 				       IBV_ACCESS_LOCAL_WRITE);
> 	if (mr->ibv_mr == NULL) {
> 		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@ -779,7 +780,7 @@ struct mr_update_mp_data {
> 	}
> 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
> 	assert(mr->ibv_mr->length == len);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	DEBUG("port %u MR CREATED (%p) for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
> 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
> @@ -792,11 +793,11 @@ struct mr_update_mp_data {
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 	return entry->lkey;
> err_mrlock:
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> err_memlock:
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> err_nolock:
> @@ -854,14 +855,15 @@ struct mr_update_mp_data {
> mr_rebuild_dev_cache(struct rte_eth_dev *dev)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr *mr;
> 
> 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
> 	/* Flush cache to rebuild. */
> -	priv->mr.cache.len = 1;
> -	priv->mr.cache.overflow = 0;
> +	sh->mr.cache.len = 1;
> +	sh->mr.cache.overflow = 0;
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
> 		if (mr_insert_dev_cache(dev, mr) < 0)
> 			return;
> }
> @@ -888,6 +890,7 @@ struct mr_update_mp_data {
> mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	const struct rte_memseg_list *msl;
> 	struct mlx5_mr *mr;
> 	int ms_n;
> @@ -901,7 +904,7 @@ struct mr_update_mp_data {
> 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
> 	assert(len == RTE_ALIGN(len, msl->page_sz));
> 	ms_n = len / msl->page_sz;
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Clear bits of freed memsegs from MR. */
> 	for (i = 0; i < ms_n; ++i) {
> 		const struct rte_memseg *ms;
> @@ -928,7 +931,7 @@ struct mr_update_mp_data {
> 		rte_bitmap_clear(mr->ms_bmp, pos);
> 		if (--mr->ms_n == 0) {
> 			LIST_REMOVE(mr, mr);
> -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 			DEBUG("port %u remove MR(%p) from list",
> 			      dev->data->port_id, (void *)mr);
> 		}
> @@ -949,12 +952,12 @@ struct mr_update_mp_data {
> 		 * generation below) will be guaranteed to be seen by other core
> 		 * before the core sees the newly allocated memory.
> 		 */
> -		++priv->mr.dev_gen;
> +		++sh->mr.dev_gen;
> 		DEBUG("broadcasting local cache flush, gen=%d",
> -		      priv->mr.dev_gen);
> +		      sh->mr.dev_gen);
> 		rte_smp_wmb();
> 	}
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> }
> 
> /**
> @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
> 		   struct mlx5_mr_cache *entry, uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
> 	uint16_t idx;
> 	uint32_t lkey;
> @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
> 	if (unlikely(bt->len == bt->size))
> 		mr_btree_expand(bt, bt->size << 1);
> 	/* Look up in the global cache. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 	if (lkey != UINT32_MAX) {
> 		/* Found. */
> -		*entry = (*priv->mr.cache.table)[idx];
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		*entry = (*sh->mr.cache.table)[idx];
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		/*
> 		 * Update local cache. Even if it fails, return the found entry
> 		 * to update top-half cache. Next time, this entry will be found
> @@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
> 		mr_btree_insert(bt, entry);
> 		return lkey;
> 	}
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	/* First time to see the address? Create a new MR. */
> 	lkey = mlx5_mr_create(dev, entry, addr);
> 	/*
> @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
> 	struct mr_update_mp_data *data = opaque;
> 	struct rte_eth_dev *dev = data->dev;
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
> 	struct mlx5_mr *mr = NULL;
> 	uintptr_t addr = (uintptr_t)memhdr->addr;
> @@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
> 
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	/* If already registered, it should return. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	lkey = mr_lookup_dev(dev, &entry, addr);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	if (lkey != UINT32_MAX)
> 		return;
> 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
> @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
> 		data->ret = -1;
> 		return;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Insert to the local cache table */
> 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
> }
> @@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_mr *mr;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 
> 	dev = pci_dev_to_eth_dev(pdev);
> 	if (!dev) {
> @@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
> 		rte_errno = EINVAL;
> 		return -1;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	sh = priv->sh;
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
> {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 	struct mlx5_mr *mr;
> 	struct mlx5_mr_cache entry;
> 
> @@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	priv = dev->data->dev_private;
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	sh = priv->sh;
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> 	if (!mr) {
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
> 				 "to PCI device %p", (uintptr_t)addr,
> 				 (void *)pdev);
> @@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	LIST_REMOVE(mr, mr);
> -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> 	      (void *)mr);
> 	mr_rebuild_dev_cache(dev);
> @@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
> 	 * generation below) will be guaranteed to be seen by other core
> 	 * before the core sees the newly allocated memory.
> 	 */
> -	++priv->mr.dev_gen;
> -	DEBUG("broadcasting local cache flush, gen=%d",
> -			priv->mr.dev_gen);
> +	++sh->mr.dev_gen;
> +	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
> 	rte_smp_wmb();
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
> /**
>  * Dump all the created MRs and the global cache entries.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
> {
> #ifndef NDEBUG
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr;
> 	int mr_n = 0;
> 	int chunk_n = 0;
> 
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> -		      dev->data->port_id, mr_n++,
> +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> +		      sh->ibdev_name, mr_n++,
> 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
> 		      mr->ms_n, mr->ms_bmp_n);
> 		if (mr->ms_n == 0)
> @@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
> 			      chunk_n++, ret.start, ret.end);
> 		}
> 	}
> -	DEBUG("port %u dumping global cache", dev->data->port_id);
> -	mlx5_mr_btree_dump(&priv->mr.cache);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> +	mlx5_mr_btree_dump(&sh->mr.cache);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> #endif
> }
> 
> /**
> - * Release all the created MRs and resources. Remove device from memory callback
> + * Release all the created MRs and resources for shared device context.
>  * list.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_release(struct rte_eth_dev *dev)
> +mlx5_mr_release(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 
> -	/* Remove from memory callback device list. */
> -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -	LIST_REMOVE(priv, mem_event_cb);
> -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> -		mlx5_mr_dump_dev(dev);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +		mlx5_mr_dump_dev(sh);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach from MR list and move to free list. */
> -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> +	mr_next = LIST_FIRST(&sh->mr.mr_list);
> 	while (mr_next != NULL) {
> 		struct mlx5_mr *mr = mr_next;
> 
> 		mr_next = LIST_NEXT(mr, mr);
> 		LIST_REMOVE(mr, mr);
> -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	}
> -	LIST_INIT(&priv->mr.mr_list);
> +	LIST_INIT(&sh->mr.mr_list);
> 	/* Free global cache. */
> -	mlx5_mr_btree_free(&priv->mr.cache);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	mlx5_mr_btree_free(&sh->mr.cache);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Free all remaining MRs. */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> }
> diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> index 786f6a3..89e89b7 100644
> --- a/drivers/net/mlx5/mlx5_mr.h
> +++ b/drivers/net/mlx5/mlx5_mr.h
> @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
> 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
> } __rte_packed;
> 
> +struct mlx5_ibv_shared;
> extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
> extern rte_rwlock_t mlx5_mem_event_rwlock;
> 
> @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
> 			  size_t len, void *arg);
> int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
> 		      struct rte_mempool *mp);
> -void mlx5_mr_release(struct rte_eth_dev *dev);
> +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
> 
> /* Debug purpose functions. */
> void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
> -void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
> 
> /**
>  * Look up LKey from given lookup table by linear search. Firstly look up the
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index dbe074f..4d55fd4 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
> 		goto error;
> 	}
> 	/* Save pointer of global generation number to check memory event. */
> -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
> 	assert(desc > MLX5_TX_COMP_THRESH);
> 	tmpl->txq.offloads = conf->offloads |
> 			     dev->data->dev_conf.txmode.offloads;
> -- 
> 1.8.3.1
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices
  2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
  2019-04-24  8:44   ` Viacheslav Ovsiienko
  2019-04-24  8:52   ` Yongseok Koh
@ 2019-04-27  4:32   ` Viacheslav Ovsiienko
  2019-04-27  4:32     ` Viacheslav Ovsiienko
                       ` (3 more replies)
  2 siblings, 4 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The patches [1] and [2] are both related to Memory Regions sharing and their
applying order matters, this series just combines ones.

The multiport Infiniband device support was introduced [3].
All active ports, belonging to the same Infiniband device use the single
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
ports, created on the base of multiport Infiniband device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/patch/53040/
[2] http://patches.dpdk.org/patch/53041/
[3] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v3:
  - combine two patches in pathset
  - commit messages spellcheck

v2:
  - intendation issues
  - comments cleanup

v1:
  http://patches.dpdk.org/patch/52723/

Viacheslav Ovsiienko (2):
  net/mlx5: share Memory Regions for multiport device
  net/mlx5: update memory event callback for shared context

 drivers/net/mlx5/mlx5.c     |  42 +++++---
 drivers/net/mlx5/mlx5.h     |  21 ++--
 drivers/net/mlx5/mlx5_mr.c  | 245 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 162 insertions(+), 153 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
@ 2019-04-27  4:32     ` Viacheslav Ovsiienko
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The patches [1] and [2] are both related to Memory Regions sharing and their
applying order matters, this series just combines ones.

The multiport Infiniband device support was introduced [3].
All active ports, belonging to the same Infiniband device use the single
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
ports, created on the base of multiport Infiniband device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/patch/53040/
[2] http://patches.dpdk.org/patch/53041/
[3] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v3:
  - combine two patches in pathset
  - commit messages spellcheck

v2:
  - intendation issues
  - comments cleanup

v1:
  http://patches.dpdk.org/patch/52723/

Viacheslav Ovsiienko (2):
  net/mlx5: share Memory Regions for multiport device
  net/mlx5: update memory event callback for shared context

 drivers/net/mlx5/mlx5.c     |  42 +++++---
 drivers/net/mlx5/mlx5.h     |  21 ++--
 drivers/net/mlx5/mlx5_mr.c  | 245 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 162 insertions(+), 153 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
  2019-04-27  4:32     ` Viacheslav Ovsiienko
@ 2019-04-27  4:32     ` Viacheslav Ovsiienko
  2019-04-27  4:32       ` Viacheslav Ovsiienko
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
  2019-04-29  8:54     ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Shahaf Shuler
  3 siblings, 1 reply; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the single
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
ports, created on the base of multiport Infiniband device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 105 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bb58b1..b563e0f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0a6d7f1..2575732 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -347,13 +355,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index e4957a4..a7a63b1 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
-	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+	++sh->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbe074f..4d55fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
@ 2019-04-27  4:32       ` Viacheslav Ovsiienko
  0 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the single
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
ports, created on the base of multiport Infiniband device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 105 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bb58b1..b563e0f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0a6d7f1..2575732 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -347,13 +355,6 @@ struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index e4957a4..a7a63b1 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@ struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@ struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@ struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@ struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@ struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@ struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@ struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@ struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@ struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@ struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@ struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@ struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@ struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@ struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@ struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@ struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@ struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@ struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
-	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+	++sh->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbe074f..4d55fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
  2019-04-27  4:32     ` Viacheslav Ovsiienko
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
@ 2019-04-27  4:32     ` Viacheslav Ovsiienko
  2019-04-27  4:32       ` Viacheslav Ovsiienko
  2019-04-29  8:54     ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Shahaf Shuler
  3 siblings, 1 reply; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

Mellanox mlx5 PMD implements the list of devices to process the memory
free events to reflect the actual memory state to Memory Regions.
Because this list contains the devices and devices may share the
same context the callback routine may be called multiple times
with the same parameter, that is not optimal. This patch modifies
the list to contain the device contexts instead of device objects
and shared context is included in the list only once.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c    |  6 +--
 drivers/net/mlx5/mlx5.h    |  6 +--
 drivers/net/mlx5/mlx5_mr.c | 91 +++++++++++++++++++++-------------------------
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b563e0f..de85e85 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -673,9 +673,9 @@ struct mlx5_dev_spawn_data {
 	mlx5_mprq_free_mp(dev);
 	/* Remove from memory callback device list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
+	LIST_REMOVE(priv->sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
 		rte_free(priv->rss_conf.rss_key);
@@ -1574,7 +1574,7 @@ struct mlx5_dev_spawn_data {
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-			 priv, mem_event_cb);
+			 sh, mem_event_cb);
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2575732..82fcb29 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -99,7 +99,7 @@ struct mlx5_switch_info {
 	uint64_t switch_id; /**< Switch identifier. */
 };
 
-LIST_HEAD(mlx5_dev_list, mlx5_priv);
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
 
 /* Shared data between primary and secondary processes. */
 struct mlx5_shared_data {
@@ -276,6 +276,8 @@ struct mlx5_ibv_shared {
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+	/**< Called by memory event callback. */
 	struct {
 		uint32_t dev_gen; /* Generation number to flush local caches. */
 		rte_rwlock_t rwlock; /* MR Lock. */
@@ -322,8 +324,6 @@ struct mlx5_proc_priv {
 	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
 
 struct mlx5_priv {
-	LIST_ENTRY(mlx5_priv) mem_event_cb;
-	/**< Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
 	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
 	uint32_t ibv_port; /* IB device port number. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a7a63b1..66e8e87 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -327,7 +327,7 @@ struct mr_update_mp_data {
  * mlx5_mr_create() on miss.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param mr
  *   Pointer to MR to insert.
  *
@@ -335,13 +335,12 @@ struct mr_update_mp_data {
  *   0 on success, -1 on failure.
  */
 static int
-mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr)
+mr_insert_dev_cache(struct mlx5_ibv_shared *sh, struct mlx5_mr *mr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	unsigned int n;
 
-	DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache",
-		dev->data->port_id, (void *)mr);
+	DRV_LOG(DEBUG, "device %s inserting MR(%p) to global cache",
+		sh->ibdev_name, (void *)mr);
 	for (n = 0; n < mr->ms_bmp_n; ) {
 		struct mlx5_mr_cache entry;
 
@@ -350,7 +349,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -364,8 +363,8 @@ struct mr_update_mp_data {
 /**
  * Look up address in the original global MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -375,14 +374,13 @@ struct mr_update_mp_data {
  *   Found MR on match, NULL otherwise.
  */
 static struct mlx5_mr *
-mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev_list(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 		   uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -406,7 +404,7 @@ struct mr_update_mp_data {
  * Look up address on device.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -416,11 +414,9 @@ struct mr_update_mp_data {
  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
  */
 static uint32_t
-mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 	      uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -437,7 +433,7 @@ struct mr_update_mp_data {
 			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
-		mr = mr_lookup_dev_list(dev, entry, addr);
+		mr = mr_lookup_dev_list(sh, entry, addr);
 		if (mr != NULL)
 			lkey = entry->lkey;
 	}
@@ -550,7 +546,7 @@ struct mr_update_mp_data {
 	}
 	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(priv->sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
@@ -716,7 +712,7 @@ struct mr_update_mp_data {
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
 	 */
-	if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+	if (mr_lookup_dev(sh, entry, addr) != UINT32_MAX) {
 		/*
 		 * Insert to the global cache table. It may fail due to
 		 * low-on-memory. Then, this entry will have to be searched
@@ -746,7 +742,7 @@ struct mr_update_mp_data {
 		memset(&ret, 0, sizeof(ret));
 		start = data_re.start + n * msl->page_sz;
 		/* Exclude memsegs already registered by other MRs. */
-		if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+		if (mr_lookup_dev(sh, &ret, start) == UINT32_MAX) {
 			/*
 			 * Start from the first unregistered memseg in the
 			 * extended range.
@@ -788,9 +784,9 @@ struct mr_update_mp_data {
 	      data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
@@ -848,23 +844,21 @@ struct mr_update_mp_data {
 /**
  * Rebuild the global B-tree cache of device from the original MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+mr_rebuild_dev_cache(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
-	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
+	DRV_LOG(DEBUG, "device %s rebuild dev cache[]", sh->ibdev_name);
 	/* Flush cache to rebuild. */
 	sh->mr.cache.len = 1;
 	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
-		if (mr_insert_dev_cache(dev, mr) < 0)
+		if (mr_insert_dev_cache(sh, mr) < 0)
 			return;
 }
 
@@ -879,26 +873,25 @@ struct mr_update_mp_data {
  * The global cache must be rebuilt if there's any change and this event has to
  * be propagated to dataplane threads to flush the local caches.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to the Ethernet device shared context.
  * @param addr
  *   Address of freed memory.
  * @param len
  *   Size of freed memory.
  */
 static void
-mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+			  const void *addr, size_t len)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
 	int i;
 	int rebuild = 0;
 
-	DEBUG("port %u free callback: addr=%p, len=%zu",
-	      dev->data->port_id, addr, len);
+	DEBUG("device %s free callback: addr=%p, len=%zu",
+	      sh->ibdev_name, addr, len);
 	msl = rte_mem_virt2memseg_list(addr);
 	/* addr and len must be page-aligned. */
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
@@ -915,7 +908,7 @@ struct mr_update_mp_data {
 
 		/* Find MR having this memseg. */
 		start = (uintptr_t)addr + i * msl->page_sz;
-		mr = mr_lookup_dev_list(dev, &entry, start);
+		mr = mr_lookup_dev_list(sh, &entry, start);
 		if (mr == NULL)
 			continue;
 		assert(mr->msl); /* Can't be external memory. */
@@ -926,14 +919,14 @@ struct mr_update_mp_data {
 		pos = ms_idx - mr->ms_base_idx;
 		assert(rte_bitmap_get(mr->ms_bmp, pos));
 		assert(pos < mr->ms_bmp_n);
-		DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
-		      dev->data->port_id, (void *)mr, pos, (void *)start);
+		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+		      sh->ibdev_name, (void *)mr, pos, (void *)start);
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
 			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
-			DEBUG("port %u remove MR(%p) from list",
-			      dev->data->port_id, (void *)mr);
+			DEBUG("device %s remove MR(%p) from list",
+			      sh->ibdev_name, (void *)mr);
 		}
 		/*
 		 * MR is fragmented or will be freed. the global cache must be
@@ -942,7 +935,7 @@ struct mr_update_mp_data {
 		rebuild = 1;
 	}
 	if (rebuild) {
-		mr_rebuild_dev_cache(dev);
+		mr_rebuild_dev_cache(sh);
 		/*
 		 * Flush local caches by propagating invalidation across cores.
 		 * rte_smp_wmb() is enough to synchronize this event. If one of
@@ -975,7 +968,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
-	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
 
 	/* Must be called from the primary process. */
@@ -984,8 +977,8 @@ struct mr_update_mp_data {
 	case RTE_MEM_EVENT_FREE:
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx5 devices. */
-		LIST_FOREACH(priv, dev_list, mem_event_cb)
-			mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
+		LIST_FOREACH(sh, dev_list, mem_event_cb)
+			mlx5_mr_mem_event_free_cb(sh, addr, len);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
@@ -1276,7 +1269,7 @@ struct mr_update_mp_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	lkey = mr_lookup_dev(dev, &entry, addr);
+	lkey = mr_lookup_dev(sh, &entry, addr);
 	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
@@ -1294,7 +1287,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
@@ -1365,7 +1358,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
@@ -1405,7 +1398,7 @@ struct mr_update_mp_data {
 	priv = dev->data->dev_private;
 	sh = priv->sh;
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
+	mr = mr_lookup_dev_list(sh, &entry, (uintptr_t)addr);
 	if (!mr) {
 		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
@@ -1418,7 +1411,7 @@ struct mr_update_mp_data {
 	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
-	mr_rebuild_dev_cache(dev);
+	mr_rebuild_dev_cache(sh);
 	/*
 	 * Flush local caches by propagating invalidation across cores.
 	 * rte_smp_wmb() is enough to synchronize this event. If one of
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
@ 2019-04-27  4:32       ` Viacheslav Ovsiienko
  0 siblings, 0 replies; 20+ messages in thread
From: Viacheslav Ovsiienko @ 2019-04-27  4:32 UTC (permalink / raw)
  To: shahafs; +Cc: dev, yskoh

Mellanox mlx5 PMD implements the list of devices to process the memory
free events to reflect the actual memory state to Memory Regions.
Because this list contains the devices and devices may share the
same context the callback routine may be called multiple times
with the same parameter, that is not optimal. This patch modifies
the list to contain the device contexts instead of device objects
and shared context is included in the list only once.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c    |  6 +--
 drivers/net/mlx5/mlx5.h    |  6 +--
 drivers/net/mlx5/mlx5_mr.c | 91 +++++++++++++++++++++-------------------------
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b563e0f..de85e85 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -673,9 +673,9 @@ struct mlx5_dev_spawn_data {
 	mlx5_mprq_free_mp(dev);
 	/* Remove from memory callback device list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
+	LIST_REMOVE(priv->sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
 		rte_free(priv->rss_conf.rss_key);
@@ -1574,7 +1574,7 @@ struct mlx5_dev_spawn_data {
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
-			 priv, mem_event_cb);
+			 sh, mem_event_cb);
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2575732..82fcb29 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -99,7 +99,7 @@ struct mlx5_switch_info {
 	uint64_t switch_id; /**< Switch identifier. */
 };
 
-LIST_HEAD(mlx5_dev_list, mlx5_priv);
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
 
 /* Shared data between primary and secondary processes. */
 struct mlx5_shared_data {
@@ -276,6 +276,8 @@ struct mlx5_ibv_shared {
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+	/**< Called by memory event callback. */
 	struct {
 		uint32_t dev_gen; /* Generation number to flush local caches. */
 		rte_rwlock_t rwlock; /* MR Lock. */
@@ -322,8 +324,6 @@ struct mlx5_proc_priv {
 	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
 
 struct mlx5_priv {
-	LIST_ENTRY(mlx5_priv) mem_event_cb;
-	/**< Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
 	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
 	uint32_t ibv_port; /* IB device port number. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a7a63b1..66e8e87 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -327,7 +327,7 @@ struct mr_update_mp_data {
  * mlx5_mr_create() on miss.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param mr
  *   Pointer to MR to insert.
  *
@@ -335,13 +335,12 @@ struct mr_update_mp_data {
  *   0 on success, -1 on failure.
  */
 static int
-mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr)
+mr_insert_dev_cache(struct mlx5_ibv_shared *sh, struct mlx5_mr *mr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	unsigned int n;
 
-	DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache",
-		dev->data->port_id, (void *)mr);
+	DRV_LOG(DEBUG, "device %s inserting MR(%p) to global cache",
+		sh->ibdev_name, (void *)mr);
 	for (n = 0; n < mr->ms_bmp_n; ) {
 		struct mlx5_mr_cache entry;
 
@@ -350,7 +349,7 @@ struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -364,8 +363,8 @@ struct mr_update_mp_data {
 /**
  * Look up address in the original global MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -375,14 +374,13 @@ struct mr_update_mp_data {
  *   Found MR on match, NULL otherwise.
  */
 static struct mlx5_mr *
-mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev_list(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 		   uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -406,7 +404,7 @@ struct mr_update_mp_data {
  * Look up address on device.
  *
  * @param dev
- *   Pointer to Ethernet device.
+ *   Pointer to Ethernet device shared context.
  * @param[out] entry
  *   Pointer to returning MR cache entry. If no match, this will not be updated.
  * @param addr
@@ -416,11 +414,9 @@ struct mr_update_mp_data {
  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
  */
 static uint32_t
-mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+mr_lookup_dev(struct mlx5_ibv_shared *sh, struct mlx5_mr_cache *entry,
 	      uintptr_t addr)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -437,7 +433,7 @@ struct mr_update_mp_data {
 			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
-		mr = mr_lookup_dev_list(dev, entry, addr);
+		mr = mr_lookup_dev_list(sh, entry, addr);
 		if (mr != NULL)
 			lkey = entry->lkey;
 	}
@@ -550,7 +546,7 @@ struct mr_update_mp_data {
 	}
 	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(priv->sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
@@ -716,7 +712,7 @@ struct mr_update_mp_data {
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
 	 */
-	if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+	if (mr_lookup_dev(sh, entry, addr) != UINT32_MAX) {
 		/*
 		 * Insert to the global cache table. It may fail due to
 		 * low-on-memory. Then, this entry will have to be searched
@@ -746,7 +742,7 @@ struct mr_update_mp_data {
 		memset(&ret, 0, sizeof(ret));
 		start = data_re.start + n * msl->page_sz;
 		/* Exclude memsegs already registered by other MRs. */
-		if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+		if (mr_lookup_dev(sh, &ret, start) == UINT32_MAX) {
 			/*
 			 * Start from the first unregistered memseg in the
 			 * extended range.
@@ -788,9 +784,9 @@ struct mr_update_mp_data {
 	      data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	/* Fill in output data. */
-	mr_lookup_dev(dev, entry, addr);
+	mr_lookup_dev(sh, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
@@ -848,23 +844,21 @@ struct mr_update_mp_data {
 /**
  * Rebuild the global B-tree cache of device from the original MR list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+mr_rebuild_dev_cache(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
-	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
+	DRV_LOG(DEBUG, "device %s rebuild dev cache[]", sh->ibdev_name);
 	/* Flush cache to rebuild. */
 	sh->mr.cache.len = 1;
 	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
 	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
-		if (mr_insert_dev_cache(dev, mr) < 0)
+		if (mr_insert_dev_cache(sh, mr) < 0)
 			return;
 }
 
@@ -879,26 +873,25 @@ struct mr_update_mp_data {
  * The global cache must be rebuilt if there's any change and this event has to
  * be propagated to dataplane threads to flush the local caches.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to the Ethernet device shared context.
  * @param addr
  *   Address of freed memory.
  * @param len
  *   Size of freed memory.
  */
 static void
-mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+			  const void *addr, size_t len)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
 	int i;
 	int rebuild = 0;
 
-	DEBUG("port %u free callback: addr=%p, len=%zu",
-	      dev->data->port_id, addr, len);
+	DEBUG("device %s free callback: addr=%p, len=%zu",
+	      sh->ibdev_name, addr, len);
 	msl = rte_mem_virt2memseg_list(addr);
 	/* addr and len must be page-aligned. */
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
@@ -915,7 +908,7 @@ struct mr_update_mp_data {
 
 		/* Find MR having this memseg. */
 		start = (uintptr_t)addr + i * msl->page_sz;
-		mr = mr_lookup_dev_list(dev, &entry, start);
+		mr = mr_lookup_dev_list(sh, &entry, start);
 		if (mr == NULL)
 			continue;
 		assert(mr->msl); /* Can't be external memory. */
@@ -926,14 +919,14 @@ struct mr_update_mp_data {
 		pos = ms_idx - mr->ms_base_idx;
 		assert(rte_bitmap_get(mr->ms_bmp, pos));
 		assert(pos < mr->ms_bmp_n);
-		DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
-		      dev->data->port_id, (void *)mr, pos, (void *)start);
+		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+		      sh->ibdev_name, (void *)mr, pos, (void *)start);
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
 			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
-			DEBUG("port %u remove MR(%p) from list",
-			      dev->data->port_id, (void *)mr);
+			DEBUG("device %s remove MR(%p) from list",
+			      sh->ibdev_name, (void *)mr);
 		}
 		/*
 		 * MR is fragmented or will be freed. the global cache must be
@@ -942,7 +935,7 @@ struct mr_update_mp_data {
 		rebuild = 1;
 	}
 	if (rebuild) {
-		mr_rebuild_dev_cache(dev);
+		mr_rebuild_dev_cache(sh);
 		/*
 		 * Flush local caches by propagating invalidation across cores.
 		 * rte_smp_wmb() is enough to synchronize this event. If one of
@@ -975,7 +968,7 @@ struct mr_update_mp_data {
 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
-	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
 
 	/* Must be called from the primary process. */
@@ -984,8 +977,8 @@ struct mr_update_mp_data {
 	case RTE_MEM_EVENT_FREE:
 		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx5 devices. */
-		LIST_FOREACH(priv, dev_list, mem_event_cb)
-			mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
+		LIST_FOREACH(sh, dev_list, mem_event_cb)
+			mlx5_mr_mem_event_free_cb(sh, addr, len);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
@@ -1276,7 +1269,7 @@ struct mr_update_mp_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	lkey = mr_lookup_dev(dev, &entry, addr);
+	lkey = mr_lookup_dev(sh, &entry, addr);
 	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
@@ -1294,7 +1287,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
@@ -1365,7 +1358,7 @@ struct mr_update_mp_data {
 	rte_rwlock_write_lock(&sh->mr.rwlock);
 	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
-	mr_insert_dev_cache(dev, mr);
+	mr_insert_dev_cache(sh, mr);
 	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
@@ -1405,7 +1398,7 @@ struct mr_update_mp_data {
 	priv = dev->data->dev_private;
 	sh = priv->sh;
 	rte_rwlock_read_lock(&sh->mr.rwlock);
-	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
+	mr = mr_lookup_dev_list(sh, &entry, (uintptr_t)addr);
 	if (!mr) {
 		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
@@ -1418,7 +1411,7 @@ struct mr_update_mp_data {
 	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
-	mr_rebuild_dev_cache(dev);
+	mr_rebuild_dev_cache(sh);
 	/*
 	 * Flush local caches by propagating invalidation across cores.
 	 * rte_smp_wmb() is enough to synchronize this event. If one of
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices
  2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
                       ` (2 preceding siblings ...)
  2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
@ 2019-04-29  8:54     ` Shahaf Shuler
  2019-04-29  8:54       ` Shahaf Shuler
  3 siblings, 1 reply; 20+ messages in thread
From: Shahaf Shuler @ 2019-04-29  8:54 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: dev, Yongseok Koh

Saturday, April 27, 2019 7:33 AM, Viacheslav Ovsiienko:
> Subject: [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport
> devices
> 
> The patches [1] and [2] are both related to Memory Regions sharing and their
> applying order matters, this series just combines ones.
> 
> The multiport Infiniband device support was introduced [3].
> All active ports, belonging to the same Infiniband device use the single
> shared Infiniband context of that device and share the resources:
>   - QPs are created within shared context
>   - Verbs flows are also created with specifying port index
>   - DV/DR resources
>   - Protection Domain
>   - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between ports,
> created on the base of multiport Infiniband device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3 is subject to
> share due to performance issues.
> 
> [1] http://patches.dpdk.org/patch/53040/
> [2] http://patches.dpdk.org/patch/53041/
> [3] http://patches.dpdk.org/cover/51800/
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

Series applied to next-net-mlx, thanks. 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices
  2019-04-29  8:54     ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Shahaf Shuler
@ 2019-04-29  8:54       ` Shahaf Shuler
  0 siblings, 0 replies; 20+ messages in thread
From: Shahaf Shuler @ 2019-04-29  8:54 UTC (permalink / raw)
  To: Slava Ovsiienko; +Cc: dev, Yongseok Koh

Saturday, April 27, 2019 7:33 AM, Viacheslav Ovsiienko:
> Subject: [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport
> devices
> 
> The patches [1] and [2] are both related to Memory Regions sharing and their
> applying order matters, this series just combines ones.
> 
> The multiport Infiniband device support was introduced [3].
> All active ports, belonging to the same Infiniband device use the single
> shared Infiniband context of that device and share the resources:
>   - QPs are created within shared context
>   - Verbs flows are also created with specifying port index
>   - DV/DR resources
>   - Protection Domain
>   - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between ports,
> created on the base of multiport Infiniband device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3 is subject to
> share due to performance issues.
> 
> [1] http://patches.dpdk.org/patch/53040/
> [2] http://patches.dpdk.org/patch/53041/
> [3] http://patches.dpdk.org/cover/51800/
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

Series applied to next-net-mlx, thanks. 


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-04-29  8:54 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-12 15:45 [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
2019-04-12 15:45 ` Viacheslav Ovsiienko
2019-04-12 19:22 ` Yongseok Koh
2019-04-12 19:22   ` Yongseok Koh
2019-04-15  6:42   ` Slava Ovsiienko
2019-04-15  6:42     ` Slava Ovsiienko
2019-04-24  8:44 ` [dpdk-dev] [PATCH v2] " Viacheslav Ovsiienko
2019-04-24  8:44   ` Viacheslav Ovsiienko
2019-04-24  8:52   ` Yongseok Koh
2019-04-24  8:52     ` Yongseok Koh
2019-04-27  4:32   ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Viacheslav Ovsiienko
2019-04-27  4:32     ` Viacheslav Ovsiienko
2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: share Memory Regions for multiport device Viacheslav Ovsiienko
2019-04-27  4:32       ` Viacheslav Ovsiienko
2019-04-27  4:32     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
2019-04-27  4:32       ` Viacheslav Ovsiienko
2019-04-29  8:54     ` [dpdk-dev] [PATCH v3 0/2] net/mlx5: share Memory Regions for multiport devices Shahaf Shuler
2019-04-29  8:54       ` Shahaf Shuler
2019-04-24  8:48 ` [dpdk-dev] [PATCH] net/mlx5: update memory event callback for shared context Viacheslav Ovsiienko
2019-04-24  8:48   ` Viacheslav Ovsiienko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).