From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id ECA821B1FE for ; Fri, 12 Apr 2019 17:45:43 +0200 (CEST) Received: from Internal Mail-Server by MTLPINE1 (envelope-from viacheslavo@mellanox.com) with ESMTPS (AES256-SHA encrypted); 12 Apr 2019 18:45:43 +0300 Received: from pegasus12.mtr.labs.mlnx. (pegasus12.mtr.labs.mlnx [10.210.17.40]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x3CFjhMd011645; Fri, 12 Apr 2019 18:45:43 +0300 From: Viacheslav Ovsiienko To: dev@dpdk.org Cc: shahafs@mellanox.com Date: Fri, 12 Apr 2019 15:45:40 +0000 Message-Id: <1555083940-24539-1-git-send-email-viacheslavo@mellanox.com> X-Mailer: git-send-email 1.8.3.1 Subject: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 12 Apr 2019 15:45:44 -0000 The multiport Infiniband device support was introduced [1]. All active ports, belonging to the same Infiniband device use the signle shared Infiniband context of that device and share the resources: - QPs are created within shared context - Verbs flows are also created with specifying port index - DV/DR resources - Protection Domain - Event Handlers This patchset adds support for Memory Regions sharing between portes, created on the base of multiport Infiniban device. The datapath of mlx5 uses the layered cache subsystem for allocating/releasing Memory Regions, only the lowest layer L3 is subject to share due to performance issues. [1] http://patches.dpdk.org/cover/51800/ Signed-off-by: Viacheslav Ovsiienko --- drivers/net/mlx5/mlx5.c | 40 +++++++---- drivers/net/mlx5/mlx5.h | 15 ++-- drivers/net/mlx5/mlx5_mr.c | 163 ++++++++++++++++++++++---------------------- drivers/net/mlx5/mlx5_mr.h | 5 +- drivers/net/mlx5/mlx5_txq.c | 2 +- 5 files changed, 121 insertions(+), 104 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 9ff50df..7ff5c8b 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data { struct mlx5_switch_info info; /**< Switch information. */ struct ibv_device *ibv_dev; /**< Associated IB device. */ struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ + struct rte_pci_device *pci_dev; /**< Backend PCI device. */ }; static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); @@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data { sizeof(sh->ibdev_name)); strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, sizeof(sh->ibdev_path)); + sh->pci_dev = spawn->pci_dev; pthread_mutex_init(&sh->intr_mutex, NULL); /* * Setting port_id to max unallowed value means @@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data { err = ENOMEM; goto error; } + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + * + * At this point the device is not added to the memory + * event list yet, context is just being created. + */ + err = mlx5_mr_btree_init(&sh->mr.cache, + MLX5_MR_BTREE_CACHE_N * 2, + sh->pci_dev->device.numa_node); + if (err) { + err = rte_errno; + goto error; + } LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); exit: pthread_mutex_unlock(&mlx5_ibv_list_mutex); @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data { assert(rte_eal_process_type() == RTE_PROC_PRIMARY); if (--sh->refcnt) goto exit; + /* Release created Memory Regions. */ + mlx5_mr_release(sh); LIST_REMOVE(sh, next); /* * Ensure there is no async event handler installed. @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data { } mlx5_proc_priv_uninit(dev); mlx5_mprq_free_mp(dev); - mlx5_mr_release(dev); + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); assert(priv->sh); mlx5_free_shared_dr(priv); if (priv->rss_conf.rss_key != NULL) @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data { goto error; } priv->config.flow_prio = err; - /* - * Once the device is added to the list of memory event - * callback, its global MR cache table cannot be expanded - * on the fly because of deadlock. If it overflows, lookup - * should be done by searching MR list linearly, which is slow. - */ - err = mlx5_mr_btree_init(&priv->mr.cache, - MLX5_MR_BTREE_CACHE_N * 2, - eth_dev->device->numa_node); - if (err) { - err = rte_errno; - goto error; - } /* Add device to memory callback list. */ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data { list[ns].ibv_port = i; list[ns].ibv_dev = ibv_match[0]; list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; list[ns].ifindex = mlx5_nl_ifindex (nl_rdma, list[ns].ibv_dev->name, i); if (!list[ns].ifindex) { @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data { list[ns].ibv_port = 1; list[ns].ibv_dev = ibv_match[i]; list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; list[ns].ifindex = 0; if (nl_rdma >= 0) list[ns].ifindex = mlx5_nl_ifindex diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 14c7f3c..8eb1019 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -257,6 +257,14 @@ struct mlx5_ibv_shared { char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */ char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */ struct ibv_device_attr_ex device_attr; /* Device properties. */ + struct rte_pci_device *pci_dev; /* Backend PCI device. */ + struct { /* Global device shared L3 MR cache. */ + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx5_mr_btree cache; /* Global MR cache table. */ + struct mlx5_mr_list mr_list; /* Registered MR list. */ + struct mlx5_mr_list mr_free_list; /* Freed MR list. */ + } mr; /* Shared DV/DR flow data section. */ pthread_mutex_t dv_mutex; /* DV context mutex. */ uint32_t dv_refcnt; /* DV/DR data reference counter. */ @@ -323,13 +331,6 @@ struct mlx5_priv { struct mlx5_flows ctrl_flows; /* Control flow rules. */ LIST_HEAD(counters, mlx5_flow_counter) flow_counters; /* Flow counters. */ - struct { - uint32_t dev_gen; /* Generation number to flush local caches. */ - rte_rwlock_t rwlock; /* MR Lock. */ - struct mlx5_mr_btree cache; /* Global MR cache table. */ - struct mlx5_mr_list mr_list; /* Registered MR list. */ - struct mlx5_mr_list mr_free_list; /* Freed MR list. */ - } mr; LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */ LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */ LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index a3732d4..5140cc2 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -36,7 +36,7 @@ struct mr_update_mp_data { /** * Expand B-tree table to a given size. Can't be called with holding - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc(). * * @param bt * Pointer to B-tree structure. @@ -350,7 +350,7 @@ struct mr_update_mp_data { n = mr_find_next_chunk(mr, &entry, n); if (!entry.end) break; - if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) { /* * Overflowed, but the global table cannot be expanded * because of deadlock. @@ -382,7 +382,7 @@ struct mr_update_mp_data { struct mlx5_mr *mr; /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) { unsigned int n; if (mr->ms_n == 0) @@ -420,6 +420,7 @@ struct mr_update_mp_data { uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; uint16_t idx; uint32_t lkey = UINT32_MAX; struct mlx5_mr *mr; @@ -430,10 +431,10 @@ struct mr_update_mp_data { * has to be searched by traversing the original MR list instead, which * is very slow path. Otherwise, the global cache is all inclusive. */ - if (!unlikely(priv->mr.cache.overflow)) { - lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (!unlikely(sh->mr.cache.overflow)) { + lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr); if (lkey != UINT32_MAX) - *entry = (*priv->mr.cache.table)[idx]; + *entry = (*sh->mr.cache.table)[idx]; } else { /* Falling back to the slowest path. */ mr = mr_lookup_dev_list(dev, entry, addr); @@ -468,13 +469,12 @@ struct mr_update_mp_data { /** * Release resources of detached MR having no online entry. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ static void -mlx5_mr_garbage_collect(struct rte_eth_dev *dev) +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh) { - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr_next; struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); @@ -484,11 +484,11 @@ struct mr_update_mp_data { * MR can't be freed with holding the lock because rte_free() could call * memory free callback function. This will be a deadlock situation. */ - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Detach the whole free list and release it after unlocking. */ - free_list = priv->mr.mr_free_list; - LIST_INIT(&priv->mr.mr_free_list); - rte_rwlock_write_unlock(&priv->mr.rwlock); + free_list = sh->mr.mr_free_list; + LIST_INIT(&sh->mr.mr_free_list); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Release resources. */ mr_next = LIST_FIRST(&free_list); while (mr_next != NULL) { @@ -548,12 +548,12 @@ struct mr_update_mp_data { dev->data->port_id, (void *)addr); return UINT32_MAX; } - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&priv->sh->mr.rwlock); /* Fill in output data. */ mr_lookup_dev(dev, entry, addr); /* Lookup can't fail. */ assert(entry->lkey != UINT32_MAX); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&priv->sh->mr.rwlock); DEBUG("port %u MR CREATED by primary process for %p:\n" " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", dev->data->port_id, (void *)addr, @@ -582,6 +582,7 @@ struct mr_update_mp_data { uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_dev_config *config = &priv->config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; const struct rte_memseg_list *msl; @@ -602,12 +603,12 @@ struct mr_update_mp_data { dev->data->port_id, (void *)addr); /* * Release detached MRs if any. This can't be called with holding either - * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have * been detached by the memory free event but it couldn't be released * inside the callback due to deadlock. As a result, releasing resources * is quite opportunistic. */ - mlx5_mr_garbage_collect(dev); + mlx5_mr_garbage_collect(sh); /* * If enabled, find out a contiguous virtual address chunk in use, to * which the given address belongs, in order to register maximum range. @@ -710,7 +711,7 @@ struct mr_update_mp_data { goto alloc_resources; } assert(data.msl == data_re.msl); - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* * Check the address is really missing. If other thread already created * one or it is not found due to overflow, abort and return. @@ -721,10 +722,10 @@ struct mr_update_mp_data { * low-on-memory. Then, this entry will have to be searched * here again. */ - mr_btree_insert(&priv->mr.cache, entry); + mr_btree_insert(&sh->mr.cache, entry); DEBUG("port %u found MR for %p on final lookup, abort", dev->data->port_id, (void *)addr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); /* * Must be unlocked before calling rte_free() because @@ -769,7 +770,7 @@ struct mr_update_mp_data { * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() * through mlx5_alloc_verbs_buf(). */ - mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len, + mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len, IBV_ACCESS_LOCAL_WRITE); if (mr->ibv_mr == NULL) { DEBUG("port %u fail to create a verbs MR for address (%p)", @@ -779,7 +780,7 @@ struct mr_update_mp_data { } assert((uintptr_t)mr->ibv_mr->addr == data.start); assert(mr->ibv_mr->length == len); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); DEBUG("port %u MR CREATED (%p) for %p:\n" " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", @@ -792,11 +793,11 @@ struct mr_update_mp_data { mr_lookup_dev(dev, entry, addr); /* Lookup can't fail. */ assert(entry->lkey != UINT32_MAX); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return entry->lkey; err_mrlock: - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); err_memlock: rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); err_nolock: @@ -854,14 +855,15 @@ struct mr_update_mp_data { mr_rebuild_dev_cache(struct rte_eth_dev *dev) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr *mr; DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id); /* Flush cache to rebuild. */ - priv->mr.cache.len = 1; - priv->mr.cache.overflow = 0; + sh->mr.cache.len = 1; + sh->mr.cache.overflow = 0; /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) + LIST_FOREACH(mr, &sh->mr.mr_list, mr) if (mr_insert_dev_cache(dev, mr) < 0) return; } @@ -888,6 +890,7 @@ struct mr_update_mp_data { mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; const struct rte_memseg_list *msl; struct mlx5_mr *mr; int ms_n; @@ -901,7 +904,7 @@ struct mr_update_mp_data { assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); assert(len == RTE_ALIGN(len, msl->page_sz)); ms_n = len / msl->page_sz; - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Clear bits of freed memsegs from MR. */ for (i = 0; i < ms_n; ++i) { const struct rte_memseg *ms; @@ -928,7 +931,7 @@ struct mr_update_mp_data { rte_bitmap_clear(mr->ms_bmp, pos); if (--mr->ms_n == 0) { LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); DEBUG("port %u remove MR(%p) from list", dev->data->port_id, (void *)mr); } @@ -949,12 +952,12 @@ struct mr_update_mp_data { * generation below) will be guaranteed to be seen by other core * before the core sees the newly allocated memory. */ - ++priv->mr.dev_gen; + ++sh->mr.dev_gen; DEBUG("broadcasting local cache flush, gen=%d", - priv->mr.dev_gen); + sh->mr.dev_gen); rte_smp_wmb(); } - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); } /** @@ -1013,6 +1016,7 @@ struct mr_update_mp_data { struct mlx5_mr_cache *entry, uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; uint16_t idx; uint32_t lkey; @@ -1021,12 +1025,12 @@ struct mr_update_mp_data { if (unlikely(bt->len == bt->size)) mr_btree_expand(bt, bt->size << 1); /* Look up in the global cache. */ - rte_rwlock_read_lock(&priv->mr.rwlock); - lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + rte_rwlock_read_lock(&sh->mr.rwlock); + lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr); if (lkey != UINT32_MAX) { /* Found. */ - *entry = (*priv->mr.cache.table)[idx]; - rte_rwlock_read_unlock(&priv->mr.rwlock); + *entry = (*sh->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&sh->mr.rwlock); /* * Update local cache. Even if it fails, return the found entry * to update top-half cache. Next time, this entry will be found @@ -1035,7 +1039,7 @@ struct mr_update_mp_data { mr_btree_insert(bt, entry); return lkey; } - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); /* First time to see the address? Create a new MR. */ lkey = mlx5_mr_create(dev, entry, addr); /* @@ -1261,6 +1265,7 @@ struct mr_update_mp_data { struct mr_update_mp_data *data = opaque; struct rte_eth_dev *dev = data->dev; struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl; struct mlx5_mr *mr = NULL; uintptr_t addr = (uintptr_t)memhdr->addr; @@ -1270,9 +1275,9 @@ struct mr_update_mp_data { assert(rte_eal_process_type() == RTE_PROC_PRIMARY); /* If already registered, it should return. */ - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&sh->mr.rwlock); lkey = mr_lookup_dev(dev, &entry, addr); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); if (lkey != UINT32_MAX) return; DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)", @@ -1286,11 +1291,11 @@ struct mr_update_mp_data { data->ret = -1; return; } - rte_rwlock_write_lock(&priv->mr.rwlock); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + rte_rwlock_write_lock(&sh->mr.rwlock); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); /* Insert to the global cache table. */ mr_insert_dev_cache(dev, mr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Insert to the local cache table */ mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr); } @@ -1345,6 +1350,7 @@ struct mr_update_mp_data { struct rte_eth_dev *dev; struct mlx5_mr *mr; struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; dev = pci_dev_to_eth_dev(pdev); if (!dev) { @@ -1361,11 +1367,12 @@ struct mr_update_mp_data { rte_errno = EINVAL; return -1; } - rte_rwlock_write_lock(&priv->mr.rwlock); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + sh = priv->sh; + rte_rwlock_write_lock(&sh->mr.rwlock); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); /* Insert to the global cache table. */ mr_insert_dev_cache(dev, mr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); return 0; } @@ -1390,6 +1397,7 @@ struct mr_update_mp_data { { struct rte_eth_dev *dev; struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; struct mlx5_mr *mr; struct mlx5_mr_cache entry; @@ -1401,10 +1409,11 @@ struct mr_update_mp_data { return -1; } priv = dev->data->dev_private; - rte_rwlock_read_lock(&priv->mr.rwlock); + sh = priv->sh; + rte_rwlock_read_lock(&sh->mr.rwlock); mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr); if (!mr) { - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered " "to PCI device %p", (uintptr_t)addr, (void *)pdev); @@ -1412,7 +1421,7 @@ struct mr_update_mp_data { return -1; } LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); DEBUG("port %u remove MR(%p) from list", dev->data->port_id, (void *)mr); mr_rebuild_dev_cache(dev); @@ -1425,11 +1434,11 @@ struct mr_update_mp_data { * generation below) will be guaranteed to be seen by other core * before the core sees the newly allocated memory. */ - ++priv->mr.dev_gen; + ++sh->mr.dev_gen; DEBUG("broadcasting local cache flush, gen=%d", - priv->mr.dev_gen); + sh->mr.dev_gen); rte_smp_wmb(); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); return 0; } @@ -1550,25 +1559,24 @@ struct mr_update_mp_data { /** * Dump all the created MRs and the global cache entries. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ void -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused) +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused) { #ifndef NDEBUG - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr; int mr_n = 0; int chunk_n = 0; - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&sh->mr.rwlock); /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + LIST_FOREACH(mr, &sh->mr.mr_list, mr) { unsigned int n; - DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", - dev->data->port_id, mr_n++, + DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + sh->ibdev_name, mr_n++, rte_cpu_to_be_32(mr->ibv_mr->lkey), mr->ms_n, mr->ms_bmp_n); if (mr->ms_n == 0) @@ -1583,45 +1591,40 @@ struct mr_update_mp_data { chunk_n++, ret.start, ret.end); } } - DEBUG("port %u dumping global cache", dev->data->port_id); - mlx5_mr_btree_dump(&priv->mr.cache); - rte_rwlock_read_unlock(&priv->mr.rwlock); + DEBUG("device %s dumping global cache", sh->ibdev_name); + mlx5_mr_btree_dump(&sh->mr.cache); + rte_rwlock_read_unlock(&sh->mr.rwlock); #endif } /** - * Release all the created MRs and resources. Remove device from memory callback + * Release all the created MRs and resources for shared device context. * list. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ void -mlx5_mr_release(struct rte_eth_dev *dev) +mlx5_mr_release(struct mlx5_ibv_shared *sh) { - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr_next; - /* Remove from memory callback device list. */ - rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); - LIST_REMOVE(priv, mem_event_cb); - rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) - mlx5_mr_dump_dev(dev); - rte_rwlock_write_lock(&priv->mr.rwlock); + mlx5_mr_dump_dev(sh); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Detach from MR list and move to free list. */ - mr_next = LIST_FIRST(&priv->mr.mr_list); + mr_next = LIST_FIRST(&sh->mr.mr_list); while (mr_next != NULL) { struct mlx5_mr *mr = mr_next; mr_next = LIST_NEXT(mr, mr); LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); } - LIST_INIT(&priv->mr.mr_list); + LIST_INIT(&sh->mr.mr_list); /* Free global cache. */ - mlx5_mr_btree_free(&priv->mr.cache); - rte_rwlock_write_unlock(&priv->mr.rwlock); + mlx5_mr_btree_free(&sh->mr.cache); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Free all remaining MRs. */ - mlx5_mr_garbage_collect(dev); + mlx5_mr_garbage_collect(sh); } diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h index 786f6a3..89e89b7 100644 --- a/drivers/net/mlx5/mlx5_mr.h +++ b/drivers/net/mlx5/mlx5_mr.h @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl { struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */ } __rte_packed; +struct mlx5_ibv_shared; extern struct mlx5_dev_list mlx5_mem_event_cb_list; extern rte_rwlock_t mlx5_mem_event_rwlock; @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, size_t len, void *arg); int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, struct rte_mempool *mp); -void mlx5_mr_release(struct rte_eth_dev *dev); +void mlx5_mr_release(struct mlx5_ibv_shared *sh); /* Debug purpose functions. */ void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt); -void mlx5_mr_dump_dev(struct rte_eth_dev *dev); +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh); /** * Look up LKey from given lookup table by linear search. Firstly look up the diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 9965b2b..ed82594 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl * goto error; } /* Save pointer of global generation number to check memory event. */ - tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; + tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen; assert(desc > MLX5_TX_COMP_THRESH); tmpl->txq.offloads = conf->offloads | dev->data->dev_conf.txmode.offloads; -- 1.8.3.1 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id CB385A0096 for ; Fri, 12 Apr 2019 17:45:51 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 866EC1B227; Fri, 12 Apr 2019 17:45:45 +0200 (CEST) Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id ECA821B1FE for ; Fri, 12 Apr 2019 17:45:43 +0200 (CEST) Received: from Internal Mail-Server by MTLPINE1 (envelope-from viacheslavo@mellanox.com) with ESMTPS (AES256-SHA encrypted); 12 Apr 2019 18:45:43 +0300 Received: from pegasus12.mtr.labs.mlnx. (pegasus12.mtr.labs.mlnx [10.210.17.40]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id x3CFjhMd011645; Fri, 12 Apr 2019 18:45:43 +0300 From: Viacheslav Ovsiienko To: dev@dpdk.org Cc: shahafs@mellanox.com Date: Fri, 12 Apr 2019 15:45:40 +0000 Message-Id: <1555083940-24539-1-git-send-email-viacheslavo@mellanox.com> X-Mailer: git-send-email 1.8.3.1 Subject: [dpdk-dev] [PATCH 1/1] net/mlx5: share Memory Regions for multiport device X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Content-Type: text/plain; charset="UTF-8" Message-ID: <20190412154540.LyDnlhiq07yKcl7hpDPxb5H-Ml1WjIg5rSKYS3i4ij4@z> The multiport Infiniband device support was introduced [1]. All active ports, belonging to the same Infiniband device use the signle shared Infiniband context of that device and share the resources: - QPs are created within shared context - Verbs flows are also created with specifying port index - DV/DR resources - Protection Domain - Event Handlers This patchset adds support for Memory Regions sharing between portes, created on the base of multiport Infiniban device. The datapath of mlx5 uses the layered cache subsystem for allocating/releasing Memory Regions, only the lowest layer L3 is subject to share due to performance issues. [1] http://patches.dpdk.org/cover/51800/ Signed-off-by: Viacheslav Ovsiienko --- drivers/net/mlx5/mlx5.c | 40 +++++++---- drivers/net/mlx5/mlx5.h | 15 ++-- drivers/net/mlx5/mlx5_mr.c | 163 ++++++++++++++++++++++---------------------- drivers/net/mlx5/mlx5_mr.h | 5 +- drivers/net/mlx5/mlx5_txq.c | 2 +- 5 files changed, 121 insertions(+), 104 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 9ff50df..7ff5c8b 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -144,6 +144,7 @@ struct mlx5_dev_spawn_data { struct mlx5_switch_info info; /**< Switch information. */ struct ibv_device *ibv_dev; /**< Associated IB device. */ struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ + struct rte_pci_device *pci_dev; /**< Backend PCI device. */ }; static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); @@ -222,6 +223,7 @@ struct mlx5_dev_spawn_data { sizeof(sh->ibdev_name)); strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, sizeof(sh->ibdev_path)); + sh->pci_dev = spawn->pci_dev; pthread_mutex_init(&sh->intr_mutex, NULL); /* * Setting port_id to max unallowed value means @@ -236,6 +238,22 @@ struct mlx5_dev_spawn_data { err = ENOMEM; goto error; } + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + * + * At this point the device is not added to the memory + * event list yet, context is just being created. + */ + err = mlx5_mr_btree_init(&sh->mr.cache, + MLX5_MR_BTREE_CACHE_N * 2, + sh->pci_dev->device.numa_node); + if (err) { + err = rte_errno; + goto error; + } LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); exit: pthread_mutex_unlock(&mlx5_ibv_list_mutex); @@ -283,6 +301,8 @@ struct mlx5_dev_spawn_data { assert(rte_eal_process_type() == RTE_PROC_PRIMARY); if (--sh->refcnt) goto exit; + /* Release created Memory Regions. */ + mlx5_mr_release(sh); LIST_REMOVE(sh, next); /* * Ensure there is no async event handler installed. @@ -615,7 +635,10 @@ struct mlx5_dev_spawn_data { } mlx5_proc_priv_uninit(dev); mlx5_mprq_free_mp(dev); - mlx5_mr_release(dev); + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); assert(priv->sh); mlx5_free_shared_dr(priv); if (priv->rss_conf.rss_key != NULL) @@ -1493,19 +1516,6 @@ struct mlx5_dev_spawn_data { goto error; } priv->config.flow_prio = err; - /* - * Once the device is added to the list of memory event - * callback, its global MR cache table cannot be expanded - * on the fly because of deadlock. If it overflows, lookup - * should be done by searching MR list linearly, which is slow. - */ - err = mlx5_mr_btree_init(&priv->mr.cache, - MLX5_MR_BTREE_CACHE_N * 2, - eth_dev->device->numa_node); - if (err) { - err = rte_errno; - goto error; - } /* Add device to memory callback list. */ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, @@ -1702,6 +1712,7 @@ struct mlx5_dev_spawn_data { list[ns].ibv_port = i; list[ns].ibv_dev = ibv_match[0]; list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; list[ns].ifindex = mlx5_nl_ifindex (nl_rdma, list[ns].ibv_dev->name, i); if (!list[ns].ifindex) { @@ -1768,6 +1779,7 @@ struct mlx5_dev_spawn_data { list[ns].ibv_port = 1; list[ns].ibv_dev = ibv_match[i]; list[ns].eth_dev = NULL; + list[ns].pci_dev = pci_dev; list[ns].ifindex = 0; if (nl_rdma >= 0) list[ns].ifindex = mlx5_nl_ifindex diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 14c7f3c..8eb1019 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -257,6 +257,14 @@ struct mlx5_ibv_shared { char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */ char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */ struct ibv_device_attr_ex device_attr; /* Device properties. */ + struct rte_pci_device *pci_dev; /* Backend PCI device. */ + struct { /* Global device shared L3 MR cache. */ + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx5_mr_btree cache; /* Global MR cache table. */ + struct mlx5_mr_list mr_list; /* Registered MR list. */ + struct mlx5_mr_list mr_free_list; /* Freed MR list. */ + } mr; /* Shared DV/DR flow data section. */ pthread_mutex_t dv_mutex; /* DV context mutex. */ uint32_t dv_refcnt; /* DV/DR data reference counter. */ @@ -323,13 +331,6 @@ struct mlx5_priv { struct mlx5_flows ctrl_flows; /* Control flow rules. */ LIST_HEAD(counters, mlx5_flow_counter) flow_counters; /* Flow counters. */ - struct { - uint32_t dev_gen; /* Generation number to flush local caches. */ - rte_rwlock_t rwlock; /* MR Lock. */ - struct mlx5_mr_btree cache; /* Global MR cache table. */ - struct mlx5_mr_list mr_list; /* Registered MR list. */ - struct mlx5_mr_list mr_free_list; /* Freed MR list. */ - } mr; LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */ LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */ LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index a3732d4..5140cc2 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -36,7 +36,7 @@ struct mr_update_mp_data { /** * Expand B-tree table to a given size. Can't be called with holding - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc(). * * @param bt * Pointer to B-tree structure. @@ -350,7 +350,7 @@ struct mr_update_mp_data { n = mr_find_next_chunk(mr, &entry, n); if (!entry.end) break; - if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) { /* * Overflowed, but the global table cannot be expanded * because of deadlock. @@ -382,7 +382,7 @@ struct mr_update_mp_data { struct mlx5_mr *mr; /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) { unsigned int n; if (mr->ms_n == 0) @@ -420,6 +420,7 @@ struct mr_update_mp_data { uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; uint16_t idx; uint32_t lkey = UINT32_MAX; struct mlx5_mr *mr; @@ -430,10 +431,10 @@ struct mr_update_mp_data { * has to be searched by traversing the original MR list instead, which * is very slow path. Otherwise, the global cache is all inclusive. */ - if (!unlikely(priv->mr.cache.overflow)) { - lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (!unlikely(sh->mr.cache.overflow)) { + lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr); if (lkey != UINT32_MAX) - *entry = (*priv->mr.cache.table)[idx]; + *entry = (*sh->mr.cache.table)[idx]; } else { /* Falling back to the slowest path. */ mr = mr_lookup_dev_list(dev, entry, addr); @@ -468,13 +469,12 @@ struct mr_update_mp_data { /** * Release resources of detached MR having no online entry. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ static void -mlx5_mr_garbage_collect(struct rte_eth_dev *dev) +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh) { - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr_next; struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); @@ -484,11 +484,11 @@ struct mr_update_mp_data { * MR can't be freed with holding the lock because rte_free() could call * memory free callback function. This will be a deadlock situation. */ - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Detach the whole free list and release it after unlocking. */ - free_list = priv->mr.mr_free_list; - LIST_INIT(&priv->mr.mr_free_list); - rte_rwlock_write_unlock(&priv->mr.rwlock); + free_list = sh->mr.mr_free_list; + LIST_INIT(&sh->mr.mr_free_list); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Release resources. */ mr_next = LIST_FIRST(&free_list); while (mr_next != NULL) { @@ -548,12 +548,12 @@ struct mr_update_mp_data { dev->data->port_id, (void *)addr); return UINT32_MAX; } - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&priv->sh->mr.rwlock); /* Fill in output data. */ mr_lookup_dev(dev, entry, addr); /* Lookup can't fail. */ assert(entry->lkey != UINT32_MAX); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&priv->sh->mr.rwlock); DEBUG("port %u MR CREATED by primary process for %p:\n" " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", dev->data->port_id, (void *)addr, @@ -582,6 +582,7 @@ struct mr_update_mp_data { uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_dev_config *config = &priv->config; struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; const struct rte_memseg_list *msl; @@ -602,12 +603,12 @@ struct mr_update_mp_data { dev->data->port_id, (void *)addr); /* * Release detached MRs if any. This can't be called with holding either - * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have * been detached by the memory free event but it couldn't be released * inside the callback due to deadlock. As a result, releasing resources * is quite opportunistic. */ - mlx5_mr_garbage_collect(dev); + mlx5_mr_garbage_collect(sh); /* * If enabled, find out a contiguous virtual address chunk in use, to * which the given address belongs, in order to register maximum range. @@ -710,7 +711,7 @@ struct mr_update_mp_data { goto alloc_resources; } assert(data.msl == data_re.msl); - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* * Check the address is really missing. If other thread already created * one or it is not found due to overflow, abort and return. @@ -721,10 +722,10 @@ struct mr_update_mp_data { * low-on-memory. Then, this entry will have to be searched * here again. */ - mr_btree_insert(&priv->mr.cache, entry); + mr_btree_insert(&sh->mr.cache, entry); DEBUG("port %u found MR for %p on final lookup, abort", dev->data->port_id, (void *)addr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); /* * Must be unlocked before calling rte_free() because @@ -769,7 +770,7 @@ struct mr_update_mp_data { * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() * through mlx5_alloc_verbs_buf(). */ - mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len, + mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len, IBV_ACCESS_LOCAL_WRITE); if (mr->ibv_mr == NULL) { DEBUG("port %u fail to create a verbs MR for address (%p)", @@ -779,7 +780,7 @@ struct mr_update_mp_data { } assert((uintptr_t)mr->ibv_mr->addr == data.start); assert(mr->ibv_mr->length == len); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); DEBUG("port %u MR CREATED (%p) for %p:\n" " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", @@ -792,11 +793,11 @@ struct mr_update_mp_data { mr_lookup_dev(dev, entry, addr); /* Lookup can't fail. */ assert(entry->lkey != UINT32_MAX); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return entry->lkey; err_mrlock: - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); err_memlock: rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); err_nolock: @@ -854,14 +855,15 @@ struct mr_update_mp_data { mr_rebuild_dev_cache(struct rte_eth_dev *dev) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr *mr; DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id); /* Flush cache to rebuild. */ - priv->mr.cache.len = 1; - priv->mr.cache.overflow = 0; + sh->mr.cache.len = 1; + sh->mr.cache.overflow = 0; /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) + LIST_FOREACH(mr, &sh->mr.mr_list, mr) if (mr_insert_dev_cache(dev, mr) < 0) return; } @@ -888,6 +890,7 @@ struct mr_update_mp_data { mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; const struct rte_memseg_list *msl; struct mlx5_mr *mr; int ms_n; @@ -901,7 +904,7 @@ struct mr_update_mp_data { assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); assert(len == RTE_ALIGN(len, msl->page_sz)); ms_n = len / msl->page_sz; - rte_rwlock_write_lock(&priv->mr.rwlock); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Clear bits of freed memsegs from MR. */ for (i = 0; i < ms_n; ++i) { const struct rte_memseg *ms; @@ -928,7 +931,7 @@ struct mr_update_mp_data { rte_bitmap_clear(mr->ms_bmp, pos); if (--mr->ms_n == 0) { LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); DEBUG("port %u remove MR(%p) from list", dev->data->port_id, (void *)mr); } @@ -949,12 +952,12 @@ struct mr_update_mp_data { * generation below) will be guaranteed to be seen by other core * before the core sees the newly allocated memory. */ - ++priv->mr.dev_gen; + ++sh->mr.dev_gen; DEBUG("broadcasting local cache flush, gen=%d", - priv->mr.dev_gen); + sh->mr.dev_gen); rte_smp_wmb(); } - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); } /** @@ -1013,6 +1016,7 @@ struct mr_update_mp_data { struct mlx5_mr_cache *entry, uintptr_t addr) { struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; uint16_t idx; uint32_t lkey; @@ -1021,12 +1025,12 @@ struct mr_update_mp_data { if (unlikely(bt->len == bt->size)) mr_btree_expand(bt, bt->size << 1); /* Look up in the global cache. */ - rte_rwlock_read_lock(&priv->mr.rwlock); - lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + rte_rwlock_read_lock(&sh->mr.rwlock); + lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr); if (lkey != UINT32_MAX) { /* Found. */ - *entry = (*priv->mr.cache.table)[idx]; - rte_rwlock_read_unlock(&priv->mr.rwlock); + *entry = (*sh->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&sh->mr.rwlock); /* * Update local cache. Even if it fails, return the found entry * to update top-half cache. Next time, this entry will be found @@ -1035,7 +1039,7 @@ struct mr_update_mp_data { mr_btree_insert(bt, entry); return lkey; } - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); /* First time to see the address? Create a new MR. */ lkey = mlx5_mr_create(dev, entry, addr); /* @@ -1261,6 +1265,7 @@ struct mr_update_mp_data { struct mr_update_mp_data *data = opaque; struct rte_eth_dev *dev = data->dev; struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_ibv_shared *sh = priv->sh; struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl; struct mlx5_mr *mr = NULL; uintptr_t addr = (uintptr_t)memhdr->addr; @@ -1270,9 +1275,9 @@ struct mr_update_mp_data { assert(rte_eal_process_type() == RTE_PROC_PRIMARY); /* If already registered, it should return. */ - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&sh->mr.rwlock); lkey = mr_lookup_dev(dev, &entry, addr); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); if (lkey != UINT32_MAX) return; DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)", @@ -1286,11 +1291,11 @@ struct mr_update_mp_data { data->ret = -1; return; } - rte_rwlock_write_lock(&priv->mr.rwlock); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + rte_rwlock_write_lock(&sh->mr.rwlock); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); /* Insert to the global cache table. */ mr_insert_dev_cache(dev, mr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Insert to the local cache table */ mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr); } @@ -1345,6 +1350,7 @@ struct mr_update_mp_data { struct rte_eth_dev *dev; struct mlx5_mr *mr; struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; dev = pci_dev_to_eth_dev(pdev); if (!dev) { @@ -1361,11 +1367,12 @@ struct mr_update_mp_data { rte_errno = EINVAL; return -1; } - rte_rwlock_write_lock(&priv->mr.rwlock); - LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + sh = priv->sh; + rte_rwlock_write_lock(&sh->mr.rwlock); + LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr); /* Insert to the global cache table. */ mr_insert_dev_cache(dev, mr); - rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_write_unlock(&sh->mr.rwlock); return 0; } @@ -1390,6 +1397,7 @@ struct mr_update_mp_data { { struct rte_eth_dev *dev; struct mlx5_priv *priv; + struct mlx5_ibv_shared *sh; struct mlx5_mr *mr; struct mlx5_mr_cache entry; @@ -1401,10 +1409,11 @@ struct mr_update_mp_data { return -1; } priv = dev->data->dev_private; - rte_rwlock_read_lock(&priv->mr.rwlock); + sh = priv->sh; + rte_rwlock_read_lock(&sh->mr.rwlock); mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr); if (!mr) { - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered " "to PCI device %p", (uintptr_t)addr, (void *)pdev); @@ -1412,7 +1421,7 @@ struct mr_update_mp_data { return -1; } LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); DEBUG("port %u remove MR(%p) from list", dev->data->port_id, (void *)mr); mr_rebuild_dev_cache(dev); @@ -1425,11 +1434,11 @@ struct mr_update_mp_data { * generation below) will be guaranteed to be seen by other core * before the core sees the newly allocated memory. */ - ++priv->mr.dev_gen; + ++sh->mr.dev_gen; DEBUG("broadcasting local cache flush, gen=%d", - priv->mr.dev_gen); + sh->mr.dev_gen); rte_smp_wmb(); - rte_rwlock_read_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&sh->mr.rwlock); return 0; } @@ -1550,25 +1559,24 @@ struct mr_update_mp_data { /** * Dump all the created MRs and the global cache entries. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ void -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused) +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused) { #ifndef NDEBUG - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr; int mr_n = 0; int chunk_n = 0; - rte_rwlock_read_lock(&priv->mr.rwlock); + rte_rwlock_read_lock(&sh->mr.rwlock); /* Iterate all the existing MRs. */ - LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + LIST_FOREACH(mr, &sh->mr.mr_list, mr) { unsigned int n; - DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", - dev->data->port_id, mr_n++, + DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + sh->ibdev_name, mr_n++, rte_cpu_to_be_32(mr->ibv_mr->lkey), mr->ms_n, mr->ms_bmp_n); if (mr->ms_n == 0) @@ -1583,45 +1591,40 @@ struct mr_update_mp_data { chunk_n++, ret.start, ret.end); } } - DEBUG("port %u dumping global cache", dev->data->port_id); - mlx5_mr_btree_dump(&priv->mr.cache); - rte_rwlock_read_unlock(&priv->mr.rwlock); + DEBUG("device %s dumping global cache", sh->ibdev_name); + mlx5_mr_btree_dump(&sh->mr.cache); + rte_rwlock_read_unlock(&sh->mr.rwlock); #endif } /** - * Release all the created MRs and resources. Remove device from memory callback + * Release all the created MRs and resources for shared device context. * list. * - * @param dev - * Pointer to Ethernet device. + * @param sh + * Pointer to Ethernet device shared context. */ void -mlx5_mr_release(struct rte_eth_dev *dev) +mlx5_mr_release(struct mlx5_ibv_shared *sh) { - struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_mr *mr_next; - /* Remove from memory callback device list. */ - rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); - LIST_REMOVE(priv, mem_event_cb); - rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) - mlx5_mr_dump_dev(dev); - rte_rwlock_write_lock(&priv->mr.rwlock); + mlx5_mr_dump_dev(sh); + rte_rwlock_write_lock(&sh->mr.rwlock); /* Detach from MR list and move to free list. */ - mr_next = LIST_FIRST(&priv->mr.mr_list); + mr_next = LIST_FIRST(&sh->mr.mr_list); while (mr_next != NULL) { struct mlx5_mr *mr = mr_next; mr_next = LIST_NEXT(mr, mr); LIST_REMOVE(mr, mr); - LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr); } - LIST_INIT(&priv->mr.mr_list); + LIST_INIT(&sh->mr.mr_list); /* Free global cache. */ - mlx5_mr_btree_free(&priv->mr.cache); - rte_rwlock_write_unlock(&priv->mr.rwlock); + mlx5_mr_btree_free(&sh->mr.cache); + rte_rwlock_write_unlock(&sh->mr.rwlock); /* Free all remaining MRs. */ - mlx5_mr_garbage_collect(dev); + mlx5_mr_garbage_collect(sh); } diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h index 786f6a3..89e89b7 100644 --- a/drivers/net/mlx5/mlx5_mr.h +++ b/drivers/net/mlx5/mlx5_mr.h @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl { struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */ } __rte_packed; +struct mlx5_ibv_shared; extern struct mlx5_dev_list mlx5_mem_event_cb_list; extern rte_rwlock_t mlx5_mem_event_rwlock; @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, size_t len, void *arg); int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, struct rte_mempool *mp); -void mlx5_mr_release(struct rte_eth_dev *dev); +void mlx5_mr_release(struct mlx5_ibv_shared *sh); /* Debug purpose functions. */ void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt); -void mlx5_mr_dump_dev(struct rte_eth_dev *dev); +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh); /** * Look up LKey from given lookup table by linear search. Firstly look up the diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 9965b2b..ed82594 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl * goto error; } /* Save pointer of global generation number to check memory event. */ - tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; + tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen; assert(desc > MLX5_TX_COMP_THRESH); tmpl->txq.offloads = conf->offloads | dev->data->dev_conf.txmode.offloads; -- 1.8.3.1