From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 8CD75A0548; Thu, 8 Sep 2022 23:59:02 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 767EE4281E; Thu, 8 Sep 2022 23:59:02 +0200 (CEST) Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by mails.dpdk.org (Postfix) with ESMTP id 6AD3540DDC for ; Thu, 8 Sep 2022 23:59:00 +0200 (CEST) Received: by linux.microsoft.com (Postfix, from userid 1004) id C625920B929C; Thu, 8 Sep 2022 14:58:59 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com C625920B929C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linuxonhyperv.com; s=default; t=1662674339; bh=L3Pjw2rjQI7L0CaAsJ4deRlEoPdVhRW9bOMVvCCi0/s=; h=From:To:Cc:Subject:Date:In-Reply-To:References:Reply-To:From; b=W/t/SPEQLd9VqUGf4ecbQmq4lDm27n/QFtIn+8apuFE3UadGzbOjEYtLfv7zibzZV tlcL+3RPInMIPxTEbgU4Sl/m8nJo+sn5KKSex8PQIOByWx6Y80FEsGLClLI33lilnh 9lMB3mPQnzLLnQ8ZmNlJWJjzUNKUKz6cAyOmegNA= From: longli@linuxonhyperv.com To: Ferruh Yigit Cc: dev@dpdk.org, Ajay Sharma , Stephen Hemminger , Long Li Subject: [Patch v8 10/18] net/mana: implement memory registration Date: Thu, 8 Sep 2022 14:58:58 -0700 Message-Id: <1662674338-30425-1-git-send-email-longli@linuxonhyperv.com> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1662169260-4953-11-git-send-email-longli@linuxonhyperv.com> References: <1662169260-4953-11-git-send-email-longli@linuxonhyperv.com> X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: longli@microsoft.com Errors-To: dev-bounces@dpdk.org From: Long Li MANA hardware has iommu built-in, that provides hardware safe access to user memory through memory registration. Since memory registration is an expensive operation, this patch implements a two level memory registration cache mechanisum for each queue and for each port. Signed-off-by: Long Li --- Change log: v2: Change all header file functions to start with mana_. Use spinlock in place of rwlock to memory cache access. Remove unused header files. v4: Remove extra "\n" in logging function. v8: Fix Coding style to function definitions. drivers/net/mana/mana.c | 20 ++ drivers/net/mana/mana.h | 39 ++++ drivers/net/mana/meson.build | 1 + drivers/net/mana/mp.c | 92 +++++++++ drivers/net/mana/mr.c | 348 +++++++++++++++++++++++++++++++++++ 5 files changed, 500 insertions(+) create mode 100644 drivers/net/mana/mr.c diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c index bc8238a02b..67bef6bd32 100644 --- a/drivers/net/mana/mana.c +++ b/drivers/net/mana/mana.c @@ -111,6 +111,8 @@ mana_dev_close(struct rte_eth_dev *dev) struct mana_priv *priv = dev->data->dev_private; int ret; + mana_remove_all_mr(priv); + ret = mana_intr_uninstall(priv); if (ret) return ret; @@ -331,6 +333,13 @@ mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, goto fail; } + ret = mana_mr_btree_init(&txq->mr_btree, + MANA_MR_BTREE_PER_QUEUE_N, socket_id); + if (ret) { + DRV_LOG(ERR, "Failed to init TXQ MR btree"); + goto fail; + } + DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p", queue_idx, nb_desc, socket_id, txq->desc_ring); @@ -353,6 +362,8 @@ mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid) { struct mana_txq *txq = dev->data->tx_queues[qid]; + mana_mr_btree_free(&txq->mr_btree); + rte_free(txq->desc_ring); rte_free(txq); } @@ -389,6 +400,13 @@ mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, goto fail; } + ret = mana_mr_btree_init(&rxq->mr_btree, + MANA_MR_BTREE_PER_QUEUE_N, socket_id); + if (ret) { + DRV_LOG(ERR, "Failed to init RXQ MR btree"); + goto fail; + } + rxq->num_desc = nb_desc; rxq->priv = priv; @@ -409,6 +427,8 @@ mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid) { struct mana_rxq *rxq = dev->data->rx_queues[qid]; + mana_mr_btree_free(&rxq->mr_btree); + rte_free(rxq->desc_ring); rte_free(rxq); } diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h index 04ccdfa0d1..964c30551b 100644 --- a/drivers/net/mana/mana.h +++ b/drivers/net/mana/mana.h @@ -49,6 +49,22 @@ struct mana_shared_data { #define MAX_RECEIVE_BUFFERS_PER_QUEUE 256 #define MAX_SEND_BUFFERS_PER_QUEUE 256 +struct mana_mr_cache { + uint32_t lkey; + uintptr_t addr; + size_t len; + void *verb_obj; +}; + +#define MANA_MR_BTREE_CACHE_N 512 +struct mana_mr_btree { + uint16_t len; /* Used entries */ + uint16_t size; /* Total entries */ + int overflow; + int socket; + struct mana_mr_cache *table; +}; + struct mana_process_priv { void *db_page; }; @@ -81,6 +97,8 @@ struct mana_priv { int max_recv_sge; int max_mr; uint64_t max_mr_size; + struct mana_mr_btree mr_btree; + rte_spinlock_t mr_btree_lock; }; struct mana_txq_desc { @@ -130,6 +148,7 @@ struct mana_txq { uint32_t desc_ring_head, desc_ring_tail; struct mana_stats stats; + struct mana_mr_btree mr_btree; unsigned int socket; }; @@ -152,6 +171,7 @@ struct mana_rxq { struct mana_gdma_queue gdma_cq; struct mana_stats stats; + struct mana_mr_btree mr_btree; unsigned int socket; }; @@ -175,6 +195,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); +struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree, + struct mana_priv *priv, + struct rte_mbuf *mbuf); +int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv, + struct rte_mempool *pool); +void mana_remove_all_mr(struct mana_priv *priv); +void mana_del_pmd_mr(struct mana_mr_cache *mr); + +void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque, + struct rte_mempool_memhdr *memhdr, unsigned int idx); + +struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt, + uint16_t *idx, + uintptr_t addr, size_t len); +int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry); +int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket); +void mana_mr_btree_free(struct mana_mr_btree *bt); + /** Request timeout for IPC. */ #define MANA_MP_REQ_TIMEOUT_SEC 5 @@ -203,6 +241,7 @@ int mana_mp_init_secondary(void); void mana_mp_uninit_primary(void); void mana_mp_uninit_secondary(void); int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev); +int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len); void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type); diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build index 81c4118f53..9771394370 100644 --- a/drivers/net/mana/meson.build +++ b/drivers/net/mana/meson.build @@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs'] sources += files( 'mana.c', + 'mr.c', 'mp.c', ) diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c index 4a3826755c..a3b5ede559 100644 --- a/drivers/net/mana/mp.c +++ b/drivers/net/mana/mp.c @@ -12,6 +12,55 @@ extern struct mana_shared_data *mana_shared_data; +/* + * Process MR request from secondary process. + */ +static int +mana_mp_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len) +{ + struct ibv_mr *ibv_mr; + int ret; + struct mana_mr_cache *mr; + + ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)addr, len, + IBV_ACCESS_LOCAL_WRITE); + + if (!ibv_mr) + return -errno; + + DRV_LOG(DEBUG, "MR (2nd) lkey %u addr %p len %zu", + ibv_mr->lkey, ibv_mr->addr, ibv_mr->length); + + mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0); + if (!mr) { + DRV_LOG(ERR, "(2nd) Failed to allocate MR"); + ret = -ENOMEM; + goto fail_alloc; + } + mr->lkey = ibv_mr->lkey; + mr->addr = (uintptr_t)ibv_mr->addr; + mr->len = ibv_mr->length; + mr->verb_obj = ibv_mr; + + rte_spinlock_lock(&priv->mr_btree_lock); + ret = mana_mr_btree_insert(&priv->mr_btree, mr); + rte_spinlock_unlock(&priv->mr_btree_lock); + if (ret) { + DRV_LOG(ERR, "(2nd) Failed to add to global MR btree"); + goto fail_btree; + } + + return 0; + +fail_btree: + rte_free(mr); + +fail_alloc: + ibv_dereg_mr(ibv_mr); + + return ret; +} + static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type, int port_id) { @@ -47,6 +96,12 @@ mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer) mp_init_msg(&mp_res, param->type, param->port_id); switch (param->type) { + case MANA_MP_REQ_CREATE_MR: + ret = mana_mp_mr_create(priv, param->addr, param->len); + res->result = ret; + ret = rte_mp_reply(&mp_res, peer); + break; + case MANA_MP_REQ_VERBS_CMD_FD: mp_res.num_fds = 1; mp_res.fds[0] = priv->ib_ctx->cmd_fd; @@ -194,6 +249,43 @@ mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev) return ret; } +/* + * Request the primary process to register a MR. + */ +int +mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len) +{ + struct rte_mp_msg mp_req = {0}; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mana_mp_param *req = (struct mana_mp_param *)mp_req.param; + struct mana_mp_param *res; + struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int ret; + + mp_init_msg(&mp_req, MANA_MP_REQ_CREATE_MR, priv->port_id); + req->addr = addr; + req->len = len; + + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + DRV_LOG(ERR, "Port %u request to primary failed", + req->port_id); + return ret; + } + + if (mp_rep.nb_received != 1) + return -EPROTO; + + mp_res = &mp_rep.msgs[0]; + res = (struct mana_mp_param *)mp_res->param; + ret = res->result; + + free(mp_rep.msgs); + + return ret; +} + void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type) { diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c new file mode 100644 index 0000000000..22df0917bb --- /dev/null +++ b/drivers/net/mana/mr.c @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2022 Microsoft Corporation + */ + +#include +#include +#include + +#include + +#include "mana.h" + +struct mana_range { + uintptr_t start; + uintptr_t end; + uint32_t len; +}; + +void +mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, unsigned int idx) +{ + struct mana_range *ranges = opaque; + struct mana_range *range = &ranges[idx]; + uint64_t page_size = rte_mem_page_size(); + + range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); + range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len, + page_size); + range->len = range->end - range->start; +} + +/* + * Register all memory regions from pool. + */ +int +mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv, + struct rte_mempool *pool) +{ + struct ibv_mr *ibv_mr; + struct mana_range ranges[pool->nb_mem_chunks]; + uint32_t i; + struct mana_mr_cache *mr; + int ret; + + rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges); + + for (i = 0; i < pool->nb_mem_chunks; i++) { + if (ranges[i].len > priv->max_mr_size) { + DRV_LOG(ERR, "memory chunk size %u exceeding max MR", + ranges[i].len); + return -ENOMEM; + } + + DRV_LOG(DEBUG, + "registering memory chunk start 0x%" PRIx64 " len %u", + ranges[i].start, ranges[i].len); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + /* Send a message to the primary to do MR */ + ret = mana_mp_req_mr_create(priv, ranges[i].start, + ranges[i].len); + if (ret) { + DRV_LOG(ERR, + "MR failed start 0x%" PRIx64 " len %u", + ranges[i].start, ranges[i].len); + return ret; + } + continue; + } + + ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start, + ranges[i].len, IBV_ACCESS_LOCAL_WRITE); + if (ibv_mr) { + DRV_LOG(DEBUG, "MR lkey %u addr %p len %" PRIu64, + ibv_mr->lkey, ibv_mr->addr, ibv_mr->length); + + mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0); + mr->lkey = ibv_mr->lkey; + mr->addr = (uintptr_t)ibv_mr->addr; + mr->len = ibv_mr->length; + mr->verb_obj = ibv_mr; + + rte_spinlock_lock(&priv->mr_btree_lock); + ret = mana_mr_btree_insert(&priv->mr_btree, mr); + rte_spinlock_unlock(&priv->mr_btree_lock); + if (ret) { + ibv_dereg_mr(ibv_mr); + DRV_LOG(ERR, "Failed to add to global MR btree"); + return ret; + } + + ret = mana_mr_btree_insert(local_tree, mr); + if (ret) { + /* Don't need to clean up MR as it's already + * in the global tree + */ + DRV_LOG(ERR, "Failed to add to local MR btree"); + return ret; + } + } else { + DRV_LOG(ERR, "MR failed at 0x%" PRIx64 " len %u", + ranges[i].start, ranges[i].len); + return -errno; + } + } + return 0; +} + +/* + * Deregister a MR. + */ +void +mana_del_pmd_mr(struct mana_mr_cache *mr) +{ + int ret; + struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj; + + ret = ibv_dereg_mr(ibv_mr); + if (ret) + DRV_LOG(ERR, "dereg MR failed ret %d", ret); +} + +/* + * Find a MR from cache. If not found, register a new MR. + */ +struct mana_mr_cache * +mana_find_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv, + struct rte_mbuf *mbuf) +{ + struct rte_mempool *pool = mbuf->pool; + int ret, second_try = 0; + struct mana_mr_cache *mr; + uint16_t idx; + + DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d", + mbuf->buf_addr, mbuf->buf_len); + +try_again: + /* First try to find the MR in local queue tree */ + mr = mana_mr_btree_lookup(local_mr_btree, &idx, + (uintptr_t)mbuf->buf_addr, mbuf->buf_len); + if (mr) { + DRV_LOG(DEBUG, + "Local mr lkey %u addr 0x%" PRIx64 " len %" PRIu64, + mr->lkey, mr->addr, mr->len); + return mr; + } + + /* If not found, try to find the MR in global tree */ + rte_spinlock_lock(&priv->mr_btree_lock); + mr = mana_mr_btree_lookup(&priv->mr_btree, &idx, + (uintptr_t)mbuf->buf_addr, + mbuf->buf_len); + rte_spinlock_unlock(&priv->mr_btree_lock); + + /* If found in the global tree, add it to the local tree */ + if (mr) { + ret = mana_mr_btree_insert(local_mr_btree, mr); + if (ret) { + DRV_LOG(DEBUG, "Failed to add MR to local tree."); + return NULL; + } + + DRV_LOG(DEBUG, + "Added local MR key %u addr 0x%" PRIx64 " len %" PRIu64, + mr->lkey, mr->addr, mr->len); + return mr; + } + + if (second_try) { + DRV_LOG(ERR, "Internal error second try failed"); + return NULL; + } + + ret = mana_new_pmd_mr(local_mr_btree, priv, pool); + if (ret) { + DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d", + ret, mbuf->buf_addr, mbuf->buf_len); + return NULL; + } + + second_try = 1; + goto try_again; +} + +void +mana_remove_all_mr(struct mana_priv *priv) +{ + struct mana_mr_btree *bt = &priv->mr_btree; + struct mana_mr_cache *mr; + struct ibv_mr *ibv_mr; + uint16_t i; + + rte_spinlock_lock(&priv->mr_btree_lock); + /* Start with index 1 as the 1st entry is always NULL */ + for (i = 1; i < bt->len; i++) { + mr = &bt->table[i]; + ibv_mr = mr->verb_obj; + ibv_dereg_mr(ibv_mr); + } + bt->len = 1; + rte_spinlock_unlock(&priv->mr_btree_lock); +} + +/* + * Expand the MR cache. + * MR cache is maintained as a btree and expand on demand. + */ +static int +mana_mr_btree_expand(struct mana_mr_btree *bt, int n) +{ + void *mem; + + mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache), + 0, bt->socket); + if (!mem) { + DRV_LOG(ERR, "Failed to expand btree size %d", n); + return -1; + } + + DRV_LOG(ERR, "Expanded btree to size %d", n); + bt->table = mem; + bt->size = n; + + return 0; +} + +/* + * Look for a region of memory in MR cache. + */ +struct mana_mr_cache * +mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx, + uintptr_t addr, size_t len) +{ + struct mana_mr_cache *table; + uint16_t n; + uint16_t base = 0; + int ret; + + n = bt->len; + + /* Try to double the cache if it's full */ + if (n == bt->size) { + ret = mana_mr_btree_expand(bt, bt->size << 1); + if (ret) + return NULL; + } + + table = bt->table; + + /* Do binary search on addr */ + do { + uint16_t delta = n >> 1; + + if (addr < table[base + delta].addr) { + n = delta; + } else { + base += delta; + n -= delta; + } + } while (n > 1); + + *idx = base; + + if (addr + len <= table[base].addr + table[base].len) + return &table[base]; + + DRV_LOG(DEBUG, + "addr 0x%" PRIx64 " len %zu idx %u sum 0x%" PRIx64 " not found", + addr, len, *idx, addr + len); + + return NULL; +} + +int +mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket) +{ + memset(bt, 0, sizeof(*bt)); + bt->table = rte_calloc_socket("MANA B-tree table", + n, + sizeof(struct mana_mr_cache), + 0, socket); + if (!bt->table) { + DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d", + n, socket); + return -ENOMEM; + } + + bt->socket = socket; + bt->size = n; + + /* First entry must be NULL for binary search to work */ + bt->table[0] = (struct mana_mr_cache) { + .lkey = UINT32_MAX, + }; + bt->len = 1; + + DRV_LOG(ERR, "B-tree initialized table %p size %d len %d", + bt->table, n, bt->len); + + return 0; +} + +void +mana_mr_btree_free(struct mana_mr_btree *bt) +{ + rte_free(bt->table); + memset(bt, 0, sizeof(*bt)); +} + +int +mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry) +{ + struct mana_mr_cache *table; + uint16_t idx = 0; + uint16_t shift; + + if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) { + DRV_LOG(DEBUG, "Addr 0x%" PRIx64 " len %zu exists in btree", + entry->addr, entry->len); + return 0; + } + + if (bt->len >= bt->size) { + bt->overflow = 1; + return -1; + } + + table = bt->table; + + idx++; + shift = (bt->len - idx) * sizeof(struct mana_mr_cache); + if (shift) { + DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u", + shift, idx, idx + 1); + memmove(&table[idx + 1], &table[idx], shift); + } + + table[idx] = *entry; + bt->len++; + + DRV_LOG(DEBUG, + "Inserted MR b-tree table %p idx %d addr 0x%" PRIx64 " len %zu", + table, idx, entry->addr, entry->len); + + return 0; +} -- 2.17.1