From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id A13B643DE5; Wed, 3 Apr 2024 11:25:37 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 0E39A40A72; Wed, 3 Apr 2024 11:25:12 +0200 (CEST) Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) by mails.dpdk.org (Postfix) with ESMTP id 8C94D40A6F for ; Wed, 3 Apr 2024 11:25:10 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1712136310; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=UkehagZcbZwuThpik0rBz4rDUxpt0YcZCblGv9du2LI=; b=gKKM023HhEwwdzQsSTahmQeP6XlrzB23q3sO5C9yIdgOZA3kBEFkmGMXurQi69v1XJ0WN2 HcMWaeR4jH7d7sLc8PwFmJQwkT35nMnhLgl2KW58/AlVRuLeUl0HMDdEItNx2wTqhPWRht bk0fhvdgizX/mTpUlPvbwoWyFGAuoY8= Received: from mimecast-mx02.redhat.com (mx-ext.redhat.com [66.187.233.73]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-686-QthxwNsIO1yM0-bYgHvNGQ-1; Wed, 03 Apr 2024 05:25:06 -0400 X-MC-Unique: QthxwNsIO1yM0-bYgHvNGQ-1 Received: from smtp.corp.redhat.com (int-mx09.intmail.prod.int.rdu2.redhat.com [10.11.54.9]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 5AA9B3C29854; Wed, 3 Apr 2024 09:25:06 +0000 (UTC) Received: from max-p1.redhat.com (unknown [10.39.208.23]) by smtp.corp.redhat.com (Postfix) with ESMTP id ECB3C492BCC; Wed, 3 Apr 2024 09:25:04 +0000 (UTC) From: Maxime Coquelin To: dev@dpdk.org, david.marchand@redhat.com, chenbox@nvidia.com Cc: Maxime Coquelin Subject: [PATCH v2 5/5] vhost: manage FD with epoll Date: Wed, 3 Apr 2024 11:24:48 +0200 Message-ID: <20240403092448.1361820-6-maxime.coquelin@redhat.com> In-Reply-To: <20240403092448.1361820-1-maxime.coquelin@redhat.com> References: <20240403092448.1361820-1-maxime.coquelin@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.9 X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII"; x-default=true X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: David Marchand Switch to epoll so that the concern over the poll() fd array is removed. Add a simple list of used entries and track the next free entry. epoll() is thread safe, we no more need a synchronization mechanism and so can remove the notification pipe. Signed-off-by: David Marchand Signed-off-by: Maxime Coquelin --- lib/vhost/fd_man.c | 399 ++++++++++++--------------------------------- lib/vhost/fd_man.h | 5 +- 2 files changed, 106 insertions(+), 298 deletions(-) diff --git a/lib/vhost/fd_man.c b/lib/vhost/fd_man.c index 8b47c97d45..a4a2965da1 100644 --- a/lib/vhost/fd_man.c +++ b/lib/vhost/fd_man.c @@ -3,9 +3,9 @@ */ #include -#include #include #include +#include #include #include @@ -21,49 +21,34 @@ RTE_LOG_REGISTER_SUFFIX(vhost_fdset_logtype, fdset, INFO); #define VHOST_FDMAN_LOG(level, ...) \ RTE_LOG_LINE(level, VHOST_FDMAN, "" __VA_ARGS__) -#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) - struct fdentry { int fd; /* -1 indicates this entry is empty */ fd_cb rcb; /* callback when this fd is readable. */ fd_cb wcb; /* callback when this fd is writeable.*/ void *dat; /* fd context */ int busy; /* whether this entry is being used in cb. */ + LIST_ENTRY(fdentry) next; }; struct fdset { char name[RTE_THREAD_NAME_SIZE]; - struct pollfd rwfds[MAX_FDS]; + int epfd; struct fdentry fd[MAX_FDS]; + LIST_HEAD(, fdentry) fdlist; + int next_free_idx; rte_thread_t tid; pthread_mutex_t fd_mutex; - pthread_mutex_t fd_polling_mutex; - int num; /* current fd number of this fdset */ - - union pipefds { - struct { - int pipefd[2]; - }; - struct { - int readfd; - int writefd; - }; - } u; - - pthread_mutex_t sync_mutex; - pthread_cond_t sync_cond; - bool sync; + bool destroy; }; -static int fdset_add_no_sync(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat); -static uint32_t fdset_event_dispatch(void *arg); - #define MAX_FDSETS 8 static struct fdset *fdsets[MAX_FDSETS]; pthread_mutex_t fdsets_mutex = PTHREAD_MUTEX_INITIALIZER; +static uint32_t fdset_event_dispatch(void *arg); + static struct fdset * fdset_lookup(const char *name) { @@ -96,166 +81,6 @@ fdset_insert(struct fdset *fdset) return -1; } -static void -fdset_pipe_read_cb(int readfd, void *dat, - int *remove __rte_unused) -{ - char charbuf[16]; - struct fdset *fdset = dat; - int r = read(readfd, charbuf, sizeof(charbuf)); - /* - * Just an optimization, we don't care if read() failed - * so ignore explicitly its return value to make the - * compiler happy - */ - RTE_SET_USED(r); - - pthread_mutex_lock(&fdset->sync_mutex); - fdset->sync = true; - pthread_cond_broadcast(&fdset->sync_cond); - pthread_mutex_unlock(&fdset->sync_mutex); -} - -static void -fdset_pipe_uninit(struct fdset *fdset) -{ - fdset_del(fdset, fdset->u.readfd); - close(fdset->u.readfd); - fdset->u.readfd = -1; - close(fdset->u.writefd); - fdset->u.writefd = -1; -} - -static int -fdset_pipe_init(struct fdset *fdset) -{ - int ret; - - pthread_mutex_init(&fdset->sync_mutex, NULL); - pthread_cond_init(&fdset->sync_cond, NULL); - - if (pipe(fdset->u.pipefd) < 0) { - VHOST_FDMAN_LOG(ERR, - "failed to create pipe for vhost fdset"); - return -1; - } - - ret = fdset_add_no_sync(fdset, fdset->u.readfd, - fdset_pipe_read_cb, NULL, fdset); - if (ret < 0) { - VHOST_FDMAN_LOG(ERR, - "failed to add pipe readfd %d into vhost server fdset", - fdset->u.readfd); - - fdset_pipe_uninit(fdset); - return -1; - } - - return 0; -} - -static void -fdset_sync(struct fdset *fdset) -{ - int ret; - - pthread_mutex_lock(&fdset->sync_mutex); - - fdset->sync = false; - ret = write(fdset->u.writefd, "1", 1); - if (ret < 0) { - VHOST_FDMAN_LOG(ERR, - "Failed to write to notification pipe: %s", - strerror(errno)); - goto out_unlock; - } - - while (!fdset->sync) - pthread_cond_wait(&fdset->sync_cond, &fdset->sync_mutex); - -out_unlock: - pthread_mutex_unlock(&fdset->sync_mutex); -} - -static int -get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) -{ - int i; - - for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) - ; - - return i; -} - -static void -fdset_move(struct fdset *pfdset, int dst, int src) -{ - pfdset->fd[dst] = pfdset->fd[src]; - pfdset->rwfds[dst] = pfdset->rwfds[src]; -} - -static void -fdset_shrink_nolock(struct fdset *pfdset) -{ - int i; - int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); - - for (i = 0; i < last_valid_idx; i++) { - if (pfdset->fd[i].fd != -1) - continue; - - fdset_move(pfdset, i, last_valid_idx); - last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); - } - pfdset->num = last_valid_idx + 1; -} - -/* - * Find deleted fd entries and remove them - */ -static void -fdset_shrink(struct fdset *pfdset) -{ - pthread_mutex_lock(&pfdset->fd_mutex); - fdset_shrink_nolock(pfdset); - pthread_mutex_unlock(&pfdset->fd_mutex); -} - -/** - * Returns the index in the fdset for a given fd. - * @return - * index for the fd, or -1 if fd isn't in the fdset. - */ -static int -fdset_find_fd(struct fdset *pfdset, int fd) -{ - int i; - - for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) - ; - - return i == pfdset->num ? -1 : i; -} - -static void -fdset_add_fd(struct fdset *pfdset, int idx, int fd, - fd_cb rcb, fd_cb wcb, void *dat) -{ - struct fdentry *pfdentry = &pfdset->fd[idx]; - struct pollfd *pfd = &pfdset->rwfds[idx]; - - pfdentry->fd = fd; - pfdentry->rcb = rcb; - pfdentry->wcb = wcb; - pfdentry->dat = dat; - - pfd->fd = fd; - pfd->events = rcb ? POLLIN : 0; - pfd->events |= wcb ? POLLOUT : 0; - pfd->revents = 0; -} - struct fdset * fdset_init(const char *name) { @@ -284,16 +109,20 @@ fdset_init(const char *name) rte_strscpy(fdset->name, name, RTE_THREAD_NAME_SIZE); pthread_mutex_init(&fdset->fd_mutex, NULL); - pthread_mutex_init(&fdset->fd_polling_mutex, NULL); - for (i = 0; i < MAX_FDS; i++) { + for (i = 0; i < (int)RTE_DIM(fdset->fd); i++) { fdset->fd[i].fd = -1; fdset->fd[i].dat = NULL; } - fdset->num = 0; + LIST_INIT(&fdset->fdlist); - if (fdset_pipe_init(fdset)) { - VHOST_FDMAN_LOG(ERR, "Failed to init pipe for %s", name); + /* + * Any non-zero value would work (see man epoll_create), + * but pass MAX_FDS for consistency. + */ + fdset->epfd = epoll_create(MAX_FDS); + if (fdset->epfd < 0) { + VHOST_FDMAN_LOG(ERR, "failed to create epoll for %s fdset", name); goto err_free; } @@ -301,7 +130,7 @@ fdset_init(const char *name) fdset_event_dispatch, fdset)) { VHOST_FDMAN_LOG(ERR, "Failed to create %s event dispatch thread", fdset->name); - goto err_pipe; + goto err_epoll; } if (fdset_insert(fdset)) { @@ -315,10 +144,9 @@ fdset_init(const char *name) err_thread: fdset->destroy = true; - fdset_sync(fdset); rte_thread_join(fdset->tid, &val); -err_pipe: - fdset_pipe_uninit(fdset); +err_epoll: + close(fdset->epfd); err_free: rte_free(fdset); err_unlock: @@ -330,78 +158,99 @@ fdset_init(const char *name) /** * Register the fd in the fdset with read/write handler and context. */ -static int -fdset_add_no_sync(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) { - int i; + struct fdentry *pfdentry; + struct epoll_event ev; if (pfdset == NULL || fd == -1) return -1; pthread_mutex_lock(&pfdset->fd_mutex); - i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; - if (i == -1) { - pthread_mutex_lock(&pfdset->fd_polling_mutex); - fdset_shrink_nolock(pfdset); - pthread_mutex_unlock(&pfdset->fd_polling_mutex); - i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; - if (i == -1) { - pthread_mutex_unlock(&pfdset->fd_mutex); - return -2; - } + if (pfdset->next_free_idx >= (int)RTE_DIM(pfdset->fd)) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; } - fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pfdentry = &pfdset->fd[pfdset->next_free_idx]; + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + LIST_INSERT_HEAD(&pfdset->fdlist, pfdentry, next); + + /* Find next free slot */ + pfdset->next_free_idx++; + for (; pfdset->next_free_idx < (int)RTE_DIM(pfdset->fd); pfdset->next_free_idx++) { + if (pfdset->fd[pfdset->next_free_idx].fd != -1) + continue; + break; + } pthread_mutex_unlock(&pfdset->fd_mutex); + ev.events = EPOLLERR; + ev.events |= rcb ? EPOLLIN : 0; + ev.events |= wcb ? EPOLLOUT : 0; + ev.data.fd = fd; + + if (epoll_ctl(pfdset->epfd, EPOLL_CTL_ADD, fd, &ev) == -1) + VHOST_FDMAN_LOG(ERR, "could not add %d fd to %d epfd: %s", + fd, pfdset->epfd, strerror(errno)); + return 0; } -int -fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +static struct fdentry * +fdset_find_entry_locked(struct fdset *pfdset, int fd) { - int ret; + struct fdentry *pfdentry; - ret = fdset_add_no_sync(pfdset, fd, rcb, wcb, dat); - if (ret < 0) - return ret; + LIST_FOREACH(pfdentry, &pfdset->fdlist, next) { + if (pfdentry->fd != fd) + continue; + return pfdentry; + } - fdset_sync(pfdset); + return NULL; +} - return 0; +static void +fdset_del_locked(struct fdset *pfdset, struct fdentry *pfdentry) +{ + int entry_idx; + + if (epoll_ctl(pfdset->epfd, EPOLL_CTL_DEL, pfdentry->fd, NULL) == -1) + VHOST_FDMAN_LOG(ERR, "could not remove %d fd from %d epfd: %s", + pfdentry->fd, pfdset->epfd, strerror(errno)); + + pfdentry->fd = -1; + pfdentry->rcb = pfdentry->wcb = NULL; + pfdentry->dat = NULL; + entry_idx = pfdentry - pfdset->fd; + if (entry_idx < pfdset->next_free_idx) + pfdset->next_free_idx = entry_idx; + LIST_REMOVE(pfdentry, next); } -/** - * Unregister the fd from the fdset. - * Returns context of a given fd or NULL. - */ -void * +void fdset_del(struct fdset *pfdset, int fd) { - int i; - void *dat = NULL; + struct fdentry *pfdentry; if (pfdset == NULL || fd == -1) - return NULL; + return; do { pthread_mutex_lock(&pfdset->fd_mutex); - - i = fdset_find_fd(pfdset, fd); - if (i != -1 && pfdset->fd[i].busy == 0) { - /* busy indicates r/wcb is executing! */ - dat = pfdset->fd[i].dat; - pfdset->fd[i].fd = -1; - pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; - pfdset->fd[i].dat = NULL; - i = -1; + pfdentry = fdset_find_entry_locked(pfdset, fd); + if (pfdentry != NULL && pfdentry->busy == 0) { + fdset_del_locked(pfdset, pfdentry); + pfdentry = NULL; } pthread_mutex_unlock(&pfdset->fd_mutex); - } while (i != -1); - - fdset_sync(pfdset); - - return dat; + } while (pfdentry != NULL); } /** @@ -415,28 +264,22 @@ fdset_del(struct fdset *pfdset, int fd) int fdset_try_del(struct fdset *pfdset, int fd) { - int i; + struct fdentry *pfdentry; if (pfdset == NULL || fd == -1) return -2; pthread_mutex_lock(&pfdset->fd_mutex); - i = fdset_find_fd(pfdset, fd); - if (i != -1 && pfdset->fd[i].busy) { + pfdentry = fdset_find_entry_locked(pfdset, fd); + if (pfdentry != NULL && pfdentry->busy != 0) { pthread_mutex_unlock(&pfdset->fd_mutex); return -1; } - if (i != -1) { - pfdset->fd[i].fd = -1; - pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; - pfdset->fd[i].dat = NULL; - } + if (pfdentry != NULL) + fdset_del_locked(pfdset, pfdentry); pthread_mutex_unlock(&pfdset->fd_mutex); - - fdset_sync(pfdset); - return 0; } @@ -453,53 +296,29 @@ static uint32_t fdset_event_dispatch(void *arg) { int i; - struct pollfd *pfd; - struct fdentry *pfdentry; fd_cb rcb, wcb; void *dat; int fd, numfds; int remove1, remove2; - int need_shrink; struct fdset *pfdset = arg; - int val; if (pfdset == NULL) return 0; while (1) { + struct epoll_event events[MAX_FDS]; + struct fdentry *pfdentry; - /* - * When poll is blocked, other threads might unregister - * listenfds from and register new listenfds into fdset. - * When poll returns, the entries for listenfds in the fdset - * might have been updated. It is ok if there is unwanted call - * for new listenfds. - */ - pthread_mutex_lock(&pfdset->fd_mutex); - numfds = pfdset->num; - pthread_mutex_unlock(&pfdset->fd_mutex); - - pthread_mutex_lock(&pfdset->fd_polling_mutex); - val = poll(pfdset->rwfds, numfds, 1000 /* millisecs */); - pthread_mutex_unlock(&pfdset->fd_polling_mutex); - if (val < 0) + numfds = epoll_wait(pfdset->epfd, events, RTE_DIM(events), 1000); + if (numfds < 0) continue; - need_shrink = 0; for (i = 0; i < numfds; i++) { pthread_mutex_lock(&pfdset->fd_mutex); - pfdentry = &pfdset->fd[i]; - fd = pfdentry->fd; - pfd = &pfdset->rwfds[i]; - - if (fd < 0) { - need_shrink = 1; - pthread_mutex_unlock(&pfdset->fd_mutex); - continue; - } - - if (!pfd->revents) { + fd = events[i].data.fd; + pfdentry = fdset_find_entry_locked(pfdset, fd); + if (pfdentry == NULL) { pthread_mutex_unlock(&pfdset->fd_mutex); continue; } @@ -513,9 +332,9 @@ fdset_event_dispatch(void *arg) pthread_mutex_unlock(&pfdset->fd_mutex); - if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + if (rcb && events[i].events & (EPOLLIN | EPOLLERR | EPOLLHUP)) rcb(fd, dat, &remove1); - if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + if (wcb && events[i].events & (EPOLLOUT | EPOLLERR | EPOLLHUP)) wcb(fd, dat, &remove2); pfdentry->busy = 0; /* @@ -524,23 +343,13 @@ fdset_event_dispatch(void *arg) * directly. */ /* - * When we are to clean up the fd from fdset, - * because the fd is closed in the cb, - * the old fd val could be reused by when creates new - * listen fd in another thread, we couldn't call - * fdset_del. + * A concurrent fdset_del may have been waiting for the + * fdentry not to be busy, so we can't call + * fdset_del_locked(). */ - if (remove1 || remove2) { - pfdentry->fd = -1; - need_shrink = 1; - } + if (remove1 || remove2) + fdset_del(pfdset, fd); } - - if (need_shrink) - fdset_shrink(pfdset); - - if (pfdset->destroy) - break; } return 0; diff --git a/lib/vhost/fd_man.h b/lib/vhost/fd_man.h index 079fa0155f..6398343a6a 100644 --- a/lib/vhost/fd_man.h +++ b/lib/vhost/fd_man.h @@ -6,7 +6,7 @@ #define _FD_MAN_H_ #include #include -#include +#include struct fdset; @@ -19,8 +19,7 @@ struct fdset *fdset_init(const char *name); int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat); -void *fdset_del(struct fdset *pfdset, int fd); - +void fdset_del(struct fdset *pfdset, int fd); int fdset_try_del(struct fdset *pfdset, int fd); #endif -- 2.44.0