* [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-26 9:40 ` [dpdk-dev] [PATCH v3] " Li Lin
@ 2019-04-26 9:40 ` Li Lin
2019-04-28 11:17 ` Tiwei Bie
2019-05-05 9:02 ` [dpdk-dev] [PATCH v4] " Li Lin
2 siblings, 0 replies; 25+ messages in thread
From: Li Lin @ 2019-04-26 9:40 UTC (permalink / raw)
To: tiwei.bie, maxime.coquelin, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris, lilin24,
Ni Xun, Zhang Yu
From: lilin24 <lilin24@baidu.com>
This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
buffer between qemu and backend.
Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
shared buffer from backend. Then qemu should send it back
through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
This shared buffer is used to process inflight I/O when backend
reconnect.
Signed-off-by: lilin24 <lilin24@baidu.com>
Signed-off-by: Ni Xun <nixun@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
v2:
- add set&clr inflight entry function
v3:
- simplified some function implementations
---
lib/librte_vhost/rte_vhost.h | 53 ++++++++++
lib/librte_vhost/vhost.c | 42 ++++++++
lib/librte_vhost/vhost.h | 11 ++
lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
lib/librte_vhost/vhost_user.h | 16 ++-
5 files changed, 351 insertions(+), 2 deletions(-)
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d2c0c93f4..bc25591a8 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -71,6 +71,10 @@ extern "C" {
#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
#endif
+#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
+#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+#endif
+
/** Indicate whether protocol features negotiation is supported. */
#ifndef VHOST_USER_F_PROTOCOL_FEATURES
#define VHOST_USER_F_PROTOCOL_FEATURES 30
@@ -98,12 +102,26 @@ struct rte_vhost_memory {
struct rte_vhost_mem_region regions[];
};
+typedef struct VhostUserInflightEntry {
+ uint8_t inflight;
+} VhostUserInflightEntry;
+
+typedef struct VhostInflightInfo {
+ uint16_t version;
+ uint16_t last_inflight_io;
+ uint16_t used_idx;
+ VhostUserInflightEntry desc[0];
+} VhostInflightInfo;
+
struct rte_vhost_vring {
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint64_t log_guest_addr;
+ VhostInflightInfo *inflight;
+ int inflight_flag;
+
/** Deprecated, use rte_vhost_vring_call() instead. */
int callfd;
@@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
int rte_vhost_vring_call(int vid, uint16_t vring_idx);
/**
+ * set inflight flag for a entry.
+ *
+ * @param vring
+ * the structure to hold the requested vring info
+ * @param idx
+ * inflight entry index
+ */
+void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
+ uint16_t idx);
+
+/**
+ * clear inflight flag for a entry.
+ *
+ * @param vring
+ * the structure to hold the requested vring info
+ * @param last_used_idx
+ * next free used_idx
+ * @param idx
+ * inflight entry index
+ */
+void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
+ uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * set last inflight io index.
+ *
+ * @param vring
+ * the structure to hold the requested vring info
+ * @param idx
+ * inflight entry index
+ */
+void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
+ uint16_t idx);
+
+/**
* Get vhost RX queue avail count.
*
* @param vid
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 163f4595e..9ba692935 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
close(vq->callfd);
if (vq->kickfd >= 0)
close(vq->kickfd);
+ if (vq->inflight)
+ vq->inflight = NULL;
}
/*
@@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
vring->kickfd = vq->kickfd;
vring->size = vq->size;
+ vring->inflight = vq->inflight;
+
+ vring->inflight_flag = vq->inflight_flag;
+ vq->inflight_flag = 0;
+
return 0;
}
@@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
return 0;
}
+void
+rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
+{
+ VhostInflightInfo *inflight = vring->inflight;
+ if (unlikely(!inflight))
+ return;
+ inflight->desc[idx].inflight = 1;
+}
+
+void
+rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
+ uint16_t last_used_idx, uint16_t idx)
+{
+ VhostInflightInfo *inflight = vring->inflight;
+
+ if (unlikely(!inflight))
+ return;
+
+ rte_compiler_barrier();
+ inflight->desc[idx].inflight = 0;
+ rte_compiler_barrier();
+ inflight->used_idx = last_used_idx;
+}
+
+void
+rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
+{
+ VhostInflightInfo *inflight = vring->inflight;
+
+ if (unlikely(!inflight))
+ return;
+
+ inflight->last_inflight_io = idx;
+}
+
uint16_t
rte_vhost_avail_entries(int vid, uint16_t queue_id)
{
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index e9138dfab..537d74c71 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -128,6 +128,10 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+ /* Inflight share memory info */
+ VhostInflightInfo *inflight;
+ bool inflight_flag;
+
uint16_t nr_zmbuf;
uint16_t zmbuf_size;
uint16_t last_zmbuf_idx;
@@ -286,6 +290,12 @@ struct guest_page {
uint64_t size;
};
+typedef struct VuDevInflightInfo {
+ int fd;
+ void *addr;
+ uint64_t size;
+} VuDevInflightInfo;
+
/**
* Device structure contains all configuration information relating
* to the device.
@@ -303,6 +313,7 @@ struct virtio_net {
uint32_t nr_vring;
int dequeue_zero_copy;
struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+ VuDevInflightInfo inflight_info;
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
char ifname[IF_NAME_SZ];
uint64_t log_size;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index c9e29ece8..92b878874 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -31,6 +31,8 @@
#include <sys/stat.h>
#include <sys/syscall.h>
#include <assert.h>
+#include <sys/syscall.h>
+#include <asm/unistd.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
#endif
@@ -49,6 +51,14 @@
#define VIRTIO_MIN_MTU 68
#define VIRTIO_MAX_MTU 65535
+#define INFLIGHT_ALIGNMENT 64
+#define INFLIGHT_VERSION 0xabcd
+
+#define CLOEXEC 0x0001U
+
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_NONE] = "VHOST_USER_NONE",
[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
@@ -78,6 +88,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE",
[VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN",
[VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END",
+ [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD",
+ [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD",
};
static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
@@ -160,6 +172,16 @@ vhost_backend_cleanup(struct virtio_net *dev)
dev->log_addr = 0;
}
+ if (dev->inflight_info.addr) {
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+ dev->inflight_info.addr = NULL;
+ }
+
+ if (dev->inflight_info.fd > 0) {
+ close(dev->inflight_info.fd);
+ dev->inflight_info.fd = -1;
+ }
+
if (dev->slave_req_fd >= 0) {
close(dev->slave_req_fd);
dev->slave_req_fd = -1;
@@ -1165,6 +1187,184 @@ virtio_is_ready(struct virtio_net *dev)
return 1;
}
+static int mem_create(const char *name, unsigned int flags)
+{
+#ifdef __NR_memfd_create
+ return syscall(__NR_memfd_create, name, flags);
+#else
+ return -1;
+#endif
+}
+
+void *inflight_mem_alloc(const char *name, size_t size, int *fd)
+{
+ void *ptr;
+ int mfd = -1;
+ char fname[20] = "/tmp/memfd-XXXXXX";
+
+ *fd = -1;
+ mfd = mem_create(name, CLOEXEC);
+ if (mfd != -1) {
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ } else {
+ mfd = mkstemp(fname);
+ unlink(fname);
+
+ if (mfd == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mkstemp fail for alloc inflight buffer\n");
+ return NULL;
+ }
+
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ }
+
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+ if (ptr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+
+ *fd = mfd;
+ return ptr;
+}
+
+static uint32_t get_pervq_shm_size(uint16_t queue_size)
+{
+ return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
+ sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
+}
+
+static int
+vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd;
+ uint64_t mmap_size;
+ void *addr;
+ uint16_t num_queues, queue_size;
+ struct virtio_net *dev = *pdev;
+
+ if (msg->size != sizeof(msg->payload.inflight)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Invalid get_inflight_fd message size is %d",
+ msg->size);
+ msg->payload.inflight.mmap_size = 0;
+ return 0;
+ }
+
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
+ msg->payload.inflight.num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
+ msg->payload.inflight.queue_size);
+ mmap_size = num_queues * get_pervq_shm_size(queue_size);
+
+ addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
+ if (!addr) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
+ msg->payload.inflight.mmap_size = 0;
+ return 0;
+ }
+ memset(addr, 0, mmap_size);
+
+ dev->inflight_info.addr = addr;
+ dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
+ dev->inflight_info.fd = msg->fds[0] = fd;
+ msg->payload.inflight.mmap_offset = 0;
+ msg->fd_num = 1;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_size: %lu\n",
+ msg->payload.inflight.mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_offset: %lu\n",
+ msg->payload.inflight.mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight fd: %d\n", msg->fds[0]);
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int
+vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd, i;
+ uint64_t mmap_size, mmap_offset;
+ uint16_t num_queues, queue_size;
+ uint32_t pervq_inflight_size;
+ void *rc;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev = *pdev;
+
+ fd = msg->fds[0];
+ if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
+ msg->size, fd);
+ return -1;
+ }
+
+ mmap_size = msg->payload.inflight.mmap_size;
+ mmap_offset = msg->payload.inflight.mmap_offset;
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+ pervq_inflight_size = get_pervq_shm_size(queue_size);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_size: %lu\n", mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd num_queues: %u\n", num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd queue_size: %u\n", queue_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd fd: %d\n", fd);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd pervq_inflight_size: %d\n",
+ pervq_inflight_size);
+
+ if (dev->inflight_info.addr)
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+
+ rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, mmap_offset);
+ if (rc == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
+ return -1;
+ }
+
+ if (dev->inflight_info.fd)
+ close(dev->inflight_info.fd);
+
+ dev->inflight_info.fd = fd;
+ dev->inflight_info.addr = rc;
+ dev->inflight_info.size = mmap_size;
+
+ for (i = 0; i < num_queues; i++) {
+ vq = dev->virtqueue[i];
+ vq->inflight = (VhostInflightInfo *)rc;
+ rc = (void *)((char *)rc + pervq_inflight_size);
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
static int
vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
@@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
}
static int
+vhost_check_queue_inflights(struct vhost_virtqueue *vq)
+{
+ uint16_t i = 0;
+
+ if ((!vq->inflight))
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ if (!vq->inflight->version) {
+ vq->inflight->version = INFLIGHT_VERSION;
+ return RTE_VHOST_MSG_RESULT_OK;
+ }
+
+ for (i = 0; i < vq->size; i++) {
+ if (vq->inflight->desc[i].inflight == 1) {
+ vq->inflight_flag = 1;
+ break;
+ }
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
{
@@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
close(vq->kickfd);
vq->kickfd = file.fd;
+ if (vhost_check_queue_inflights(vq)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to inflights for vq: %d\n", file.index);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
return RTE_VHOST_MSG_RESULT_OK;
}
@@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
+ [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
+ [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
};
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 2a650fe4b..35f969b1b 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -23,7 +23,8 @@
(1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
(1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
- (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
+ (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
@@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
- VHOST_USER_MAX = 31
+ VHOST_USER_GET_INFLIGHT_FD = 31,
+ VHOST_USER_SET_INFLIGHT_FD = 32,
+ VHOST_USER_MAX = 33
} VhostUserRequest;
typedef enum VhostUserSlaveRequest {
@@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
uint64_t offset;
} VhostUserVringArea;
+typedef struct VhostUserInflight {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint16_t num_queues;
+ uint16_t queue_size;
+} VhostUserInflight;
+
typedef struct VhostUserMsg {
union {
uint32_t master; /* a VhostUserRequest value */
@@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
struct vhost_vring_addr addr;
VhostUserMemory memory;
VhostUserLog log;
+ VhostUserInflight inflight;
struct vhost_iotlb_msg iotlb;
VhostUserCryptoSessionParam crypto_session;
VhostUserVringArea area;
@@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
/* vhost_user.c */
int vhost_user_msg_handler(int vid, int fd);
int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
+void *inflight_mem_alloc(const char *name, size_t size, int *fd);
/* socket.c */
int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
--
2.11.0
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-26 9:40 ` [dpdk-dev] [PATCH v3] " Li Lin
2019-04-26 9:40 ` Li Lin
@ 2019-04-28 11:17 ` Tiwei Bie
2019-04-28 11:17 ` Tiwei Bie
2019-04-29 4:07 ` lin li
2019-05-05 9:02 ` [dpdk-dev] [PATCH v4] " Li Lin
2 siblings, 2 replies; 25+ messages in thread
From: Tiwei Bie @ 2019-04-28 11:17 UTC (permalink / raw)
To: Li Lin
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu
On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> From: lilin24 <lilin24@baidu.com>
>
> This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> buffer between qemu and backend.
>
> Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> shared buffer from backend. Then qemu should send it back
> through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
>
> This shared buffer is used to process inflight I/O when backend
> reconnect.
>
> Signed-off-by: lilin24 <lilin24@baidu.com>
You need to use your real name here.
> Signed-off-by: Ni Xun <nixun@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
>
> v2:
> - add set&clr inflight entry function
> v3:
> - simplified some function implementations
>
> ---
You can put change logs here (i.e. after ---).
> lib/librte_vhost/rte_vhost.h | 53 ++++++++++
> lib/librte_vhost/vhost.c | 42 ++++++++
> lib/librte_vhost/vhost.h | 11 ++
> lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
> lib/librte_vhost/vhost_user.h | 16 ++-
> 5 files changed, 351 insertions(+), 2 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d2c0c93f4..bc25591a8 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -71,6 +71,10 @@ extern "C" {
> #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> #endif
>
> +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> +#endif
> +
> /** Indicate whether protocol features negotiation is supported. */
> #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> #define VHOST_USER_F_PROTOCOL_FEATURES 30
> @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> struct rte_vhost_mem_region regions[];
> };
>
> +typedef struct VhostUserInflightEntry {
> + uint8_t inflight;
> +} VhostUserInflightEntry;
> +
> +typedef struct VhostInflightInfo {
> + uint16_t version;
> + uint16_t last_inflight_io;
> + uint16_t used_idx;
> + VhostUserInflightEntry desc[0];
> +} VhostInflightInfo;
Is there any details on above structure? Why does it not match
QueueRegionSplit or QueueRegionPacked structures described in
qemu/docs/interop/vhost-user.txt?
> +
> struct rte_vhost_vring {
> struct vring_desc *desc;
> struct vring_avail *avail;
> struct vring_used *used;
> uint64_t log_guest_addr;
>
> + VhostInflightInfo *inflight;
> + int inflight_flag;
> +
> /** Deprecated, use rte_vhost_vring_call() instead. */
> int callfd;
>
> @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> int rte_vhost_vring_call(int vid, uint16_t vring_idx);
>
> /**
> + * set inflight flag for a entry.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> + uint16_t idx);
> +
> +/**
> + * clear inflight flag for a entry.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param last_used_idx
> + * next free used_idx
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> + uint16_t last_used_idx, uint16_t idx);
> +
> +/**
> + * set last inflight io index.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> + uint16_t idx);
New APIs should be experimental and also need to be
added in rte_vhost_version.map file.
> +
> +/**
> * Get vhost RX queue avail count.
> *
> * @param vid
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 163f4595e..9ba692935 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> close(vq->callfd);
> if (vq->kickfd >= 0)
> close(vq->kickfd);
> + if (vq->inflight)
> + vq->inflight = NULL;
> }
>
> /*
> @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> vring->kickfd = vq->kickfd;
> vring->size = vq->size;
>
> + vring->inflight = vq->inflight;
> +
> + vring->inflight_flag = vq->inflight_flag;
> + vq->inflight_flag = 0;
This will break the ABI. Better to introduce a new API to do this.
> +
> return 0;
> }
>
> @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> return 0;
> }
>
> +void
> +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> + if (unlikely(!inflight))
> + return;
> + inflight->desc[idx].inflight = 1;
> +}
> +
> +void
> +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> + uint16_t last_used_idx, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> +
> + if (unlikely(!inflight))
> + return;
> +
> + rte_compiler_barrier();
> + inflight->desc[idx].inflight = 0;
> + rte_compiler_barrier();
> + inflight->used_idx = last_used_idx;
> +}
> +
> +void
> +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> +
> + if (unlikely(!inflight))
> + return;
> +
> + inflight->last_inflight_io = idx;
> +}
The rte_vhost_vring is used to return information to apps.
IIUC, you want to update vq->inflight. So the rte_vhost_vring
param should be replaced by vid + vring_idx.
> +
> uint16_t
> rte_vhost_avail_entries(int vid, uint16_t queue_id)
> {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index e9138dfab..537d74c71 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -128,6 +128,10 @@ struct vhost_virtqueue {
> /* Physical address of used ring, for logging */
> uint64_t log_guest_addr;
>
> + /* Inflight share memory info */
> + VhostInflightInfo *inflight;
> + bool inflight_flag;
> +
> uint16_t nr_zmbuf;
> uint16_t zmbuf_size;
> uint16_t last_zmbuf_idx;
> @@ -286,6 +290,12 @@ struct guest_page {
> uint64_t size;
> };
>
> +typedef struct VuDevInflightInfo {
> + int fd;
> + void *addr;
> + uint64_t size;
> +} VuDevInflightInfo;
Please follow DPDK's coding style when defining internal structure.
> +
> /**
> * Device structure contains all configuration information relating
> * to the device.
> @@ -303,6 +313,7 @@ struct virtio_net {
> uint32_t nr_vring;
> int dequeue_zero_copy;
> struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> + VuDevInflightInfo inflight_info;
> #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> char ifname[IF_NAME_SZ];
> uint64_t log_size;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
[...]
> +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> +{
> + return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> + sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> +}
> +
> +static int
> +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> + int main_fd __rte_unused)
> +{
> + int fd;
> + uint64_t mmap_size;
> + void *addr;
> + uint16_t num_queues, queue_size;
> + struct virtio_net *dev = *pdev;
> +
> + if (msg->size != sizeof(msg->payload.inflight)) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "Invalid get_inflight_fd message size is %d",
> + msg->size);
> + msg->payload.inflight.mmap_size = 0;
In this case, you can't touch the payload.
> + return 0;
And an error should be returned.
> + }
> +
> + num_queues = msg->payload.inflight.num_queues;
> + queue_size = msg->payload.inflight.queue_size;
> +
> + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> + msg->payload.inflight.num_queues);
> + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> + msg->payload.inflight.queue_size);
> + mmap_size = num_queues * get_pervq_shm_size(queue_size);
> +
> + addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> + if (!addr) {
> + RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> + msg->payload.inflight.mmap_size = 0;
> + return 0;
Should always return RTE_VHOST_MSG_RESULT_* in
message handler.
> + }
> + memset(addr, 0, mmap_size);
> +
> + dev->inflight_info.addr = addr;
> + dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> + dev->inflight_info.fd = msg->fds[0] = fd;
> + msg->payload.inflight.mmap_offset = 0;
> + msg->fd_num = 1;
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight mmap_size: %lu\n",
> + msg->payload.inflight.mmap_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight mmap_offset: %lu\n",
> + msg->payload.inflight.mmap_offset);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight fd: %d\n", msg->fds[0]);
> +
> + return RTE_VHOST_MSG_RESULT_REPLY;
> +}
> +
> +static int
> +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> + int main_fd __rte_unused)
> +{
> + int fd, i;
> + uint64_t mmap_size, mmap_offset;
> + uint16_t num_queues, queue_size;
> + uint32_t pervq_inflight_size;
> + void *rc;
> + struct vhost_virtqueue *vq;
> + struct virtio_net *dev = *pdev;
> +
> + fd = msg->fds[0];
> + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> + msg->size, fd);
> + return -1;
Ditto.
> + }
> +
> + mmap_size = msg->payload.inflight.mmap_size;
> + mmap_offset = msg->payload.inflight.mmap_offset;
> + num_queues = msg->payload.inflight.num_queues;
> + queue_size = msg->payload.inflight.queue_size;
> + pervq_inflight_size = get_pervq_shm_size(queue_size);
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd num_queues: %u\n", num_queues);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd queue_size: %u\n", queue_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd fd: %d\n", fd);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd pervq_inflight_size: %d\n",
> + pervq_inflight_size);
> +
> + if (dev->inflight_info.addr)
> + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +
> + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> + fd, mmap_offset);
Why call it rc? Maybe addr is a better name?
> + if (rc == MAP_FAILED) {
> + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> + return -1;
Should always return RTE_VHOST_MSG_RESULT_* in
message handler.
> + }
> +
> + if (dev->inflight_info.fd)
> + close(dev->inflight_info.fd);
> +
> + dev->inflight_info.fd = fd;
> + dev->inflight_info.addr = rc;
> + dev->inflight_info.size = mmap_size;
> +
> + for (i = 0; i < num_queues; i++) {
> + vq = dev->virtqueue[i];
> + vq->inflight = (VhostInflightInfo *)rc;
> + rc = (void *)((char *)rc + pervq_inflight_size);
> + }
> +
> + return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
> static int
> vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
> int main_fd __rte_unused)
> @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
> }
>
> static int
> +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> +{
> + uint16_t i = 0;
> +
> + if ((!vq->inflight))
> + return RTE_VHOST_MSG_RESULT_ERR;
Should check whether the feature is negotiated first.
> +
> + if (!vq->inflight->version) {
> + vq->inflight->version = INFLIGHT_VERSION;
> + return RTE_VHOST_MSG_RESULT_OK;
> + }
> +
> + for (i = 0; i < vq->size; i++) {
> + if (vq->inflight->desc[i].inflight == 1) {
> + vq->inflight_flag = 1;
> + break;
> + }
> + }
> +
> + return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
> +static int
> vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> int main_fd __rte_unused)
> {
> @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> close(vq->kickfd);
> vq->kickfd = file.fd;
>
> + if (vhost_check_queue_inflights(vq)) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "Failed to inflights for vq: %d\n", file.index);
> + return RTE_VHOST_MSG_RESULT_ERR;
> + }
> +
> return RTE_VHOST_MSG_RESULT_OK;
> }
>
> @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
> [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
> [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
> [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> + [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> + [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
> };
>
>
> diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> index 2a650fe4b..35f969b1b 100644
> --- a/lib/librte_vhost/vhost_user.h
> +++ b/lib/librte_vhost/vhost_user.h
> @@ -23,7 +23,8 @@
> (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
It will advertise this feature for builtin net and crypto
backends. It's probably not what you intended.
>
> typedef enum VhostUserRequest {
> VHOST_USER_NONE = 0,
> @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
> VHOST_USER_POSTCOPY_ADVISE = 28,
> VHOST_USER_POSTCOPY_LISTEN = 29,
> VHOST_USER_POSTCOPY_END = 30,
> - VHOST_USER_MAX = 31
> + VHOST_USER_GET_INFLIGHT_FD = 31,
> + VHOST_USER_SET_INFLIGHT_FD = 32,
> + VHOST_USER_MAX = 33
> } VhostUserRequest;
>
> typedef enum VhostUserSlaveRequest {
> @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
> uint64_t offset;
> } VhostUserVringArea;
>
> +typedef struct VhostUserInflight {
> + uint64_t mmap_size;
> + uint64_t mmap_offset;
> + uint16_t num_queues;
> + uint16_t queue_size;
> +} VhostUserInflight;
> +
> typedef struct VhostUserMsg {
> union {
> uint32_t master; /* a VhostUserRequest value */
> @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
> struct vhost_vring_addr addr;
> VhostUserMemory memory;
> VhostUserLog log;
> + VhostUserInflight inflight;
> struct vhost_iotlb_msg iotlb;
> VhostUserCryptoSessionParam crypto_session;
> VhostUserVringArea area;
> @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
> /* vhost_user.c */
> int vhost_user_msg_handler(int vid, int fd);
> int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
>
> /* socket.c */
> int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> --
> 2.11.0
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-28 11:17 ` Tiwei Bie
@ 2019-04-28 11:17 ` Tiwei Bie
2019-04-29 4:07 ` lin li
1 sibling, 0 replies; 25+ messages in thread
From: Tiwei Bie @ 2019-04-28 11:17 UTC (permalink / raw)
To: Li Lin
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu
On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> From: lilin24 <lilin24@baidu.com>
>
> This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> buffer between qemu and backend.
>
> Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> shared buffer from backend. Then qemu should send it back
> through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
>
> This shared buffer is used to process inflight I/O when backend
> reconnect.
>
> Signed-off-by: lilin24 <lilin24@baidu.com>
You need to use your real name here.
> Signed-off-by: Ni Xun <nixun@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
>
> v2:
> - add set&clr inflight entry function
> v3:
> - simplified some function implementations
>
> ---
You can put change logs here (i.e. after ---).
> lib/librte_vhost/rte_vhost.h | 53 ++++++++++
> lib/librte_vhost/vhost.c | 42 ++++++++
> lib/librte_vhost/vhost.h | 11 ++
> lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
> lib/librte_vhost/vhost_user.h | 16 ++-
> 5 files changed, 351 insertions(+), 2 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d2c0c93f4..bc25591a8 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -71,6 +71,10 @@ extern "C" {
> #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> #endif
>
> +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> +#endif
> +
> /** Indicate whether protocol features negotiation is supported. */
> #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> #define VHOST_USER_F_PROTOCOL_FEATURES 30
> @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> struct rte_vhost_mem_region regions[];
> };
>
> +typedef struct VhostUserInflightEntry {
> + uint8_t inflight;
> +} VhostUserInflightEntry;
> +
> +typedef struct VhostInflightInfo {
> + uint16_t version;
> + uint16_t last_inflight_io;
> + uint16_t used_idx;
> + VhostUserInflightEntry desc[0];
> +} VhostInflightInfo;
Is there any details on above structure? Why does it not match
QueueRegionSplit or QueueRegionPacked structures described in
qemu/docs/interop/vhost-user.txt?
> +
> struct rte_vhost_vring {
> struct vring_desc *desc;
> struct vring_avail *avail;
> struct vring_used *used;
> uint64_t log_guest_addr;
>
> + VhostInflightInfo *inflight;
> + int inflight_flag;
> +
> /** Deprecated, use rte_vhost_vring_call() instead. */
> int callfd;
>
> @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> int rte_vhost_vring_call(int vid, uint16_t vring_idx);
>
> /**
> + * set inflight flag for a entry.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> + uint16_t idx);
> +
> +/**
> + * clear inflight flag for a entry.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param last_used_idx
> + * next free used_idx
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> + uint16_t last_used_idx, uint16_t idx);
> +
> +/**
> + * set last inflight io index.
> + *
> + * @param vring
> + * the structure to hold the requested vring info
> + * @param idx
> + * inflight entry index
> + */
> +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> + uint16_t idx);
New APIs should be experimental and also need to be
added in rte_vhost_version.map file.
> +
> +/**
> * Get vhost RX queue avail count.
> *
> * @param vid
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 163f4595e..9ba692935 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> close(vq->callfd);
> if (vq->kickfd >= 0)
> close(vq->kickfd);
> + if (vq->inflight)
> + vq->inflight = NULL;
> }
>
> /*
> @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> vring->kickfd = vq->kickfd;
> vring->size = vq->size;
>
> + vring->inflight = vq->inflight;
> +
> + vring->inflight_flag = vq->inflight_flag;
> + vq->inflight_flag = 0;
This will break the ABI. Better to introduce a new API to do this.
> +
> return 0;
> }
>
> @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> return 0;
> }
>
> +void
> +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> + if (unlikely(!inflight))
> + return;
> + inflight->desc[idx].inflight = 1;
> +}
> +
> +void
> +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> + uint16_t last_used_idx, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> +
> + if (unlikely(!inflight))
> + return;
> +
> + rte_compiler_barrier();
> + inflight->desc[idx].inflight = 0;
> + rte_compiler_barrier();
> + inflight->used_idx = last_used_idx;
> +}
> +
> +void
> +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> + VhostInflightInfo *inflight = vring->inflight;
> +
> + if (unlikely(!inflight))
> + return;
> +
> + inflight->last_inflight_io = idx;
> +}
The rte_vhost_vring is used to return information to apps.
IIUC, you want to update vq->inflight. So the rte_vhost_vring
param should be replaced by vid + vring_idx.
> +
> uint16_t
> rte_vhost_avail_entries(int vid, uint16_t queue_id)
> {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index e9138dfab..537d74c71 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -128,6 +128,10 @@ struct vhost_virtqueue {
> /* Physical address of used ring, for logging */
> uint64_t log_guest_addr;
>
> + /* Inflight share memory info */
> + VhostInflightInfo *inflight;
> + bool inflight_flag;
> +
> uint16_t nr_zmbuf;
> uint16_t zmbuf_size;
> uint16_t last_zmbuf_idx;
> @@ -286,6 +290,12 @@ struct guest_page {
> uint64_t size;
> };
>
> +typedef struct VuDevInflightInfo {
> + int fd;
> + void *addr;
> + uint64_t size;
> +} VuDevInflightInfo;
Please follow DPDK's coding style when defining internal structure.
> +
> /**
> * Device structure contains all configuration information relating
> * to the device.
> @@ -303,6 +313,7 @@ struct virtio_net {
> uint32_t nr_vring;
> int dequeue_zero_copy;
> struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> + VuDevInflightInfo inflight_info;
> #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> char ifname[IF_NAME_SZ];
> uint64_t log_size;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
[...]
> +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> +{
> + return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> + sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> +}
> +
> +static int
> +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> + int main_fd __rte_unused)
> +{
> + int fd;
> + uint64_t mmap_size;
> + void *addr;
> + uint16_t num_queues, queue_size;
> + struct virtio_net *dev = *pdev;
> +
> + if (msg->size != sizeof(msg->payload.inflight)) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "Invalid get_inflight_fd message size is %d",
> + msg->size);
> + msg->payload.inflight.mmap_size = 0;
In this case, you can't touch the payload.
> + return 0;
And an error should be returned.
> + }
> +
> + num_queues = msg->payload.inflight.num_queues;
> + queue_size = msg->payload.inflight.queue_size;
> +
> + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> + msg->payload.inflight.num_queues);
> + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> + msg->payload.inflight.queue_size);
> + mmap_size = num_queues * get_pervq_shm_size(queue_size);
> +
> + addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> + if (!addr) {
> + RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> + msg->payload.inflight.mmap_size = 0;
> + return 0;
Should always return RTE_VHOST_MSG_RESULT_* in
message handler.
> + }
> + memset(addr, 0, mmap_size);
> +
> + dev->inflight_info.addr = addr;
> + dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> + dev->inflight_info.fd = msg->fds[0] = fd;
> + msg->payload.inflight.mmap_offset = 0;
> + msg->fd_num = 1;
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight mmap_size: %lu\n",
> + msg->payload.inflight.mmap_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight mmap_offset: %lu\n",
> + msg->payload.inflight.mmap_offset);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "send inflight fd: %d\n", msg->fds[0]);
> +
> + return RTE_VHOST_MSG_RESULT_REPLY;
> +}
> +
> +static int
> +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> + int main_fd __rte_unused)
> +{
> + int fd, i;
> + uint64_t mmap_size, mmap_offset;
> + uint16_t num_queues, queue_size;
> + uint32_t pervq_inflight_size;
> + void *rc;
> + struct vhost_virtqueue *vq;
> + struct virtio_net *dev = *pdev;
> +
> + fd = msg->fds[0];
> + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> + msg->size, fd);
> + return -1;
Ditto.
> + }
> +
> + mmap_size = msg->payload.inflight.mmap_size;
> + mmap_offset = msg->payload.inflight.mmap_offset;
> + num_queues = msg->payload.inflight.num_queues;
> + queue_size = msg->payload.inflight.queue_size;
> + pervq_inflight_size = get_pervq_shm_size(queue_size);
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd num_queues: %u\n", num_queues);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd queue_size: %u\n", queue_size);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd fd: %d\n", fd);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "set_inflight_fd pervq_inflight_size: %d\n",
> + pervq_inflight_size);
> +
> + if (dev->inflight_info.addr)
> + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +
> + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> + fd, mmap_offset);
Why call it rc? Maybe addr is a better name?
> + if (rc == MAP_FAILED) {
> + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> + return -1;
Should always return RTE_VHOST_MSG_RESULT_* in
message handler.
> + }
> +
> + if (dev->inflight_info.fd)
> + close(dev->inflight_info.fd);
> +
> + dev->inflight_info.fd = fd;
> + dev->inflight_info.addr = rc;
> + dev->inflight_info.size = mmap_size;
> +
> + for (i = 0; i < num_queues; i++) {
> + vq = dev->virtqueue[i];
> + vq->inflight = (VhostInflightInfo *)rc;
> + rc = (void *)((char *)rc + pervq_inflight_size);
> + }
> +
> + return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
> static int
> vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
> int main_fd __rte_unused)
> @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
> }
>
> static int
> +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> +{
> + uint16_t i = 0;
> +
> + if ((!vq->inflight))
> + return RTE_VHOST_MSG_RESULT_ERR;
Should check whether the feature is negotiated first.
> +
> + if (!vq->inflight->version) {
> + vq->inflight->version = INFLIGHT_VERSION;
> + return RTE_VHOST_MSG_RESULT_OK;
> + }
> +
> + for (i = 0; i < vq->size; i++) {
> + if (vq->inflight->desc[i].inflight == 1) {
> + vq->inflight_flag = 1;
> + break;
> + }
> + }
> +
> + return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
> +static int
> vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> int main_fd __rte_unused)
> {
> @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> close(vq->kickfd);
> vq->kickfd = file.fd;
>
> + if (vhost_check_queue_inflights(vq)) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "Failed to inflights for vq: %d\n", file.index);
> + return RTE_VHOST_MSG_RESULT_ERR;
> + }
> +
> return RTE_VHOST_MSG_RESULT_OK;
> }
>
> @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
> [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
> [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
> [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> + [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> + [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
> };
>
>
> diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> index 2a650fe4b..35f969b1b 100644
> --- a/lib/librte_vhost/vhost_user.h
> +++ b/lib/librte_vhost/vhost_user.h
> @@ -23,7 +23,8 @@
> (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
It will advertise this feature for builtin net and crypto
backends. It's probably not what you intended.
>
> typedef enum VhostUserRequest {
> VHOST_USER_NONE = 0,
> @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
> VHOST_USER_POSTCOPY_ADVISE = 28,
> VHOST_USER_POSTCOPY_LISTEN = 29,
> VHOST_USER_POSTCOPY_END = 30,
> - VHOST_USER_MAX = 31
> + VHOST_USER_GET_INFLIGHT_FD = 31,
> + VHOST_USER_SET_INFLIGHT_FD = 32,
> + VHOST_USER_MAX = 33
> } VhostUserRequest;
>
> typedef enum VhostUserSlaveRequest {
> @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
> uint64_t offset;
> } VhostUserVringArea;
>
> +typedef struct VhostUserInflight {
> + uint64_t mmap_size;
> + uint64_t mmap_offset;
> + uint16_t num_queues;
> + uint16_t queue_size;
> +} VhostUserInflight;
> +
> typedef struct VhostUserMsg {
> union {
> uint32_t master; /* a VhostUserRequest value */
> @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
> struct vhost_vring_addr addr;
> VhostUserMemory memory;
> VhostUserLog log;
> + VhostUserInflight inflight;
> struct vhost_iotlb_msg iotlb;
> VhostUserCryptoSessionParam crypto_session;
> VhostUserVringArea area;
> @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
> /* vhost_user.c */
> int vhost_user_msg_handler(int vid, int fd);
> int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
>
> /* socket.c */
> int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> --
> 2.11.0
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-28 11:17 ` Tiwei Bie
2019-04-28 11:17 ` Tiwei Bie
@ 2019-04-29 4:07 ` lin li
2019-04-29 4:07 ` lin li
2019-04-29 10:54 ` Tiwei Bie
1 sibling, 2 replies; 25+ messages in thread
From: lin li @ 2019-04-29 4:07 UTC (permalink / raw)
To: Tiwei Bie
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu
Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
>
> On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> > From: lilin24 <lilin24@baidu.com>
> >
> > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> > and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> > buffer between qemu and backend.
> >
> > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> > shared buffer from backend. Then qemu should send it back
> > through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> >
> > This shared buffer is used to process inflight I/O when backend
> > reconnect.
> >
> > Signed-off-by: lilin24 <lilin24@baidu.com>
>
> You need to use your real name here.
>
> > Signed-off-by: Ni Xun <nixun@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> >
> > v2:
> > - add set&clr inflight entry function
> > v3:
> > - simplified some function implementations
> >
> > ---
>
> You can put change logs here (i.e. after ---).
>
> > lib/librte_vhost/rte_vhost.h | 53 ++++++++++
> > lib/librte_vhost/vhost.c | 42 ++++++++
> > lib/librte_vhost/vhost.h | 11 ++
> > lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
> > lib/librte_vhost/vhost_user.h | 16 ++-
> > 5 files changed, 351 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> > index d2c0c93f4..bc25591a8 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -71,6 +71,10 @@ extern "C" {
> > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> > #endif
> >
> > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> > +#endif
> > +
> > /** Indicate whether protocol features negotiation is supported. */
> > #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> > #define VHOST_USER_F_PROTOCOL_FEATURES 30
> > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > struct rte_vhost_mem_region regions[];
> > };
> >
> > +typedef struct VhostUserInflightEntry {
> > + uint8_t inflight;
> > +} VhostUserInflightEntry;
> > +
> > +typedef struct VhostInflightInfo {
> > + uint16_t version;
> > + uint16_t last_inflight_io;
> > + uint16_t used_idx;
> > + VhostUserInflightEntry desc[0];
> > +} VhostInflightInfo;
>
> Is there any details on above structure? Why does it not match
> QueueRegionSplit or QueueRegionPacked structures described in
> qemu/docs/interop/vhost-user.txt?
Qemu have its vhost-user backend,qemu did the submission of IO in it.
The implementation of dpdk is more general. It is just to mark inflight entry.
The submission of inflight entry is handle over to different backends.
They have their own ways to handle it, such as spdk.
So there are some differences in data structure.
>
> > +
> > struct rte_vhost_vring {
> > struct vring_desc *desc;
> > struct vring_avail *avail;
> > struct vring_used *used;
> > uint64_t log_guest_addr;
> >
> > + VhostInflightInfo *inflight;
> > + int inflight_flag;
> > +
> > /** Deprecated, use rte_vhost_vring_call() instead. */
> > int callfd;
> >
> > @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> > int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >
> > /**
> > + * set inflight flag for a entry.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> > + uint16_t idx);
> > +
> > +/**
> > + * clear inflight flag for a entry.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param last_used_idx
> > + * next free used_idx
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > + uint16_t last_used_idx, uint16_t idx);
> > +
> > +/**
> > + * set last inflight io index.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> > + uint16_t idx);
>
> New APIs should be experimental and also need to be
> added in rte_vhost_version.map file.
>
> > +
> > +/**
> > * Get vhost RX queue avail count.
> > *
> > * @param vid
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 163f4595e..9ba692935 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> > close(vq->callfd);
> > if (vq->kickfd >= 0)
> > close(vq->kickfd);
> > + if (vq->inflight)
> > + vq->inflight = NULL;
> > }
> >
> > /*
> > @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> > vring->kickfd = vq->kickfd;
> > vring->size = vq->size;
> >
> > + vring->inflight = vq->inflight;
> > +
> > + vring->inflight_flag = vq->inflight_flag;
> > + vq->inflight_flag = 0;
>
> This will break the ABI. Better to introduce a new API to do this.
>
> > +
> > return 0;
> > }
> >
> > @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> > return 0;
> > }
> >
> > +void
> > +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > + if (unlikely(!inflight))
> > + return;
> > + inflight->desc[idx].inflight = 1;
> > +}
> > +
> > +void
> > +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > + uint16_t last_used_idx, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > +
> > + if (unlikely(!inflight))
> > + return;
> > +
> > + rte_compiler_barrier();
> > + inflight->desc[idx].inflight = 0;
> > + rte_compiler_barrier();
> > + inflight->used_idx = last_used_idx;
> > +}
> > +
> > +void
> > +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > +
> > + if (unlikely(!inflight))
> > + return;
> > +
> > + inflight->last_inflight_io = idx;
> > +}
>
> The rte_vhost_vring is used to return information to apps.
>
> IIUC, you want to update vq->inflight. So the rte_vhost_vring
> param should be replaced by vid + vring_idx.
>
> > +
> > uint16_t
> > rte_vhost_avail_entries(int vid, uint16_t queue_id)
> > {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index e9138dfab..537d74c71 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -128,6 +128,10 @@ struct vhost_virtqueue {
> > /* Physical address of used ring, for logging */
> > uint64_t log_guest_addr;
> >
> > + /* Inflight share memory info */
> > + VhostInflightInfo *inflight;
> > + bool inflight_flag;
> > +
> > uint16_t nr_zmbuf;
> > uint16_t zmbuf_size;
> > uint16_t last_zmbuf_idx;
> > @@ -286,6 +290,12 @@ struct guest_page {
> > uint64_t size;
> > };
> >
> > +typedef struct VuDevInflightInfo {
> > + int fd;
> > + void *addr;
> > + uint64_t size;
> > +} VuDevInflightInfo;
>
> Please follow DPDK's coding style when defining internal structure.
>
> > +
> > /**
> > * Device structure contains all configuration information relating
> > * to the device.
> > @@ -303,6 +313,7 @@ struct virtio_net {
> > uint32_t nr_vring;
> > int dequeue_zero_copy;
> > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > + VuDevInflightInfo inflight_info;
> > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > char ifname[IF_NAME_SZ];
> > uint64_t log_size;
> > diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> [...]
> > +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> > +{
> > + return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> > + sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> > +}
> > +
> > +static int
> > +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > + int main_fd __rte_unused)
> > +{
> > + int fd;
> > + uint64_t mmap_size;
> > + void *addr;
> > + uint16_t num_queues, queue_size;
> > + struct virtio_net *dev = *pdev;
> > +
> > + if (msg->size != sizeof(msg->payload.inflight)) {
> > + RTE_LOG(ERR, VHOST_CONFIG,
> > + "Invalid get_inflight_fd message size is %d",
> > + msg->size);
> > + msg->payload.inflight.mmap_size = 0;
>
> In this case, you can't touch the payload.
>
> > + return 0;
>
> And an error should be returned.
>
> > + }
> > +
> > + num_queues = msg->payload.inflight.num_queues;
> > + queue_size = msg->payload.inflight.queue_size;
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> > + msg->payload.inflight.num_queues);
> > + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> > + msg->payload.inflight.queue_size);
> > + mmap_size = num_queues * get_pervq_shm_size(queue_size);
> > +
> > + addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> > + if (!addr) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> > + msg->payload.inflight.mmap_size = 0;
> > + return 0;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > + }
> > + memset(addr, 0, mmap_size);
> > +
> > + dev->inflight_info.addr = addr;
> > + dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> > + dev->inflight_info.fd = msg->fds[0] = fd;
> > + msg->payload.inflight.mmap_offset = 0;
> > + msg->fd_num = 1;
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight mmap_size: %lu\n",
> > + msg->payload.inflight.mmap_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight mmap_offset: %lu\n",
> > + msg->payload.inflight.mmap_offset);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight fd: %d\n", msg->fds[0]);
> > +
> > + return RTE_VHOST_MSG_RESULT_REPLY;
> > +}
> > +
> > +static int
> > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > + int main_fd __rte_unused)
> > +{
> > + int fd, i;
> > + uint64_t mmap_size, mmap_offset;
> > + uint16_t num_queues, queue_size;
> > + uint32_t pervq_inflight_size;
> > + void *rc;
> > + struct vhost_virtqueue *vq;
> > + struct virtio_net *dev = *pdev;
> > +
> > + fd = msg->fds[0];
> > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > + msg->size, fd);
> > + return -1;
>
> Ditto.
>
> > + }
> > +
> > + mmap_size = msg->payload.inflight.mmap_size;
> > + mmap_offset = msg->payload.inflight.mmap_offset;
> > + num_queues = msg->payload.inflight.num_queues;
> > + queue_size = msg->payload.inflight.queue_size;
> > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd num_queues: %u\n", num_queues);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd queue_size: %u\n", queue_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd fd: %d\n", fd);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd pervq_inflight_size: %d\n",
> > + pervq_inflight_size);
> > +
> > + if (dev->inflight_info.addr)
> > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +
> > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > + fd, mmap_offset);
>
> Why call it rc? Maybe addr is a better name?
In some scenarios, shared memory is reallocated or resized by qemu, so
again mmap is needed.
>
> > + if (rc == MAP_FAILED) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > + return -1;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > + }
> > +
> > + if (dev->inflight_info.fd)
> > + close(dev->inflight_info.fd);
> > +
> > + dev->inflight_info.fd = fd;
> > + dev->inflight_info.addr = rc;
> > + dev->inflight_info.size = mmap_size;
> > +
> > + for (i = 0; i < num_queues; i++) {
> > + vq = dev->virtqueue[i];
> > + vq->inflight = (VhostInflightInfo *)rc;
> > + rc = (void *)((char *)rc + pervq_inflight_size);
> > + }
> > +
> > + return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> > static int
> > vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > int main_fd __rte_unused)
> > @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
> > }
> >
> > static int
> > +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> > +{
> > + uint16_t i = 0;
> > +
> > + if ((!vq->inflight))
> > + return RTE_VHOST_MSG_RESULT_ERR;
>
> Should check whether the feature is negotiated first.
>
> > +
> > + if (!vq->inflight->version) {
> > + vq->inflight->version = INFLIGHT_VERSION;
> > + return RTE_VHOST_MSG_RESULT_OK;
> > + }
> > +
> > + for (i = 0; i < vq->size; i++) {
> > + if (vq->inflight->desc[i].inflight == 1) {
> > + vq->inflight_flag = 1;
> > + break;
> > + }
> > + }
> > +
> > + return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> > +static int
> > vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > int main_fd __rte_unused)
> > {
> > @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > close(vq->kickfd);
> > vq->kickfd = file.fd;
> >
> > + if (vhost_check_queue_inflights(vq)) {
> > + RTE_LOG(ERR, VHOST_CONFIG,
> > + "Failed to inflights for vq: %d\n", file.index);
> > + return RTE_VHOST_MSG_RESULT_ERR;
> > + }
> > +
> > return RTE_VHOST_MSG_RESULT_OK;
> > }
> >
> > @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
> > [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
> > [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
> > [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> > + [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> > + [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
> > };
> >
> >
> > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > index 2a650fe4b..35f969b1b 100644
> > --- a/lib/librte_vhost/vhost_user.h
> > +++ b/lib/librte_vhost/vhost_user.h
> > @@ -23,7 +23,8 @@
> > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
>
> It will advertise this feature for builtin net and crypto
> backends. It's probably not what you intended.
>
Indeed, this feature is mainly used for spdk-like backends. You mean
this function is disabled by default?
> >
> > typedef enum VhostUserRequest {
> > VHOST_USER_NONE = 0,
> > @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
> > VHOST_USER_POSTCOPY_ADVISE = 28,
> > VHOST_USER_POSTCOPY_LISTEN = 29,
> > VHOST_USER_POSTCOPY_END = 30,
> > - VHOST_USER_MAX = 31
> > + VHOST_USER_GET_INFLIGHT_FD = 31,
> > + VHOST_USER_SET_INFLIGHT_FD = 32,
> > + VHOST_USER_MAX = 33
> > } VhostUserRequest;
> >
> > typedef enum VhostUserSlaveRequest {
> > @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
> > uint64_t offset;
> > } VhostUserVringArea;
> >
> > +typedef struct VhostUserInflight {
> > + uint64_t mmap_size;
> > + uint64_t mmap_offset;
> > + uint16_t num_queues;
> > + uint16_t queue_size;
> > +} VhostUserInflight;
> > +
> > typedef struct VhostUserMsg {
> > union {
> > uint32_t master; /* a VhostUserRequest value */
> > @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
> > struct vhost_vring_addr addr;
> > VhostUserMemory memory;
> > VhostUserLog log;
> > + VhostUserInflight inflight;
> > struct vhost_iotlb_msg iotlb;
> > VhostUserCryptoSessionParam crypto_session;
> > VhostUserVringArea area;
> > @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
> > /* vhost_user.c */
> > int vhost_user_msg_handler(int vid, int fd);
> > int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> > +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
> >
> > /* socket.c */
> > int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> > --
> > 2.11.0
> >
Thank you for your comments. They will be revised in the next version.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-29 4:07 ` lin li
@ 2019-04-29 4:07 ` lin li
2019-04-29 10:54 ` Tiwei Bie
1 sibling, 0 replies; 25+ messages in thread
From: lin li @ 2019-04-29 4:07 UTC (permalink / raw)
To: Tiwei Bie
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu
Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
>
> On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> > From: lilin24 <lilin24@baidu.com>
> >
> > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> > and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> > buffer between qemu and backend.
> >
> > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> > shared buffer from backend. Then qemu should send it back
> > through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> >
> > This shared buffer is used to process inflight I/O when backend
> > reconnect.
> >
> > Signed-off-by: lilin24 <lilin24@baidu.com>
>
> You need to use your real name here.
>
> > Signed-off-by: Ni Xun <nixun@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> >
> > v2:
> > - add set&clr inflight entry function
> > v3:
> > - simplified some function implementations
> >
> > ---
>
> You can put change logs here (i.e. after ---).
>
> > lib/librte_vhost/rte_vhost.h | 53 ++++++++++
> > lib/librte_vhost/vhost.c | 42 ++++++++
> > lib/librte_vhost/vhost.h | 11 ++
> > lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
> > lib/librte_vhost/vhost_user.h | 16 ++-
> > 5 files changed, 351 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> > index d2c0c93f4..bc25591a8 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -71,6 +71,10 @@ extern "C" {
> > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> > #endif
> >
> > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> > +#endif
> > +
> > /** Indicate whether protocol features negotiation is supported. */
> > #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> > #define VHOST_USER_F_PROTOCOL_FEATURES 30
> > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > struct rte_vhost_mem_region regions[];
> > };
> >
> > +typedef struct VhostUserInflightEntry {
> > + uint8_t inflight;
> > +} VhostUserInflightEntry;
> > +
> > +typedef struct VhostInflightInfo {
> > + uint16_t version;
> > + uint16_t last_inflight_io;
> > + uint16_t used_idx;
> > + VhostUserInflightEntry desc[0];
> > +} VhostInflightInfo;
>
> Is there any details on above structure? Why does it not match
> QueueRegionSplit or QueueRegionPacked structures described in
> qemu/docs/interop/vhost-user.txt?
Qemu have its vhost-user backend,qemu did the submission of IO in it.
The implementation of dpdk is more general. It is just to mark inflight entry.
The submission of inflight entry is handle over to different backends.
They have their own ways to handle it, such as spdk.
So there are some differences in data structure.
>
> > +
> > struct rte_vhost_vring {
> > struct vring_desc *desc;
> > struct vring_avail *avail;
> > struct vring_used *used;
> > uint64_t log_guest_addr;
> >
> > + VhostInflightInfo *inflight;
> > + int inflight_flag;
> > +
> > /** Deprecated, use rte_vhost_vring_call() instead. */
> > int callfd;
> >
> > @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> > int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >
> > /**
> > + * set inflight flag for a entry.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> > + uint16_t idx);
> > +
> > +/**
> > + * clear inflight flag for a entry.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param last_used_idx
> > + * next free used_idx
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > + uint16_t last_used_idx, uint16_t idx);
> > +
> > +/**
> > + * set last inflight io index.
> > + *
> > + * @param vring
> > + * the structure to hold the requested vring info
> > + * @param idx
> > + * inflight entry index
> > + */
> > +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> > + uint16_t idx);
>
> New APIs should be experimental and also need to be
> added in rte_vhost_version.map file.
>
> > +
> > +/**
> > * Get vhost RX queue avail count.
> > *
> > * @param vid
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 163f4595e..9ba692935 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> > close(vq->callfd);
> > if (vq->kickfd >= 0)
> > close(vq->kickfd);
> > + if (vq->inflight)
> > + vq->inflight = NULL;
> > }
> >
> > /*
> > @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> > vring->kickfd = vq->kickfd;
> > vring->size = vq->size;
> >
> > + vring->inflight = vq->inflight;
> > +
> > + vring->inflight_flag = vq->inflight_flag;
> > + vq->inflight_flag = 0;
>
> This will break the ABI. Better to introduce a new API to do this.
>
> > +
> > return 0;
> > }
> >
> > @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> > return 0;
> > }
> >
> > +void
> > +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > + if (unlikely(!inflight))
> > + return;
> > + inflight->desc[idx].inflight = 1;
> > +}
> > +
> > +void
> > +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > + uint16_t last_used_idx, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > +
> > + if (unlikely(!inflight))
> > + return;
> > +
> > + rte_compiler_barrier();
> > + inflight->desc[idx].inflight = 0;
> > + rte_compiler_barrier();
> > + inflight->used_idx = last_used_idx;
> > +}
> > +
> > +void
> > +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > + VhostInflightInfo *inflight = vring->inflight;
> > +
> > + if (unlikely(!inflight))
> > + return;
> > +
> > + inflight->last_inflight_io = idx;
> > +}
>
> The rte_vhost_vring is used to return information to apps.
>
> IIUC, you want to update vq->inflight. So the rte_vhost_vring
> param should be replaced by vid + vring_idx.
>
> > +
> > uint16_t
> > rte_vhost_avail_entries(int vid, uint16_t queue_id)
> > {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index e9138dfab..537d74c71 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -128,6 +128,10 @@ struct vhost_virtqueue {
> > /* Physical address of used ring, for logging */
> > uint64_t log_guest_addr;
> >
> > + /* Inflight share memory info */
> > + VhostInflightInfo *inflight;
> > + bool inflight_flag;
> > +
> > uint16_t nr_zmbuf;
> > uint16_t zmbuf_size;
> > uint16_t last_zmbuf_idx;
> > @@ -286,6 +290,12 @@ struct guest_page {
> > uint64_t size;
> > };
> >
> > +typedef struct VuDevInflightInfo {
> > + int fd;
> > + void *addr;
> > + uint64_t size;
> > +} VuDevInflightInfo;
>
> Please follow DPDK's coding style when defining internal structure.
>
> > +
> > /**
> > * Device structure contains all configuration information relating
> > * to the device.
> > @@ -303,6 +313,7 @@ struct virtio_net {
> > uint32_t nr_vring;
> > int dequeue_zero_copy;
> > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > + VuDevInflightInfo inflight_info;
> > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > char ifname[IF_NAME_SZ];
> > uint64_t log_size;
> > diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> [...]
> > +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> > +{
> > + return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> > + sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> > +}
> > +
> > +static int
> > +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > + int main_fd __rte_unused)
> > +{
> > + int fd;
> > + uint64_t mmap_size;
> > + void *addr;
> > + uint16_t num_queues, queue_size;
> > + struct virtio_net *dev = *pdev;
> > +
> > + if (msg->size != sizeof(msg->payload.inflight)) {
> > + RTE_LOG(ERR, VHOST_CONFIG,
> > + "Invalid get_inflight_fd message size is %d",
> > + msg->size);
> > + msg->payload.inflight.mmap_size = 0;
>
> In this case, you can't touch the payload.
>
> > + return 0;
>
> And an error should be returned.
>
> > + }
> > +
> > + num_queues = msg->payload.inflight.num_queues;
> > + queue_size = msg->payload.inflight.queue_size;
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> > + msg->payload.inflight.num_queues);
> > + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> > + msg->payload.inflight.queue_size);
> > + mmap_size = num_queues * get_pervq_shm_size(queue_size);
> > +
> > + addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> > + if (!addr) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> > + msg->payload.inflight.mmap_size = 0;
> > + return 0;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > + }
> > + memset(addr, 0, mmap_size);
> > +
> > + dev->inflight_info.addr = addr;
> > + dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> > + dev->inflight_info.fd = msg->fds[0] = fd;
> > + msg->payload.inflight.mmap_offset = 0;
> > + msg->fd_num = 1;
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight mmap_size: %lu\n",
> > + msg->payload.inflight.mmap_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight mmap_offset: %lu\n",
> > + msg->payload.inflight.mmap_offset);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "send inflight fd: %d\n", msg->fds[0]);
> > +
> > + return RTE_VHOST_MSG_RESULT_REPLY;
> > +}
> > +
> > +static int
> > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > + int main_fd __rte_unused)
> > +{
> > + int fd, i;
> > + uint64_t mmap_size, mmap_offset;
> > + uint16_t num_queues, queue_size;
> > + uint32_t pervq_inflight_size;
> > + void *rc;
> > + struct vhost_virtqueue *vq;
> > + struct virtio_net *dev = *pdev;
> > +
> > + fd = msg->fds[0];
> > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > + msg->size, fd);
> > + return -1;
>
> Ditto.
>
> > + }
> > +
> > + mmap_size = msg->payload.inflight.mmap_size;
> > + mmap_offset = msg->payload.inflight.mmap_offset;
> > + num_queues = msg->payload.inflight.num_queues;
> > + queue_size = msg->payload.inflight.queue_size;
> > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > +
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd num_queues: %u\n", num_queues);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd queue_size: %u\n", queue_size);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd fd: %d\n", fd);
> > + RTE_LOG(INFO, VHOST_CONFIG,
> > + "set_inflight_fd pervq_inflight_size: %d\n",
> > + pervq_inflight_size);
> > +
> > + if (dev->inflight_info.addr)
> > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +
> > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > + fd, mmap_offset);
>
> Why call it rc? Maybe addr is a better name?
In some scenarios, shared memory is reallocated or resized by qemu, so
again mmap is needed.
>
> > + if (rc == MAP_FAILED) {
> > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > + return -1;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > + }
> > +
> > + if (dev->inflight_info.fd)
> > + close(dev->inflight_info.fd);
> > +
> > + dev->inflight_info.fd = fd;
> > + dev->inflight_info.addr = rc;
> > + dev->inflight_info.size = mmap_size;
> > +
> > + for (i = 0; i < num_queues; i++) {
> > + vq = dev->virtqueue[i];
> > + vq->inflight = (VhostInflightInfo *)rc;
> > + rc = (void *)((char *)rc + pervq_inflight_size);
> > + }
> > +
> > + return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> > static int
> > vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > int main_fd __rte_unused)
> > @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
> > }
> >
> > static int
> > +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> > +{
> > + uint16_t i = 0;
> > +
> > + if ((!vq->inflight))
> > + return RTE_VHOST_MSG_RESULT_ERR;
>
> Should check whether the feature is negotiated first.
>
> > +
> > + if (!vq->inflight->version) {
> > + vq->inflight->version = INFLIGHT_VERSION;
> > + return RTE_VHOST_MSG_RESULT_OK;
> > + }
> > +
> > + for (i = 0; i < vq->size; i++) {
> > + if (vq->inflight->desc[i].inflight == 1) {
> > + vq->inflight_flag = 1;
> > + break;
> > + }
> > + }
> > +
> > + return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> > +static int
> > vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > int main_fd __rte_unused)
> > {
> > @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> > close(vq->kickfd);
> > vq->kickfd = file.fd;
> >
> > + if (vhost_check_queue_inflights(vq)) {
> > + RTE_LOG(ERR, VHOST_CONFIG,
> > + "Failed to inflights for vq: %d\n", file.index);
> > + return RTE_VHOST_MSG_RESULT_ERR;
> > + }
> > +
> > return RTE_VHOST_MSG_RESULT_OK;
> > }
> >
> > @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
> > [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
> > [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
> > [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> > + [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> > + [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
> > };
> >
> >
> > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > index 2a650fe4b..35f969b1b 100644
> > --- a/lib/librte_vhost/vhost_user.h
> > +++ b/lib/librte_vhost/vhost_user.h
> > @@ -23,7 +23,8 @@
> > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
>
> It will advertise this feature for builtin net and crypto
> backends. It's probably not what you intended.
>
Indeed, this feature is mainly used for spdk-like backends. You mean
this function is disabled by default?
> >
> > typedef enum VhostUserRequest {
> > VHOST_USER_NONE = 0,
> > @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
> > VHOST_USER_POSTCOPY_ADVISE = 28,
> > VHOST_USER_POSTCOPY_LISTEN = 29,
> > VHOST_USER_POSTCOPY_END = 30,
> > - VHOST_USER_MAX = 31
> > + VHOST_USER_GET_INFLIGHT_FD = 31,
> > + VHOST_USER_SET_INFLIGHT_FD = 32,
> > + VHOST_USER_MAX = 33
> > } VhostUserRequest;
> >
> > typedef enum VhostUserSlaveRequest {
> > @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
> > uint64_t offset;
> > } VhostUserVringArea;
> >
> > +typedef struct VhostUserInflight {
> > + uint64_t mmap_size;
> > + uint64_t mmap_offset;
> > + uint16_t num_queues;
> > + uint16_t queue_size;
> > +} VhostUserInflight;
> > +
> > typedef struct VhostUserMsg {
> > union {
> > uint32_t master; /* a VhostUserRequest value */
> > @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
> > struct vhost_vring_addr addr;
> > VhostUserMemory memory;
> > VhostUserLog log;
> > + VhostUserInflight inflight;
> > struct vhost_iotlb_msg iotlb;
> > VhostUserCryptoSessionParam crypto_session;
> > VhostUserVringArea area;
> > @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
> > /* vhost_user.c */
> > int vhost_user_msg_handler(int vid, int fd);
> > int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> > +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
> >
> > /* socket.c */
> > int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> > --
> > 2.11.0
> >
Thank you for your comments. They will be revised in the next version.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-29 4:07 ` lin li
2019-04-29 4:07 ` lin li
@ 2019-04-29 10:54 ` Tiwei Bie
2019-04-29 10:54 ` Tiwei Bie
2019-04-30 8:37 ` lin li
1 sibling, 2 replies; 25+ messages in thread
From: Tiwei Bie @ 2019-04-29 10:54 UTC (permalink / raw)
To: lin li
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu, mst
On Mon, Apr 29, 2019 at 12:07:05PM +0800, lin li wrote:
> Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
> > On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
[...]
> > > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > > struct rte_vhost_mem_region regions[];
> > > };
> > >
> > > +typedef struct VhostUserInflightEntry {
> > > + uint8_t inflight;
> > > +} VhostUserInflightEntry;
> > > +
> > > +typedef struct VhostInflightInfo {
> > > + uint16_t version;
> > > + uint16_t last_inflight_io;
> > > + uint16_t used_idx;
> > > + VhostUserInflightEntry desc[0];
> > > +} VhostInflightInfo;
> >
> > Is there any details on above structure? Why does it not match
> > QueueRegionSplit or QueueRegionPacked structures described in
> > qemu/docs/interop/vhost-user.txt?
>
> Qemu have its vhost-user backend,
Do you mean contrib/libvhost-user in QEMU?
> qemu did the submission of IO in it.
What does this mean?
> The implementation of dpdk is more general. It is just to mark inflight entry.
Based on the discussion in below threads:
https://patchwork.kernel.org/patch/10753893/
https://patchwork.kernel.org/patch/10817735/
IIUC, above structure is the extra buffer allocated by slave to
store the information of inflight descriptors and share with master
for persistent. All vhost-user backends' implementation should
follow the vhost-user protocol (including the format of above
structure) defined in qemu/docs/interop/vhost-user.txt. Right?
> The submission of inflight entry is handle over to different backends.
> They have their own ways to handle it, such as spdk.
> So there are some differences in data structure.
>
> >
[...]
> > > +static int
> > > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > > + int main_fd __rte_unused)
> > > +{
> > > + int fd, i;
> > > + uint64_t mmap_size, mmap_offset;
> > > + uint16_t num_queues, queue_size;
> > > + uint32_t pervq_inflight_size;
> > > + void *rc;
> > > + struct vhost_virtqueue *vq;
> > > + struct virtio_net *dev = *pdev;
> > > +
> > > + fd = msg->fds[0];
> > > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > > + msg->size, fd);
> > > + return -1;
> >
> > Ditto.
> >
> > > + }
> > > +
> > > + mmap_size = msg->payload.inflight.mmap_size;
> > > + mmap_offset = msg->payload.inflight.mmap_offset;
> > > + num_queues = msg->payload.inflight.num_queues;
> > > + queue_size = msg->payload.inflight.queue_size;
> > > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > > +
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd num_queues: %u\n", num_queues);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd queue_size: %u\n", queue_size);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd fd: %d\n", fd);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd pervq_inflight_size: %d\n",
> > > + pervq_inflight_size);
> > > +
> > > + if (dev->inflight_info.addr)
> > > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > > +
> > > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > + fd, mmap_offset);
> >
> > Why call it rc? Maybe addr is a better name?
>
> In some scenarios, shared memory is reallocated or resized by qemu, so
> again mmap is needed.
In which case the shared memory will be reallocated or resized by QEMU?
>
> >
> > > + if (rc == MAP_FAILED) {
> > > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > > + return -1;
> >
> > Should always return RTE_VHOST_MSG_RESULT_* in
> > message handler.
> >
> > > + }
> > > +
> > > + if (dev->inflight_info.fd)
> > > + close(dev->inflight_info.fd);
> > > +
> > > + dev->inflight_info.fd = fd;
> > > + dev->inflight_info.addr = rc;
> > > + dev->inflight_info.size = mmap_size;
> > > +
> > > + for (i = 0; i < num_queues; i++) {
> > > + vq = dev->virtqueue[i];
> > > + vq->inflight = (VhostInflightInfo *)rc;
> > > + rc = (void *)((char *)rc + pervq_inflight_size);
> > > + }
> > > +
> > > + return RTE_VHOST_MSG_RESULT_OK;
> > > +}
[...]
> > > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > > index 2a650fe4b..35f969b1b 100644
> > > --- a/lib/librte_vhost/vhost_user.h
> > > +++ b/lib/librte_vhost/vhost_user.h
> > > @@ -23,7 +23,8 @@
> > > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
> >
> > It will advertise this feature for builtin net and crypto
> > backends. It's probably not what you intended.
> >
>
> Indeed, this feature is mainly used for spdk-like backends. You mean
> this function is disabled by default?
External backends should use rte_vhost_driver_set_protocol_features()
to advertise the protocol features they support.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-29 10:54 ` Tiwei Bie
@ 2019-04-29 10:54 ` Tiwei Bie
2019-04-30 8:37 ` lin li
1 sibling, 0 replies; 25+ messages in thread
From: Tiwei Bie @ 2019-04-29 10:54 UTC (permalink / raw)
To: lin li
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu, mst
On Mon, Apr 29, 2019 at 12:07:05PM +0800, lin li wrote:
> Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
> > On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
[...]
> > > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > > struct rte_vhost_mem_region regions[];
> > > };
> > >
> > > +typedef struct VhostUserInflightEntry {
> > > + uint8_t inflight;
> > > +} VhostUserInflightEntry;
> > > +
> > > +typedef struct VhostInflightInfo {
> > > + uint16_t version;
> > > + uint16_t last_inflight_io;
> > > + uint16_t used_idx;
> > > + VhostUserInflightEntry desc[0];
> > > +} VhostInflightInfo;
> >
> > Is there any details on above structure? Why does it not match
> > QueueRegionSplit or QueueRegionPacked structures described in
> > qemu/docs/interop/vhost-user.txt?
>
> Qemu have its vhost-user backend,
Do you mean contrib/libvhost-user in QEMU?
> qemu did the submission of IO in it.
What does this mean?
> The implementation of dpdk is more general. It is just to mark inflight entry.
Based on the discussion in below threads:
https://patchwork.kernel.org/patch/10753893/
https://patchwork.kernel.org/patch/10817735/
IIUC, above structure is the extra buffer allocated by slave to
store the information of inflight descriptors and share with master
for persistent. All vhost-user backends' implementation should
follow the vhost-user protocol (including the format of above
structure) defined in qemu/docs/interop/vhost-user.txt. Right?
> The submission of inflight entry is handle over to different backends.
> They have their own ways to handle it, such as spdk.
> So there are some differences in data structure.
>
> >
[...]
> > > +static int
> > > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > > + int main_fd __rte_unused)
> > > +{
> > > + int fd, i;
> > > + uint64_t mmap_size, mmap_offset;
> > > + uint16_t num_queues, queue_size;
> > > + uint32_t pervq_inflight_size;
> > > + void *rc;
> > > + struct vhost_virtqueue *vq;
> > > + struct virtio_net *dev = *pdev;
> > > +
> > > + fd = msg->fds[0];
> > > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > > + msg->size, fd);
> > > + return -1;
> >
> > Ditto.
> >
> > > + }
> > > +
> > > + mmap_size = msg->payload.inflight.mmap_size;
> > > + mmap_offset = msg->payload.inflight.mmap_offset;
> > > + num_queues = msg->payload.inflight.num_queues;
> > > + queue_size = msg->payload.inflight.queue_size;
> > > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > > +
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd num_queues: %u\n", num_queues);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd queue_size: %u\n", queue_size);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd fd: %d\n", fd);
> > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > + "set_inflight_fd pervq_inflight_size: %d\n",
> > > + pervq_inflight_size);
> > > +
> > > + if (dev->inflight_info.addr)
> > > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > > +
> > > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > + fd, mmap_offset);
> >
> > Why call it rc? Maybe addr is a better name?
>
> In some scenarios, shared memory is reallocated or resized by qemu, so
> again mmap is needed.
In which case the shared memory will be reallocated or resized by QEMU?
>
> >
> > > + if (rc == MAP_FAILED) {
> > > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > > + return -1;
> >
> > Should always return RTE_VHOST_MSG_RESULT_* in
> > message handler.
> >
> > > + }
> > > +
> > > + if (dev->inflight_info.fd)
> > > + close(dev->inflight_info.fd);
> > > +
> > > + dev->inflight_info.fd = fd;
> > > + dev->inflight_info.addr = rc;
> > > + dev->inflight_info.size = mmap_size;
> > > +
> > > + for (i = 0; i < num_queues; i++) {
> > > + vq = dev->virtqueue[i];
> > > + vq->inflight = (VhostInflightInfo *)rc;
> > > + rc = (void *)((char *)rc + pervq_inflight_size);
> > > + }
> > > +
> > > + return RTE_VHOST_MSG_RESULT_OK;
> > > +}
[...]
> > > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > > index 2a650fe4b..35f969b1b 100644
> > > --- a/lib/librte_vhost/vhost_user.h
> > > +++ b/lib/librte_vhost/vhost_user.h
> > > @@ -23,7 +23,8 @@
> > > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
> >
> > It will advertise this feature for builtin net and crypto
> > backends. It's probably not what you intended.
> >
>
> Indeed, this feature is mainly used for spdk-like backends. You mean
> this function is disabled by default?
External backends should use rte_vhost_driver_set_protocol_features()
to advertise the protocol features they support.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-29 10:54 ` Tiwei Bie
2019-04-29 10:54 ` Tiwei Bie
@ 2019-04-30 8:37 ` lin li
2019-04-30 8:37 ` lin li
1 sibling, 1 reply; 25+ messages in thread
From: lin li @ 2019-04-30 8:37 UTC (permalink / raw)
To: Tiwei Bie
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu, mst,
xieyongji
Tiwei Bie <tiwei.bie@intel.com> 于2019年4月29日周一 下午6:55写道:
>
> On Mon, Apr 29, 2019 at 12:07:05PM +0800, lin li wrote:
> > Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
> > > On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> [...]
> > > > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > > > struct rte_vhost_mem_region regions[];
> > > > };
> > > >
> > > > +typedef struct VhostUserInflightEntry {
> > > > + uint8_t inflight;
> > > > +} VhostUserInflightEntry;
> > > > +
> > > > +typedef struct VhostInflightInfo {
> > > > + uint16_t version;
> > > > + uint16_t last_inflight_io;
> > > > + uint16_t used_idx;
> > > > + VhostUserInflightEntry desc[0];
> > > > +} VhostInflightInfo;
> > >
> > > Is there any details on above structure? Why does it not match
> > > QueueRegionSplit or QueueRegionPacked structures described in
> > > qemu/docs/interop/vhost-user.txt?
> >
> > Qemu have its vhost-user backend,
>
> Do you mean contrib/libvhost-user in QEMU?
>
> > qemu did the submission of IO in it.
>
> What does this mean?
>
> > The implementation of dpdk is more general. It is just to mark inflight entry.
>
> Based on the discussion in below threads:
>
> https://patchwork.kernel.org/patch/10753893/
> https://patchwork.kernel.org/patch/10817735/
>
> IIUC, above structure is the extra buffer allocated by slave to
> store the information of inflight descriptors and share with master
> for persistent. All vhost-user backends' implementation should
> follow the vhost-user protocol (including the format of above
> structure) defined in qemu/docs/interop/vhost-user.txt. Right?
Yes you're right. I've confirmed with QEMU that these data structures
are consistent with qemu.
It will be consistent with QEMU in the next version.
>
> > The submission of inflight entry is handle over to different backends.
> > They have their own ways to handle it, such as spdk.
> > So there are some differences in data structure.
> >
> > >
> [...]
> > > > +static int
> > > > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > > > + int main_fd __rte_unused)
> > > > +{
> > > > + int fd, i;
> > > > + uint64_t mmap_size, mmap_offset;
> > > > + uint16_t num_queues, queue_size;
> > > > + uint32_t pervq_inflight_size;
> > > > + void *rc;
> > > > + struct vhost_virtqueue *vq;
> > > > + struct virtio_net *dev = *pdev;
> > > > +
> > > > + fd = msg->fds[0];
> > > > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > > > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > > > + msg->size, fd);
> > > > + return -1;
> > >
> > > Ditto.
> > >
> > > > + }
> > > > +
> > > > + mmap_size = msg->payload.inflight.mmap_size;
> > > > + mmap_offset = msg->payload.inflight.mmap_offset;
> > > > + num_queues = msg->payload.inflight.num_queues;
> > > > + queue_size = msg->payload.inflight.queue_size;
> > > > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > > > +
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd num_queues: %u\n", num_queues);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd queue_size: %u\n", queue_size);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd fd: %d\n", fd);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd pervq_inflight_size: %d\n",
> > > > + pervq_inflight_size);
> > > > +
> > > > + if (dev->inflight_info.addr)
> > > > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > > > +
> > > > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > + fd, mmap_offset);
> > >
> > > Why call it rc? Maybe addr is a better name?
> >
> > In some scenarios, shared memory is reallocated or resized by qemu, so
> > again mmap is needed.
>
> In which case the shared memory will be reallocated or resized by QEMU?
QEMU replies that it needs to copy inflight share memory data when
live migration occurs.
If the size of inflight buffer on two machines is not the same, QEMU
will reallocate memory according to the old inflight buffer size.
>
> >
> > >
> > > > + if (rc == MAP_FAILED) {
> > > > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > > > + return -1;
> > >
> > > Should always return RTE_VHOST_MSG_RESULT_* in
> > > message handler.
> > >
> > > > + }
> > > > +
> > > > + if (dev->inflight_info.fd)
> > > > + close(dev->inflight_info.fd);
> > > > +
> > > > + dev->inflight_info.fd = fd;
> > > > + dev->inflight_info.addr = rc;
> > > > + dev->inflight_info.size = mmap_size;
> > > > +
> > > > + for (i = 0; i < num_queues; i++) {
> > > > + vq = dev->virtqueue[i];
> > > > + vq->inflight = (VhostInflightInfo *)rc;
> > > > + rc = (void *)((char *)rc + pervq_inflight_size);
> > > > + }
> > > > +
> > > > + return RTE_VHOST_MSG_RESULT_OK;
> > > > +}
> [...]
> > > > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > > > index 2a650fe4b..35f969b1b 100644
> > > > --- a/lib/librte_vhost/vhost_user.h
> > > > +++ b/lib/librte_vhost/vhost_user.h
> > > > @@ -23,7 +23,8 @@
> > > > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > > > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > > > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > > > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > > > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
> > >
> > > It will advertise this feature for builtin net and crypto
> > > backends. It's probably not what you intended.
> > >
> >
> > Indeed, this feature is mainly used for spdk-like backends. You mean
> > this function is disabled by default?
>
> External backends should use rte_vhost_driver_set_protocol_features()
> to advertise the protocol features they support.
Thanks,It will be modified in the next version.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
2019-04-30 8:37 ` lin li
@ 2019-04-30 8:37 ` lin li
0 siblings, 0 replies; 25+ messages in thread
From: lin li @ 2019-04-30 8:37 UTC (permalink / raw)
To: Tiwei Bie
Cc: maxime.coquelin, zhihong.wang, dev, dariusz.stojaczyk,
changpeng.liu, james.r.harris, lilin24, Ni Xun, Zhang Yu, mst,
xieyongji
Tiwei Bie <tiwei.bie@intel.com> 于2019年4月29日周一 下午6:55写道:
>
> On Mon, Apr 29, 2019 at 12:07:05PM +0800, lin li wrote:
> > Tiwei Bie <tiwei.bie@intel.com> 于2019年4月28日周日 下午7:18写道:
> > > On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> [...]
> > > > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> > > > struct rte_vhost_mem_region regions[];
> > > > };
> > > >
> > > > +typedef struct VhostUserInflightEntry {
> > > > + uint8_t inflight;
> > > > +} VhostUserInflightEntry;
> > > > +
> > > > +typedef struct VhostInflightInfo {
> > > > + uint16_t version;
> > > > + uint16_t last_inflight_io;
> > > > + uint16_t used_idx;
> > > > + VhostUserInflightEntry desc[0];
> > > > +} VhostInflightInfo;
> > >
> > > Is there any details on above structure? Why does it not match
> > > QueueRegionSplit or QueueRegionPacked structures described in
> > > qemu/docs/interop/vhost-user.txt?
> >
> > Qemu have its vhost-user backend,
>
> Do you mean contrib/libvhost-user in QEMU?
>
> > qemu did the submission of IO in it.
>
> What does this mean?
>
> > The implementation of dpdk is more general. It is just to mark inflight entry.
>
> Based on the discussion in below threads:
>
> https://patchwork.kernel.org/patch/10753893/
> https://patchwork.kernel.org/patch/10817735/
>
> IIUC, above structure is the extra buffer allocated by slave to
> store the information of inflight descriptors and share with master
> for persistent. All vhost-user backends' implementation should
> follow the vhost-user protocol (including the format of above
> structure) defined in qemu/docs/interop/vhost-user.txt. Right?
Yes you're right. I've confirmed with QEMU that these data structures
are consistent with qemu.
It will be consistent with QEMU in the next version.
>
> > The submission of inflight entry is handle over to different backends.
> > They have their own ways to handle it, such as spdk.
> > So there are some differences in data structure.
> >
> > >
> [...]
> > > > +static int
> > > > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > > > + int main_fd __rte_unused)
> > > > +{
> > > > + int fd, i;
> > > > + uint64_t mmap_size, mmap_offset;
> > > > + uint16_t num_queues, queue_size;
> > > > + uint32_t pervq_inflight_size;
> > > > + void *rc;
> > > > + struct vhost_virtqueue *vq;
> > > > + struct virtio_net *dev = *pdev;
> > > > +
> > > > + fd = msg->fds[0];
> > > > + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > > > + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > > > + msg->size, fd);
> > > > + return -1;
> > >
> > > Ditto.
> > >
> > > > + }
> > > > +
> > > > + mmap_size = msg->payload.inflight.mmap_size;
> > > > + mmap_offset = msg->payload.inflight.mmap_offset;
> > > > + num_queues = msg->payload.inflight.num_queues;
> > > > + queue_size = msg->payload.inflight.queue_size;
> > > > + pervq_inflight_size = get_pervq_shm_size(queue_size);
> > > > +
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd num_queues: %u\n", num_queues);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd queue_size: %u\n", queue_size);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd fd: %d\n", fd);
> > > > + RTE_LOG(INFO, VHOST_CONFIG,
> > > > + "set_inflight_fd pervq_inflight_size: %d\n",
> > > > + pervq_inflight_size);
> > > > +
> > > > + if (dev->inflight_info.addr)
> > > > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > > > +
> > > > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > + fd, mmap_offset);
> > >
> > > Why call it rc? Maybe addr is a better name?
> >
> > In some scenarios, shared memory is reallocated or resized by qemu, so
> > again mmap is needed.
>
> In which case the shared memory will be reallocated or resized by QEMU?
QEMU replies that it needs to copy inflight share memory data when
live migration occurs.
If the size of inflight buffer on two machines is not the same, QEMU
will reallocate memory according to the old inflight buffer size.
>
> >
> > >
> > > > + if (rc == MAP_FAILED) {
> > > > + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > > > + return -1;
> > >
> > > Should always return RTE_VHOST_MSG_RESULT_* in
> > > message handler.
> > >
> > > > + }
> > > > +
> > > > + if (dev->inflight_info.fd)
> > > > + close(dev->inflight_info.fd);
> > > > +
> > > > + dev->inflight_info.fd = fd;
> > > > + dev->inflight_info.addr = rc;
> > > > + dev->inflight_info.size = mmap_size;
> > > > +
> > > > + for (i = 0; i < num_queues; i++) {
> > > > + vq = dev->virtqueue[i];
> > > > + vq->inflight = (VhostInflightInfo *)rc;
> > > > + rc = (void *)((char *)rc + pervq_inflight_size);
> > > > + }
> > > > +
> > > > + return RTE_VHOST_MSG_RESULT_OK;
> > > > +}
> [...]
> > > > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > > > index 2a650fe4b..35f969b1b 100644
> > > > --- a/lib/librte_vhost/vhost_user.h
> > > > +++ b/lib/librte_vhost/vhost_user.h
> > > > @@ -23,7 +23,8 @@
> > > > (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> > > > (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> > > > (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > > > - (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > > > + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
> > >
> > > It will advertise this feature for builtin net and crypto
> > > backends. It's probably not what you intended.
> > >
> >
> > Indeed, this feature is mainly used for spdk-like backends. You mean
> > this function is disabled by default?
>
> External backends should use rte_vhost_driver_set_protocol_features()
> to advertise the protocol features they support.
Thanks,It will be modified in the next version.
^ permalink raw reply [flat|nested] 25+ messages in thread
* [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-04-26 9:40 ` [dpdk-dev] [PATCH v3] " Li Lin
2019-04-26 9:40 ` Li Lin
2019-04-28 11:17 ` Tiwei Bie
@ 2019-05-05 9:02 ` Li Lin
2019-05-05 9:02 ` Li Lin
2019-05-17 15:47 ` Maxime Coquelin
2 siblings, 2 replies; 25+ messages in thread
From: Li Lin @ 2019-05-05 9:02 UTC (permalink / raw)
To: tiwei.bie, maxime.coquelin, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris, xieyongji,
Li Lin, Ni Xun, Zhang Yu
From: Li Lin <lilin24@baidu.com>
This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
buffer between qemu and backend.
Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
shared buffer from backend. Then qemu should send it back
through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
This shared buffer is used to process inflight I/O when backend
reconnect.
Signed-off-by: Li Lin <lilin24@baidu.com>
Signed-off-by: Ni Xun <nixun@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
---
v2 - add set&clr inflight desc functions
v3 - simplified some functions implementation
v4 - rework some data structures according to qemu doc
lib/librte_vhost/rte_vhost.h | 83 ++++++++++
lib/librte_vhost/rte_vhost_version.map | 3 +
lib/librte_vhost/vhost.c | 105 ++++++++++++
lib/librte_vhost/vhost.h | 12 ++
lib/librte_vhost/vhost_user.c | 292 +++++++++++++++++++++++++++++++++
lib/librte_vhost/vhost_user.h | 13 +-
6 files changed, 507 insertions(+), 1 deletion(-)
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d2c0c93f4..d4d709717 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -71,6 +71,10 @@ extern "C" {
#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
#endif
+#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
+#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+#endif
+
/** Indicate whether protocol features negotiation is supported. */
#ifndef VHOST_USER_F_PROTOCOL_FEATURES
#define VHOST_USER_F_PROTOCOL_FEATURES 30
@@ -98,12 +102,41 @@ struct rte_vhost_memory {
struct rte_vhost_mem_region regions[];
};
+struct inflight_desc {
+ uint8_t inflight;
+ uint8_t padding[5];
+ uint16_t next;
+ uint64_t counter;
+};
+
+struct inflight_info {
+ uint64_t features;
+ uint16_t version;
+ uint16_t desc_num;
+ uint16_t last_inflight_io;
+ uint16_t used_idx;
+ struct inflight_desc desc[0];
+};
+
+struct resubmit_desc {
+ uint16_t index;
+ uint64_t counter;
+};
+
+struct resubmit_info {
+ struct resubmit_desc *resubmit_list;
+ uint16_t resubmit_num;
+};
+
struct rte_vhost_vring {
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint64_t log_guest_addr;
+ struct inflight_info *inflight;
+ struct resubmit_info *resubmit;
+
/** Deprecated, use rte_vhost_vring_call() instead. */
int callfd;
@@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
int rte_vhost_vring_call(int vid, uint16_t vring_idx);
/**
+ * set inflight flag for a desc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t idx);
+
+/**
+ * clear inflight flag for a desc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_used_idx
+ * next free used_idx
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * set last inflight io index.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
+ uint16_t idx);
+
+/**
* Get vhost RX queue avail count.
*
* @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index 5f1d4a75c..a992b3935 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -87,4 +87,7 @@ EXPERIMENTAL {
rte_vdpa_relay_vring_used;
rte_vhost_extern_callback_register;
rte_vhost_driver_set_protocol_features;
+ rte_vhost_set_inflight_desc;
+ rte_vhost_clr_inflight_desc;
+ rte_vhost_set_last_inflight_io;
};
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 163f4595e..03eab3551 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
close(vq->callfd);
if (vq->kickfd >= 0)
close(vq->kickfd);
+ if (vq->inflight)
+ vq->inflight = NULL;
+ if (vq->resubmit->resubmit_list) {
+ free(vq->resubmit->resubmit_list);
+ vq->resubmit->resubmit_list = NULL;
+ }
+ if (vq->resubmit) {
+ free(vq->resubmit);
+ vq->resubmit = NULL;
+ }
}
/*
@@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
vring->kickfd = vq->kickfd;
vring->size = vq->size;
+ vring->inflight = vq->inflight;
+ vring->resubmit = vq->resubmit;
+
return 0;
}
@@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
return 0;
}
+int
+rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ vq->inflight->desc[idx].counter = vq->counter++;
+ vq->inflight->desc[idx].inflight = 1;
+ return 0;
+}
+
+int
+rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ rte_compiler_barrier();
+
+ vq->inflight->desc[idx].inflight = 0;
+
+ rte_compiler_barrier();
+
+ vq->inflight->used_idx = last_used_idx;
+ return 0;
+}
+
+int
+rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ vq->inflight->last_inflight_io = idx;
+ return 0;
+}
+
uint16_t
rte_vhost_avail_entries(int vid, uint16_t queue_id)
{
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index e9138dfab..3dace2ec3 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -128,6 +128,11 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+ /* Inflight share memory info */
+ struct inflight_info *inflight;
+ struct resubmit_info *resubmit;
+ uint64_t counter;
+
uint16_t nr_zmbuf;
uint16_t zmbuf_size;
uint16_t last_zmbuf_idx;
@@ -286,6 +291,12 @@ struct guest_page {
uint64_t size;
};
+struct inflight_mem_info {
+ int fd;
+ void *addr;
+ uint64_t size;
+};
+
/**
* Device structure contains all configuration information relating
* to the device.
@@ -303,6 +314,7 @@ struct virtio_net {
uint32_t nr_vring;
int dequeue_zero_copy;
struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+ struct inflight_mem_info inflight_info;
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
char ifname[IF_NAME_SZ];
uint64_t log_size;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index c9e29ece8..eb41b9045 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -31,6 +31,8 @@
#include <sys/stat.h>
#include <sys/syscall.h>
#include <assert.h>
+#include <sys/syscall.h>
+#include <asm/unistd.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
#endif
@@ -49,6 +51,15 @@
#define VIRTIO_MIN_MTU 68
#define VIRTIO_MAX_MTU 65535
+#define INFLIGHT_ALIGNMENT 64
+#define INFLIGHT_VERSION 0xabcd
+#define VIRTQUEUE_MAX_SIZE 1024
+
+#define CLOEXEC 0x0001U
+
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_NONE] = "VHOST_USER_NONE",
[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
@@ -78,6 +89,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE",
[VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN",
[VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END",
+ [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD",
+ [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD",
};
static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
@@ -160,6 +173,16 @@ vhost_backend_cleanup(struct virtio_net *dev)
dev->log_addr = 0;
}
+ if (dev->inflight_info.addr) {
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+ dev->inflight_info.addr = NULL;
+ }
+
+ if (dev->inflight_info.fd > 0) {
+ close(dev->inflight_info.fd);
+ dev->inflight_info.fd = -1;
+ }
+
if (dev->slave_req_fd >= 0) {
close(dev->slave_req_fd);
dev->slave_req_fd = -1;
@@ -1165,6 +1188,184 @@ virtio_is_ready(struct virtio_net *dev)
return 1;
}
+static int mem_create(const char *name, unsigned int flags)
+{
+#ifdef __NR_memfd_create
+ return syscall(__NR_memfd_create, name, flags);
+#else
+ return -1;
+#endif
+}
+
+void *inflight_mem_alloc(const char *name, size_t size, int *fd)
+{
+ void *ptr;
+ int mfd = -1;
+ char fname[20] = "/tmp/memfd-XXXXXX";
+
+ *fd = -1;
+ mfd = mem_create(name, CLOEXEC);
+ if (mfd != -1) {
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ } else {
+ mfd = mkstemp(fname);
+ unlink(fname);
+
+ if (mfd == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mkstemp fail for alloc inflight buffer\n");
+ return NULL;
+ }
+
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ }
+
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+ if (ptr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+
+ *fd = mfd;
+ return ptr;
+}
+
+static uint32_t get_pervq_shm_size(uint16_t queue_size)
+{
+ return ALIGN_UP(sizeof(struct inflight_desc) * queue_size +
+ sizeof(uint64_t) * 1 + sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT);
+}
+
+static int
+vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd;
+ uint64_t mmap_size;
+ void *addr;
+ uint16_t num_queues, queue_size;
+ struct virtio_net *dev = *pdev;
+
+ if (msg->size != sizeof(msg->payload.inflight)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Invalid get_inflight_fd message size is %d",
+ msg->size);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
+ msg->payload.inflight.num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
+ msg->payload.inflight.queue_size);
+ mmap_size = num_queues * get_pervq_shm_size(queue_size);
+
+ addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
+ if (!addr) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
+ msg->payload.inflight.mmap_size = 0;
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ memset(addr, 0, mmap_size);
+
+ dev->inflight_info.addr = addr;
+ dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
+ dev->inflight_info.fd = msg->fds[0] = fd;
+ msg->payload.inflight.mmap_offset = 0;
+ msg->fd_num = 1;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_size: %lu\n",
+ msg->payload.inflight.mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_offset: %lu\n",
+ msg->payload.inflight.mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight fd: %d\n", msg->fds[0]);
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int
+vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd, i;
+ uint64_t mmap_size, mmap_offset;
+ uint16_t num_queues, queue_size;
+ uint32_t pervq_inflight_size;
+ void *rc;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev = *pdev;
+
+ fd = msg->fds[0];
+ if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
+ msg->size, fd);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ mmap_size = msg->payload.inflight.mmap_size;
+ mmap_offset = msg->payload.inflight.mmap_offset;
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+ pervq_inflight_size = get_pervq_shm_size(queue_size);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_size: %lu\n", mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd num_queues: %u\n", num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd queue_size: %u\n", queue_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd fd: %d\n", fd);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd pervq_inflight_size: %d\n",
+ pervq_inflight_size);
+
+ if (dev->inflight_info.addr)
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+
+ rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, mmap_offset);
+ if (rc == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->inflight_info.fd)
+ close(dev->inflight_info.fd);
+
+ dev->inflight_info.fd = fd;
+ dev->inflight_info.addr = rc;
+ dev->inflight_info.size = mmap_size;
+
+ for (i = 0; i < num_queues; i++) {
+ vq = dev->virtqueue[i];
+ vq->inflight = (struct inflight_info *)rc;
+ vq->inflight->desc_num = queue_size;
+ rc = (void *)((char *)rc + pervq_inflight_size);
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
static int
vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
@@ -1202,6 +1403,89 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
}
static int
+resubmit_desc_compare(const void *a, const void *b)
+{
+ const struct resubmit_desc *desc0 = (const struct resubmit_desc *)a;
+ const struct resubmit_desc *desc1 = (const struct resubmit_desc *)b;
+
+ if (desc1->counter > desc0->counter &&
+ (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2)
+ return 1;
+
+ return -1;
+}
+
+static int
+vhost_check_queue_inflights(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ struct vring_used *used = vq->used;
+ uint16_t i = 0;
+ uint16_t resubmit_num = 0;
+ struct resubmit_info *resubmit = NULL;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return RTE_VHOST_MSG_RESULT_OK;
+
+ if ((!vq->inflight))
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ if (!vq->inflight->version) {
+ vq->inflight->version = INFLIGHT_VERSION;
+ return RTE_VHOST_MSG_RESULT_OK;
+ }
+
+ vq->resubmit = NULL;
+ vq->counter = 0;
+
+ if (vq->inflight->used_idx != used->idx) {
+ vq->inflight->desc[vq->inflight->last_inflight_io].inflight = 0;
+ rte_compiler_barrier();
+ vq->inflight->used_idx = used->idx;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight == 1)
+ resubmit_num++;
+ }
+
+ vq->last_avail_idx += resubmit_num;
+
+ if (resubmit_num) {
+ resubmit = calloc(1, sizeof(struct resubmit_info));
+ if (!resubmit) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for resubmit info.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ resubmit->resubmit_list = calloc(resubmit_num,
+ sizeof(struct resubmit_desc));
+ if (!resubmit->resubmit_list) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for inflight desc.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight == 1) {
+ resubmit->resubmit_list[resubmit->resubmit_num].index = i;
+ resubmit->resubmit_list[resubmit->resubmit_num].counter =
+ vq->inflight->desc[i].counter;
+ resubmit->resubmit_num++;
+ }
+ }
+
+ if (resubmit->resubmit_num > 1)
+ qsort(resubmit->resubmit_list, resubmit->resubmit_num,
+ sizeof(struct resubmit_desc), resubmit_desc_compare);
+
+ vq->counter = resubmit->resubmit_list[0].counter + 1;
+ vq->resubmit = resubmit;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
{
@@ -1242,6 +1526,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
close(vq->kickfd);
vq->kickfd = file.fd;
+ if (vhost_check_queue_inflights(dev, vq)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to inflights for vq: %d\n", file.index);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
return RTE_VHOST_MSG_RESULT_OK;
}
@@ -1762,6 +2052,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
+ [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
+ [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
};
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 2a650fe4b..99a773910 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -54,7 +54,9 @@ typedef enum VhostUserRequest {
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
- VHOST_USER_MAX = 31
+ VHOST_USER_GET_INFLIGHT_FD = 31,
+ VHOST_USER_SET_INFLIGHT_FD = 32,
+ VHOST_USER_MAX = 33
} VhostUserRequest;
typedef enum VhostUserSlaveRequest {
@@ -112,6 +114,13 @@ typedef struct VhostUserVringArea {
uint64_t offset;
} VhostUserVringArea;
+typedef struct VhostUserInflight {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint16_t num_queues;
+ uint16_t queue_size;
+} VhostUserInflight;
+
typedef struct VhostUserMsg {
union {
uint32_t master; /* a VhostUserRequest value */
@@ -131,6 +140,7 @@ typedef struct VhostUserMsg {
struct vhost_vring_addr addr;
VhostUserMemory memory;
VhostUserLog log;
+ VhostUserInflight inflight;
struct vhost_iotlb_msg iotlb;
VhostUserCryptoSessionParam crypto_session;
VhostUserVringArea area;
@@ -148,6 +158,7 @@ typedef struct VhostUserMsg {
/* vhost_user.c */
int vhost_user_msg_handler(int vid, int fd);
int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
+void *inflight_mem_alloc(const char *name, size_t size, int *fd);
/* socket.c */
int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
--
2.11.0
^ permalink raw reply [flat|nested] 25+ messages in thread
* [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-05 9:02 ` [dpdk-dev] [PATCH v4] " Li Lin
@ 2019-05-05 9:02 ` Li Lin
2019-05-17 15:47 ` Maxime Coquelin
1 sibling, 0 replies; 25+ messages in thread
From: Li Lin @ 2019-05-05 9:02 UTC (permalink / raw)
To: tiwei.bie, maxime.coquelin, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris, xieyongji,
Li Lin, Ni Xun, Zhang Yu
From: Li Lin <lilin24@baidu.com>
This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
buffer between qemu and backend.
Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
shared buffer from backend. Then qemu should send it back
through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
This shared buffer is used to process inflight I/O when backend
reconnect.
Signed-off-by: Li Lin <lilin24@baidu.com>
Signed-off-by: Ni Xun <nixun@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
---
v2 - add set&clr inflight desc functions
v3 - simplified some functions implementation
v4 - rework some data structures according to qemu doc
lib/librte_vhost/rte_vhost.h | 83 ++++++++++
lib/librte_vhost/rte_vhost_version.map | 3 +
lib/librte_vhost/vhost.c | 105 ++++++++++++
lib/librte_vhost/vhost.h | 12 ++
lib/librte_vhost/vhost_user.c | 292 +++++++++++++++++++++++++++++++++
lib/librte_vhost/vhost_user.h | 13 +-
6 files changed, 507 insertions(+), 1 deletion(-)
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d2c0c93f4..d4d709717 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -71,6 +71,10 @@ extern "C" {
#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
#endif
+#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
+#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+#endif
+
/** Indicate whether protocol features negotiation is supported. */
#ifndef VHOST_USER_F_PROTOCOL_FEATURES
#define VHOST_USER_F_PROTOCOL_FEATURES 30
@@ -98,12 +102,41 @@ struct rte_vhost_memory {
struct rte_vhost_mem_region regions[];
};
+struct inflight_desc {
+ uint8_t inflight;
+ uint8_t padding[5];
+ uint16_t next;
+ uint64_t counter;
+};
+
+struct inflight_info {
+ uint64_t features;
+ uint16_t version;
+ uint16_t desc_num;
+ uint16_t last_inflight_io;
+ uint16_t used_idx;
+ struct inflight_desc desc[0];
+};
+
+struct resubmit_desc {
+ uint16_t index;
+ uint64_t counter;
+};
+
+struct resubmit_info {
+ struct resubmit_desc *resubmit_list;
+ uint16_t resubmit_num;
+};
+
struct rte_vhost_vring {
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint64_t log_guest_addr;
+ struct inflight_info *inflight;
+ struct resubmit_info *resubmit;
+
/** Deprecated, use rte_vhost_vring_call() instead. */
int callfd;
@@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
int rte_vhost_vring_call(int vid, uint16_t vring_idx);
/**
+ * set inflight flag for a desc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t idx);
+
+/**
+ * clear inflight flag for a desc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_used_idx
+ * next free used_idx
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * set last inflight io index.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
+ uint16_t idx);
+
+/**
* Get vhost RX queue avail count.
*
* @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index 5f1d4a75c..a992b3935 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -87,4 +87,7 @@ EXPERIMENTAL {
rte_vdpa_relay_vring_used;
rte_vhost_extern_callback_register;
rte_vhost_driver_set_protocol_features;
+ rte_vhost_set_inflight_desc;
+ rte_vhost_clr_inflight_desc;
+ rte_vhost_set_last_inflight_io;
};
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 163f4595e..03eab3551 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
close(vq->callfd);
if (vq->kickfd >= 0)
close(vq->kickfd);
+ if (vq->inflight)
+ vq->inflight = NULL;
+ if (vq->resubmit->resubmit_list) {
+ free(vq->resubmit->resubmit_list);
+ vq->resubmit->resubmit_list = NULL;
+ }
+ if (vq->resubmit) {
+ free(vq->resubmit);
+ vq->resubmit = NULL;
+ }
}
/*
@@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
vring->kickfd = vq->kickfd;
vring->size = vq->size;
+ vring->inflight = vq->inflight;
+ vring->resubmit = vq->resubmit;
+
return 0;
}
@@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
return 0;
}
+int
+rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ vq->inflight->desc[idx].counter = vq->counter++;
+ vq->inflight->desc[idx].inflight = 1;
+ return 0;
+}
+
+int
+rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ rte_compiler_barrier();
+
+ vq->inflight->desc[idx].inflight = 0;
+
+ rte_compiler_barrier();
+
+ vq->inflight->used_idx = last_used_idx;
+ return 0;
+}
+
+int
+rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return 0;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ if (unlikely(!vq->inflight))
+ return -1;
+
+ vq->inflight->last_inflight_io = idx;
+ return 0;
+}
+
uint16_t
rte_vhost_avail_entries(int vid, uint16_t queue_id)
{
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index e9138dfab..3dace2ec3 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -128,6 +128,11 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+ /* Inflight share memory info */
+ struct inflight_info *inflight;
+ struct resubmit_info *resubmit;
+ uint64_t counter;
+
uint16_t nr_zmbuf;
uint16_t zmbuf_size;
uint16_t last_zmbuf_idx;
@@ -286,6 +291,12 @@ struct guest_page {
uint64_t size;
};
+struct inflight_mem_info {
+ int fd;
+ void *addr;
+ uint64_t size;
+};
+
/**
* Device structure contains all configuration information relating
* to the device.
@@ -303,6 +314,7 @@ struct virtio_net {
uint32_t nr_vring;
int dequeue_zero_copy;
struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+ struct inflight_mem_info inflight_info;
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
char ifname[IF_NAME_SZ];
uint64_t log_size;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index c9e29ece8..eb41b9045 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -31,6 +31,8 @@
#include <sys/stat.h>
#include <sys/syscall.h>
#include <assert.h>
+#include <sys/syscall.h>
+#include <asm/unistd.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
#endif
@@ -49,6 +51,15 @@
#define VIRTIO_MIN_MTU 68
#define VIRTIO_MAX_MTU 65535
+#define INFLIGHT_ALIGNMENT 64
+#define INFLIGHT_VERSION 0xabcd
+#define VIRTQUEUE_MAX_SIZE 1024
+
+#define CLOEXEC 0x0001U
+
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_NONE] = "VHOST_USER_NONE",
[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
@@ -78,6 +89,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE",
[VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN",
[VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END",
+ [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD",
+ [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD",
};
static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
@@ -160,6 +173,16 @@ vhost_backend_cleanup(struct virtio_net *dev)
dev->log_addr = 0;
}
+ if (dev->inflight_info.addr) {
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+ dev->inflight_info.addr = NULL;
+ }
+
+ if (dev->inflight_info.fd > 0) {
+ close(dev->inflight_info.fd);
+ dev->inflight_info.fd = -1;
+ }
+
if (dev->slave_req_fd >= 0) {
close(dev->slave_req_fd);
dev->slave_req_fd = -1;
@@ -1165,6 +1188,184 @@ virtio_is_ready(struct virtio_net *dev)
return 1;
}
+static int mem_create(const char *name, unsigned int flags)
+{
+#ifdef __NR_memfd_create
+ return syscall(__NR_memfd_create, name, flags);
+#else
+ return -1;
+#endif
+}
+
+void *inflight_mem_alloc(const char *name, size_t size, int *fd)
+{
+ void *ptr;
+ int mfd = -1;
+ char fname[20] = "/tmp/memfd-XXXXXX";
+
+ *fd = -1;
+ mfd = mem_create(name, CLOEXEC);
+ if (mfd != -1) {
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ } else {
+ mfd = mkstemp(fname);
+ unlink(fname);
+
+ if (mfd == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mkstemp fail for alloc inflight buffer\n");
+ return NULL;
+ }
+
+ if (ftruncate(mfd, size) == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "ftruncate fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+ }
+
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+ if (ptr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap fail for alloc inflight buffer\n");
+ close(mfd);
+ return NULL;
+ }
+
+ *fd = mfd;
+ return ptr;
+}
+
+static uint32_t get_pervq_shm_size(uint16_t queue_size)
+{
+ return ALIGN_UP(sizeof(struct inflight_desc) * queue_size +
+ sizeof(uint64_t) * 1 + sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT);
+}
+
+static int
+vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd;
+ uint64_t mmap_size;
+ void *addr;
+ uint16_t num_queues, queue_size;
+ struct virtio_net *dev = *pdev;
+
+ if (msg->size != sizeof(msg->payload.inflight)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Invalid get_inflight_fd message size is %d",
+ msg->size);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
+ msg->payload.inflight.num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
+ msg->payload.inflight.queue_size);
+ mmap_size = num_queues * get_pervq_shm_size(queue_size);
+
+ addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
+ if (!addr) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
+ msg->payload.inflight.mmap_size = 0;
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+ memset(addr, 0, mmap_size);
+
+ dev->inflight_info.addr = addr;
+ dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
+ dev->inflight_info.fd = msg->fds[0] = fd;
+ msg->payload.inflight.mmap_offset = 0;
+ msg->fd_num = 1;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_size: %lu\n",
+ msg->payload.inflight.mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight mmap_offset: %lu\n",
+ msg->payload.inflight.mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "send inflight fd: %d\n", msg->fds[0]);
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int
+vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ int fd, i;
+ uint64_t mmap_size, mmap_offset;
+ uint16_t num_queues, queue_size;
+ uint32_t pervq_inflight_size;
+ void *rc;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev = *pdev;
+
+ fd = msg->fds[0];
+ if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
+ msg->size, fd);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ mmap_size = msg->payload.inflight.mmap_size;
+ mmap_offset = msg->payload.inflight.mmap_offset;
+ num_queues = msg->payload.inflight.num_queues;
+ queue_size = msg->payload.inflight.queue_size;
+ pervq_inflight_size = get_pervq_shm_size(queue_size);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_size: %lu\n", mmap_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd num_queues: %u\n", num_queues);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd queue_size: %u\n", queue_size);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd fd: %d\n", fd);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set_inflight_fd pervq_inflight_size: %d\n",
+ pervq_inflight_size);
+
+ if (dev->inflight_info.addr)
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+
+ rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, mmap_offset);
+ if (rc == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (dev->inflight_info.fd)
+ close(dev->inflight_info.fd);
+
+ dev->inflight_info.fd = fd;
+ dev->inflight_info.addr = rc;
+ dev->inflight_info.size = mmap_size;
+
+ for (i = 0; i < num_queues; i++) {
+ vq = dev->virtqueue[i];
+ vq->inflight = (struct inflight_info *)rc;
+ vq->inflight->desc_num = queue_size;
+ rc = (void *)((char *)rc + pervq_inflight_size);
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
static int
vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
@@ -1202,6 +1403,89 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
}
static int
+resubmit_desc_compare(const void *a, const void *b)
+{
+ const struct resubmit_desc *desc0 = (const struct resubmit_desc *)a;
+ const struct resubmit_desc *desc1 = (const struct resubmit_desc *)b;
+
+ if (desc1->counter > desc0->counter &&
+ (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2)
+ return 1;
+
+ return -1;
+}
+
+static int
+vhost_check_queue_inflights(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ struct vring_used *used = vq->used;
+ uint16_t i = 0;
+ uint16_t resubmit_num = 0;
+ struct resubmit_info *resubmit = NULL;
+
+ if (!(dev->features &
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
+ return RTE_VHOST_MSG_RESULT_OK;
+
+ if ((!vq->inflight))
+ return RTE_VHOST_MSG_RESULT_ERR;
+
+ if (!vq->inflight->version) {
+ vq->inflight->version = INFLIGHT_VERSION;
+ return RTE_VHOST_MSG_RESULT_OK;
+ }
+
+ vq->resubmit = NULL;
+ vq->counter = 0;
+
+ if (vq->inflight->used_idx != used->idx) {
+ vq->inflight->desc[vq->inflight->last_inflight_io].inflight = 0;
+ rte_compiler_barrier();
+ vq->inflight->used_idx = used->idx;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight == 1)
+ resubmit_num++;
+ }
+
+ vq->last_avail_idx += resubmit_num;
+
+ if (resubmit_num) {
+ resubmit = calloc(1, sizeof(struct resubmit_info));
+ if (!resubmit) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for resubmit info.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ resubmit->resubmit_list = calloc(resubmit_num,
+ sizeof(struct resubmit_desc));
+ if (!resubmit->resubmit_list) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for inflight desc.\n");
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight == 1) {
+ resubmit->resubmit_list[resubmit->resubmit_num].index = i;
+ resubmit->resubmit_list[resubmit->resubmit_num].counter =
+ vq->inflight->desc[i].counter;
+ resubmit->resubmit_num++;
+ }
+ }
+
+ if (resubmit->resubmit_num > 1)
+ qsort(resubmit->resubmit_list, resubmit->resubmit_num,
+ sizeof(struct resubmit_desc), resubmit_desc_compare);
+
+ vq->counter = resubmit->resubmit_list[0].counter + 1;
+ vq->resubmit = resubmit;
+ }
+
+ return RTE_VHOST_MSG_RESULT_OK;
+}
+
+static int
vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd __rte_unused)
{
@@ -1242,6 +1526,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
close(vq->kickfd);
vq->kickfd = file.fd;
+ if (vhost_check_queue_inflights(dev, vq)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to inflights for vq: %d\n", file.index);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
return RTE_VHOST_MSG_RESULT_OK;
}
@@ -1762,6 +2052,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
+ [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
+ [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
};
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 2a650fe4b..99a773910 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -54,7 +54,9 @@ typedef enum VhostUserRequest {
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
- VHOST_USER_MAX = 31
+ VHOST_USER_GET_INFLIGHT_FD = 31,
+ VHOST_USER_SET_INFLIGHT_FD = 32,
+ VHOST_USER_MAX = 33
} VhostUserRequest;
typedef enum VhostUserSlaveRequest {
@@ -112,6 +114,13 @@ typedef struct VhostUserVringArea {
uint64_t offset;
} VhostUserVringArea;
+typedef struct VhostUserInflight {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint16_t num_queues;
+ uint16_t queue_size;
+} VhostUserInflight;
+
typedef struct VhostUserMsg {
union {
uint32_t master; /* a VhostUserRequest value */
@@ -131,6 +140,7 @@ typedef struct VhostUserMsg {
struct vhost_vring_addr addr;
VhostUserMemory memory;
VhostUserLog log;
+ VhostUserInflight inflight;
struct vhost_iotlb_msg iotlb;
VhostUserCryptoSessionParam crypto_session;
VhostUserVringArea area;
@@ -148,6 +158,7 @@ typedef struct VhostUserMsg {
/* vhost_user.c */
int vhost_user_msg_handler(int vid, int fd);
int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
+void *inflight_mem_alloc(const char *name, size_t size, int *fd);
/* socket.c */
int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
--
2.11.0
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-05 9:02 ` [dpdk-dev] [PATCH v4] " Li Lin
2019-05-05 9:02 ` Li Lin
@ 2019-05-17 15:47 ` Maxime Coquelin
2019-05-20 2:13 ` Tiwei Bie
2019-05-22 11:04 ` Li,Lin(ACG Cloud)
1 sibling, 2 replies; 25+ messages in thread
From: Maxime Coquelin @ 2019-05-17 15:47 UTC (permalink / raw)
To: Li Lin, tiwei.bie, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris, xieyongji,
Li Lin, Ni Xun, Zhang Yu
On 5/5/19 11:02 AM, Li Lin wrote:
> From: Li Lin <lilin24@baidu.com>
>
> This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> buffer between qemu and backend.
>
> Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> shared buffer from backend. Then qemu should send it back
> through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
>
> This shared buffer is used to process inflight I/O when backend
> reconnect.
>
> Signed-off-by: Li Lin <lilin24@baidu.com>
> Signed-off-by: Ni Xun <nixun@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> ---
> v2 - add set&clr inflight desc functions
> v3 - simplified some functions implementation
> v4 - rework some data structures according to qemu doc
>
> lib/librte_vhost/rte_vhost.h | 83 ++++++++++
> lib/librte_vhost/rte_vhost_version.map | 3 +
> lib/librte_vhost/vhost.c | 105 ++++++++++++
> lib/librte_vhost/vhost.h | 12 ++
> lib/librte_vhost/vhost_user.c | 292 +++++++++++++++++++++++++++++++++
> lib/librte_vhost/vhost_user.h | 13 +-
> 6 files changed, 507 insertions(+), 1 deletion(-)
>
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d2c0c93f4..d4d709717 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -71,6 +71,10 @@ extern "C" {
> #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> #endif
>
> +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> +#endif
> +
> /** Indicate whether protocol features negotiation is supported. */
> #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> #define VHOST_USER_F_PROTOCOL_FEATURES 30
> @@ -98,12 +102,41 @@ struct rte_vhost_memory {
> struct rte_vhost_mem_region regions[];
> };
>
> +struct inflight_desc {
> + uint8_t inflight;
> + uint8_t padding[5];
> + uint16_t next;
> + uint64_t counter;
> +};
> +
> +struct inflight_info {
> + uint64_t features;
> + uint16_t version;
> + uint16_t desc_num;
> + uint16_t last_inflight_io;
> + uint16_t used_idx;
> + struct inflight_desc desc[0];
> +};
> +
> +struct resubmit_desc {
> + uint16_t index;
> + uint64_t counter;
> +};
> +
> +struct resubmit_info {
> + struct resubmit_desc *resubmit_list;
> + uint16_t resubmit_num;
> +};
> +
> struct rte_vhost_vring {
> struct vring_desc *desc;
> struct vring_avail *avail;
> struct vring_used *used;
> uint64_t log_guest_addr;
>
> + struct inflight_info *inflight;
> + struct resubmit_info *resubmit;
> +
> /** Deprecated, use rte_vhost_vring_call() instead. */
> int callfd;
>
> @@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> int rte_vhost_vring_call(int vid, uint16_t vring_idx);
>
> /**
> + * set inflight flag for a desc.
> + *
> + * @param vid
> + * vhost device ID
> + * @param vring_idx
> + * vring index
> + * @param idx
> + * inflight entry index
> + * @return
> + * 0 on success, -1 on failure
> + */
> +int __rte_experimental
> +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
> + uint16_t idx);
> +
> +/**
> + * clear inflight flag for a desc.
> + *
> + * @param vid
> + * vhost device ID
> + * @param vring_idx
> + * vring index
> + * @param last_used_idx
> + * next free used_idx
> + * @param idx
> + * inflight entry index
> + * @return
> + * 0 on success, -1 on failure
> + */
> +int __rte_experimental
> +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> + uint16_t last_used_idx, uint16_t idx);
> +
> +/**
> + * set last inflight io index.
> + *
> + * @param vid
> + * vhost device ID
> + * @param vring_idx
> + * vring index
> + * @param idx
> + * inflight entry index
> + * @return
> + * 0 on success, -1 on failure
> + */
> +int __rte_experimental
> +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
> + uint16_t idx);
> +
> +/**
> * Get vhost RX queue avail count.
> *
> * @param vid
> diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
> index 5f1d4a75c..a992b3935 100644
> --- a/lib/librte_vhost/rte_vhost_version.map
> +++ b/lib/librte_vhost/rte_vhost_version.map
> @@ -87,4 +87,7 @@ EXPERIMENTAL {
> rte_vdpa_relay_vring_used;
> rte_vhost_extern_callback_register;
> rte_vhost_driver_set_protocol_features;
> + rte_vhost_set_inflight_desc;
> + rte_vhost_clr_inflight_desc;
> + rte_vhost_set_last_inflight_io;
> };
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 163f4595e..03eab3551 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> close(vq->callfd);
> if (vq->kickfd >= 0)
> close(vq->kickfd);
> + if (vq->inflight)
> + vq->inflight = NULL;
> + if (vq->resubmit->resubmit_list) {
> + free(vq->resubmit->resubmit_list);
> + vq->resubmit->resubmit_list = NULL;
> + }
> + if (vq->resubmit) {
> + free(vq->resubmit);
> + vq->resubmit = NULL;
> + }
> }
>
> /*
> @@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> vring->kickfd = vq->kickfd;
> vring->size = vq->size;
>
> + vring->inflight = vq->inflight;
> + vring->resubmit = vq->resubmit;
> +
> return 0;
> }
>
> @@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> return 0;
> }
>
> +int
> +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t idx)
> +{
> + struct virtio_net *dev;
> + struct vhost_virtqueue *vq;
> +
> + dev = get_device(vid);
> + if (!dev)
> + return -1;
> +
> + if (!(dev->features &
> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> + return 0;
Shouldn't -1 be returned here also?
I think this function should be called only if the feature negotiation
succeeded.
Returning an error here would help the calling application to know
something went wrong.
Same comment applies for the clear function.
> +
> + if (vring_idx >= VHOST_MAX_VRING)
> + return -1;
> +
> + vq = dev->virtqueue[vring_idx];
> + if (!vq)
> + return -1;
> +
> + if (unlikely(!vq->inflight))
> + return -1;
> +
> + vq->inflight->desc[idx].counter = vq->counter++;
> + vq->inflight->desc[idx].inflight = 1;
> + return 0;
> +}
> +
> +int
> +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> + uint16_t last_used_idx, uint16_t idx)
> +{
> + struct virtio_net *dev;
> + struct vhost_virtqueue *vq;
> +
> + dev = get_device(vid);
> + if (!dev)
> + return -1;
> +
> + if (!(dev->features &
> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> + return 0;
> +
> + if (vring_idx >= VHOST_MAX_VRING)
> + return -1;
> +
> + vq = dev->virtqueue[vring_idx];
> + if (!vq)
> + return -1;
> +
> + if (unlikely(!vq->inflight))
> + return -1;
> +
> + rte_compiler_barrier();
> +
> + vq->inflight->desc[idx].inflight = 0;
> +
> + rte_compiler_barrier();
> +
> + vq->inflight->used_idx = last_used_idx;
> + return 0;
> +}
> +
> +int
> +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t idx)
> +{
> + struct virtio_net *dev;
> + struct vhost_virtqueue *vq;
> +
> + dev = get_device(vid);
> + if (!dev)
> + return -1;
> +
> + if (!(dev->features &
> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> + return 0;
> +
> + if (vring_idx >= VHOST_MAX_VRING)
> + return -1;
> +
> + vq = dev->virtqueue[vring_idx];
> + if (!vq)
> + return -1;
> +
> + if (unlikely(!vq->inflight))
> + return -1;
> +
> + vq->inflight->last_inflight_io = idx;
> + return 0;
> +}
Are above functions supposed to be called in the hot path?
If so, you might want to use unlikely for the error checks
everywhere.
> +
> uint16_t
> rte_vhost_avail_entries(int vid, uint16_t queue_id)
> {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index e9138dfab..3dace2ec3 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -128,6 +128,11 @@ struct vhost_virtqueue {
> /* Physical address of used ring, for logging */
> uint64_t log_guest_addr;
>
> + /* Inflight share memory info */
> + struct inflight_info *inflight;
> + struct resubmit_info *resubmit;
> + uint64_t counter;
counter and resubmit are too generic names.
> +
> uint16_t nr_zmbuf;
> uint16_t zmbuf_size;
> uint16_t last_zmbuf_idx;
> @@ -286,6 +291,12 @@ struct guest_page {
> uint64_t size;
> };
>
> +struct inflight_mem_info {
> + int fd;
> + void *addr;
> + uint64_t size;
> +};
> +
> /**
> * Device structure contains all configuration information relating
> * to the device.
> @@ -303,6 +314,7 @@ struct virtio_net {
> uint32_t nr_vring;
> int dequeue_zero_copy;
> struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> + struct inflight_mem_info inflight_info;
> #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> char ifname[IF_NAME_SZ];
> uint64_t log_size;
Do you have some code example using these new APIs?
It would help for reviewing the patch.
Thanks,
Maxime
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-17 15:47 ` Maxime Coquelin
@ 2019-05-20 2:13 ` Tiwei Bie
2019-05-21 8:29 ` lin li
2019-05-22 11:04 ` Li,Lin(ACG Cloud)
1 sibling, 1 reply; 25+ messages in thread
From: Tiwei Bie @ 2019-05-20 2:13 UTC (permalink / raw)
To: Maxime Coquelin
Cc: Li Lin, zhihong.wang, dev, dariusz.stojaczyk, changpeng.liu,
james.r.harris, xieyongji, Li Lin, Ni Xun, Zhang Yu
On Fri, May 17, 2019 at 05:47:07PM +0200, Maxime Coquelin wrote:
> On 5/5/19 11:02 AM, Li Lin wrote:
[...]
> > /**
> > * Device structure contains all configuration information relating
> > * to the device.
> > @@ -303,6 +314,7 @@ struct virtio_net {
> > uint32_t nr_vring;
> > int dequeue_zero_copy;
> > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > + struct inflight_mem_info inflight_info;
> > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > char ifname[IF_NAME_SZ];
> > uint64_t log_size;
>
> Do you have some code example using these new APIs?
> It would help for reviewing the patch.
+1
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-20 2:13 ` Tiwei Bie
@ 2019-05-21 8:29 ` lin li
2019-05-21 8:46 ` Tiwei Bie
0 siblings, 1 reply; 25+ messages in thread
From: lin li @ 2019-05-21 8:29 UTC (permalink / raw)
To: Tiwei Bie; +Cc: dev, dariusz.stojaczyk, changpeng.liu, Li Lin
Tiwei Bie <tiwei.bie@intel.com> 于2019年5月20日周一 上午10:14写道:
>
> On Fri, May 17, 2019 at 05:47:07PM +0200, Maxime Coquelin wrote:
> > On 5/5/19 11:02 AM, Li Lin wrote:
> [...]
> > > /**
> > > * Device structure contains all configuration information relating
> > > * to the device.
> > > @@ -303,6 +314,7 @@ struct virtio_net {
> > > uint32_t nr_vring;
> > > int dequeue_zero_copy;
> > > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > > + struct inflight_mem_info inflight_info;
> > > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > > char ifname[IF_NAME_SZ];
> > > uint64_t log_size;
> >
> > Do you have some code example using these new APIs?
> > It would help for reviewing the patch.
>
> +1
I just submitted SPDK-related patches, and you will see how to use new API.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-21 8:29 ` lin li
@ 2019-05-21 8:46 ` Tiwei Bie
2019-05-22 10:15 ` [dpdk-dev] 答复: " Li,Lin(ACG Cloud)
0 siblings, 1 reply; 25+ messages in thread
From: Tiwei Bie @ 2019-05-21 8:46 UTC (permalink / raw)
To: lin li; +Cc: dev, dariusz.stojaczyk, changpeng.liu, Li Lin, maxime.coquelin
On Tue, May 21, 2019 at 04:29:31PM +0800, lin li wrote:
> Tiwei Bie <tiwei.bie@intel.com> 于2019年5月20日周一上午10:14写道:
> >
> > On Fri, May 17, 2019 at 05:47:07PM +0200, Maxime Coquelin wrote:
> > > On 5/5/19 11:02 AM, Li Lin wrote:
> > [...]
> > > > /**
> > > > * Device structure contains all configuration information relating
> > > > * to the device.
> > > > @@ -303,6 +314,7 @@ struct virtio_net {
> > > > uint32_t nr_vring;
> > > > int dequeue_zero_copy;
> > > > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > > > + struct inflight_mem_info inflight_info;
> > > > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > > > char ifname[IF_NAME_SZ];
> > > > uint64_t log_size;
> > >
> > > Do you have some code example using these new APIs?
> > > It would help for reviewing the patch.
> >
> > +1
> I just submitted SPDK-related patches, and you will see how to use new API.
SPDK is another separate project. For the DPDK APIs introduced
in this patch, it would be great to have an example in DPDK to
demonstrate their usage, e.g. examples/vhost_scsi. Besides, the
new APIs also need to take the packed ring into consideration.
Thanks,
Tiwei
^ permalink raw reply [flat|nested] 25+ messages in thread
* [dpdk-dev] 答复: [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-21 8:46 ` Tiwei Bie
@ 2019-05-22 10:15 ` Li,Lin(ACG Cloud)
0 siblings, 0 replies; 25+ messages in thread
From: Li,Lin(ACG Cloud) @ 2019-05-22 10:15 UTC (permalink / raw)
To: Tiwei Bie, lin li; +Cc: dev, dariusz.stojaczyk, changpeng.liu, maxime.coquelin
> -----邮件原件-----
> 发件人: Tiwei Bie [mailto:tiwei.bie@intel.com]
> 发送时间: 2019年5月21日 16:47
> 收件人: lin li <chuckylinchuckylin@gmail.com>
> 抄送: dev@dpdk.org; dariusz.stojaczyk@intel.com; changpeng.liu@intel.com;
> Li,Lin(ACG Cloud) <lilin24@baidu.com>; maxime.coquelin@redhat.com
> 主题: Re: [PATCH v4] vhost: support inflight share memory protocol feature
>
> On Tue, May 21, 2019 at 04:29:31PM +0800, lin li wrote:
> > Tiwei Bie <tiwei.bie@intel.com> 于2019年5月20日周一上午10:14写道:
> > >
> > > On Fri, May 17, 2019 at 05:47:07PM +0200, Maxime Coquelin wrote:
> > > > On 5/5/19 11:02 AM, Li Lin wrote:
> > > [...]
> > > > > /**
> > > > > * Device structure contains all configuration information relating
> > > > > * to the device.
> > > > > @@ -303,6 +314,7 @@ struct virtio_net {
> > > > > uint32_t nr_vring;
> > > > > int dequeue_zero_copy;
> > > > > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS
> *
> > > > > 2];
> > > > > + struct inflight_mem_info inflight_info;
> > > > > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX :
> IFNAMSIZ)
> > > > > char ifname[IF_NAME_SZ];
> > > > > uint64_t log_size;
> > > >
> > > > Do you have some code example using these new APIs?
> > > > It would help for reviewing the patch.
> > >
> > > +1
> > I just submitted SPDK-related patches, and you will see how to use new API.
>
> SPDK is another separate project. For the DPDK APIs introduced in this patch, it
> would be great to have an example in DPDK to demonstrate their usage, e.g.
> examples/vhost_scsi. Besides, the new APIs also need to take the packed ring
> into consideration.
The patch does not apply to SCSI ,and disconnection can cause a
timeout problem in SCSI.
To disconnection, scsi driver have a different process.
This patch is only valid for vhost blk.
The use of new two messages and the implementation of packed queue are
described in docs/vhost-user.txt.
>
> Thanks,
> Tiwei
^ permalink raw reply [flat|nested] 25+ messages in thread
* [dpdk-dev] 答复: [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-17 15:47 ` Maxime Coquelin
2019-05-20 2:13 ` Tiwei Bie
@ 2019-05-22 11:04 ` Li,Lin(ACG Cloud)
2019-06-12 8:07 ` Maxime Coquelin
1 sibling, 1 reply; 25+ messages in thread
From: Li,Lin(ACG Cloud) @ 2019-05-22 11:04 UTC (permalink / raw)
To: Maxime Coquelin, Li Lin, tiwei.bie, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris,
Xie,Yongji, Ni,Xun, Zhang,Yu(ACG Cloud)
> -----邮件原件-----
> 发件人: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> 发送时间: 2019年5月17日 23:47
> 收件人: Li Lin <chuckylinchuckylin@gmail.com>; tiwei.bie@intel.com;
> zhihong.wang@intel.com
> 抄送: dev@dpdk.org; dariusz.stojaczyk@intel.com; changpeng.liu@intel.com;
> james.r.harris@intel.com; Xie,Yongji <xieyongji@baidu.com>; Li,Lin(ACG Cloud)
> <lilin24@baidu.com>; Ni,Xun <nixun@baidu.com>; Zhang,Yu(ACG Cloud)
> <zhangyu31@baidu.com>
> 主题: Re: [PATCH v4] vhost: support inflight share memory protocol feature
>
>
>
> On 5/5/19 11:02 AM, Li Lin wrote:
> > From: Li Lin <lilin24@baidu.com>
> >
> > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and
> > VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer
> > between qemu and backend.
> >
> > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the shared buffer
> > from backend. Then qemu should send it back through
> > VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> >
> > This shared buffer is used to process inflight I/O when backend
> > reconnect.
> >
> > Signed-off-by: Li Lin <lilin24@baidu.com>
> > Signed-off-by: Ni Xun <nixun@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> > ---
> > v2 - add set&clr inflight desc functions
> > v3 - simplified some functions implementation
> > v4 - rework some data structures according to qemu doc
> >
> > lib/librte_vhost/rte_vhost.h | 83 ++++++++++
> > lib/librte_vhost/rte_vhost_version.map | 3 +
> > lib/librte_vhost/vhost.c | 105 ++++++++++++
> > lib/librte_vhost/vhost.h | 12 ++
> > lib/librte_vhost/vhost_user.c | 292
> +++++++++++++++++++++++++++++++++
> > lib/librte_vhost/vhost_user.h | 13 +-
> > 6 files changed, 507 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index d2c0c93f4..d4d709717 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -71,6 +71,10 @@ extern "C" {
> > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> > #endif
> >
> > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 #endif
> > +
> > /** Indicate whether protocol features negotiation is supported. */
> > #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> > #define VHOST_USER_F_PROTOCOL_FEATURES 30
> > @@ -98,12 +102,41 @@ struct rte_vhost_memory {
> > struct rte_vhost_mem_region regions[];
> > };
> >
> > +struct inflight_desc {
> > + uint8_t inflight;
> > + uint8_t padding[5];
> > + uint16_t next;
> > + uint64_t counter;
> > +};
> > +
> > +struct inflight_info {
> > + uint64_t features;
> > + uint16_t version;
> > + uint16_t desc_num;
> > + uint16_t last_inflight_io;
> > + uint16_t used_idx;
> > + struct inflight_desc desc[0];
> > +};
> > +
> > +struct resubmit_desc {
> > + uint16_t index;
> > + uint64_t counter;
> > +};
> > +
> > +struct resubmit_info {
> > + struct resubmit_desc *resubmit_list;
> > + uint16_t resubmit_num;
> > +};
> > +
> > struct rte_vhost_vring {
> > struct vring_desc *desc;
> > struct vring_avail *avail;
> > struct vring_used *used;
> > uint64_t log_guest_addr;
> >
> > + struct inflight_info *inflight;
> > + struct resubmit_info *resubmit;
> > +
> > /** Deprecated, use rte_vhost_vring_call() instead. */
> > int callfd;
> >
> > @@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t
> vring_idx,
> > int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >
> > /**
> > + * set inflight flag for a desc.
> > + *
> > + * @param vid
> > + * vhost device ID
> > + * @param vring_idx
> > + * vring index
> > + * @param idx
> > + * inflight entry index
> > + * @return
> > + * 0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
> > + uint16_t idx);
> > +
> > +/**
> > + * clear inflight flag for a desc.
> > + *
> > + * @param vid
> > + * vhost device ID
> > + * @param vring_idx
> > + * vring index
> > + * @param last_used_idx
> > + * next free used_idx
> > + * @param idx
> > + * inflight entry index
> > + * @return
> > + * 0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> > + uint16_t last_used_idx, uint16_t idx);
> > +
> > +/**
> > + * set last inflight io index.
> > + *
> > + * @param vid
> > + * vhost device ID
> > + * @param vring_idx
> > + * vring index
> > + * @param idx
> > + * inflight entry index
> > + * @return
> > + * 0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
> > + uint16_t idx);
> > +
> > +/**
> > * Get vhost RX queue avail count.
> > *
> > * @param vid
> > diff --git a/lib/librte_vhost/rte_vhost_version.map
> > b/lib/librte_vhost/rte_vhost_version.map
> > index 5f1d4a75c..a992b3935 100644
> > --- a/lib/librte_vhost/rte_vhost_version.map
> > +++ b/lib/librte_vhost/rte_vhost_version.map
> > @@ -87,4 +87,7 @@ EXPERIMENTAL {
> > rte_vdpa_relay_vring_used;
> > rte_vhost_extern_callback_register;
> > rte_vhost_driver_set_protocol_features;
> > + rte_vhost_set_inflight_desc;
> > + rte_vhost_clr_inflight_desc;
> > + rte_vhost_set_last_inflight_io;
> > };
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 163f4595e..03eab3551 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> > close(vq->callfd);
> > if (vq->kickfd >= 0)
> > close(vq->kickfd);
> > + if (vq->inflight)
> > + vq->inflight = NULL;
> > + if (vq->resubmit->resubmit_list) {
> > + free(vq->resubmit->resubmit_list);
> > + vq->resubmit->resubmit_list = NULL;
> > + }
> > + if (vq->resubmit) {
> > + free(vq->resubmit);
> > + vq->resubmit = NULL;
> > + }
> > }
> >
> > /*
> > @@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t
> vring_idx,
> > vring->kickfd = vq->kickfd;
> > vring->size = vq->size;
> >
> > + vring->inflight = vq->inflight;
> > + vring->resubmit = vq->resubmit;
> > +
> > return 0;
> > }
> >
> > @@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> > return 0;
> > }
> >
> > +int
> > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t
> > +idx) {
> > + struct virtio_net *dev;
> > + struct vhost_virtqueue *vq;
> > +
> > + dev = get_device(vid);
> > + if (!dev)
> > + return -1;
> > +
> > + if (!(dev->features &
> > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > + return 0;
>
> Shouldn't -1 be returned here also?
> I think this function should be called only if the feature negotiation succeeded.
>
> Returning an error here would help the calling application to know something
> went wrong.
>
> Same comment applies for the clear function.
If the feature is not set,set & clr function returns directly.
They are similar to empty functions.
The return value is - 1, which represents an error in function execution
and unexpected behavior.
> > +
> > + if (vring_idx >= VHOST_MAX_VRING)
> > + return -1;
> > +
> > + vq = dev->virtqueue[vring_idx];
> > + if (!vq)
> > + return -1;
> > +
> > + if (unlikely(!vq->inflight))
> > + return -1;
> > +
> > + vq->inflight->desc[idx].counter = vq->counter++;
> > + vq->inflight->desc[idx].inflight = 1;
> > + return 0;
> > +}
> > +
> > +int
> > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> > + uint16_t last_used_idx, uint16_t idx) {
> > + struct virtio_net *dev;
> > + struct vhost_virtqueue *vq;
> > +
> > + dev = get_device(vid);
> > + if (!dev)
> > + return -1;
> > +
> > + if (!(dev->features &
> > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > + return 0;
> > +
> > + if (vring_idx >= VHOST_MAX_VRING)
> > + return -1;
> > +
> > + vq = dev->virtqueue[vring_idx];
> > + if (!vq)
> > + return -1;
> > +
> > + if (unlikely(!vq->inflight))
> > + return -1;
> > +
> > + rte_compiler_barrier();
> > +
> > + vq->inflight->desc[idx].inflight = 0;
> > +
> > + rte_compiler_barrier();
> > +
> > + vq->inflight->used_idx = last_used_idx;
> > + return 0;
> > +}
> > +
> > +int
> > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t
> > +idx) {
> > + struct virtio_net *dev;
> > + struct vhost_virtqueue *vq;
> > +
> > + dev = get_device(vid);
> > + if (!dev)
> > + return -1;
> > +
> > + if (!(dev->features &
> > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > + return 0;
> > +
> > + if (vring_idx >= VHOST_MAX_VRING)
> > + return -1;
> > +
> > + vq = dev->virtqueue[vring_idx];
> > + if (!vq)
> > + return -1;
> > +
> > + if (unlikely(!vq->inflight))
> > + return -1;
> > +
> > + vq->inflight->last_inflight_io = idx;
> > + return 0;
> > +}
>
> Are above functions supposed to be called in the hot path?
> If so, you might want to use unlikely for the error checks everywhere.
You're right. I'll modify it in the next version.
>
> > +
> > uint16_t
> > rte_vhost_avail_entries(int vid, uint16_t queue_id)
> > {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > e9138dfab..3dace2ec3 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -128,6 +128,11 @@ struct vhost_virtqueue {
> > /* Physical address of used ring, for logging */
> > uint64_t log_guest_addr;
> >
> > + /* Inflight share memory info */
> > + struct inflight_info *inflight;
> > + struct resubmit_info *resubmit;
> > + uint64_t counter;
>
> counter and resubmit are too generic names.
I'll modify the names in the next version.
>
> > +
> > uint16_t nr_zmbuf;
> > uint16_t zmbuf_size;
> > uint16_t last_zmbuf_idx;
> > @@ -286,6 +291,12 @@ struct guest_page {
> > uint64_t size;
> > };
> >
> > +struct inflight_mem_info {
> > + int fd;
> > + void *addr;
> > + uint64_t size;
> > +};
> > +
> > /**
> > * Device structure contains all configuration information relating
> > * to the device.
> > @@ -303,6 +314,7 @@ struct virtio_net {
> > uint32_t nr_vring;
> > int dequeue_zero_copy;
> > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > + struct inflight_mem_info inflight_info;
> > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> > char ifname[IF_NAME_SZ];
> > uint64_t log_size;
>
> Do you have some code example using these new APIs?
> It would help for reviewing the patch.
This patch is only valid for vhost blk.
The use of new two messages and the implementation of packed queue are
described in docs/vhost-user.txt.
I have submitted SPDK-related patches to use new API.
>
> Thanks,
> Maxime
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [dpdk-dev] 答复: [PATCH v4] vhost: support inflight share memory protocol feature
2019-05-22 11:04 ` Li,Lin(ACG Cloud)
@ 2019-06-12 8:07 ` Maxime Coquelin
0 siblings, 0 replies; 25+ messages in thread
From: Maxime Coquelin @ 2019-06-12 8:07 UTC (permalink / raw)
To: Li,Lin(ACG Cloud), Li Lin, tiwei.bie, zhihong.wang
Cc: dev, dariusz.stojaczyk, changpeng.liu, james.r.harris,
Xie,Yongji, Ni,Xun, Zhang,Yu(ACG Cloud)
On 5/22/19 1:04 PM, Li,Lin(ACG Cloud) wrote:
>
>
>> -----邮件原件-----
>> 发件人: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> 发送时间: 2019年5月17日 23:47
>> 收件人: Li Lin <chuckylinchuckylin@gmail.com>; tiwei.bie@intel.com;
>> zhihong.wang@intel.com
>> 抄送: dev@dpdk.org; dariusz.stojaczyk@intel.com; changpeng.liu@intel.com;
>> james.r.harris@intel.com; Xie,Yongji <xieyongji@baidu.com>; Li,Lin(ACG Cloud)
>> <lilin24@baidu.com>; Ni,Xun <nixun@baidu.com>; Zhang,Yu(ACG Cloud)
>> <zhangyu31@baidu.com>
>> 主题: Re: [PATCH v4] vhost: support inflight share memory protocol feature
>>
>>
>>
>> On 5/5/19 11:02 AM, Li Lin wrote:
>>> From: Li Lin <lilin24@baidu.com>
>>>
>>> This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
>> and
>>> VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer
>>> between qemu and backend.
>>>
>>> Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the shared buffer
>>> from backend. Then qemu should send it back through
>>> VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
>>>
>>> This shared buffer is used to process inflight I/O when backend
>>> reconnect.
>>>
>>> Signed-off-by: Li Lin <lilin24@baidu.com>
>>> Signed-off-by: Ni Xun <nixun@baidu.com>
>>> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
>>> ---
>>> v2 - add set&clr inflight desc functions
>>> v3 - simplified some functions implementation
>>> v4 - rework some data structures according to qemu doc
>>>
>>> lib/librte_vhost/rte_vhost.h | 83 ++++++++++
>>> lib/librte_vhost/rte_vhost_version.map | 3 +
>>> lib/librte_vhost/vhost.c | 105 ++++++++++++
>>> lib/librte_vhost/vhost.h | 12 ++
>>> lib/librte_vhost/vhost_user.c | 292
>> +++++++++++++++++++++++++++++++++
>>> lib/librte_vhost/vhost_user.h | 13 +-
>>> 6 files changed, 507 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/lib/librte_vhost/rte_vhost.h
>>> b/lib/librte_vhost/rte_vhost.h index d2c0c93f4..d4d709717 100644
>>> --- a/lib/librte_vhost/rte_vhost.h
>>> +++ b/lib/librte_vhost/rte_vhost.h
>>> @@ -71,6 +71,10 @@ extern "C" {
>>> #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
>>> #endif
>>>
>>> +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
>>> +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 #endif
>>> +
>>> /** Indicate whether protocol features negotiation is supported. */
>>> #ifndef VHOST_USER_F_PROTOCOL_FEATURES
>>> #define VHOST_USER_F_PROTOCOL_FEATURES 30
>>> @@ -98,12 +102,41 @@ struct rte_vhost_memory {
>>> struct rte_vhost_mem_region regions[];
>>> };
>>>
>>> +struct inflight_desc {
>>> + uint8_t inflight;
>>> + uint8_t padding[5];
>>> + uint16_t next;
>>> + uint64_t counter;
>>> +};
>>> +
>>> +struct inflight_info {
>>> + uint64_t features;
>>> + uint16_t version;
>>> + uint16_t desc_num;
>>> + uint16_t last_inflight_io;
>>> + uint16_t used_idx;
>>> + struct inflight_desc desc[0];
>>> +};
>>> +
>>> +struct resubmit_desc {
>>> + uint16_t index;
>>> + uint64_t counter;
>>> +};
>>> +
>>> +struct resubmit_info {
>>> + struct resubmit_desc *resubmit_list;
>>> + uint16_t resubmit_num;
>>> +};
>>> +
>>> struct rte_vhost_vring {
>>> struct vring_desc *desc;
>>> struct vring_avail *avail;
>>> struct vring_used *used;
>>> uint64_t log_guest_addr;
>>>
>>> + struct inflight_info *inflight;
>>> + struct resubmit_info *resubmit;
>>> +
>>> /** Deprecated, use rte_vhost_vring_call() instead. */
>>> int callfd;
>>>
>>> @@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t
>> vring_idx,
>>> int rte_vhost_vring_call(int vid, uint16_t vring_idx);
>>>
>>> /**
>>> + * set inflight flag for a desc.
>>> + *
>>> + * @param vid
>>> + * vhost device ID
>>> + * @param vring_idx
>>> + * vring index
>>> + * @param idx
>>> + * inflight entry index
>>> + * @return
>>> + * 0 on success, -1 on failure
>>> + */
>>> +int __rte_experimental
>>> +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
>>> + uint16_t idx);
>>> +
>>> +/**
>>> + * clear inflight flag for a desc.
>>> + *
>>> + * @param vid
>>> + * vhost device ID
>>> + * @param vring_idx
>>> + * vring index
>>> + * @param last_used_idx
>>> + * next free used_idx
>>> + * @param idx
>>> + * inflight entry index
>>> + * @return
>>> + * 0 on success, -1 on failure
>>> + */
>>> +int __rte_experimental
>>> +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
>>> + uint16_t last_used_idx, uint16_t idx);
>>> +
>>> +/**
>>> + * set last inflight io index.
>>> + *
>>> + * @param vid
>>> + * vhost device ID
>>> + * @param vring_idx
>>> + * vring index
>>> + * @param idx
>>> + * inflight entry index
>>> + * @return
>>> + * 0 on success, -1 on failure
>>> + */
>>> +int __rte_experimental
>>> +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
>>> + uint16_t idx);
>>> +
>>> +/**
>>> * Get vhost RX queue avail count.
>>> *
>>> * @param vid
>>> diff --git a/lib/librte_vhost/rte_vhost_version.map
>>> b/lib/librte_vhost/rte_vhost_version.map
>>> index 5f1d4a75c..a992b3935 100644
>>> --- a/lib/librte_vhost/rte_vhost_version.map
>>> +++ b/lib/librte_vhost/rte_vhost_version.map
>>> @@ -87,4 +87,7 @@ EXPERIMENTAL {
>>> rte_vdpa_relay_vring_used;
>>> rte_vhost_extern_callback_register;
>>> rte_vhost_driver_set_protocol_features;
>>> + rte_vhost_set_inflight_desc;
>>> + rte_vhost_clr_inflight_desc;
>>> + rte_vhost_set_last_inflight_io;
>>> };
>>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
>>> 163f4595e..03eab3551 100644
>>> --- a/lib/librte_vhost/vhost.c
>>> +++ b/lib/librte_vhost/vhost.c
>>> @@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
>>> close(vq->callfd);
>>> if (vq->kickfd >= 0)
>>> close(vq->kickfd);
>>> + if (vq->inflight)
>>> + vq->inflight = NULL;
>>> + if (vq->resubmit->resubmit_list) {
>>> + free(vq->resubmit->resubmit_list);
>>> + vq->resubmit->resubmit_list = NULL;
>>> + }
>>> + if (vq->resubmit) {
>>> + free(vq->resubmit);
>>> + vq->resubmit = NULL;
>>> + }
>>> }
>>>
>>> /*
>>> @@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t
>> vring_idx,
>>> vring->kickfd = vq->kickfd;
>>> vring->size = vq->size;
>>>
>>> + vring->inflight = vq->inflight;
>>> + vring->resubmit = vq->resubmit;
>>> +
>>> return 0;
>>> }
>>>
>>> @@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
>>> return 0;
>>> }
>>>
>>> +int
>>> +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t
>>> +idx) {
>>> + struct virtio_net *dev;
>>> + struct vhost_virtqueue *vq;
>>> +
>>> + dev = get_device(vid);
>>> + if (!dev)
>>> + return -1;
>>> +
>>> + if (!(dev->features &
>>> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
>>> + return 0;
>>
>> Shouldn't -1 be returned here also?
>> I think this function should be called only if the feature negotiation succeeded.
>>
>> Returning an error here would help the calling application to know something
>> went wrong.
>>
>> Same comment applies for the clear function.
>
> If the feature is not set,set & clr function returns directly.
> They are similar to empty functions.
> The return value is - 1, which represents an error in function execution
> and unexpected behavior.
>
>>> +
>>> + if (vring_idx >= VHOST_MAX_VRING)
>>> + return -1;
>>> +
>>> + vq = dev->virtqueue[vring_idx];
>>> + if (!vq)
>>> + return -1;
>>> +
>>> + if (unlikely(!vq->inflight))
>>> + return -1;
>>> +
>>> + vq->inflight->desc[idx].counter = vq->counter++;
>>> + vq->inflight->desc[idx].inflight = 1;
>>> + return 0;
>>> +}
>>> +
>>> +int
>>> +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
>>> + uint16_t last_used_idx, uint16_t idx) {
>>> + struct virtio_net *dev;
>>> + struct vhost_virtqueue *vq;
>>> +
>>> + dev = get_device(vid);
>>> + if (!dev)
>>> + return -1;
>>> +
>>> + if (!(dev->features &
>>> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
>>> + return 0;
>>> +
>>> + if (vring_idx >= VHOST_MAX_VRING)
>>> + return -1;
>>> +
>>> + vq = dev->virtqueue[vring_idx];
>>> + if (!vq)
>>> + return -1;
>>> +
>>> + if (unlikely(!vq->inflight))
>>> + return -1;
>>> +
>>> + rte_compiler_barrier();
>>> +
>>> + vq->inflight->desc[idx].inflight = 0;
>>> +
>>> + rte_compiler_barrier();
>>> +
>>> + vq->inflight->used_idx = last_used_idx;
>>> + return 0;
>>> +}
>>> +
>>> +int
>>> +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t
>>> +idx) {
>>> + struct virtio_net *dev;
>>> + struct vhost_virtqueue *vq;
>>> +
>>> + dev = get_device(vid);
>>> + if (!dev)
>>> + return -1;
>>> +
>>> + if (!(dev->features &
>>> + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
>>> + return 0;
>>> +
>>> + if (vring_idx >= VHOST_MAX_VRING)
>>> + return -1;
>>> +
>>> + vq = dev->virtqueue[vring_idx];
>>> + if (!vq)
>>> + return -1;
>>> +
>>> + if (unlikely(!vq->inflight))
>>> + return -1;
>>> +
>>> + vq->inflight->last_inflight_io = idx;
>>> + return 0;
>>> +}
>>
>> Are above functions supposed to be called in the hot path?
>> If so, you might want to use unlikely for the error checks everywhere.
>
> You're right. I'll modify it in the next version.
>
>>
>>> +
>>> uint16_t
>>> rte_vhost_avail_entries(int vid, uint16_t queue_id)
>>> {
>>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
>>> e9138dfab..3dace2ec3 100644
>>> --- a/lib/librte_vhost/vhost.h
>>> +++ b/lib/librte_vhost/vhost.h
>>> @@ -128,6 +128,11 @@ struct vhost_virtqueue {
>>> /* Physical address of used ring, for logging */
>>> uint64_t log_guest_addr;
>>>
>>> + /* Inflight share memory info */
>>> + struct inflight_info *inflight;
>>> + struct resubmit_info *resubmit;
>>> + uint64_t counter;
>>
>> counter and resubmit are too generic names.
>
> I'll modify the names in the next version.
>
>>
>>> +
>>> uint16_t nr_zmbuf;
>>> uint16_t zmbuf_size;
>>> uint16_t last_zmbuf_idx;
>>> @@ -286,6 +291,12 @@ struct guest_page {
>>> uint64_t size;
>>> };
>>>
>>> +struct inflight_mem_info {
>>> + int fd;
>>> + void *addr;
>>> + uint64_t size;
>>> +};
>>> +
>>> /**
>>> * Device structure contains all configuration information relating
>>> * to the device.
>>> @@ -303,6 +314,7 @@ struct virtio_net {
>>> uint32_t nr_vring;
>>> int dequeue_zero_copy;
>>> struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>>> + struct inflight_mem_info inflight_info;
>>> #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>>> char ifname[IF_NAME_SZ];
>>> uint64_t log_size;
>>
>> Do you have some code example using these new APIs?
>> It would help for reviewing the patch.
>
> This patch is only valid for vhost blk.
This does not seem vhost-blk specifics, otherwise the requests names
would reflect that and it should live in spdk only as we have worked
with the spdk team so that they can handle their specific requests
directly in their lib.
This new feature is for backends handling the ring out-of-order,
including vhost-blk.
> The use of new two messages and the implementation of packed queue are
> described in docs/vhost-user.txt.
Thanks, that's now in QEMU's docs/interop/vhost-user.rst
Regarding packed ring, I understand you don't want to implement it now.
However, the new rte_vhost APIs you introduce need to specify it is
split-ring only in their names, or find a way to to make the API generic
but it seems difficult.
Regarding the feature, I wonder whether it could also be useful in case
of in-order handling to recover from crashes. Maybe not for split ring
where we have some workarounds since we can get the info from the rings,
but more for packed ring, where the wrap counter values aren't saved
anywhere except the backend.
> I have submitted SPDK-related patches to use new API.
Here is the link:
https://review.gerrithub.io/c/spdk/spdk/+/449418/1
>
>>
>> Thanks,
>> Maxime
>>
>
^ permalink raw reply [flat|nested] 25+ messages in thread