Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature

DPDK patches and discussions
 help / color / mirror / Atom feed

From: Tiwei Bie <tiwei.bie@intel.com>
To: Li Lin <chuckylinchuckylin@gmail.com>
Cc: maxime.coquelin@redhat.com, zhihong.wang@intel.com, dev@dpdk.org,
	dariusz.stojaczyk@intel.com, changpeng.liu@intel.com,
	james.r.harris@intel.com, lilin24 <lilin24@baidu.com>,
	Ni Xun <nixun@baidu.com>, Zhang Yu <zhangyu31@baidu.com>
Subject: Re: [dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature
Date: Sun, 28 Apr 2019 19:17:48 +0800	[thread overview]
Message-ID: <20190428111748.GA16470@___> (raw)
In-Reply-To: <1556271621-8594-1-git-send-email-chuckylinchuckylin@gmail.com>

On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> From: lilin24 <lilin24@baidu.com>
> 
> This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> buffer between qemu and backend.
> 
> Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> shared buffer from backend. Then qemu should send it back
> through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> 
> This shared buffer is used to process inflight I/O when backend
> reconnect.
> 
> Signed-off-by: lilin24 <lilin24@baidu.com>

You need to use your real name here.

> Signed-off-by: Ni Xun <nixun@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> 
> v2:
> - add set&clr inflight entry function
> v3:
> - simplified some function implementations
> 
> ---

You can put change logs here (i.e. after ---).

>  lib/librte_vhost/rte_vhost.h  |  53 ++++++++++
>  lib/librte_vhost/vhost.c      |  42 ++++++++
>  lib/librte_vhost/vhost.h      |  11 ++
>  lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/vhost_user.h |  16 ++-
>  5 files changed, 351 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d2c0c93f4..bc25591a8 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -71,6 +71,10 @@ extern "C" {
>  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
>  #endif
>  
> +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> +#endif
> +
>  /** Indicate whether protocol features negotiation is supported. */
>  #ifndef VHOST_USER_F_PROTOCOL_FEATURES
>  #define VHOST_USER_F_PROTOCOL_FEATURES	30
> @@ -98,12 +102,26 @@ struct rte_vhost_memory {
>  	struct rte_vhost_mem_region regions[];
>  };
>  
> +typedef struct VhostUserInflightEntry {
> +	uint8_t inflight;
> +} VhostUserInflightEntry;
> +
> +typedef struct VhostInflightInfo {
> +	uint16_t version;
> +	uint16_t last_inflight_io;
> +	uint16_t used_idx;
> +	VhostUserInflightEntry desc[0];
> +} VhostInflightInfo;

Is there any details on above structure? Why does it not match
QueueRegionSplit or QueueRegionPacked structures described in
qemu/docs/interop/vhost-user.txt?

> +
>  struct rte_vhost_vring {
>  	struct vring_desc	*desc;
>  	struct vring_avail	*avail;
>  	struct vring_used	*used;
>  	uint64_t		log_guest_addr;
>  
> +	VhostInflightInfo       *inflight;
> +	int                inflight_flag;
> +
>  	/** Deprecated, use rte_vhost_vring_call() instead. */
>  	int			callfd;
>  
> @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
>  int rte_vhost_vring_call(int vid, uint16_t vring_idx);
>  
>  /**
> + * set inflight flag for a entry.
> + *
> + * @param vring
> + *  the structure to hold the requested vring info
> + * @param idx
> + *  inflight entry index
> + */
> +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> +		uint16_t idx);
> +
> +/**
> + * clear inflight flag for a entry.
> + *
> + * @param vring
> + *  the structure to hold the requested vring info
> + * @param last_used_idx
> + *  next free used_idx
> + * @param idx
> + *  inflight entry index
> + */
> +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> +		uint16_t last_used_idx, uint16_t idx);
> +
> +/**
> + * set last inflight io index.
> + *
> + * @param vring
> + *  the structure to hold the requested vring info
> + * @param idx
> + *  inflight entry index
> + */
> +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> +		uint16_t idx);

New APIs should be experimental and also need to be
added in rte_vhost_version.map file.

> +
> +/**
>   * Get vhost RX queue avail count.
>   *
>   * @param vid
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 163f4595e..9ba692935 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
>  		close(vq->callfd);
>  	if (vq->kickfd >= 0)
>  		close(vq->kickfd);
> +	if (vq->inflight)
> +		vq->inflight = NULL;
>  }
>  
>  /*
> @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
>  	vring->kickfd  = vq->kickfd;
>  	vring->size    = vq->size;
>  
> +	vring->inflight = vq->inflight;
> +
> +	vring->inflight_flag = vq->inflight_flag;
> +	vq->inflight_flag = 0;

This will break the ABI. Better to introduce a new API to do this.

> +
>  	return 0;
>  }
>  
> @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
>  	return 0;
>  }
>  
> +void
> +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> +	VhostInflightInfo *inflight = vring->inflight;
> +	if (unlikely(!inflight))
> +		return;
> +	inflight->desc[idx].inflight = 1;
> +}
> +
> +void
> +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> +	uint16_t last_used_idx, uint16_t idx)
> +{
> +	VhostInflightInfo *inflight = vring->inflight;
> +
> +	if (unlikely(!inflight))
> +		return;
> +
> +	rte_compiler_barrier();
> +	inflight->desc[idx].inflight = 0;
> +	rte_compiler_barrier();
> +	inflight->used_idx = last_used_idx;
> +}
> +
> +void
> +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> +{
> +	VhostInflightInfo *inflight = vring->inflight;
> +
> +	if (unlikely(!inflight))
> +		return;
> +
> +	inflight->last_inflight_io = idx;
> +}

The rte_vhost_vring is used to return information to apps.

IIUC, you want to update vq->inflight. So the rte_vhost_vring
param should be replaced by vid + vring_idx.

> +
>  uint16_t
>  rte_vhost_avail_entries(int vid, uint16_t queue_id)
>  {
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index e9138dfab..537d74c71 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -128,6 +128,10 @@ struct vhost_virtqueue {
>  	/* Physical address of used ring, for logging */
>  	uint64_t		log_guest_addr;
>  
> +	/* Inflight share memory info */
> +	VhostInflightInfo       *inflight;
> +	bool                inflight_flag;
> +
>  	uint16_t		nr_zmbuf;
>  	uint16_t		zmbuf_size;
>  	uint16_t		last_zmbuf_idx;
> @@ -286,6 +290,12 @@ struct guest_page {
>  	uint64_t size;
>  };
>  
> +typedef struct VuDevInflightInfo {
> +	int fd;
> +	void *addr;
> +	uint64_t size;
> +} VuDevInflightInfo;

Please follow DPDK's coding style when defining internal structure.

> +
>  /**
>   * Device structure contains all configuration information relating
>   * to the device.
> @@ -303,6 +313,7 @@ struct virtio_net {
>  	uint32_t		nr_vring;
>  	int			dequeue_zero_copy;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> +	VuDevInflightInfo inflight_info;
>  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
>  	char			ifname[IF_NAME_SZ];
>  	uint64_t		log_size;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
[...]
> +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> +{
> +	return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> +		sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> +}
> +
> +static int
> +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> +		int main_fd __rte_unused)
> +{
> +	int fd;
> +	uint64_t mmap_size;
> +	void *addr;
> +	uint16_t num_queues, queue_size;
> +	struct virtio_net *dev = *pdev;
> +
> +	if (msg->size != sizeof(msg->payload.inflight)) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"Invalid get_inflight_fd message size is %d",
> +			msg->size);
> +		msg->payload.inflight.mmap_size = 0;

In this case, you can't touch the payload.

> +		return 0;

And an error should be returned.

> +	}
> +
> +	num_queues = msg->payload.inflight.num_queues;
> +	queue_size = msg->payload.inflight.queue_size;
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> +			msg->payload.inflight.num_queues);
> +	RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> +			msg->payload.inflight.queue_size);
> +	mmap_size = num_queues * get_pervq_shm_size(queue_size);
> +
> +	addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> +	if (!addr) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> +			msg->payload.inflight.mmap_size = 0;
> +		return 0;

Should always return RTE_VHOST_MSG_RESULT_* in
message handler.

> +	}
> +	memset(addr, 0, mmap_size);
> +
> +	dev->inflight_info.addr = addr;
> +	dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> +	dev->inflight_info.fd = msg->fds[0] = fd;
> +	msg->payload.inflight.mmap_offset = 0;
> +	msg->fd_num = 1;
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +			"send inflight mmap_size: %lu\n",
> +			msg->payload.inflight.mmap_size);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +			"send inflight mmap_offset: %lu\n",
> +			msg->payload.inflight.mmap_offset);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +			"send inflight fd: %d\n", msg->fds[0]);
> +
> +	return RTE_VHOST_MSG_RESULT_REPLY;
> +}
> +
> +static int
> +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> +		int main_fd __rte_unused)
> +{
> +	int fd, i;
> +	uint64_t mmap_size, mmap_offset;
> +	uint16_t num_queues, queue_size;
> +	uint32_t pervq_inflight_size;
> +	void *rc;
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev = *pdev;
> +
> +	fd = msg->fds[0];
> +	if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> +			msg->size, fd);
> +		return -1;

Ditto.

> +	}
> +
> +	mmap_size = msg->payload.inflight.mmap_size;
> +	mmap_offset = msg->payload.inflight.mmap_offset;
> +	num_queues = msg->payload.inflight.num_queues;
> +	queue_size = msg->payload.inflight.queue_size;
> +	pervq_inflight_size = get_pervq_shm_size(queue_size);
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd mmap_size: %lu\n", mmap_size);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd num_queues: %u\n", num_queues);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd queue_size: %u\n", queue_size);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd fd: %d\n", fd);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"set_inflight_fd pervq_inflight_size: %d\n",
> +		pervq_inflight_size);
> +
> +	if (dev->inflight_info.addr)
> +		munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +
> +	rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +			fd, mmap_offset);

Why call it rc? Maybe addr is a better name?

> +	if (rc == MAP_FAILED) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> +		return -1;

Should always return RTE_VHOST_MSG_RESULT_* in
message handler.

> +	}
> +
> +	if (dev->inflight_info.fd)
> +		close(dev->inflight_info.fd);
> +
> +	dev->inflight_info.fd = fd;
> +	dev->inflight_info.addr = rc;
> +	dev->inflight_info.size = mmap_size;
> +
> +	for (i = 0; i < num_queues; i++) {
> +		vq = dev->virtqueue[i];
> +		vq->inflight = (VhostInflightInfo *)rc;
> +		rc = (void *)((char *)rc + pervq_inflight_size);
> +	}
> +
> +	return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
>  static int
>  vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
>  			int main_fd __rte_unused)
> @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
>  }
>  
>  static int
> +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> +{
> +	uint16_t i = 0;
> +
> +	if ((!vq->inflight))
> +		return RTE_VHOST_MSG_RESULT_ERR;

Should check whether the feature is negotiated first.

> +
> +	if (!vq->inflight->version) {
> +		vq->inflight->version = INFLIGHT_VERSION;
> +		return RTE_VHOST_MSG_RESULT_OK;
> +	}
> +
> +	for (i = 0; i < vq->size; i++) {
> +		if (vq->inflight->desc[i].inflight == 1) {
> +			vq->inflight_flag = 1;
> +			break;
> +		}
> +	}
> +
> +	return RTE_VHOST_MSG_RESULT_OK;
> +}
> +
> +static int
>  vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
>  			int main_fd __rte_unused)
>  {
> @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
>  		close(vq->kickfd);
>  	vq->kickfd = file.fd;
>  
> +	if (vhost_check_queue_inflights(vq)) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"Failed to inflights for vq: %d\n", file.index);
> +		return RTE_VHOST_MSG_RESULT_ERR;
> +	}
> +
>  	return RTE_VHOST_MSG_RESULT_OK;
>  }
>  
> @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
>  	[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
>  	[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
>  	[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> +	[VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> +	[VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
>  };
>  
>  
> diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> index 2a650fe4b..35f969b1b 100644
> --- a/lib/librte_vhost/vhost_user.h
> +++ b/lib/librte_vhost/vhost_user.h
> @@ -23,7 +23,8 @@
>  					 (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
>  					 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
>  					 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> -					 (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> +					(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> +					(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))

It will advertise this feature for builtin net and crypto
backends. It's probably not what you intended.

>  
>  typedef enum VhostUserRequest {
>  	VHOST_USER_NONE = 0,
> @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
>  	VHOST_USER_POSTCOPY_ADVISE = 28,
>  	VHOST_USER_POSTCOPY_LISTEN = 29,
>  	VHOST_USER_POSTCOPY_END = 30,
> -	VHOST_USER_MAX = 31
> +	VHOST_USER_GET_INFLIGHT_FD = 31,
> +	VHOST_USER_SET_INFLIGHT_FD = 32,
> +	VHOST_USER_MAX = 33
>  } VhostUserRequest;
>  
>  typedef enum VhostUserSlaveRequest {
> @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
>  	uint64_t offset;
>  } VhostUserVringArea;
>  
> +typedef struct VhostUserInflight {
> +	uint64_t mmap_size;
> +	uint64_t mmap_offset;
> +	uint16_t num_queues;
> +	uint16_t queue_size;
> +} VhostUserInflight;
> +
>  typedef struct VhostUserMsg {
>  	union {
>  		uint32_t master; /* a VhostUserRequest value */
> @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
>  		struct vhost_vring_addr addr;
>  		VhostUserMemory memory;
>  		VhostUserLog    log;
> +		VhostUserInflight inflight;
>  		struct vhost_iotlb_msg iotlb;
>  		VhostUserCryptoSessionParam crypto_session;
>  		VhostUserVringArea area;
> @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
>  /* vhost_user.c */
>  int vhost_user_msg_handler(int vid, int fd);
>  int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
>  
>  /* socket.c */
>  int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> -- 
> 2.11.0
>

next prev parent reply	other threads:[~2019-04-28 11:18 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-25  7:56 [dpdk-dev] [PATCH] [resend] " Li Lin
2019-04-25  7:56 ` Li Lin
2019-04-25 10:56 ` lin li
2019-04-25 10:56   ` lin li
2019-04-25 11:10 ` [dpdk-dev] [PATCH v2] " Li Lin
2019-04-25 11:10   ` Li Lin
2019-04-26  9:40   ` [dpdk-dev] [PATCH v3] " Li Lin
2019-04-26  9:40     ` Li Lin
2019-04-28 11:17     ` Tiwei Bie [this message]
2019-04-28 11:17       ` Tiwei Bie
2019-04-29  4:07       ` lin li
2019-04-29  4:07         ` lin li
2019-04-29 10:54         ` Tiwei Bie
2019-04-29 10:54           ` Tiwei Bie
2019-04-30  8:37           ` lin li
2019-04-30  8:37             ` lin li
2019-05-05  9:02     ` [dpdk-dev] [PATCH v4] " Li Lin
2019-05-05  9:02       ` Li Lin
2019-05-17 15:47       ` Maxime Coquelin
2019-05-20  2:13         ` Tiwei Bie
2019-05-21  8:29           ` lin li
2019-05-21  8:46             ` Tiwei Bie
2019-05-22 10:15               ` [dpdk-dev] 答复: " Li,Lin(ACG Cloud)
2019-05-22 11:04         ` Li,Lin(ACG Cloud)
2019-06-12  8:07           ` Maxime Coquelin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190428111748.GA16470@___ \
    --to=tiwei.bie@intel.com \
    --cc=changpeng.liu@intel.com \
    --cc=chuckylinchuckylin@gmail.com \
    --cc=dariusz.stojaczyk@intel.com \
    --cc=dev@dpdk.org \
    --cc=james.r.harris@intel.com \
    --cc=lilin24@baidu.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=nixun@baidu.com \
    --cc=zhangyu31@baidu.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).