* [dpdk-dev] [PATCH v4 0/2] *** vhost support inflight share memory protocol feature *** [not found] <20190715202858.49624> @ 2019-07-25 21:23 ` JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 1/2] vhost: support inflight share memory protocol feature JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight JinYu 0 siblings, 2 replies; 4+ messages in thread From: JinYu @ 2019-07-25 21:23 UTC (permalink / raw) To: dev; +Cc: changpeng.liu, maxime.coquelin, tiwei.bie, zhihong.wang, JinYu This patches introduces two new messages VHOST_USER_GET_INFLIGHT_FD and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer between qemu and backend. Now It can both support split and packed ring. The example code show how these API work. JinYu (2): vhost: support inflight share memory protocol feature vhost: Add vhost-user-blk example which support inflight examples/vhost_blk/Makefile | 67 ++ examples/vhost_blk/blk.c | 122 +++ examples/vhost_blk/blk_spec.h | 95 +++ examples/vhost_blk/meson.build | 20 + examples/vhost_blk/vhost_blk.c | 1054 ++++++++++++++++++++++++ examples/vhost_blk/vhost_blk.h | 114 +++ examples/vhost_blk/vhost_blk_compat.c | 193 +++++ lib/librte_vhost/rte_vhost.h | 301 ++++++- lib/librte_vhost/rte_vhost_version.map | 12 + lib/librte_vhost/vhost.c | 398 ++++++++- lib/librte_vhost/vhost.h | 54 +- lib/librte_vhost/vhost_user.c | 418 +++++++++- lib/librte_vhost/vhost_user.h | 13 +- 13 files changed, 2832 insertions(+), 29 deletions(-) create mode 100644 examples/vhost_blk/Makefile create mode 100644 examples/vhost_blk/blk.c create mode 100644 examples/vhost_blk/blk_spec.h create mode 100644 examples/vhost_blk/meson.build create mode 100644 examples/vhost_blk/vhost_blk.c create mode 100644 examples/vhost_blk/vhost_blk.h create mode 100644 examples/vhost_blk/vhost_blk_compat.c -- 2.17.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
* [dpdk-dev] [PATCH v4 1/2] vhost: support inflight share memory protocol feature 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 0/2] *** vhost support inflight share memory protocol feature *** JinYu @ 2019-07-25 21:23 ` JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight JinYu 1 sibling, 0 replies; 4+ messages in thread From: JinYu @ 2019-07-25 21:23 UTC (permalink / raw) To: dev Cc: changpeng.liu, maxime.coquelin, tiwei.bie, zhihong.wang, JinYu, Lin Li, Xun Ni, Yu Zhang This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer between qemu and backend. Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the shared buffer from backend. Then qemu should send it back through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user. This shared buffer is used to process inflight I/O when backend reconnect. Signed-off-by: Lin Li <lilin24@baidu.com> Signed-off-by: Xun Ni <nixun@baidu.com> Signed-off-by: Yu Zhang <zhangyu31@baidu.com> Signed-off-by: Jin Yu <jin.yu@intel.com> --- v1 - specify the APIs are split-ring only v2 - fix APIs and judge split or packed v3 - Add rte_vhost_ prefix and fix one issue. v4 - add the packed ring support --- lib/librte_vhost/rte_vhost.h | 301 +++++++++++++++++- lib/librte_vhost/rte_vhost_version.map | 12 + lib/librte_vhost/vhost.c | 398 ++++++++++++++++++++++- lib/librte_vhost/vhost.h | 54 ++-- lib/librte_vhost/vhost_user.c | 418 ++++++++++++++++++++++++- lib/librte_vhost/vhost_user.h | 13 +- 6 files changed, 1167 insertions(+), 29 deletions(-) diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 0226b3eff..417ad8b16 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -11,6 +11,7 @@ */ #include <stdint.h> +#include <stdbool.h> #include <sys/eventfd.h> #include <rte_memory.h> @@ -71,6 +72,10 @@ extern "C" { #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 #endif +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 +#endif + /** Indicate whether protocol features negotiation is supported. */ #ifndef VHOST_USER_F_PROTOCOL_FEATURES #define VHOST_USER_F_PROTOCOL_FEATURES 30 @@ -98,10 +103,85 @@ struct rte_vhost_memory { struct rte_vhost_mem_region regions[]; }; +struct inflight_desc_packed { + uint8_t inflight; + uint8_t padding; + uint16_t next; + uint16_t last; + uint16_t num; + uint64_t counter; + uint16_t id; + uint16_t flags; + uint32_t len; + uint64_t addr; +}; + +struct inflight_info_packed { + uint64_t features; + uint16_t version; + uint16_t desc_num; + uint16_t free_head; + uint16_t old_free_head; + uint16_t used_idx; + uint16_t old_used_idx; + uint8_t used_wrap_counter; + uint8_t old_used_wrap_counter; + uint8_t padding[7]; + struct inflight_desc_packed desc[0]; +}; + +struct rte_vhost_resubmit_desc { + uint16_t index; + uint64_t counter; +}; + +struct rte_vhost_resubmit_info { + struct rte_vhost_resubmit_desc *resubmit_list; + uint16_t resubmit_num; +}; + +struct rte_vhost_ring_inflight{ + union { + struct inflight_info_split *inflight_split; + struct inflight_info_packed *inflight_packed; + }; + + struct rte_vhost_resubmit_info *resubmit_inflight; +}; + +struct vring_packed_desc { + uint64_t addr; + uint32_t len; + uint16_t id; + uint16_t flags; +}; + +struct vring_split_desc { + uint64_t addr; + uint32_t len; + uint16_t flags; + uint16_t next; +}; + +struct vring_packed_desc_event { + uint16_t off_wrap; + uint16_t flags; +}; + struct rte_vhost_vring { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; + union { + struct vring_desc *desc; + struct vring_packed_desc *desc_packed; + }; + union { + struct vring_avail *avail; + struct vring_packed_desc_event *driver_event; + }; + union { + struct vring_used *used; + struct vring_packed_desc_event *device_event; + }; + uint64_t log_guest_addr; /** Deprecated, use rte_vhost_vring_call() instead. */ @@ -603,6 +683,33 @@ uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, */ int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); +/** + * Get vq is packed + * + * @param vid + * vhost device ID + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_vq_is_packed(int vid); + +/** + * Get guest inflight vring info, including inflight ring and resubmit list. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested inflight vring info + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx, + struct rte_vhost_ring_inflight *vring); + /** * Get guest vring info, including the vring address, vring size, etc. * @@ -616,7 +723,7 @@ int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); * 0 on success, -1 on failure */ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, - struct rte_vhost_vring *vring); + struct rte_vhost_vring *vring); /** * Notify the guest that used descriptors have been added to the vring. This @@ -631,6 +738,112 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, */ int rte_vhost_vring_call(int vid, uint16_t vring_idx); +/** + * set split inflight descriptor. + * + * This function save descriptors that has been comsumed in available + * ring + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t idx); + +/** + * set packed inflight descriptor and get corresponding inflight entry + * + * This function save descriptors that has been comsumed + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx, + uint16_t head, uint16_t last, uint16_t *inflight_entry); + +/** + * save the head of list that the last batch of used descriptors. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * descriptor entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx, + uint16_t idx); + +/** + * update the inflight free_head, used_idx and used_wrap_counter. + * + * This function will update status first before updating descriptors + * to used + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_last_inflight_io_packed(int vid, uint16_t vring_idx, + uint16_t head); + +/** + * clear the split inflight status. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_used_idx + * last used idx of used ring + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t last_used_idx, uint16_t idx); + +/** + * clear the packed inflight status. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param head + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx, + uint16_t head); + /** * Get vhost RX queue avail count. * @@ -676,6 +889,68 @@ int __rte_experimental rte_vhost_get_vring_base(int vid, uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx); +/** + * Get avail_wrap_counter/used_wrap_counter of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param avail_wrap_counter + * vhost avail_wrap_counter to get + * @param used_wrap_counter + * vhost used_wrap_counter to get + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vring_base_counter(int vid, uint16_t queue_id, + bool *avail_wrap_counter, bool *used_wrap_counter); + +/** + * Get last_avail/last_used of the vhost virtqueue + * + * This function is designed for the reconnection and it's specific for + * the packed ring as we can get the two parameters from the inflight + * queueregion + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param last_avail_idx + * vhost last_avail_idx to get + * @param last_used_idx + * vhost last_used_idx to get + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vring_base_from_inflight(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx); + +/** + * Get avail_wrap_counter/used_wrap_counter of the vhost virtqueue + * + * This function is designed for the reconnection and it's specific for + * the packed ring as we can get the two parameters from the inflight + * queueregion + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param avail_wrap_counter + * vhost avail_wrap_counter to get + * @param used_wrap_counter + * vhost used_wrap_counter to get + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vring_base_counter_from_inflight(int vid, + uint16_t queue_id, bool *avail_wrap_counter, bool *used_wrap_counter); + /** * Set last_avail/used_idx of the vhost virtqueue * @@ -694,6 +969,24 @@ int __rte_experimental rte_vhost_set_vring_base(int vid, uint16_t queue_id, uint16_t last_avail_idx, uint16_t last_used_idx); +/** + * Set avail_wrap_counter/used_wrap_counter of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param avail_wrap_counter + * avail_wrap_counter to set + * @param used_wrap_counter + * used_wrap_counter to set + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_vring_base_counter(int vid, uint16_t queue_id, + bool avail_wrap_counter, bool used_wrap_counter); + /** * Register external message handling callbacks * diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index 5f1d4a75c..99f1134ea 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -87,4 +87,16 @@ EXPERIMENTAL { rte_vdpa_relay_vring_used; rte_vhost_extern_callback_register; rte_vhost_driver_set_protocol_features; + rte_vhost_set_inflight_desc_split; + rte_vhost_clr_inflight_desc_split; + rte_vhost_set_last_inflight_io_split; + rte_vhost_get_vhost_ring_inflight; + rte_vhost_vq_is_packed; + rte_vhost_set_inflight_desc_packed; + rte_vhost_clr_inflight_desc_packed; + rte_vhost_set_last_inflight_io_packed; + rte_vhost_get_vring_base_counter; + rte_vhost_get_vring_base_from_inflight; + rte_vhost_get_vring_base_counter_from_inflight; + rte_vhost_set_vring_base_counter; }; diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 981837b5d..2d958adc9 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -242,6 +242,31 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy) close(vq->kickfd); } +void +cleanup_vq_inflight(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) + return; + + if (vq_is_packed(dev)) { + if (vq->inflight_packed) + vq->inflight_packed = NULL; + } else { + if (vq->inflight_split) + vq->inflight_split = NULL; + } + + if (vq->resubmit_inflight) { + if (vq->resubmit_inflight->resubmit_list) { + free(vq->resubmit_inflight->resubmit_list); + vq->resubmit_inflight->resubmit_list = NULL; + } + free(vq->resubmit_inflight); + vq->resubmit_inflight = NULL; + } +} + /* * Unmap any memory, close any file descriptors and * free any memory owned by a device. @@ -253,8 +278,10 @@ cleanup_device(struct virtio_net *dev, int destroy) vhost_backend_cleanup(dev); - for (i = 0; i < dev->nr_vring; i++) + for (i = 0; i < dev->nr_vring; i++) { cleanup_vq(dev->virtqueue[i], destroy); + cleanup_vq_inflight(dev, dev->virtqueue[i]); + } } void @@ -726,6 +753,41 @@ rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) return 0; } +int +rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx, + struct rte_vhost_ring_inflight *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL || vring == NULL) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + if (vq_is_packed(dev)) { + if (!vq->inflight_packed) + return -1; + + vring->inflight_packed = vq->inflight_packed; + } else { + if (!vq->inflight_split) + return -1; + + vring->inflight_split = vq->inflight_split; + } + + vring->resubmit_inflight = vq->resubmit_inflight; + + return 0; +} + int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, struct rte_vhost_vring *vring) @@ -744,11 +806,17 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, if (!vq) return -1; - vring->desc = vq->desc; - vring->avail = vq->avail; - vring->used = vq->used; - vring->log_guest_addr = vq->log_guest_addr; + if (vq_is_packed(dev)) { + vring->desc_packed = vq->desc_packed; + vring->driver_event = vq->driver_event; + vring->device_event = vq->device_event; + } else { + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + } + vring->log_guest_addr = vq->log_guest_addr; vring->callfd = vq->callfd; vring->kickfd = vq->kickfd; vring->size = vq->size; @@ -781,6 +849,257 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) return 0; } +int +rte_vhost_vq_is_packed(int vid) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + return vq_is_packed(dev); +} + +int +rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx, uint16_t idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (unlikely(!vq)) + return -1; + + if (unlikely(!vq->inflight_split)) + return -1; + + vq->inflight_split->desc[idx].counter = vq->global_counter++; + vq->inflight_split->desc[idx].inflight = 1; + return 0; +} + +int rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx, + uint16_t head, uint16_t last, uint16_t *inflight_entry) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + uint16_t old_free_head, free_head; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(!vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (unlikely(!vq)) + return -1; + + if (unlikely(!vq->inflight_split)) + return -1; + + free_head = vq->inflight_packed->free_head; + old_free_head = vq->inflight_packed->old_free_head; + + /* init header descriptor */ + vq->inflight_packed->desc[old_free_head].num = 0; + vq->inflight_packed->desc[old_free_head].counter = vq->global_counter++; + vq->inflight_packed->desc[old_free_head].inflight = 1; + + while (head != ((last + 1) & (vq->size - 1))) { + vq->inflight_packed->desc[old_free_head].num++; + vq->inflight_packed->desc[free_head].addr = vq->desc_packed[head].addr; + vq->inflight_packed->desc[free_head].len = vq->desc_packed[head].len; + vq->inflight_packed->desc[free_head].flags = vq->desc_packed[head].flags; + vq->inflight_packed->desc[free_head].id = vq->desc_packed[head].id; + vq->inflight_packed->desc[old_free_head].last = free_head; + free_head = vq->inflight_packed->desc[free_head].next; + vq->inflight_packed->free_head = free_head; + head = (head + 1) & (vq->size - 1); + } + + vq->inflight_packed->old_free_head = free_head; + *inflight_entry = old_free_head; + + return 0; +} + +int +rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t last_used_idx, uint16_t idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (unlikely(!vq)) + return -1; + + if (unlikely(!vq->inflight_split)) + return -1; + + rte_compiler_barrier(); + + vq->inflight_split->desc[idx].inflight = 0; + + rte_compiler_barrier(); + + vq->inflight_split->used_idx = last_used_idx; + return 0; +} + +int +rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx, + uint16_t head) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(!vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (unlikely(!vq)) + return -1; + + if (unlikely(!vq->inflight_packed)) + return -1; + + rte_compiler_barrier(); + + vq->inflight_packed->desc[head].inflight = 0; + + rte_compiler_barrier(); + + vq->inflight_packed->old_free_head = vq->inflight_packed->free_head; + vq->inflight_packed->old_used_idx = vq->inflight_packed->used_idx; + vq->inflight_packed->old_used_wrap_counter = + vq->inflight_packed->used_wrap_counter; + + return 0; +} + +int +rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx, + uint16_t idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + if (unlikely(!vq->inflight_split)) + return -1; + + vq->inflight_split->last_inflight_io = idx; + return 0; +} + +int +rte_vhost_set_last_inflight_io_packed(int vid, uint16_t vring_idx, + uint16_t head) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (unlikely(!dev)) + return -1; + + if (unlikely(!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))) + return 0; + + if (unlikely(!vq_is_packed(dev))) + return -1; + + if (unlikely(vring_idx >= VHOST_MAX_VRING)) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + if (unlikely(!vq->inflight_packed)) + return -1; + + vq->inflight_packed->desc[vq->inflight_packed->desc[head].last].next = + vq->inflight_packed->free_head; + vq->inflight_packed->free_head = head; + vq->inflight_packed->used_idx += vq->inflight_packed->desc[head].num; + if (vq->inflight_packed->used_idx >= vq->inflight_packed->desc_num) { + vq->inflight_packed->used_idx &= vq->inflight_packed->desc_num - 1; + vq->inflight_packed->used_wrap_counter = + !vq->inflight_packed->used_wrap_counter; + } + + return 0; +} + uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id) { @@ -950,6 +1269,61 @@ int rte_vhost_get_vring_base(int vid, uint16_t queue_id, return 0; } +int rte_vhost_get_vring_base_counter(int vid, uint16_t queue_id, + bool *avail_wrap_counter, bool *used_wrap_counter) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL || avail_wrap_counter == NULL || used_wrap_counter == NULL) + return -1; + + *avail_wrap_counter = dev->virtqueue[queue_id]->avail_wrap_counter; + *used_wrap_counter = dev->virtqueue[queue_id]->used_wrap_counter; + + return 0; +} + +int rte_vhost_get_vring_base_from_inflight(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL || last_avail_idx == NULL || last_used_idx == NULL) + return -1; + + if (!vq_is_packed(dev)) + return -1; + + if (!dev->virtqueue[queue_id]->inflight_packed) + return -1; + + *last_avail_idx = dev->virtqueue[queue_id]->inflight_packed->old_used_idx; + *last_used_idx = *last_avail_idx; + + return 0; +} + +int rte_vhost_get_vring_base_counter_from_inflight(int vid, + uint16_t queue_id, bool *avail_wrap_counter, bool *used_wrap_counter) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL || avail_wrap_counter == NULL || used_wrap_counter == NULL) + return -1; + + if (!vq_is_packed(dev)) + return -1; + + if (!dev->virtqueue[queue_id]->inflight_packed) + return -1; + + *avail_wrap_counter = + dev->virtqueue[queue_id]->inflight_packed->old_used_wrap_counter; + *used_wrap_counter = *avail_wrap_counter; + + return 0; +} + int rte_vhost_set_vring_base(int vid, uint16_t queue_id, uint16_t last_avail_idx, uint16_t last_used_idx) { @@ -964,6 +1338,20 @@ int rte_vhost_set_vring_base(int vid, uint16_t queue_id, return 0; } +int rte_vhost_set_vring_base_counter(int vid, uint16_t queue_id, + bool avail_wrap_counter, bool used_wrap_counter) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + dev->virtqueue[queue_id]->avail_wrap_counter = avail_wrap_counter; + dev->virtqueue[queue_id]->used_wrap_counter = used_wrap_counter; + + return 0; +} + int rte_vhost_extern_callback_register(int vid, struct rte_vhost_user_extern_ops const * const ops, void *ctx) { diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa85..30af53feb 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -88,6 +88,22 @@ struct vring_used_elem_packed { uint32_t count; }; +struct inflight_desc_split { + uint8_t inflight; + uint8_t padding[5]; + uint16_t next; + uint64_t counter; +}; + +struct inflight_info_split { + uint64_t features; + uint16_t version; + uint16_t desc_num; + uint16_t last_inflight_io; + uint16_t used_idx; + struct inflight_desc_split desc[0]; +}; + /** * Structure contains variables relevant to RX/TX virtqueues. */ @@ -128,6 +144,14 @@ struct vhost_virtqueue { /* Physical address of used ring, for logging */ uint64_t log_guest_addr; + /* inflight share memory info */ + union { + struct inflight_info_split *inflight_split; + struct inflight_info_packed *inflight_packed; + }; + struct rte_vhost_resubmit_info *resubmit_inflight; + uint64_t global_counter; + uint16_t nr_zmbuf; uint16_t zmbuf_size; uint16_t last_zmbuf_idx; @@ -215,24 +239,6 @@ struct vhost_msg { #define VIRTIO_F_VERSION_1 32 #endif -/* Declare packed ring related bits for older kernels */ -#ifndef VIRTIO_F_RING_PACKED - -#define VIRTIO_F_RING_PACKED 34 - -struct vring_packed_desc { - uint64_t addr; - uint32_t len; - uint16_t id; - uint16_t flags; -}; - -struct vring_packed_desc_event { - uint16_t off_wrap; - uint16_t flags; -}; -#endif - /* * Declare below packed ring defines unconditionally * as Kernel header might use different names. @@ -244,6 +250,10 @@ struct vring_packed_desc_event { #define VRING_EVENT_F_DISABLE 0x1 #define VRING_EVENT_F_DESC 0x2 +#ifndef VIRTIO_F_RING_PACKED +#define VIRTIO_F_RING_PACKED 34 +#endif + /* * Available and used descs are in same order */ @@ -286,6 +296,12 @@ struct guest_page { uint64_t size; }; +struct inflight_mem_info { + int fd; + void *addr; + uint64_t size; +}; + /** * Device structure contains all configuration information relating * to the device. @@ -303,6 +319,7 @@ struct virtio_net { uint32_t nr_vring; int dequeue_zero_copy; struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; + struct inflight_mem_info inflight_info; #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) char ifname[IF_NAME_SZ]; uint64_t log_size; @@ -467,6 +484,7 @@ void vhost_destroy_device(int); void vhost_destroy_device_notify(struct virtio_net *dev); void cleanup_vq(struct vhost_virtqueue *vq, int destroy); +void cleanup_vq_inflight(struct virtio_net *dev, struct vhost_virtqueue *vq); void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq); int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index c9e29ece8..1cf2edf93 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -31,6 +31,8 @@ #include <sys/stat.h> #include <sys/syscall.h> #include <assert.h> +#include <sys/syscall.h> +#include <asm/unistd.h> #ifdef RTE_LIBRTE_VHOST_NUMA #include <numaif.h> #endif @@ -49,6 +51,15 @@ #define VIRTIO_MIN_MTU 68 #define VIRTIO_MAX_MTU 65535 +#define INFLIGHT_ALIGNMENT 64 +#define INFLIGHT_VERSION 0xabcd +#define VIRTQUEUE_MAX_SIZE 1024 + +#define CLOEXEC 0x0001U + +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_NONE] = "VHOST_USER_NONE", [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", @@ -78,6 +89,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", + [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD", + [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD", }; static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg); @@ -160,6 +173,16 @@ vhost_backend_cleanup(struct virtio_net *dev) dev->log_addr = 0; } + if (dev->inflight_info.addr) { + munmap(dev->inflight_info.addr, dev->inflight_info.size); + dev->inflight_info.addr = NULL; + } + + if (dev->inflight_info.fd > 0) { + close(dev->inflight_info.fd); + dev->inflight_info.fd = -1; + } + if (dev->slave_req_fd >= 0) { close(dev->slave_req_fd); dev->slave_req_fd = -1; @@ -306,6 +329,7 @@ vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg, dev->virtqueue[dev->nr_vring] = NULL; cleanup_vq(vq, 1); + cleanup_vq_inflight(dev, vq); free_vq(dev, vq); } } @@ -616,7 +640,6 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) dev->vid); return dev; } - return dev; } @@ -1165,6 +1188,218 @@ virtio_is_ready(struct virtio_net *dev) return 1; } +static int mem_create(const char *name, unsigned int flags) +{ +#ifdef __NR_memfd_create + return syscall(__NR_memfd_create, name, flags); +#else + return -1; +#endif +} + +void *inflight_mem_alloc(const char *name, size_t size, int *fd) +{ + void *ptr; + int mfd = -1; + char fname[20] = "/tmp/memfd-XXXXXX"; + + *fd = -1; + mfd = mem_create(name, CLOEXEC); + if (mfd != -1) { + if (ftruncate(mfd, size) == -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "ftruncate fail for alloc inflight buffer\n"); + close(mfd); + return NULL; + } + } else { + mfd = mkstemp(fname); + unlink(fname); + + if (mfd == -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "mkstemp fail for alloc inflight buffer\n"); + return NULL; + } + + if (ftruncate(mfd, size) == -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "ftruncate fail for alloc inflight buffer\n"); + close(mfd); + return NULL; + } + } + + ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); + if (ptr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap fail for alloc inflight buffer\n"); + close(mfd); + return NULL; + } + + *fd = mfd; + return ptr; +} + +static uint32_t get_pervq_shm_size_split(uint16_t queue_size) +{ + return ALIGN_UP(sizeof(struct inflight_desc_split) * queue_size + + sizeof(uint64_t) * 1 + sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT); +} + +static uint32_t get_pervq_shm_size_packed(uint16_t queue_size) +{ + return ALIGN_UP(sizeof(struct inflight_desc_packed) * queue_size + + sizeof(uint64_t) * 1 + sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9, + INFLIGHT_ALIGNMENT); +} + +static int +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, + int main_fd __rte_unused) +{ + int fd, i, j; + uint64_t pervq_inflight_size, mmap_size; + void *addr; + uint16_t num_queues, queue_size; + struct virtio_net *dev = *pdev; + struct inflight_info_packed *inflight_packed = NULL; + + if (msg->size != sizeof(msg->payload.inflight)) { + RTE_LOG(ERR, VHOST_CONFIG, + "Invalid get_inflight_fd message size is %d", + msg->size); + return RTE_VHOST_MSG_RESULT_ERR; + } + + num_queues = msg->payload.inflight.num_queues; + queue_size = msg->payload.inflight.queue_size; + + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n", + msg->payload.inflight.num_queues); + RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n", + msg->payload.inflight.queue_size); + + if (vq_is_packed(dev)) + pervq_inflight_size = get_pervq_shm_size_packed(queue_size); + else + pervq_inflight_size = get_pervq_shm_size_split(queue_size); + + mmap_size = num_queues * pervq_inflight_size; + addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd); + if (!addr) { + RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area"); + msg->payload.inflight.mmap_size = 0; + return RTE_VHOST_MSG_RESULT_ERR; + } + memset(addr, 0, mmap_size); + + dev->inflight_info.addr = addr; + dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size; + dev->inflight_info.fd = msg->fds[0] = fd; + msg->payload.inflight.mmap_offset = 0; + msg->fd_num = 1; + + if (vq_is_packed(dev)) { + for (i = 0; i < num_queues; i++) { + inflight_packed = (struct inflight_info_packed *)addr; + inflight_packed->used_wrap_counter = 1; + inflight_packed->old_used_wrap_counter = 1; + for (j = 0; j < queue_size; j++) + inflight_packed->desc[j].next = j + 1; + addr = (void *)((char *)addr + pervq_inflight_size); + } + } + + RTE_LOG(INFO, VHOST_CONFIG, + "send inflight mmap_size: %lu\n", + msg->payload.inflight.mmap_size); + RTE_LOG(INFO, VHOST_CONFIG, + "send inflight mmap_offset: %lu\n", + msg->payload.inflight.mmap_offset); + RTE_LOG(INFO, VHOST_CONFIG, + "send inflight fd: %d\n", msg->fds[0]); + + return RTE_VHOST_MSG_RESULT_REPLY; +} + +static int +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, + int main_fd __rte_unused) +{ + int fd, i; + uint64_t mmap_size, mmap_offset; + uint16_t num_queues, queue_size; + uint32_t pervq_inflight_size; + void *addr; + struct vhost_virtqueue *vq; + struct virtio_net *dev = *pdev; + + fd = msg->fds[0]; + if (msg->size != sizeof(msg->payload.inflight) || fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n", + msg->size, fd); + return RTE_VHOST_MSG_RESULT_ERR; + } + + mmap_size = msg->payload.inflight.mmap_size; + mmap_offset = msg->payload.inflight.mmap_offset; + num_queues = msg->payload.inflight.num_queues; + queue_size = msg->payload.inflight.queue_size; + + if (vq_is_packed(dev)) { + pervq_inflight_size = get_pervq_shm_size_packed(queue_size); + } else { + pervq_inflight_size = get_pervq_shm_size_split(queue_size); + } + + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd mmap_size: %lu\n", mmap_size); + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd mmap_offset: %lu\n", mmap_offset); + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd num_queues: %u\n", num_queues); + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd queue_size: %u\n", queue_size); + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd fd: %d\n", fd); + RTE_LOG(INFO, VHOST_CONFIG, + "set_inflight_fd pervq_inflight_size: %d\n", + pervq_inflight_size); + + if (dev->inflight_info.addr) + munmap(dev->inflight_info.addr, dev->inflight_info.size); + + addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, mmap_offset); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } + + if (dev->inflight_info.fd) + close(dev->inflight_info.fd); + + dev->inflight_info.fd = fd; + dev->inflight_info.addr = addr; + dev->inflight_info.size = mmap_size; + + for (i = 0; i < num_queues; i++) { + vq = dev->virtqueue[i]; + if (vq_is_packed(dev)) { + vq->inflight_packed = (struct inflight_info_packed *)addr; + vq->inflight_packed->desc_num = queue_size; + } else { + vq->inflight_split = (struct inflight_info_split *)addr; + vq->inflight_split->desc_num = queue_size; + } + addr = (void *)((char *)addr + pervq_inflight_size); + } + + return RTE_VHOST_MSG_RESULT_OK; +} + static int vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg, int main_fd __rte_unused) @@ -1201,6 +1436,171 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused, return RTE_VHOST_MSG_RESULT_OK; } +static int +resubmit_desc_compare(const void *a, const void *b) +{ + const struct rte_vhost_resubmit_desc *desc0 = + (const struct rte_vhost_resubmit_desc *)a; + const struct rte_vhost_resubmit_desc *desc1 = + (const struct rte_vhost_resubmit_desc *)b; + + if (desc1->counter > desc0->counter && + (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) + return 1; + + return -1; +} + +static int +vhost_check_queue_inflights_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + struct vring_used *used = vq->used; + uint16_t i = 0; + uint16_t resubmit_num = 0; + struct rte_vhost_resubmit_info *resubmit = NULL; + struct inflight_info_split *inflight_split; + + if (!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) + return RTE_VHOST_MSG_RESULT_OK; + + if ((!vq->inflight_split)) + return RTE_VHOST_MSG_RESULT_ERR; + + if (!vq->inflight_split->version) { + vq->inflight_split->version = INFLIGHT_VERSION; + return RTE_VHOST_MSG_RESULT_OK; + } + + inflight_split = vq->inflight_split; + vq->resubmit_inflight = NULL; + vq->global_counter = 0; + + if (inflight_split->used_idx != used->idx) { + inflight_split->desc[inflight_split->last_inflight_io].inflight = 0; + rte_compiler_barrier(); + inflight_split->used_idx = used->idx; + } + + for (i = 0; i < inflight_split->desc_num; i++) { + if (inflight_split->desc[i].inflight == 1) + resubmit_num++; + } + + vq->last_avail_idx += resubmit_num; + + if (resubmit_num) { + resubmit = calloc(1, sizeof(struct rte_vhost_resubmit_info)); + if (!resubmit) { + RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for resubmit info.\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } + + resubmit->resubmit_list = calloc(resubmit_num, + sizeof(struct rte_vhost_resubmit_desc)); + if (!resubmit->resubmit_list) { + RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for inflight desc.\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } + + for (i = 0; i < vq->inflight_split->desc_num; i++) { + if (vq->inflight_split->desc[i].inflight == 1) { + resubmit->resubmit_list[resubmit->resubmit_num].index = i; + resubmit->resubmit_list[resubmit->resubmit_num].counter = + inflight_split->desc[i].counter; + resubmit->resubmit_num++; + } + } + + if (resubmit->resubmit_num > 1) + qsort(resubmit->resubmit_list, resubmit->resubmit_num, + sizeof(struct rte_vhost_resubmit_desc), + resubmit_desc_compare); + + vq->global_counter = resubmit->resubmit_list[0].counter + 1; + vq->resubmit_inflight = resubmit; + } + + return RTE_VHOST_MSG_RESULT_OK; +} + +static int +vhost_check_queue_inflights_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + uint16_t i = 0; + uint16_t resubmit_num = 0, counter = 0; + struct rte_vhost_resubmit_info *resubmit = NULL; + struct inflight_info_packed *inflight_packed; + + if (!(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) + return RTE_VHOST_MSG_RESULT_OK; + + if (!vq->inflight_packed->version) { + vq->inflight_packed->version = INFLIGHT_VERSION; + return RTE_VHOST_MSG_RESULT_OK; + } + + if ((!vq->inflight_packed)) + return RTE_VHOST_MSG_RESULT_ERR; + + inflight_packed = vq->inflight_packed; + vq->resubmit_inflight = NULL; + vq->global_counter = 0; + + if (inflight_packed->used_idx != inflight_packed->old_used_idx) { + inflight_packed->used_idx = inflight_packed->old_used_idx; + inflight_packed->used_wrap_counter = + inflight_packed->old_used_wrap_counter; + inflight_packed->free_head = inflight_packed->old_free_head; + } + + for (i = 0; i < inflight_packed->desc_num; i++) { + if (inflight_packed->desc[i].inflight == 1) { + resubmit_num++; + counter += inflight_packed->desc[i].num; + } + } + + vq->last_avail_idx += counter; + + if (resubmit_num) { + resubmit = calloc(1, sizeof(struct rte_vhost_resubmit_info)); + if (resubmit == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for resubmit info.\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } + + resubmit->resubmit_list = calloc(resubmit_num, + sizeof(struct rte_vhost_resubmit_desc)); + if (resubmit->resubmit_list == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for resubmit desc.\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } + + for (i = 0; i < inflight_packed->desc_num; i++) { + if (vq->inflight_packed->desc[i].inflight == 1) { + resubmit->resubmit_list[resubmit->resubmit_num].index = i; + resubmit->resubmit_list[resubmit->resubmit_num].counter = + inflight_packed->desc[i].counter; + resubmit->resubmit_num++; + } + } + + if (resubmit->resubmit_num > 1) + qsort(resubmit->resubmit_list, resubmit->resubmit_num, + sizeof(struct rte_vhost_resubmit_desc), + resubmit_desc_compare); + + vq->global_counter = resubmit->resubmit_list[0].counter + 1; + vq->resubmit_inflight = resubmit; + } + + return RTE_VHOST_MSG_RESULT_OK; +} + + static int vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg, int main_fd __rte_unused) @@ -1242,6 +1642,20 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg, close(vq->kickfd); vq->kickfd = file.fd; + if (vq_is_packed(dev)) { + if (vhost_check_queue_inflights_packed(dev, vq)) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to inflights for vq: %d\n", file.index); + return RTE_VHOST_MSG_RESULT_ERR; + } + } else { + if (vhost_check_queue_inflights_split(dev, vq)) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to inflights for vq: %d\n", file.index); + return RTE_VHOST_MSG_RESULT_ERR; + } + } + return RTE_VHOST_MSG_RESULT_OK; } @@ -1762,6 +2176,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, + [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd, + [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd, }; diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index 2a650fe4b..99a773910 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -54,7 +54,9 @@ typedef enum VhostUserRequest { VHOST_USER_POSTCOPY_ADVISE = 28, VHOST_USER_POSTCOPY_LISTEN = 29, VHOST_USER_POSTCOPY_END = 30, - VHOST_USER_MAX = 31 + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_MAX = 33 } VhostUserRequest; typedef enum VhostUserSlaveRequest { @@ -112,6 +114,13 @@ typedef struct VhostUserVringArea { uint64_t offset; } VhostUserVringArea; +typedef struct VhostUserInflight { + uint64_t mmap_size; + uint64_t mmap_offset; + uint16_t num_queues; + uint16_t queue_size; +} VhostUserInflight; + typedef struct VhostUserMsg { union { uint32_t master; /* a VhostUserRequest value */ @@ -131,6 +140,7 @@ typedef struct VhostUserMsg { struct vhost_vring_addr addr; VhostUserMemory memory; VhostUserLog log; + VhostUserInflight inflight; struct vhost_iotlb_msg iotlb; VhostUserCryptoSessionParam crypto_session; VhostUserVringArea area; @@ -148,6 +158,7 @@ typedef struct VhostUserMsg { /* vhost_user.c */ int vhost_user_msg_handler(int vid, int fd); int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); +void *inflight_mem_alloc(const char *name, size_t size, int *fd); /* socket.c */ int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds, -- 2.17.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
* [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 0/2] *** vhost support inflight share memory protocol feature *** JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 1/2] vhost: support inflight share memory protocol feature JinYu @ 2019-07-25 21:23 ` JinYu 1 sibling, 0 replies; 4+ messages in thread From: JinYu @ 2019-07-25 21:23 UTC (permalink / raw) To: dev; +Cc: changpeng.liu, maxime.coquelin, tiwei.bie, zhihong.wang, JinYu A vhost-user-blk example that support inflight feature. It uses the new APIs that introduced in the first patch, so It can show how there APIs work to support inflight feature. Signed-off-by: Jin Yu <jin.yu@intel.com> --- V1 - add the case. V2 - add the rte_vhost prefix. V3 - add packed ring support --- examples/vhost_blk/Makefile | 67 ++ examples/vhost_blk/blk.c | 122 +++ examples/vhost_blk/blk_spec.h | 95 +++ examples/vhost_blk/meson.build | 20 + examples/vhost_blk/vhost_blk.c | 1054 +++++++++++++++++++++++++ examples/vhost_blk/vhost_blk.h | 114 +++ examples/vhost_blk/vhost_blk_compat.c | 193 +++++ 7 files changed, 1665 insertions(+) create mode 100644 examples/vhost_blk/Makefile create mode 100644 examples/vhost_blk/blk.c create mode 100644 examples/vhost_blk/blk_spec.h create mode 100644 examples/vhost_blk/meson.build create mode 100644 examples/vhost_blk/vhost_blk.c create mode 100644 examples/vhost_blk/vhost_blk.h create mode 100644 examples/vhost_blk/vhost_blk_compat.c diff --git a/examples/vhost_blk/Makefile b/examples/vhost_blk/Makefile new file mode 100644 index 000000000..52e9befd8 --- /dev/null +++ b/examples/vhost_blk/Makefile @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2017 Intel Corporation + +# binary name +APP = vhost-blk + +# all source are stored in SRCS-y +SRCS-y := blk.c vhost_blk.c vhost_blk_compat.c + +# Build using pkg-config variables if possible +$(shell pkg-config --exists libdpdk) +ifeq ($(.SHELLSTATUS),0) + +all: shared +.PHONY: shared static +shared: build/$(APP)-shared + ln -sf $(APP)-shared build/$(APP) +static: build/$(APP)-static + ln -sf $(APP)-static build/$(APP) + +CFLAGS += -D_FILE_OFFSET_BITS=64 +LDFLAGS += -pthread + +PC_FILE := $(shell pkg-config --path libdpdk) +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk) +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk) +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk) + +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) + +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) + +build: + @mkdir -p $@ + +.PHONY: clean +clean: + rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared + test -d build && rmdir -p build || true + +else # Build using legacy build system + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, detect a build directory, by looking for a path with a .config +RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config))))) + +include $(RTE_SDK)/mk/rte.vars.mk + +ifneq ($(CONFIG_RTE_EXEC_ENV_LINUX),y) +$(info This application can only operate in a linux environment, \ +please change the definition of the RTE_TARGET environment variable) +all: +else + +CFLAGS += -D_FILE_OFFSET_BITS=64 +CFLAGS += -O2 +#CFLAGS += $(WERROR_FLAGS) + +include $(RTE_SDK)/mk/rte.extapp.mk + +endif +endif diff --git a/examples/vhost_blk/blk.c b/examples/vhost_blk/blk.c new file mode 100644 index 000000000..3ecd0e206 --- /dev/null +++ b/examples/vhost_blk/blk.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +/** + * This work is largely based on the "vhost-user-blk" implementation by + * SPDK(https://github.com/spdk/spdk). + */ + +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <assert.h> +#include <ctype.h> +#include <string.h> +#include <stddef.h> + +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_byteorder.h> +#include <rte_string_fns.h> + +#include "vhost_blk.h" +#include "blk_spec.h" + +static void +vhost_strcpy_pad(void *dst, const char *src, size_t size, int pad) +{ + size_t len; + + len = strlen(src); + if (len < size) { + memcpy(dst, src, len); + memset((char *)dst + len, pad, size - len); + } else { + memcpy(dst, src, size); + } +} + +static int +vhost_bdev_blk_readwrite(struct vhost_block_dev *bdev, + struct vhost_blk_task *task, + uint64_t lba_512, __rte_unused uint32_t xfer_len) +{ + uint32_t i; + uint64_t offset; + uint32_t nbytes = 0; + + offset = lba_512 * 512; + + for (i = 0; i < task->iovs_cnt; i++) { + if (task->dxfer_dir == BLK_DIR_TO_DEV) + memcpy(bdev->data + offset, task->iovs[i].iov_base, + task->iovs[i].iov_len); + else + memcpy(task->iovs[i].iov_base, bdev->data + offset, + task->iovs[i].iov_len); + offset += task->iovs[i].iov_len; + nbytes += task->iovs[i].iov_len; + } + + return nbytes; +} + +int +vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task) +{ + int used_len; + + if (unlikely(task->data_len > (bdev->blockcnt * bdev->blocklen))) { + fprintf(stderr, "read or write beyond capacity\n"); + return VIRTIO_BLK_S_UNSUPP; + } + + switch (task->req->type) { + case VIRTIO_BLK_T_IN: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b" + "(req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->head_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + task->dxfer_dir = BLK_DIR_FROM_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_OUT: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b" + "(req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->head_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + if (task->readtype) { + fprintf(stderr, "type isn't right\n"); + return VIRTIO_BLK_S_IOERR; + } + task->dxfer_dir = BLK_DIR_TO_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovs_cnt || task->data_len) + return VIRTIO_BLK_S_UNSUPP; + used_len = min(VIRTIO_BLK_ID_BYTES, task->data_len); + vhost_strcpy_pad(task->iovs[0].iov_base, + bdev->product_name, used_len, ' '); + break; + default: + fprintf(stderr, "unsupported cmd\n"); + return VIRTIO_BLK_S_UNSUPP; + } + + return VIRTIO_BLK_S_OK; +} diff --git a/examples/vhost_blk/blk_spec.h b/examples/vhost_blk/blk_spec.h new file mode 100644 index 000000000..5875e2f86 --- /dev/null +++ b/examples/vhost_blk/blk_spec.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _BLK_SPEC_H +#define _BLK_SPEC_H + +#include <stdint.h> + +#ifndef VHOST_USER_MEMORY_MAX_NREGIONS +#define VHOST_USER_MEMORY_MAX_NREGIONS 8 +#endif + +#ifndef VHOST_USER_MAX_CONFIG_SIZE +#define VHOST_USER_MAX_CONFIG_SIZE 256 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CONFIG +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 +#endif + +#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ + +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_DISCARD 11 +#define VIRTIO_BLK_T_WRITE_ZEROES 13 + +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_MAX +}; + +/** Get/set config msg payload */ +struct vhost_user_config { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +}; + +/** Fixed-size vhost_memory struct */ +struct vhost_memory_padded { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /**< the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory_padded memory; + struct vhost_user_config cfg; + } payload; +} __attribute((packed)); + +#endif diff --git a/examples/vhost_blk/meson.build b/examples/vhost_blk/meson.build new file mode 100644 index 000000000..028aa4f62 --- /dev/null +++ b/examples/vhost_blk/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +# meson file, for building this example as part of a main DPDK build. +# +# To build this example as a standalone application with an already-installed +# DPDK instance, use 'make' + +if not is_linux + build = false +endif + +if not cc.has_header('linux/virtio_blk.h') + build = false +endif + +deps += 'vhost' +sources = files( + 'blk.c', 'vhost_blk.c', 'vhost_blk_compat.c' +) diff --git a/examples/vhost_blk/vhost_blk.c b/examples/vhost_blk/vhost_blk.c new file mode 100644 index 000000000..a03b6036f --- /dev/null +++ b/examples/vhost_blk/vhost_blk.c @@ -0,0 +1,1054 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include <stdint.h> +#include <unistd.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <semaphore.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_vhost.h> + +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VIRTQ_DESC_F_NEXT 1 +#define VIRTQ_DESC_F_AVAIL (1 << 7) +#define VIRTQ_DESC_F_USED (1 << 15) + +#define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \ + (1ULL << VIRTIO_F_VERSION_1) |\ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_pathname[PATH_MAX] = ""; +static sem_t exit_sem; + +struct vhost_blk_ctrlr * +vhost_blk_ctrlr_find(const char *ctrlr_name) +{ + /* currently we only support 1 socket file fd */ + return g_vhost_ctrlr; +} + +static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + int ret = 0; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + assert(ret != 0); + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + assert(ctrlr != NULL); + } + + assert(ctrlr->mem != NULL); + + return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len); +} + +static struct vring_packed_desc * +descriptor_get_next_packed(struct rte_vhost_vring *vq, + uint16_t *idx) +{ + if (vq->desc_packed[*idx & (vq->size - 1)].flags & VIRTQ_DESC_F_NEXT) { + *idx += 1; + return &vq->desc_packed[*idx & (vq->size - 1)]; + } + + return NULL; +} + +static bool +descriptor_has_next_packed(struct vring_packed_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +descriptor_is_wr_packed(struct vring_packed_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static struct inflight_desc_packed * +inflight_desc_get_next(struct inflight_info_packed *inflight_packed, + struct inflight_desc_packed *cur_desc) +{ + if (cur_desc->flags & VIRTQ_DESC_F_NEXT) { + return &inflight_packed->desc[cur_desc->next]; + } + + return NULL; +} + +static bool +inflight_desc_has_next(struct inflight_desc_packed *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +inflight_desc_is_wr(struct inflight_desc_packed *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void +inflight_process_payload_chain_packed(struct inflight_blk_task *task) +{ + void *data; + uint64_t chunck_len; + + task->blk_task.iovs_cnt = 0; + + do { + chunck_len = task->inflight_desc->len; + data = (void *)(uintptr_t)gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!data || chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->blk_task.iovs[task->blk_task.iovs_cnt].iov_base = data; + task->blk_task.iovs[task->blk_task.iovs_cnt].iov_len = + task->inflight_desc->len; + task->blk_task.data_len += task->inflight_desc->len; + task->blk_task.iovs_cnt++; + task->inflight_desc = inflight_desc_get_next(task->inflight_packed, + task->inflight_desc); + } while (inflight_desc_has_next(task->inflight_desc)); + + chunck_len = task->inflight_desc->len; + task->blk_task.status = (void *)(uintptr_t)gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.status || chunck_len != task->inflight_desc->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + +static void +inflight_submit_completion_packed(struct inflight_blk_task *task, + uint32_t q_idx, uint16_t *used_id, + bool *used_wrap_counter) +{ + struct vhost_blk_ctrlr *ctrlr; + struct rte_vhost_vring *vq; + struct vring_packed_desc *desc; + uint16_t flags; + uint16_t entry_num; + int ret; + + ctrlr = vhost_blk_ctrlr_find(dev_pathname); + vq = task->blk_task.vq; + + ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx, + task->blk_task.head_idx); + if (ret != 0) + fprintf(stderr, "fail to set last inflight io\n"); + + desc = &vq->desc_packed[*used_id]; + desc->id = task->blk_task.buffer_id; + rte_compiler_barrier(); + if (*used_wrap_counter) { + desc->flags = desc->flags | VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED; + } else { + desc->flags = desc->flags & ~( VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED); + } + rte_compiler_barrier(); + + *used_id += task->blk_task.iovs_cnt + 2; + if (*used_id > vq->size) { + *used_id &= vq->size - 1; + *used_wrap_counter = !(*used_wrap_counter); + } + + ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->blk_task.head_idx); + if (ret != 0) + fprintf(stderr, "fail to clear inflight io\n"); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->blk_task.bdev->vid, q_idx); +} + +static void +submit_completion_packed(struct vhost_blk_task *task, uint32_t q_idx, + uint16_t *used_id, bool *used_wrap_counter) +{ + struct vhost_blk_ctrlr *ctrlr; + struct rte_vhost_vring *vq; + struct vring_packed_desc *desc; + uint16_t entry_num; + int ret; + + ctrlr = vhost_blk_ctrlr_find(dev_pathname); + vq = task->vq;; + + ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx, + task->inflight_idx); + if (ret != 0) + fprintf(stderr, "fail to set last inflight io\n"); + + + desc = &vq->desc_packed[*used_id]; + desc->id = task->buffer_id; + rte_compiler_barrier(); + if (*used_wrap_counter) { + desc->flags = desc->flags | VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED; + } else { + desc->flags = desc->flags & ~( VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED); + } + rte_compiler_barrier(); + + *used_id += task->iovs_cnt + 2; + if (*used_id >= vq->size) { + *used_id &= vq->size - 1; + *used_wrap_counter = !(*used_wrap_counter); + } + + ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->inflight_idx); + if (ret != 0) + fprintf(stderr, "fail to clear inflight io\n"); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->bdev->vid, q_idx); +} + +static void +vhost_process_payload_chain_packed(struct vhost_blk_task *task, uint16_t *idx) +{ + void *data; + uint64_t chunck_len; + + task->iovs_cnt = 0; + + do { + chunck_len = task->desc_packed->len; + data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!data || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->iovs[task->iovs_cnt].iov_base = data; + task->iovs[task->iovs_cnt].iov_len = task->desc_packed->len; + task->data_len += task->desc_packed->len; + task->iovs_cnt++; + task->desc_packed = descriptor_get_next_packed(task->vq, idx); + } while (descriptor_has_next_packed(task->desc_packed)); + + task->last_idx = *idx & (task->vq->size - 1); + chunck_len = task->desc_packed->len; + task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_packed->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + + +static int +descriptor_is_available(struct rte_vhost_vring *vring, uint16_t idx, + bool avail_wrap_counter) +{ + uint16_t flags = vring->desc_packed[idx].flags; + + return ((!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter) && + (!!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter)); +} + +static int +descriptor_is_used(struct rte_vhost_vring *vring, uint16_t idx, + bool used_wrap_counter) +{ + uint16_t flags = vring->desc_packed[idx].flags; + + return ((!!(flags & VIRTQ_DESC_F_AVAIL) == used_wrap_counter) && + (!!(flags & VIRTQ_DESC_F_USED) == used_wrap_counter)); +} + +static void +process_requestq_packed(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + bool avail_wrap_counter, used_wrap_counter; + uint16_t avail_idx, used_idx; + int ret; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + avail_idx = blk_vq->last_avail_idx; + avail_wrap_counter = blk_vq->avail_wrap_counter; + used_idx = blk_vq->last_used_idx; + used_wrap_counter = blk_vq->used_wrap_counter; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + task->vq = vq; + task->bdev = ctrlr->bdev; + + while (descriptor_is_available(vq, avail_idx, avail_wrap_counter)) { + task->head_idx = avail_idx; + task->desc_packed = &task->vq->desc_packed[task->head_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->status = NULL; + + /* does not support indirect descriptors */ + assert((task->desc_packed->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_packed->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_packed = descriptor_get_next_packed(task->vq, &avail_idx); + assert(task->desc_packed != NULL); + if (!descriptor_has_next_packed(task->desc_packed)) { + task->dxfer_dir = BLK_DIR_NONE; + task->last_idx = avail_idx & (vq->size - 1); + chunck_len = task->desc_packed->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_packed(task->desc_packed); + vhost_process_payload_chain_packed(task, &avail_idx); + } + task->buffer_id = vq->desc_packed[task->last_idx].id; + rte_vhost_set_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->head_idx, + task->last_idx, + &task->inflight_idx); + + if (++avail_idx >= vq->size) { + avail_idx &= vq->size - 1; + avail_wrap_counter = !avail_wrap_counter; + } + blk_vq->last_avail_idx = avail_idx; + blk_vq->avail_wrap_counter = avail_wrap_counter; + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + + submit_completion_packed(task, q_idx, &used_idx, &used_wrap_counter); + blk_vq->last_used_idx = used_idx; + blk_vq->used_wrap_counter = used_wrap_counter; + } + + rte_free(task); +} + +static void +submit_inflight_vq_packed(struct vhost_blk_ctrlr *ctrlr, uint16_t q_idx) +{ + bool used_wrap_counter; + int i, ret; + uint16_t used_idx; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + struct rte_vhost_vring *vq; + struct inflight_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + inflight_vq = &blk_vq->inflight_vq; + used_idx = inflight_vq->inflight_packed->old_used_idx; + used_wrap_counter = inflight_vq->inflight_packed->old_used_wrap_counter; + + task = rte_malloc(NULL, sizeof(*task), 0); + if (task) { + fprintf(stderr, "fail to allocate memory\n"); + return; + } + task->blk_task.vq = vq; + task->blk_task.bdev = ctrlr->bdev; + + for (i = 0; i < inflight_vq->resubmit_inflight->resubmit_num; i++) { + task->blk_task.head_idx = + inflight_vq->resubmit_inflight->resubmit_list[i].index; + task->inflight_desc = + &inflight_vq->inflight_packed->desc[task->blk_task.head_idx]; + task->blk_task.iovs_cnt = 0; + task->blk_task.data_len = 0; + task->blk_task.req = NULL; + task->blk_task.status = NULL; + + /* does not support indirect descriptors */ + assert((task->inflight_desc->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->inflight_desc->len; + task->blk_task.req = (void *)(uintptr_t)gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.req || chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->inflight_desc = inflight_desc_get_next(task->inflight_packed, + task->inflight_desc); + if (!inflight_desc_has_next(task->inflight_desc)) { + task->blk_task.dxfer_dir = BLK_DIR_NONE; + chunck_len = task->inflight_desc->len; + task->blk_task.status = (void *)(uintptr_t) + gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.status || + chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->blk_task.readtype = + inflight_desc_is_wr(task->inflight_desc); + inflight_process_payload_chain_packed(task); + } + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, &task->blk_task); + if (ret) { + /* invalid response */ + *task->blk_task.status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->blk_task.status = VIRTIO_BLK_S_OK; + } + + inflight_submit_completion_packed(task, q_idx, &used_idx, + &used_wrap_counter); + + blk_vq->last_used_idx = used_idx; + blk_vq->used_wrap_counter = used_wrap_counter; + } + + rte_free(task); +} + +static struct vring_desc * +descriptor_get_next_split(struct vring_desc *vq_desc, + struct vring_desc *cur_desc) +{ + return &vq_desc[cur_desc->next]; +} + +static bool +descriptor_has_next_split(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +descriptor_is_wr_split(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void +vhost_process_payload_chain_split(struct vhost_blk_task *task) +{ + void *data; + uint64_t chunck_len; + + task->iovs_cnt = 0; + + do { + chunck_len = task->desc_split->len; + data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!data || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->iovs[task->iovs_cnt].iov_base = data; + task->iovs[task->iovs_cnt].iov_len = task->desc_split->len; + task->data_len += task->desc_split->len; + task->iovs_cnt++; + task->desc_split = + descriptor_get_next_split(task->vq->desc, task->desc_split); + } while (descriptor_has_next_split(task->desc_split)); + + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + +static void +submit_completion_split(struct vhost_blk_task *task, uint32_t vid, uint32_t q_idx) +{ + struct rte_vhost_vring *vq; + struct vring_used *used; + + vq = task->vq; + used = vq->used; + + rte_vhost_set_last_inflight_io_split(vid, q_idx, task->req_idx); + + /* Fill out the next entry in the "used" ring. id = the + * index of the descriptor that contained the blk request. + * len = the total amount of data transferred for the blk + * request. We must report the correct len, for variable + * length blk CDBs, where we may return less data than + * allocated by the guest VM. + */ + used->ring[used->idx & (vq->size - 1)].id = task->req_idx; + used->ring[used->idx & (vq->size - 1)].len = task->data_len; + rte_compiler_barrier(); + used->idx++; + rte_compiler_barrier(); + + rte_vhost_clr_inflight_desc_split(vid, q_idx, used->idx, task->req_idx); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->bdev->vid, q_idx); +} + +static void +submit_inflight_vq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + struct rte_vhost_resubmit_info *resubmit_inflight; + struct rte_vhost_resubmit_desc *resubmit_list; + struct vhost_blk_task *task; + int i, req_idx; + uint64_t chunck_len; + int ret; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + inflight_vq = &blk_vq->inflight_vq; + resubmit_inflight = inflight_vq->resubmit_inflight; + resubmit_list = resubmit_inflight->resubmit_list; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = &blk_vq->vq; + + for (i = 0; i < resubmit_inflight->resubmit_num; i++) { + req_idx = resubmit_list[i].index; + task->req_idx = req_idx; + task->desc_split = &task->vq->desc[task->req_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->desc_split = NULL; + task->status = NULL; + + /* does not support indirect descriptors */ + assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_split->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_split = descriptor_get_next_split(task->vq->desc, + task->desc_split); + if (!descriptor_has_next_split(task->desc_split)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_split(task->desc_split); + vhost_process_payload_chain_split(task); + } + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + submit_completion_split(task, ctrlr->bdev->vid, q_idx); + } + + rte_free(task); +} + +static void +process_requestq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + int ret; + int req_idx; + uint16_t last_idx; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = vq; + + while (vq->avail->idx != blk_vq->last_avail_idx) { + last_idx = blk_vq->last_avail_idx & (vq->size - 1); + req_idx = vq->avail->ring[last_idx]; + task->req_idx = req_idx; + task->desc_split = &task->vq->desc[task->req_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->status = NULL; + + rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx, task->req_idx); + + /* does not support indirect descriptors */ + assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_split->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_split = descriptor_get_next_split(task->vq->desc, task->desc_split); + if (!descriptor_has_next_split(task->desc_split)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_split(task->desc_split); + vhost_process_payload_chain_split(task); + } + blk_vq->last_avail_idx++; + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + + submit_completion_split(task, ctrlr->bdev->vid, q_idx); + } + + rte_free(task); +} + +static void * +ctrlr_worker(void *arg) +{ + struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + cpu_set_t cpuset; + pthread_t thread; + int i, ret; + + fprintf(stdout, "Ctrlr Worker Thread start\n"); + + if (ctrlr == NULL || ctrlr->bdev == NULL) { + fprintf(stderr, "%s: Error, invalid argument passed to worker thread\n", + __func__); + exit(0); + } + + thread = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + inflight_vq = &blk_vq->inflight_vq; + if (inflight_vq->resubmit_inflight != NULL && + inflight_vq->resubmit_inflight->resubmit_num != 0) { + if (ctrlr->packed) + submit_inflight_vq_packed(ctrlr, i); + else + submit_inflight_vq_split(ctrlr, i); + } + } + + while (!g_should_stop && ctrlr->bdev != NULL) { + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + if (ctrlr->packed) + process_requestq_packed(ctrlr, i); + else + process_requestq_split(ctrlr, i); + } + } + + fprintf(stdout, "Ctrlr Worker Thread Exiting\n"); + sem_post(&exit_sem); + return NULL; +} + +static int +new_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + pthread_t tid; + int i, ret; + uint64_t features; + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->started) + return 0; + + ctrlr->bdev->vid = vid; + ctrlr->packed = rte_vhost_vq_is_packed(vid); + + ret = rte_vhost_get_mem_table(vid, &ctrlr->mem); + if (ret) + fprintf(stderr, "Get Controller memory region failed\n"); + assert(ctrlr->mem != NULL); + + /* Disable Notifications and init last idx */ + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + rte_vhost_enable_guest_notification(vid, i, 0); + + blk_vq = &ctrlr->bdev->queues[i]; + vq = &blk_vq->vq; + ret = rte_vhost_get_vring_base(ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); + assert(ret == 0); + if (ctrlr->packed) { + ret = rte_vhost_get_vring_base_counter(ctrlr->bdev->vid, i, + &blk_vq->avail_wrap_counter, + &blk_vq->used_wrap_counter); + assert(ret == 0); + } + + ret = rte_vhost_get_vhost_vring(ctrlr->bdev->vid, i, vq); + assert(ret == 0); + + ret = rte_vhost_get_vhost_ring_inflight(ctrlr->bdev->vid, i, + &blk_vq->inflight_vq); + assert(ret == 0); + + if (ctrlr->packed) { + /* for the reconnection */ + ret = rte_vhost_get_vring_base_from_inflight(ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); + assert(ret == 0); + rte_vhost_get_vring_base_counter_from_inflight(ctrlr->bdev->vid, i, + &blk_vq->avail_wrap_counter, + &blk_vq->used_wrap_counter); + assert(ret == 0); + } + } + + /* start polling vring */ + g_should_stop = 0; + fprintf(stdout, "New Device %s, Device ID %d\n", path, vid); + if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) { + fprintf(stderr, "Worker Thread Started Failed\n"); + return -1; + } + + /* device has been started */ + ctrlr->started = 1; + pthread_detach(tid); + return 0; +} + +static void +destroy_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + int i, ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + fprintf(stdout, "Destroy %s Device ID %d\n", path, vid); + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + + if (!ctrlr->started) + return; + + g_should_stop = 1; + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + rte_vhost_set_vring_base(ctrlr->bdev->vid, i, + blk_vq->last_avail_idx, blk_vq->last_used_idx); + if (ctrlr->packed) { + fprintf(stderr, "destroy counter avail is %d and used is %d\n", + blk_vq->avail_wrap_counter, blk_vq->used_wrap_counter); + rte_vhost_set_vring_base_counter(ctrlr->bdev->vid, i, + blk_vq->avail_wrap_counter, + blk_vq->used_wrap_counter); + } + } + + free(ctrlr->mem); + + ctrlr->started = 0; + sem_wait(&exit_sem); +} + +static int +new_connection(int vid) +{ + /* extend the proper features for block device */ + vhost_session_install_rte_compat_hooks(vid); +} + +struct vhost_device_ops vhost_blk_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, + .new_connection = new_connection, +}; + +static struct vhost_block_dev * +vhost_blk_bdev_construct(const char *bdev_name, const char *bdev_serial, + uint32_t blk_size, uint64_t blk_cnt, + bool wce_enable) +{ + struct vhost_block_dev *bdev; + + bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE); + if (!bdev) + return NULL; + + strncpy(bdev->name, bdev_name, sizeof(bdev->name)); + strncpy(bdev->product_name, bdev_serial, sizeof(bdev->product_name)); + bdev->blocklen = blk_size; + bdev->blockcnt = blk_cnt; + bdev->write_cache = wce_enable; + + fprintf(stdout, "blocklen=%d, blockcnt=%d\n", bdev->blocklen, bdev->blockcnt); + + /* use memory as disk storage space */ + bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0); + if (!bdev->data) { + fprintf(stderr, "no enough reseverd huge memory for disk\n"); + free(bdev); + return NULL; + } + + return bdev; +} + +static struct vhost_blk_ctrlr * +vhost_blk_ctrlr_construct(const char *ctrlr_name) +{ + int ret; + struct vhost_blk_ctrlr *ctrlr; + char *path; + char cwd[PATH_MAX]; + + /* always use current directory */ + path = getcwd(cwd, PATH_MAX); + if (!path) { + fprintf(stderr, "Cannot get current working directory\n"); + return NULL; + } + snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name); + + if (access(dev_pathname, F_OK) != -1) { + if (unlink(dev_pathname) != 0) + rte_exit(EXIT_FAILURE, "Cannot remove %s.\n", + dev_pathname); + } + + if (rte_vhost_driver_register(dev_pathname, 0) != 0) { + fprintf(stderr, "socket %s already exists\n", dev_pathname); + return NULL; + } + + ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES); + if (ret != 0) { + fprintf(stderr, "Set vhost driver features failed\n"); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* set proper features */ + vhost_dev_install_rte_compat_hooks(dev_pathname); + + ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE); + if (!ctrlr) { + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* hardcoded block device information with 128MiB */ + ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0", + 4096, 32768, 0); + if (!ctrlr->bdev) { + rte_free(ctrlr); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + rte_vhost_driver_callback_register(dev_pathname, + &vhost_blk_device_ops); + + return ctrlr; +} + +static void +signal_handler(__rte_unused int signum) +{ + struct vhost_blk_ctrlr *ctrlr; + + if (access(dev_pathname, F_OK) == 0) + unlink(dev_pathname); + + g_should_stop = 1; + + ctrlr = vhost_blk_ctrlr_find(NULL); + if (ctrlr != NULL) { + fprintf(stderr, "never come in\n"); + if (ctrlr->bdev != NULL) { + rte_free(ctrlr->bdev->data); + rte_free(ctrlr->bdev); + } + rte_free(ctrlr); + } + + rte_vhost_driver_unregister(dev_pathname); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int ret; + + signal(SIGINT, signal_handler); + + /* init EAL */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); + + g_vhost_ctrlr = vhost_blk_ctrlr_construct("vhost.socket"); + if (g_vhost_ctrlr == NULL) { + fprintf(stderr, "Construct vhost blk controller failed\n"); + return 0; + } + + if (sem_init(&exit_sem, 0, 0) < 0) { + fprintf(stderr, "Error init exit_sem\n"); + return -1; + } + + rte_vhost_driver_start(dev_pathname); + + /* loop for exit the application */ + while (1) + sleep(1); + + return 0; +} + diff --git a/examples/vhost_blk/vhost_blk.h b/examples/vhost_blk/vhost_blk.h new file mode 100644 index 000000000..6f6d20398 --- /dev/null +++ b/examples/vhost_blk/vhost_blk.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_H_ +#define _VHOST_BLK_H_ + +#include <stdio.h> +#include <sys/uio.h> +#include <stdint.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_vhost.h> + +struct vhost_blk_queue { + struct rte_vhost_vring vq; + struct rte_vhost_ring_inflight inflight_vq; + uint16_t last_avail_idx; + uint16_t last_used_idx; + bool avail_wrap_counter; + bool used_wrap_counter; +}; + +#define NUM_OF_BLK_QUEUES 1 + +#ifndef VIRTIO_F_RING_PACKED +#define VIRTIO_F_RING_PACKED 34 +#endif + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +struct vhost_block_dev { + /** ID for vhost library. */ + int vid; + /** Queues for the block device */ + struct vhost_blk_queue queues[NUM_OF_BLK_QUEUES]; + /** Unique name for this block device. */ + char name[64]; + + /** Unique product name for this kind of block device. */ + char product_name[256]; + + /** Size in bytes of a logical block for the backend */ + uint32_t blocklen; + + /** Number of blocks */ + uint64_t blockcnt; + + /** write cache enabled, not used at the moment */ + int write_cache; + + /** use memory as disk storage space */ + uint8_t *data; +}; + +struct vhost_blk_ctrlr { + uint8_t started; + uint8_t packed; + uint8_t need_restart; + /** Only support 1 LUN for the example */ + struct vhost_block_dev *bdev; + /** VM memory region */ + struct rte_vhost_memory *mem; +} __rte_cache_aligned; + +#define VHOST_BLK_MAX_IOVS 128 + +enum blk_data_dir { + BLK_DIR_NONE = 0, + BLK_DIR_TO_DEV = 1, + BLK_DIR_FROM_DEV = 2, +}; + +struct vhost_blk_task { + uint8_t readtype; + uint8_t req_idx; + uint16_t head_idx; + uint16_t last_idx; + uint16_t inflight_idx; + uint16_t buffer_id; + uint32_t dxfer_dir; + uint32_t data_len; + struct virtio_blk_outhdr *req; + volatile uint8_t *status; + struct iovec iovs[VHOST_BLK_MAX_IOVS]; + uint32_t iovs_cnt; + struct vring_packed_desc *desc_packed; + struct vring_desc *desc_split; + struct rte_vhost_vring *vq; + struct vhost_block_dev *bdev; + struct vhost_blk_ctrlr *ctrlr; +}; + +struct inflight_blk_task { + struct vhost_blk_task blk_task; + struct inflight_desc_packed *inflight_desc; + struct inflight_info_packed *inflight_packed; +}; + +struct vhost_blk_ctrlr *g_vhost_ctrlr; +struct vhost_device_ops vhost_blk_device_ops; +int g_should_stop; + +int vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task); + +void vhost_session_install_rte_compat_hooks(uint32_t vid); + +void vhost_dev_install_rte_compat_hooks(const char *path); + +struct vhost_blk_ctrlr *vhost_blk_ctrlr_find(const char *ctrlr_name); + +#endif /* _VHOST_blk_H_ */ diff --git a/examples/vhost_blk/vhost_blk_compat.c b/examples/vhost_blk/vhost_blk_compat.c new file mode 100644 index 000000000..405b091d7 --- /dev/null +++ b/examples/vhost_blk/vhost_blk_compat.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_COMPAT_H_ +#define _VHOST_BLK_COMPAT_H_ + +#include <sys/uio.h> +#include <stdint.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_vhost.h> +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define VHOST_USER_GET_CONFIG 24 +#define VHOST_USER_SET_CONFIG 25 + +static int +vhost_blk_get_config(struct vhost_block_dev *bdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config blkcfg; + uint32_t blk_size; + uint64_t blkcnt; + + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = bdev->blocklen; + blkcnt = bdev->blockcnt; + } + + memset(&blkcfg, 0, sizeof(blkcfg)); + blkcfg.blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg.min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg.capacity = (blkcnt * blk_size) / 512; + /* QEMU can overwrite this value when started */ + blkcfg.num_queues = VHOST_MAX_VQUEUES; + + fprintf(stdout, "block device:blk_size = %d, blkcnt = %d\n", blk_size, blkcnt); + + memcpy(config, &blkcfg, min(len, sizeof(blkcfg))); + + return 0; +} + +static enum rte_vhost_msg_result +extern_vhost_pre_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + switch (msg->request) { + case VHOST_USER_GET_VRING_BASE: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_KICK: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_MEM_TABLE: + if (!g_should_stop && ctrlr->started) { + vhost_blk_device_ops.destroy_device(vid); + ctrlr->need_restart = 1; + } + break; + case VHOST_USER_GET_CONFIG: { + int rc = 0; + + rc = vhost_blk_get_config(ctrlr->bdev, + msg->payload.cfg.region, msg->payload.cfg.size); + if (rc != 0) + msg->size = 0; + + return RTE_VHOST_MSG_RESULT_REPLY; + } + case VHOST_USER_SET_CONFIG: + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +static enum rte_vhost_msg_result +extern_vhost_post_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->need_restart) { + vhost_blk_device_ops.new_device(vid); + ctrlr->need_restart = 0; + } + + switch (msg->request) { + case VHOST_USER_SET_FEATURES: + break; + case VHOST_USER_SET_VRING_KICK: + /* vhost-user spec tells us to start polling a queue after receiving + * its SET_VRING_KICK message. Let's do it! + */ + if (g_should_stop && !ctrlr->started) + vhost_blk_device_ops.new_device(vid); + break; + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +struct rte_vhost_user_extern_ops g_extern_vhost_ops = { + .pre_msg_handle = extern_vhost_pre_msg_handler, + .post_msg_handle = extern_vhost_post_msg_handler, +}; + +void +vhost_session_install_rte_compat_hooks(uint32_t vid) +{ + int rc; + + rc = rte_vhost_extern_callback_register(vid, &g_extern_vhost_ops, NULL); + if (rc != 0) + fprintf(stderr, + "rte_vhost_extern_callback_register() failed for vid = %d\n", + vid); +} + +void +vhost_dev_install_rte_compat_hooks(const char *path) +{ + uint64_t protocol_features = 0; + + rte_vhost_driver_get_protocol_features(path, &protocol_features); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); + rte_vhost_driver_set_protocol_features(path, protocol_features); +} + +#endif -- 2.17.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
[parent not found: <20190725212335.9675>]
* [dpdk-dev] [PATCH v4 0/2] vhost: support inflight share memory protocol feature [not found] <20190725212335.9675> @ 2019-07-31 20:40 ` JinYu 2019-07-31 20:40 ` [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight JinYu 0 siblings, 1 reply; 4+ messages in thread From: JinYu @ 2019-07-31 20:40 UTC (permalink / raw) To: dev; +Cc: changpeng.liu, maxime.coquelin, tiwei.bie, zhihong.wang, JinYu This patches introduces two new messages VHOST_USER_GET_INFLIGHT_FD and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer between qemu and backend. This shared buffer is used to process inflight I/O when backend reconnect Now It can both support split and packed ring. The example code show how these APIs work and the test has passed. JinYu (2): vhost: support inflight share memory protocol feature vhost: Add vhost-user-blk example which support inflight examples/vhost_blk/Makefile | 67 ++ examples/vhost_blk/blk.c | 122 +++ examples/vhost_blk/blk_spec.h | 95 ++ examples/vhost_blk/meson.build | 20 + examples/vhost_blk/vhost_blk.c | 1272 ++++++++++++++++++++++++ examples/vhost_blk/vhost_blk.h | 114 +++ examples/vhost_blk/vhost_blk_compat.c | 193 ++++ lib/librte_vhost/rte_vhost.h | 301 +++++- lib/librte_vhost/rte_vhost_version.map | 12 + lib/librte_vhost/vhost.c | 399 +++++++- lib/librte_vhost/vhost.h | 54 +- lib/librte_vhost/vhost_user.c | 423 +++++++- lib/librte_vhost/vhost_user.h | 13 +- 13 files changed, 3056 insertions(+), 29 deletions(-) create mode 100644 examples/vhost_blk/Makefile create mode 100644 examples/vhost_blk/blk.c create mode 100644 examples/vhost_blk/blk_spec.h create mode 100644 examples/vhost_blk/meson.build create mode 100644 examples/vhost_blk/vhost_blk.c create mode 100644 examples/vhost_blk/vhost_blk.h create mode 100644 examples/vhost_blk/vhost_blk_compat.c -- 2.17.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
* [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight 2019-07-31 20:40 ` [dpdk-dev] [PATCH v4 0/2] vhost: support inflight share memory protocol feature JinYu @ 2019-07-31 20:40 ` JinYu 0 siblings, 0 replies; 4+ messages in thread From: JinYu @ 2019-07-31 20:40 UTC (permalink / raw) To: dev; +Cc: changpeng.liu, maxime.coquelin, tiwei.bie, zhihong.wang, JinYu A vhost-user-blk example that support inflight feature. It uses the new APIs that introduced in the first patch, so It can show how there APIs work to support inflight feature. Signed-off-by: Jin Yu <jin.yu@intel.com> --- V1 - add the case. V2 - add the rte_vhost prefix. V3 - add packed ring support --- examples/vhost_blk/Makefile | 67 ++ examples/vhost_blk/blk.c | 122 +++ examples/vhost_blk/blk_spec.h | 95 ++ examples/vhost_blk/meson.build | 20 + examples/vhost_blk/vhost_blk.c | 1272 +++++++++++++++++++++++++ examples/vhost_blk/vhost_blk.h | 114 +++ examples/vhost_blk/vhost_blk_compat.c | 193 ++++ 7 files changed, 1883 insertions(+) create mode 100644 examples/vhost_blk/Makefile create mode 100644 examples/vhost_blk/blk.c create mode 100644 examples/vhost_blk/blk_spec.h create mode 100644 examples/vhost_blk/meson.build create mode 100644 examples/vhost_blk/vhost_blk.c create mode 100644 examples/vhost_blk/vhost_blk.h create mode 100644 examples/vhost_blk/vhost_blk_compat.c diff --git a/examples/vhost_blk/Makefile b/examples/vhost_blk/Makefile new file mode 100644 index 000000000..52e9befd8 --- /dev/null +++ b/examples/vhost_blk/Makefile @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2017 Intel Corporation + +# binary name +APP = vhost-blk + +# all source are stored in SRCS-y +SRCS-y := blk.c vhost_blk.c vhost_blk_compat.c + +# Build using pkg-config variables if possible +$(shell pkg-config --exists libdpdk) +ifeq ($(.SHELLSTATUS),0) + +all: shared +.PHONY: shared static +shared: build/$(APP)-shared + ln -sf $(APP)-shared build/$(APP) +static: build/$(APP)-static + ln -sf $(APP)-static build/$(APP) + +CFLAGS += -D_FILE_OFFSET_BITS=64 +LDFLAGS += -pthread + +PC_FILE := $(shell pkg-config --path libdpdk) +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk) +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk) +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk) + +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) + +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) + +build: + @mkdir -p $@ + +.PHONY: clean +clean: + rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared + test -d build && rmdir -p build || true + +else # Build using legacy build system + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, detect a build directory, by looking for a path with a .config +RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config))))) + +include $(RTE_SDK)/mk/rte.vars.mk + +ifneq ($(CONFIG_RTE_EXEC_ENV_LINUX),y) +$(info This application can only operate in a linux environment, \ +please change the definition of the RTE_TARGET environment variable) +all: +else + +CFLAGS += -D_FILE_OFFSET_BITS=64 +CFLAGS += -O2 +#CFLAGS += $(WERROR_FLAGS) + +include $(RTE_SDK)/mk/rte.extapp.mk + +endif +endif diff --git a/examples/vhost_blk/blk.c b/examples/vhost_blk/blk.c new file mode 100644 index 000000000..3ecd0e206 --- /dev/null +++ b/examples/vhost_blk/blk.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +/** + * This work is largely based on the "vhost-user-blk" implementation by + * SPDK(https://github.com/spdk/spdk). + */ + +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <assert.h> +#include <ctype.h> +#include <string.h> +#include <stddef.h> + +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_byteorder.h> +#include <rte_string_fns.h> + +#include "vhost_blk.h" +#include "blk_spec.h" + +static void +vhost_strcpy_pad(void *dst, const char *src, size_t size, int pad) +{ + size_t len; + + len = strlen(src); + if (len < size) { + memcpy(dst, src, len); + memset((char *)dst + len, pad, size - len); + } else { + memcpy(dst, src, size); + } +} + +static int +vhost_bdev_blk_readwrite(struct vhost_block_dev *bdev, + struct vhost_blk_task *task, + uint64_t lba_512, __rte_unused uint32_t xfer_len) +{ + uint32_t i; + uint64_t offset; + uint32_t nbytes = 0; + + offset = lba_512 * 512; + + for (i = 0; i < task->iovs_cnt; i++) { + if (task->dxfer_dir == BLK_DIR_TO_DEV) + memcpy(bdev->data + offset, task->iovs[i].iov_base, + task->iovs[i].iov_len); + else + memcpy(task->iovs[i].iov_base, bdev->data + offset, + task->iovs[i].iov_len); + offset += task->iovs[i].iov_len; + nbytes += task->iovs[i].iov_len; + } + + return nbytes; +} + +int +vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task) +{ + int used_len; + + if (unlikely(task->data_len > (bdev->blockcnt * bdev->blocklen))) { + fprintf(stderr, "read or write beyond capacity\n"); + return VIRTIO_BLK_S_UNSUPP; + } + + switch (task->req->type) { + case VIRTIO_BLK_T_IN: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b" + "(req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->head_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + task->dxfer_dir = BLK_DIR_FROM_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_OUT: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b" + "(req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->head_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + if (task->readtype) { + fprintf(stderr, "type isn't right\n"); + return VIRTIO_BLK_S_IOERR; + } + task->dxfer_dir = BLK_DIR_TO_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovs_cnt || task->data_len) + return VIRTIO_BLK_S_UNSUPP; + used_len = min(VIRTIO_BLK_ID_BYTES, task->data_len); + vhost_strcpy_pad(task->iovs[0].iov_base, + bdev->product_name, used_len, ' '); + break; + default: + fprintf(stderr, "unsupported cmd\n"); + return VIRTIO_BLK_S_UNSUPP; + } + + return VIRTIO_BLK_S_OK; +} diff --git a/examples/vhost_blk/blk_spec.h b/examples/vhost_blk/blk_spec.h new file mode 100644 index 000000000..5875e2f86 --- /dev/null +++ b/examples/vhost_blk/blk_spec.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _BLK_SPEC_H +#define _BLK_SPEC_H + +#include <stdint.h> + +#ifndef VHOST_USER_MEMORY_MAX_NREGIONS +#define VHOST_USER_MEMORY_MAX_NREGIONS 8 +#endif + +#ifndef VHOST_USER_MAX_CONFIG_SIZE +#define VHOST_USER_MAX_CONFIG_SIZE 256 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CONFIG +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 +#endif + +#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ + +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_DISCARD 11 +#define VIRTIO_BLK_T_WRITE_ZEROES 13 + +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_MAX +}; + +/** Get/set config msg payload */ +struct vhost_user_config { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +}; + +/** Fixed-size vhost_memory struct */ +struct vhost_memory_padded { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /**< the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory_padded memory; + struct vhost_user_config cfg; + } payload; +} __attribute((packed)); + +#endif diff --git a/examples/vhost_blk/meson.build b/examples/vhost_blk/meson.build new file mode 100644 index 000000000..028aa4f62 --- /dev/null +++ b/examples/vhost_blk/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +# meson file, for building this example as part of a main DPDK build. +# +# To build this example as a standalone application with an already-installed +# DPDK instance, use 'make' + +if not is_linux + build = false +endif + +if not cc.has_header('linux/virtio_blk.h') + build = false +endif + +deps += 'vhost' +sources = files( + 'blk.c', 'vhost_blk.c', 'vhost_blk_compat.c' +) diff --git a/examples/vhost_blk/vhost_blk.c b/examples/vhost_blk/vhost_blk.c new file mode 100644 index 000000000..d0dab4f8b --- /dev/null +++ b/examples/vhost_blk/vhost_blk.c @@ -0,0 +1,1272 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include <stdint.h> +#include <unistd.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <semaphore.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_vhost.h> + +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VIRTQ_DESC_F_NEXT 1 +#define VIRTQ_DESC_F_AVAIL (1 << 7) +#define VIRTQ_DESC_F_USED (1 << 15) + +#define MAX_TASK 12 + +#define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \ + (1ULL << VIRTIO_F_VERSION_1) |\ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_pathname[PATH_MAX] = ""; +static sem_t exit_sem; + +struct vhost_blk_ctrlr * +vhost_blk_ctrlr_find(const char *ctrlr_name) +{ + /* currently we only support 1 socket file fd */ + return g_vhost_ctrlr; +} + +static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + int ret = 0; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + assert(ret != 0); + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + assert(ctrlr != NULL); + } + + assert(ctrlr->mem != NULL); + + return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len); +} + +static struct vring_packed_desc * +descriptor_get_next_packed(struct rte_vhost_vring *vq, + uint16_t *idx) +{ + if (vq->desc_packed[*idx & (vq->size - 1)].flags & VIRTQ_DESC_F_NEXT) { + *idx += 1; + return &vq->desc_packed[*idx & (vq->size - 1)]; + } + + return NULL; +} + +static bool +descriptor_has_next_packed(struct vring_packed_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +descriptor_is_wr_packed(struct vring_packed_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static struct inflight_desc_packed * +inflight_desc_get_next(struct inflight_info_packed *inflight_packed, + struct inflight_desc_packed *cur_desc) +{ + if (!!(cur_desc->flags & VIRTQ_DESC_F_NEXT)) { + return &inflight_packed->desc[cur_desc->next]; + } + + return NULL; +} + +static bool +inflight_desc_has_next(struct inflight_desc_packed *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +inflight_desc_is_wr(struct inflight_desc_packed *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void +inflight_process_payload_chain_packed(struct inflight_blk_task *task) +{ + void *data; + uint64_t chunck_len; + + task->blk_task.iovs_cnt = 0; + + do { + chunck_len = task->inflight_desc->len; + data = (void *)(uintptr_t)gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!data || chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->blk_task.iovs[task->blk_task.iovs_cnt].iov_base = data; + task->blk_task.iovs[task->blk_task.iovs_cnt].iov_len = + task->inflight_desc->len; + task->blk_task.data_len += task->inflight_desc->len; + task->blk_task.iovs_cnt++; + task->inflight_desc = inflight_desc_get_next(task->inflight_packed, + task->inflight_desc); + } while (inflight_desc_has_next(task->inflight_desc)); + + chunck_len = task->inflight_desc->len; + task->blk_task.status = (void *)(uintptr_t)gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.status || chunck_len != task->inflight_desc->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + +static void +inflight_submit_completion_packed(struct inflight_blk_task *task, + uint32_t q_idx, uint16_t *used_id, + bool *used_wrap_counter) +{ + struct vhost_blk_ctrlr *ctrlr; + struct rte_vhost_vring *vq; + struct vring_packed_desc *desc; + uint16_t flags; + uint16_t entry_num; + int ret; + + ctrlr = vhost_blk_ctrlr_find(dev_pathname); + vq = task->blk_task.vq; + + ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx, + task->blk_task.head_idx); + if (ret != 0) + fprintf(stderr, "fail to set last inflight io\n"); + + desc = &vq->desc_packed[*used_id]; + desc->id = task->blk_task.buffer_id; + rte_compiler_barrier(); + if (*used_wrap_counter) { + desc->flags = desc->flags | VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED; + } else { + desc->flags = desc->flags & ~(VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED); + } + + *used_id += task->blk_task.iovs_cnt + 2; + if (*used_id > vq->size) { + *used_id &= vq->size - 1; + *used_wrap_counter = !(*used_wrap_counter); + } + + ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->blk_task.head_idx); + if (ret != 0) + fprintf(stderr, "fail to clear inflight io\n"); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->blk_task.bdev->vid, q_idx); +} + +static void +submit_completion_packed(struct vhost_blk_task *task, uint32_t q_idx, + uint16_t *used_id, bool *used_wrap_counter) +{ + struct vhost_blk_ctrlr *ctrlr; + struct rte_vhost_vring *vq; + struct vring_packed_desc *desc; + uint16_t entry_num; + int ret; + + ctrlr = vhost_blk_ctrlr_find(dev_pathname); + vq = task->vq; + + ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx, + task->inflight_idx); + if (ret != 0) + fprintf(stderr, "fail to set last inflight io\n"); + + desc = &vq->desc_packed[*used_id]; + desc->id = task->buffer_id; + rte_compiler_barrier(); + if (*used_wrap_counter) { + desc->flags = desc->flags | VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED; + } else { + desc->flags = desc->flags & ~(VIRTQ_DESC_F_AVAIL | + VIRTQ_DESC_F_USED); + } + rte_compiler_barrier(); + + *used_id += task->iovs_cnt + 2; + if (*used_id >= vq->size) { + *used_id &= vq->size - 1; + *used_wrap_counter = !(*used_wrap_counter); + } + + ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->inflight_idx); + if (ret != 0) + fprintf(stderr, "fail to clear inflight io\n"); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->bdev->vid, q_idx); +} + +static void +vhost_process_payload_chain_packed(struct vhost_blk_task *task, uint16_t *idx) +{ + void *data; + uint64_t chunck_len; + + task->iovs_cnt = 0; + + do { + chunck_len = task->desc_packed->len; + data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!data || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->iovs[task->iovs_cnt].iov_base = data; + task->iovs[task->iovs_cnt].iov_len = task->desc_packed->len; + task->data_len += task->desc_packed->len; + task->iovs_cnt++; + task->desc_packed = descriptor_get_next_packed(task->vq, idx); + } while (descriptor_has_next_packed(task->desc_packed)); + + task->last_idx = *idx & (task->vq->size - 1); + chunck_len = task->desc_packed->len; + task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_packed->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + + +static int +descriptor_is_available(struct rte_vhost_vring *vring, uint16_t idx, + bool avail_wrap_counter) +{ + uint16_t flags = vring->desc_packed[idx].flags; + + return ((!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter) && + (!!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter)); +} + +static int +descriptor_is_used(struct rte_vhost_vring *vring, uint16_t idx, + bool used_wrap_counter) +{ + uint16_t flags = vring->desc_packed[idx].flags; + + return ((!!(flags & VIRTQ_DESC_F_AVAIL) == used_wrap_counter) && + (!!(flags & VIRTQ_DESC_F_USED) == used_wrap_counter)); +} + +static void +process_requestq_packed(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + bool avail_wrap_counter, used_wrap_counter; + uint16_t avail_idx, used_idx; + int ret; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + avail_idx = blk_vq->last_avail_idx; + avail_wrap_counter = blk_vq->avail_wrap_counter; + used_idx = blk_vq->last_used_idx; + used_wrap_counter = blk_vq->used_wrap_counter; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + task->vq = vq; + task->bdev = ctrlr->bdev; + + while (descriptor_is_available(vq, avail_idx, avail_wrap_counter)) { + task->head_idx = avail_idx; + task->desc_packed = &task->vq->desc_packed[task->head_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->status = NULL; + + /* does not support indirect descriptors */ + assert((task->desc_packed->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_packed->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_packed = descriptor_get_next_packed(task->vq, &avail_idx); + assert(task->desc_packed != NULL); + if (!descriptor_has_next_packed(task->desc_packed)) { + task->dxfer_dir = BLK_DIR_NONE; + task->last_idx = avail_idx & (vq->size - 1); + chunck_len = task->desc_packed->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_packed->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_packed(task->desc_packed); + vhost_process_payload_chain_packed(task, &avail_idx); + } + task->buffer_id = vq->desc_packed[task->last_idx].id; + rte_vhost_set_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task->head_idx, + task->last_idx, + &task->inflight_idx); + + if (++avail_idx >= vq->size) { + avail_idx &= vq->size - 1; + avail_wrap_counter = !avail_wrap_counter; + } + blk_vq->last_avail_idx = avail_idx; + blk_vq->avail_wrap_counter = avail_wrap_counter; + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + + submit_completion_packed(task, q_idx, &used_idx, &used_wrap_counter); + blk_vq->last_used_idx = used_idx; + blk_vq->used_wrap_counter = used_wrap_counter; + } + + rte_free(task); +} + +static void +submit_inflight_vq_packed(struct vhost_blk_ctrlr *ctrlr, uint16_t q_idx) +{ + bool used_wrap_counter; + int req_idx, ret; + uint16_t used_idx; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + struct rte_vhost_vring *vq; + struct inflight_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + inflight_vq = &blk_vq->inflight_vq; + used_idx = blk_vq->last_used_idx; + used_wrap_counter = blk_vq->used_wrap_counter; + + task = rte_malloc(NULL, sizeof(*task), 0); + if (!task) { + fprintf(stderr, "fail to allocate memory\n"); + return; + } + task->blk_task.vq = vq; + task->blk_task.bdev = ctrlr->bdev; + task->inflight_packed = inflight_vq->inflight_packed; + + while (inflight_vq->resubmit_inflight->resubmit_num-- > 0) { + req_idx = inflight_vq->resubmit_inflight->resubmit_num; + task->blk_task.head_idx = + inflight_vq->resubmit_inflight->resubmit_list[req_idx].index; + task->inflight_desc = + &inflight_vq->inflight_packed->desc[task->blk_task.head_idx]; + task->blk_task.iovs_cnt = 0; + task->blk_task.data_len = 0; + task->blk_task.req = NULL; + task->blk_task.status = NULL; + + /* update the avail idx too as it's initial value equals to used idx */ + blk_vq->last_avail_idx += task->inflight_desc->num; + if (blk_vq->last_avail_idx >= vq->size) { + blk_vq->last_avail_idx &= vq->size - 1; + blk_vq->avail_wrap_counter = !blk_vq->avail_wrap_counter; + } + + /* does not support indirect descriptors */ + assert(task->inflight_desc != NULL); + assert((task->inflight_desc->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->inflight_desc->len;; + task->blk_task.req = (void *)(uintptr_t) + gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.req || chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->inflight_desc = inflight_desc_get_next(task->inflight_packed, + task->inflight_desc); + assert(task->inflight_desc != NULL); + if (!inflight_desc_has_next(task->inflight_desc)) { + task->blk_task.dxfer_dir = BLK_DIR_NONE; + chunck_len = task->inflight_desc->len; + task->blk_task.status = (void *)(uintptr_t) + gpa_to_vva(task->blk_task.bdev->vid, + task->inflight_desc->addr, + &chunck_len); + if (!task->blk_task.status || + chunck_len != task->inflight_desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->blk_task.readtype = + inflight_desc_is_wr(task->inflight_desc); + inflight_process_payload_chain_packed(task); + } + + task->blk_task.buffer_id = task->inflight_desc->id; + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, &task->blk_task); + if (ret) + /* invalid response */ + *task->blk_task.status = VIRTIO_BLK_S_IOERR; + else + /* successfully */ + *task->blk_task.status = VIRTIO_BLK_S_OK; + + inflight_submit_completion_packed(task, q_idx, &used_idx, + &used_wrap_counter); + + blk_vq->last_used_idx = used_idx; + blk_vq->used_wrap_counter = used_wrap_counter; + } + + rte_free(task); +} + +static void +process_batch_requests_packed(struct vhost_blk_ctrlr *ctrlr, uint16_t q_idx) +{ + bool avail_wrap_counter, used_wrap_counter; + uint16_t avail_idx, used_idx; + int ret, i, j; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task[MAX_TASK] = {NULL}; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + avail_idx = blk_vq->last_avail_idx; + avail_wrap_counter = blk_vq->avail_wrap_counter; + used_idx = blk_vq->last_used_idx; + used_wrap_counter = blk_vq->used_wrap_counter; + + for (i = 0; i < MAX_TASK; i++) { + task[i] = rte_zmalloc(NULL, sizeof(struct vhost_blk_task), 0); + assert(task[i] != NULL); + task[i]->vq = vq; + task[i]->bdev = ctrlr->bdev; + } + + i = 0; + j = 0; + while (j++ < 1000000 && + descriptor_is_available(vq, avail_idx, avail_wrap_counter) && + i < MAX_TASK) { + task[i]->head_idx = avail_idx; + task[i]->desc_packed = &task[i]->vq->desc_packed[task[i]->head_idx]; + task[i]->iovs_cnt = 0; + task[i]->data_len = 0; + task[i]->req = NULL; + task[i]->status = NULL; + + /* does not support indirect descriptors */ + assert((task[i]->desc_packed->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task[i]->desc_packed->len; + task[i]->req = (void *)(uintptr_t)gpa_to_vva(task[i]->bdev->vid, + task[i]->desc_packed->addr, + &chunck_len); + if (!task[i]->req || chunck_len != task[i]->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + goto free_task; + } + + task[i]->desc_packed = descriptor_get_next_packed(task[i]->vq, &avail_idx); + assert(task[i]->desc_packed != NULL); + if (!descriptor_has_next_packed(task[i]->desc_packed)) { + task[i]->dxfer_dir = BLK_DIR_NONE; + task[i]->last_idx = avail_idx & (vq->size - 1); + chunck_len = task[i]->desc_packed->len; + task[i]->status = (void *)(uintptr_t) + gpa_to_vva(task[i]->bdev->vid, + task[i]->desc_packed->addr, + &chunck_len); + if (!task[i]->status || chunck_len != task[i]->desc_packed->len) { + fprintf(stderr, "failed to translate desc address.\n"); + goto free_task; + } + } else { + task[i]->readtype = descriptor_is_wr_packed(task[i]->desc_packed); + vhost_process_payload_chain_packed(task[i], &avail_idx); + } + task[i]->buffer_id = vq->desc_packed[task[i]->last_idx].id; + rte_vhost_set_inflight_desc_packed(ctrlr->bdev->vid, q_idx, + task[i]->head_idx, + task[i]->last_idx, + &task[i]->inflight_idx); + + if (++avail_idx >= vq->size) { + avail_idx &= vq->size - 1; + avail_wrap_counter = !avail_wrap_counter; + } + blk_vq->last_avail_idx = avail_idx; + blk_vq->avail_wrap_counter = avail_wrap_counter; + + i++; + if (g_should_stop == 1) { + goto free_task; + } + } + + for (j = 0; j < i; j++) { + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task[j]); + if (ret) { + /* invalid response */ + *task[j]->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task[j]->status = VIRTIO_BLK_S_OK; + } + + submit_completion_packed(task[j], q_idx, &used_idx, &used_wrap_counter); + blk_vq->last_used_idx = used_idx; + blk_vq->used_wrap_counter = used_wrap_counter; + + if (g_should_stop == 1) + goto free_task; + } + +free_task: + for (j = 0; j < MAX_TASK; j++) + rte_free(task[j]); +} + +static struct vring_desc * +descriptor_get_next_split(struct vring_desc *vq_desc, + struct vring_desc *cur_desc) +{ + return &vq_desc[cur_desc->next]; +} + +static bool +descriptor_has_next_split(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +descriptor_is_wr_split(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void +vhost_process_payload_chain_split(struct vhost_blk_task *task) +{ + void *data; + uint64_t chunck_len; + + task->iovs_cnt = 0; + + do { + chunck_len = task->desc_split->len; + data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!data || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->iovs[task->iovs_cnt].iov_base = data; + task->iovs[task->iovs_cnt].iov_len = task->desc_split->len; + task->data_len += task->desc_split->len; + task->iovs_cnt++; + task->desc_split = + descriptor_get_next_split(task->vq->desc, task->desc_split); + } while (descriptor_has_next_split(task->desc_split)); + + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + +static void +submit_completion_split(struct vhost_blk_task *task, uint32_t vid, uint32_t q_idx) +{ + struct rte_vhost_vring *vq; + struct vring_used *used; + + vq = task->vq; + used = vq->used; + + rte_vhost_set_last_inflight_io_split(vid, q_idx, task->req_idx); + + /* Fill out the next entry in the "used" ring. id = the + * index of the descriptor that contained the blk request. + * len = the total amount of data transferred for the blk + * request. We must report the correct len, for variable + * length blk CDBs, where we may return less data than + * allocated by the guest VM. + */ + used->ring[used->idx & (vq->size - 1)].id = task->req_idx; + used->ring[used->idx & (vq->size - 1)].len = task->data_len; + rte_compiler_barrier(); + used->idx++; + rte_compiler_barrier(); + + rte_vhost_clr_inflight_desc_split(vid, q_idx, used->idx, task->req_idx); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->bdev->vid, q_idx); +} + +static void +submit_inflight_vq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + struct rte_vhost_resubmit_info *resubmit_inflight; + struct rte_vhost_resubmit_desc *resubmit_list; + struct vhost_blk_task *task; + int req_idx; + uint64_t chunck_len; + int ret; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + inflight_vq = &blk_vq->inflight_vq; + resubmit_inflight = inflight_vq->resubmit_inflight; + resubmit_list = resubmit_inflight->resubmit_list; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = &blk_vq->vq; + + while (resubmit_inflight->resubmit_num-- > 0) { + req_idx = resubmit_list[resubmit_inflight->resubmit_num].index; + task->req_idx = req_idx; + task->desc_split = &task->vq->desc[task->req_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->status = NULL; + + /* does not support indirect descriptors */ + assert(task->desc_split != NULL); + assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_split->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_split = descriptor_get_next_split(task->vq->desc, + task->desc_split); + if (!descriptor_has_next_split(task->desc_split)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_split(task->desc_split); + vhost_process_payload_chain_split(task); + } + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + submit_completion_split(task, ctrlr->bdev->vid, q_idx); + } + + rte_free(task); +} + +static void +process_requestq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + int ret; + int req_idx; + uint16_t last_idx; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = vq; + + while (vq->avail->idx != blk_vq->last_avail_idx) { + last_idx = blk_vq->last_avail_idx & (vq->size - 1); + req_idx = vq->avail->ring[last_idx]; + task->req_idx = req_idx; + task->desc_split = &task->vq->desc[task->req_idx]; + task->iovs_cnt = 0; + task->data_len = 0; + task->req = NULL; + task->status = NULL; + + rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx, task->req_idx); + + /* does not support indirect descriptors */ + assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc_split->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc_split = descriptor_get_next_split(task->vq->desc, + task->desc_split); + if (!descriptor_has_next_split(task->desc_split)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc_split->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc_split->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr_split(task->desc_split); + vhost_process_payload_chain_split(task); + } + blk_vq->last_avail_idx++; + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + + submit_completion_split(task, ctrlr->bdev->vid, q_idx); + } + + rte_free(task); +} + +static void +process_batch_requests_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + int i, j; + int ret; + int req_idx; + uint16_t last_idx; + uint64_t chunck_len; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct vhost_blk_task *task[MAX_TASK] = {NULL}; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + for (i = 0; i < MAX_TASK; i++) { + task[i] = rte_zmalloc(NULL, sizeof(struct vhost_blk_task), 0); + assert(task[i] != NULL); + task[i]->ctrlr = ctrlr; + task[i]->bdev = ctrlr->bdev; + task[i]->vq = vq; + } + + i = 0; + j = 0; + while (j++ < 1000000 && vq->avail->idx != blk_vq->last_avail_idx && i < MAX_TASK) { + last_idx = blk_vq->last_avail_idx & (vq->size - 1); + req_idx = vq->avail->ring[last_idx]; + task[i]->req_idx = req_idx; + task[i]->desc_split = &task[i]->vq->desc[task[i]->req_idx]; + task[i]->iovs_cnt = 0; + task[i]->data_len = 0; + task[i]->req = NULL; + task[i]->status = NULL; + + rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx, task[i]->req_idx); + + /* does not support indirect descriptors */ + assert((task[i]->desc_split->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task[i]->desc_split->len; + task[i]->req = (void *)(uintptr_t)gpa_to_vva(task[i]->bdev->vid, + task[i]->desc_split->addr, + &chunck_len); + if (!task[i]->req || chunck_len != task[i]->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + goto free_tasks; + } + + task[i]->desc_split = descriptor_get_next_split(task[i]->vq->desc, + task[i]->desc_split); + if (!descriptor_has_next_split(task[i]->desc_split)) { + task[i]->dxfer_dir = BLK_DIR_NONE; + chunck_len = task[i]->desc_split->len; + task[i]->status = (void *)(uintptr_t) + gpa_to_vva(task[i]->bdev->vid, + task[i]->desc_split->addr, + &chunck_len); + if (!task[i]->status || chunck_len != task[i]->desc_split->len) { + fprintf(stderr, "failed to translate desc address.\n"); + goto free_tasks; + } + } else { + task[i]->readtype = descriptor_is_wr_split(task[i]->desc_split); + vhost_process_payload_chain_split(task[i]); + } + blk_vq->last_avail_idx++; + i++; + + if (g_should_stop == 1) + goto free_tasks; + } + + for (j = 0; j < i; j++) { + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task[j]); + if (ret) { + /* invalid response */ + *task[j]->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task[j]->status = VIRTIO_BLK_S_OK; + } + + submit_completion_split(task[j], ctrlr->bdev->vid, q_idx); + + if (g_should_stop == 1) + goto free_tasks; + } + +free_tasks: + for (i = 0; i < MAX_TASK; i++) + rte_free(task[i]); +} + +static void * +ctrlr_worker(void *arg) +{ + struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight *inflight_vq; + cpu_set_t cpuset; + pthread_t thread; + int i, ret; + + fprintf(stdout, "Ctrlr Worker Thread start\n"); + + if (ctrlr == NULL || ctrlr->bdev == NULL) { + fprintf(stderr, "%s: Error, invalid argument passed to worker thread\n", + __func__); + exit(0); + } + + thread = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + inflight_vq = &blk_vq->inflight_vq; + if (inflight_vq->resubmit_inflight != NULL && + inflight_vq->resubmit_inflight->resubmit_num != 0) { + if (ctrlr->packed) + submit_inflight_vq_packed(ctrlr, i); + else + submit_inflight_vq_split(ctrlr, i); + } + } + + while (!g_should_stop && ctrlr->bdev != NULL) { + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + if (ctrlr->packed) + process_requestq_packed(ctrlr, i); + else + process_requestq_split(ctrlr, i); + } + } + + g_should_stop = 2; + fprintf(stdout, "Ctrlr Worker Thread Exiting\n"); + sem_post(&exit_sem); + return NULL; +} + +static int +new_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + pthread_t tid; + int i, ret; + uint64_t features; + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->started) + return 0; + + ctrlr->bdev->vid = vid; + ctrlr->packed = rte_vhost_vq_is_packed(vid); + + ret = rte_vhost_get_mem_table(vid, &ctrlr->mem); + if (ret) + fprintf(stderr, "Get Controller memory region failed\n"); + assert(ctrlr->mem != NULL); + + /* Disable Notifications and init last idx */ + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + rte_vhost_enable_guest_notification(vid, i, 0); + + blk_vq = &ctrlr->bdev->queues[i]; + vq = &blk_vq->vq; + ret = rte_vhost_get_vring_base(ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); + assert(ret == 0); + if (ctrlr->packed) { + ret = rte_vhost_get_vring_base_counter(ctrlr->bdev->vid, i, + &blk_vq->avail_wrap_counter, + &blk_vq->used_wrap_counter); + assert(ret == 0); + } + + ret = rte_vhost_get_vhost_vring(ctrlr->bdev->vid, i, vq); + assert(ret == 0); + + ret = rte_vhost_get_vhost_ring_inflight(ctrlr->bdev->vid, i, + &blk_vq->inflight_vq); + assert(ret == 0); + + if (ctrlr->packed) { + /* for the reconnection */ + ret = rte_vhost_get_vring_base_from_inflight(ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); + assert(ret == 0); + rte_vhost_get_vring_base_counter_from_inflight(ctrlr->bdev->vid, + i, &blk_vq->avail_wrap_counter, + &blk_vq->used_wrap_counter); + assert(ret == 0); + } + } + + /* start polling vring */ + g_should_stop = 0; + fprintf(stdout, "New Device %s, Device ID %d\n", path, vid); + if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) { + fprintf(stderr, "Worker Thread Started Failed\n"); + return -1; + } + + /* device has been started */ + ctrlr->started = 1; + pthread_detach(tid); + return 0; +} + +static void +destroy_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + int i, ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + fprintf(stdout, "Destroy %s Device ID %d\n", path, vid); + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + + if (!ctrlr->started) + return; + + g_should_stop = 1; + while(g_should_stop != 2); + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + rte_vhost_set_vring_base(ctrlr->bdev->vid, i, + blk_vq->last_avail_idx, blk_vq->last_used_idx); + if (ctrlr->packed) { + rte_vhost_set_vring_base_counter(ctrlr->bdev->vid, i, + blk_vq->avail_wrap_counter, + blk_vq->used_wrap_counter); + } + } + + free(ctrlr->mem); + + ctrlr->started = 0; + sem_wait(&exit_sem); +} + +static int +new_connection(int vid) +{ + /* extend the proper features for block device */ + vhost_session_install_rte_compat_hooks(vid); +} + +struct vhost_device_ops vhost_blk_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, + .new_connection = new_connection, +}; + +static struct vhost_block_dev * +vhost_blk_bdev_construct(const char *bdev_name, const char *bdev_serial, + uint32_t blk_size, uint64_t blk_cnt, + bool wce_enable) +{ + struct vhost_block_dev *bdev; + + bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE); + if (!bdev) + return NULL; + + strncpy(bdev->name, bdev_name, sizeof(bdev->name)); + strncpy(bdev->product_name, bdev_serial, sizeof(bdev->product_name)); + bdev->blocklen = blk_size; + bdev->blockcnt = blk_cnt; + bdev->write_cache = wce_enable; + + fprintf(stdout, "blocklen=%d, blockcnt=%d\n", bdev->blocklen, bdev->blockcnt); + + /* use memory as disk storage space */ + bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0); + if (!bdev->data) { + fprintf(stderr, "no enough reseverd huge memory for disk\n"); + free(bdev); + return NULL; + } + + return bdev; +} + +static struct vhost_blk_ctrlr * +vhost_blk_ctrlr_construct(const char *ctrlr_name) +{ + int ret; + struct vhost_blk_ctrlr *ctrlr; + char *path; + char cwd[PATH_MAX]; + + /* always use current directory */ + path = getcwd(cwd, PATH_MAX); + if (!path) { + fprintf(stderr, "Cannot get current working directory\n"); + return NULL; + } + snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name); + + if (access(dev_pathname, F_OK) != -1) { + if (unlink(dev_pathname) != 0) + rte_exit(EXIT_FAILURE, "Cannot remove %s.\n", + dev_pathname); + } + + if (rte_vhost_driver_register(dev_pathname, 0) != 0) { + fprintf(stderr, "socket %s already exists\n", dev_pathname); + return NULL; + } + + ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES); + if (ret != 0) { + fprintf(stderr, "Set vhost driver features failed\n"); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* set proper features */ + vhost_dev_install_rte_compat_hooks(dev_pathname); + + ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE); + if (!ctrlr) { + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* hardcoded block device information with 128MiB */ + ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0", + 4096, 32768, 0); + if (!ctrlr->bdev) { + rte_free(ctrlr); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + rte_vhost_driver_callback_register(dev_pathname, + &vhost_blk_device_ops); + + return ctrlr; +} + +static void +signal_handler(__rte_unused int signum) +{ + struct vhost_blk_ctrlr *ctrlr; + + if (access(dev_pathname, F_OK) == 0) + unlink(dev_pathname); + + g_should_stop = 1; + + ctrlr = vhost_blk_ctrlr_find(NULL); + if (ctrlr != NULL) { + if (ctrlr->bdev != NULL) { + rte_free(ctrlr->bdev->data); + rte_free(ctrlr->bdev); + } + rte_free(ctrlr); + } + + rte_vhost_driver_unregister(dev_pathname); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int ret; + + signal(SIGINT, signal_handler); + + /* init EAL */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); + + g_vhost_ctrlr = vhost_blk_ctrlr_construct("vhost.socket"); + if (g_vhost_ctrlr == NULL) { + fprintf(stderr, "Construct vhost blk controller failed\n"); + return 0; + } + + if (sem_init(&exit_sem, 0, 0) < 0) { + fprintf(stderr, "Error init exit_sem\n"); + return -1; + } + + rte_vhost_driver_start(dev_pathname); + + /* loop for exit the application */ + while (1) + sleep(1); + + return 0; +} + diff --git a/examples/vhost_blk/vhost_blk.h b/examples/vhost_blk/vhost_blk.h new file mode 100644 index 000000000..6f6d20398 --- /dev/null +++ b/examples/vhost_blk/vhost_blk.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_H_ +#define _VHOST_BLK_H_ + +#include <stdio.h> +#include <sys/uio.h> +#include <stdint.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_vhost.h> + +struct vhost_blk_queue { + struct rte_vhost_vring vq; + struct rte_vhost_ring_inflight inflight_vq; + uint16_t last_avail_idx; + uint16_t last_used_idx; + bool avail_wrap_counter; + bool used_wrap_counter; +}; + +#define NUM_OF_BLK_QUEUES 1 + +#ifndef VIRTIO_F_RING_PACKED +#define VIRTIO_F_RING_PACKED 34 +#endif + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +struct vhost_block_dev { + /** ID for vhost library. */ + int vid; + /** Queues for the block device */ + struct vhost_blk_queue queues[NUM_OF_BLK_QUEUES]; + /** Unique name for this block device. */ + char name[64]; + + /** Unique product name for this kind of block device. */ + char product_name[256]; + + /** Size in bytes of a logical block for the backend */ + uint32_t blocklen; + + /** Number of blocks */ + uint64_t blockcnt; + + /** write cache enabled, not used at the moment */ + int write_cache; + + /** use memory as disk storage space */ + uint8_t *data; +}; + +struct vhost_blk_ctrlr { + uint8_t started; + uint8_t packed; + uint8_t need_restart; + /** Only support 1 LUN for the example */ + struct vhost_block_dev *bdev; + /** VM memory region */ + struct rte_vhost_memory *mem; +} __rte_cache_aligned; + +#define VHOST_BLK_MAX_IOVS 128 + +enum blk_data_dir { + BLK_DIR_NONE = 0, + BLK_DIR_TO_DEV = 1, + BLK_DIR_FROM_DEV = 2, +}; + +struct vhost_blk_task { + uint8_t readtype; + uint8_t req_idx; + uint16_t head_idx; + uint16_t last_idx; + uint16_t inflight_idx; + uint16_t buffer_id; + uint32_t dxfer_dir; + uint32_t data_len; + struct virtio_blk_outhdr *req; + volatile uint8_t *status; + struct iovec iovs[VHOST_BLK_MAX_IOVS]; + uint32_t iovs_cnt; + struct vring_packed_desc *desc_packed; + struct vring_desc *desc_split; + struct rte_vhost_vring *vq; + struct vhost_block_dev *bdev; + struct vhost_blk_ctrlr *ctrlr; +}; + +struct inflight_blk_task { + struct vhost_blk_task blk_task; + struct inflight_desc_packed *inflight_desc; + struct inflight_info_packed *inflight_packed; +}; + +struct vhost_blk_ctrlr *g_vhost_ctrlr; +struct vhost_device_ops vhost_blk_device_ops; +int g_should_stop; + +int vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task); + +void vhost_session_install_rte_compat_hooks(uint32_t vid); + +void vhost_dev_install_rte_compat_hooks(const char *path); + +struct vhost_blk_ctrlr *vhost_blk_ctrlr_find(const char *ctrlr_name); + +#endif /* _VHOST_blk_H_ */ diff --git a/examples/vhost_blk/vhost_blk_compat.c b/examples/vhost_blk/vhost_blk_compat.c new file mode 100644 index 000000000..405b091d7 --- /dev/null +++ b/examples/vhost_blk/vhost_blk_compat.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_COMPAT_H_ +#define _VHOST_BLK_COMPAT_H_ + +#include <sys/uio.h> +#include <stdint.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_ring.h> + +#include <rte_vhost.h> +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define VHOST_USER_GET_CONFIG 24 +#define VHOST_USER_SET_CONFIG 25 + +static int +vhost_blk_get_config(struct vhost_block_dev *bdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config blkcfg; + uint32_t blk_size; + uint64_t blkcnt; + + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = bdev->blocklen; + blkcnt = bdev->blockcnt; + } + + memset(&blkcfg, 0, sizeof(blkcfg)); + blkcfg.blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg.min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg.capacity = (blkcnt * blk_size) / 512; + /* QEMU can overwrite this value when started */ + blkcfg.num_queues = VHOST_MAX_VQUEUES; + + fprintf(stdout, "block device:blk_size = %d, blkcnt = %d\n", blk_size, blkcnt); + + memcpy(config, &blkcfg, min(len, sizeof(blkcfg))); + + return 0; +} + +static enum rte_vhost_msg_result +extern_vhost_pre_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + switch (msg->request) { + case VHOST_USER_GET_VRING_BASE: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_KICK: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_MEM_TABLE: + if (!g_should_stop && ctrlr->started) { + vhost_blk_device_ops.destroy_device(vid); + ctrlr->need_restart = 1; + } + break; + case VHOST_USER_GET_CONFIG: { + int rc = 0; + + rc = vhost_blk_get_config(ctrlr->bdev, + msg->payload.cfg.region, msg->payload.cfg.size); + if (rc != 0) + msg->size = 0; + + return RTE_VHOST_MSG_RESULT_REPLY; + } + case VHOST_USER_SET_CONFIG: + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +static enum rte_vhost_msg_result +extern_vhost_post_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->need_restart) { + vhost_blk_device_ops.new_device(vid); + ctrlr->need_restart = 0; + } + + switch (msg->request) { + case VHOST_USER_SET_FEATURES: + break; + case VHOST_USER_SET_VRING_KICK: + /* vhost-user spec tells us to start polling a queue after receiving + * its SET_VRING_KICK message. Let's do it! + */ + if (g_should_stop && !ctrlr->started) + vhost_blk_device_ops.new_device(vid); + break; + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +struct rte_vhost_user_extern_ops g_extern_vhost_ops = { + .pre_msg_handle = extern_vhost_pre_msg_handler, + .post_msg_handle = extern_vhost_post_msg_handler, +}; + +void +vhost_session_install_rte_compat_hooks(uint32_t vid) +{ + int rc; + + rc = rte_vhost_extern_callback_register(vid, &g_extern_vhost_ops, NULL); + if (rc != 0) + fprintf(stderr, + "rte_vhost_extern_callback_register() failed for vid = %d\n", + vid); +} + +void +vhost_dev_install_rte_compat_hooks(const char *path) +{ + uint64_t protocol_features = 0; + + rte_vhost_driver_get_protocol_features(path, &protocol_features); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); + rte_vhost_driver_set_protocol_features(path, protocol_features); +} + +#endif -- 2.17.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2019-07-31 12:57 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20190715202858.49624> 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 0/2] *** vhost support inflight share memory protocol feature *** JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 1/2] vhost: support inflight share memory protocol feature JinYu 2019-07-25 21:23 ` [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight JinYu [not found] <20190725212335.9675> 2019-07-31 20:40 ` [dpdk-dev] [PATCH v4 0/2] vhost: support inflight share memory protocol feature JinYu 2019-07-31 20:40 ` [dpdk-dev] [PATCH v4 2/2] vhost: Add vhost-user-blk example which support inflight JinYu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).