From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id A2D52A00E6 for ; Wed, 10 Jul 2019 04:58:01 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id CF8621B9A6; Wed, 10 Jul 2019 04:58:00 +0200 (CEST) Received: from mga06.intel.com (mga06.intel.com [134.134.136.31]) by dpdk.org (Postfix) with ESMTP id DA8301B9A5 for ; Wed, 10 Jul 2019 04:57:58 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga104.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 09 Jul 2019 19:57:57 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.63,473,1557212400"; d="scan'208";a="317223795" Received: from storage36.sh.intel.com ([10.67.110.166]) by orsmga004.jf.intel.com with ESMTP; 09 Jul 2019 19:57:55 -0700 From: JinYu To: dev@dpdk.org Cc: changpeng.liu@intel.com, maxime.coquelin@redhat.com, tiwei.bie@intel.com, zhihong.wang@intel.com, JinYu Date: Wed, 10 Jul 2019 18:43:56 +0800 Message-Id: <20190710104356.6580-2-jin.yu@intel.com> X-Mailer: git-send-email 2.17.2 In-Reply-To: <20190710104356.6580-1-jin.yu@intel.com> References: <20190708183959.50293> <20190710104356.6580-1-jin.yu@intel.com> Subject: [dpdk-dev] [PATCH v2 2/2] vhost: Add vhost-user-blk example which support inflight X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" A vhost-user-blk example that support inflight feature. It uses the new APIs that introduced in the first patch, so It can show how there APIs work to support inflight feature. Signed-off-by: JinYu --- V1 - add the case. --- examples/vhost_blk/Makefile | 67 +++ examples/vhost_blk/blk.c | 118 ++++++ examples/vhost_blk/blk_spec.h | 95 +++++ examples/vhost_blk/meson.build | 20 + examples/vhost_blk/vhost_blk.c | 589 ++++++++++++++++++++++++++ examples/vhost_blk/vhost_blk.h | 96 +++++ examples/vhost_blk/vhost_blk_compat.c | 193 +++++++++ 7 files changed, 1178 insertions(+) create mode 100644 examples/vhost_blk/Makefile create mode 100644 examples/vhost_blk/blk.c create mode 100644 examples/vhost_blk/blk_spec.h create mode 100644 examples/vhost_blk/meson.build create mode 100644 examples/vhost_blk/vhost_blk.c create mode 100644 examples/vhost_blk/vhost_blk.h create mode 100644 examples/vhost_blk/vhost_blk_compat.c diff --git a/examples/vhost_blk/Makefile b/examples/vhost_blk/Makefile new file mode 100644 index 000000000..52e9befd8 --- /dev/null +++ b/examples/vhost_blk/Makefile @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2017 Intel Corporation + +# binary name +APP = vhost-blk + +# all source are stored in SRCS-y +SRCS-y := blk.c vhost_blk.c vhost_blk_compat.c + +# Build using pkg-config variables if possible +$(shell pkg-config --exists libdpdk) +ifeq ($(.SHELLSTATUS),0) + +all: shared +.PHONY: shared static +shared: build/$(APP)-shared + ln -sf $(APP)-shared build/$(APP) +static: build/$(APP)-static + ln -sf $(APP)-static build/$(APP) + +CFLAGS += -D_FILE_OFFSET_BITS=64 +LDFLAGS += -pthread + +PC_FILE := $(shell pkg-config --path libdpdk) +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk) +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk) +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk) + +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) + +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) + +build: + @mkdir -p $@ + +.PHONY: clean +clean: + rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared + test -d build && rmdir -p build || true + +else # Build using legacy build system + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, detect a build directory, by looking for a path with a .config +RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config))))) + +include $(RTE_SDK)/mk/rte.vars.mk + +ifneq ($(CONFIG_RTE_EXEC_ENV_LINUX),y) +$(info This application can only operate in a linux environment, \ +please change the definition of the RTE_TARGET environment variable) +all: +else + +CFLAGS += -D_FILE_OFFSET_BITS=64 +CFLAGS += -O2 +#CFLAGS += $(WERROR_FLAGS) + +include $(RTE_SDK)/mk/rte.extapp.mk + +endif +endif diff --git a/examples/vhost_blk/blk.c b/examples/vhost_blk/blk.c new file mode 100644 index 000000000..768792bf9 --- /dev/null +++ b/examples/vhost_blk/blk.c @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +/** + * This work is largely based on the "vhost-user-blk" implementation by + * SPDK(https://github.com/spdk/spdk). + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "vhost_blk.h" +#include "blk_spec.h" + +static void +vhost_strcpy_pad(void *dst, const char *src, size_t size, int pad) +{ + size_t len; + + len = strlen(src); + if (len < size) { + memcpy(dst, src, len); + memset((char *)dst + len, pad, size - len); + } else { + memcpy(dst, src, size); + } +} + +static int +vhost_bdev_blk_readwrite(struct vhost_block_dev *bdev, + struct vhost_blk_task *task, + uint64_t lba_512, __rte_unused uint32_t xfer_len) +{ + uint32_t i; + uint64_t offset; + uint32_t nbytes = 0; + + offset = lba_512 * 512; + + for (i = 0; i < task->iovs_cnt; i++) { + if (task->dxfer_dir == BLK_DIR_TO_DEV) + memcpy(bdev->data + offset, task->iovs[i].iov_base, + task->iovs[i].iov_len); + else + memcpy(task->iovs[i].iov_base, bdev->data + offset, + task->iovs[i].iov_len); + offset += task->iovs[i].iov_len; + nbytes += task->iovs[i].iov_len; + } + + return nbytes; +} + +int +vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task) +{ + int used_len; + + if (unlikely(task->data_len > (bdev->blockcnt * bdev->blocklen))) { + fprintf(stderr, "read or write beyond capacity\n"); + return VIRTIO_BLK_S_UNSUPP; + } + + switch (task->req->type) { + case VIRTIO_BLK_T_IN: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->req_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + task->dxfer_dir = BLK_DIR_FROM_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_OUT: + if (unlikely(task->data_len == 0 || (task->data_len & (512 - 1)) != 0)) { + fprintf(stderr, + "%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + task->req->type ? "WRITE" : "READ", task->req_idx); + return VIRTIO_BLK_S_UNSUPP; + } + + if (task->readtype) + return VIRTIO_BLK_S_IOERR; + task->dxfer_dir = BLK_DIR_TO_DEV; + vhost_bdev_blk_readwrite(bdev, task, + task->req->sector, task->data_len); + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovs_cnt || task->data_len) { + return VIRTIO_BLK_S_UNSUPP; + } + used_len = min((size_t)VIRTIO_BLK_ID_BYTES, task->data_len); + vhost_strcpy_pad(task->iovs[0].iov_base, + bdev->product_name, used_len, ' '); + break; + default: + return VIRTIO_BLK_S_UNSUPP; + } + + return VIRTIO_BLK_S_OK; +} diff --git a/examples/vhost_blk/blk_spec.h b/examples/vhost_blk/blk_spec.h new file mode 100644 index 000000000..5875e2f86 --- /dev/null +++ b/examples/vhost_blk/blk_spec.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _BLK_SPEC_H +#define _BLK_SPEC_H + +#include + +#ifndef VHOST_USER_MEMORY_MAX_NREGIONS +#define VHOST_USER_MEMORY_MAX_NREGIONS 8 +#endif + +#ifndef VHOST_USER_MAX_CONFIG_SIZE +#define VHOST_USER_MAX_CONFIG_SIZE 256 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CONFIG +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 +#endif + +#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ + +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_DISCARD 11 +#define VIRTIO_BLK_T_WRITE_ZEROES 13 + +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_MAX +}; + +/** Get/set config msg payload */ +struct vhost_user_config { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +}; + +/** Fixed-size vhost_memory struct */ +struct vhost_memory_padded { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /**< the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory_padded memory; + struct vhost_user_config cfg; + } payload; +} __attribute((packed)); + +#endif diff --git a/examples/vhost_blk/meson.build b/examples/vhost_blk/meson.build new file mode 100644 index 000000000..028aa4f62 --- /dev/null +++ b/examples/vhost_blk/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +# meson file, for building this example as part of a main DPDK build. +# +# To build this example as a standalone application with an already-installed +# DPDK instance, use 'make' + +if not is_linux + build = false +endif + +if not cc.has_header('linux/virtio_blk.h') + build = false +endif + +deps += 'vhost' +sources = files( + 'blk.c', 'vhost_blk.c', 'vhost_blk_compat.c' +) diff --git a/examples/vhost_blk/vhost_blk.c b/examples/vhost_blk/vhost_blk.c new file mode 100644 index 000000000..8510b5aae --- /dev/null +++ b/examples/vhost_blk/vhost_blk.c @@ -0,0 +1,589 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VHOST_BLK_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VIRTIO_F_VERSION_1)) + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_pathname[PATH_MAX] = ""; +static sem_t exit_sem; + +struct vhost_blk_ctrlr * +vhost_blk_ctrlr_find(const char *ctrlr_name) +{ + /* currently we only support 1 socket file fd */ + return g_vhost_ctrlr; +} + +static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + int ret = 0; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + assert(ret != 0); + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + assert(ctrlr != NULL); + } + + assert(ctrlr->mem != NULL); + + return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len); +} + +static struct vring_desc * +descriptor_get_next(struct vring_desc *vq_desc, struct vring_desc *cur_desc) +{ + return &vq_desc[cur_desc->next]; +} + +static bool +descriptor_has_next(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static bool +descriptor_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void +submit_completion(struct vhost_blk_task *task, uint32_t vid, uint32_t q_idx) +{ + struct rte_vhost_vring *vq; + struct vring_used *used; + + vq = task->vq; + used = vq->used; + + rte_vhost_set_last_inflight_io_split(vid, q_idx, task->req_idx); + + /* Fill out the next entry in the "used" ring. id = the + * index of the descriptor that contained the blk request. + * len = the total amount of data transferred for the blk + * request. We must report the correct len, for variable + * length blk CDBs, where we may return less data than + * allocated by the guest VM. + */ + used->ring[used->idx & (vq->size - 1)].id = task->req_idx; + used->ring[used->idx & (vq->size - 1)].len = task->data_len; + used->idx++; + + rte_vhost_clr_inflight_desc_split(vid, q_idx, used->idx, task->req_idx); + + /* Send an interrupt back to the guest VM so that it knows + * a completion is ready to be processed. + */ + rte_vhost_vring_call(task->bdev->vid, q_idx); +} + +static void +vhost_process_payload_chain(struct vhost_blk_task *task) +{ + void *data; + uint64_t chunck_len; + + task->iovs_cnt = 0; + + do { + chunck_len = task->desc->len; + data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!data || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + + task->iovs[task->iovs_cnt].iov_base = data; + task->iovs[task->iovs_cnt].iov_len = task->desc->len; + task->data_len += task->desc->len; + task->iovs_cnt++; + task->desc = descriptor_get_next(task->vq->desc, task->desc); + } while (descriptor_has_next(task->desc)); + + chunck_len = task->desc->len; + task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc->len) + fprintf(stderr, "failed to translate desc address.\n"); +} + +static struct vhost_block_dev * +vhost_blk_bdev_construct(const char *bdev_name, const char *bdev_serial, + uint32_t blk_size, uint64_t blk_cnt, + bool wce_enable) +{ + struct vhost_block_dev *bdev; + + bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE); + if (!bdev) + return NULL; + + strncpy(bdev->name, bdev_name, sizeof(bdev->name)); + strncpy(bdev->product_name, bdev_serial, sizeof(bdev->product_name)); + bdev->blocklen = blk_size; + bdev->blockcnt = blk_cnt; + bdev->write_cache = wce_enable; + + fprintf(stdout, "blocklen=%d, blockcnt=%d\n", bdev->blocklen, bdev->blockcnt); + + /* use memory as disk storage space */ + bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0); + if (!bdev->data) { + fprintf(stderr, "no enough reseverd huge memory for disk\n"); + free(bdev); + return NULL; + } + + return bdev; +} + +static void +submit_inflight_vq(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight_split *inflight_vq; + struct resubmit_info *resubmit_inflight; + struct resubmit_desc *resubmit_list; + int i, req_idx; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + inflight_vq = &blk_vq->inflight_vq; + + resubmit_inflight = inflight_vq->resubmit_inflight_split; + resubmit_list = resubmit_inflight->resubmit_list; + + while (resubmit_list && resubmit_inflight->resubmit_num) { + struct vhost_blk_task *task; + uint64_t chunck_len; + int ret; + + i = (--resubmit_inflight->resubmit_num); + req_idx = resubmit_list[i].index; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = &blk_vq->vq; + task->req_idx = req_idx; + task->desc = &task->vq->desc[task->req_idx]; + + /* does not support indirect descriptors */ + assert((task->desc->flags & VRING_DESC_F_INDIRECT) == 0); + + chunck_len = task->desc->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc = descriptor_get_next(task->vq->desc, task->desc); + if (!descriptor_has_next(task->desc)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr(task->desc); + vhost_process_payload_chain(task); + } + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + submit_completion(task, ctrlr->bdev->vid, q_idx); + rte_free(task); + } +} + +static void +process_requestq(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx) +{ + int ret; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + + blk_vq = &ctrlr->bdev->queues[q_idx]; + vq = &blk_vq->vq; + + while (vq->avail->idx != blk_vq->last_avail_idx) { + int req_idx; + uint16_t last_idx; + struct vhost_blk_task *task; + uint64_t chunck_len; + + last_idx = blk_vq->last_avail_idx & (vq->size - 1); + req_idx = vq->avail->ring[last_idx]; + + task = rte_zmalloc(NULL, sizeof(*task), 0); + assert(task != NULL); + + task->ctrlr = ctrlr; + task->bdev = ctrlr->bdev; + task->vq = vq; + task->req_idx = req_idx; + task->desc = &task->vq->desc[task->req_idx]; + + rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx, last_idx); + + /* does not support indirect descriptors */ + assert((task->desc->flags & VRING_DESC_F_INDIRECT) == 0); + blk_vq->last_avail_idx++; + + chunck_len = task->desc->len; + task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + + task->desc = descriptor_get_next(task->vq->desc, task->desc); + if (!descriptor_has_next(task->desc)) { + task->dxfer_dir = BLK_DIR_NONE; + chunck_len = task->desc->len; + task->status = (void *)(uintptr_t) + gpa_to_vva(task->bdev->vid, + task->desc->addr, + &chunck_len); + if (!task->status || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + rte_free(task); + return; + } + } else { + task->readtype = descriptor_is_wr(task->desc); + vhost_process_payload_chain(task); + } + + ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task); + if (ret) { + /* invalid response */ + *task->status = VIRTIO_BLK_S_IOERR; + } else { + /* successfully */ + *task->status = VIRTIO_BLK_S_OK; + } + + submit_completion(task, ctrlr->bdev->vid, q_idx); + rte_free(task); + } +} + +/* Main framework for processing IOs */ +static void * +ctrlr_worker(void *arg) +{ + struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_ring_inflight_split *inflight_vq; + struct resubmit_info *resubmit_inflight; + cpu_set_t cpuset; + pthread_t thread; + int i, ret; + + fprintf(stdout, "Ctrlr Worker Thread start\n"); + + if (ctrlr == NULL || ctrlr->bdev == NULL) { + fprintf(stderr, "%s: Error, invalid argument passed to worker thread\n", + __func__); + exit(0); + } + + thread = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + inflight_vq = &blk_vq->inflight_vq; + resubmit_inflight = inflight_vq->resubmit_inflight_split; + if (resubmit_inflight && resubmit_inflight->resubmit_num) + submit_inflight_vq(ctrlr, i); + } + + while (!g_should_stop && ctrlr->bdev != NULL) { + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) + process_requestq(ctrlr, i); + } + + fprintf(stdout, "Ctrlr Worker Thread Exiting\n"); + sem_post(&exit_sem); + return NULL; +} + +static int +new_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + struct rte_vhost_ring_inflight_split *inflight_vq; + pthread_t tid; + int i, ret; + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->started) + return 0; + + ctrlr->bdev->vid = vid; + + ret = rte_vhost_get_mem_table(vid, &ctrlr->mem); + if (ret) + fprintf(stderr, "Get Controller memory region failed\n"); + assert(ctrlr->mem != NULL); + + /* Disable Notifications and init last idx */ + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + rte_vhost_enable_guest_notification(vid, i, 0); + + blk_vq = &ctrlr->bdev->queues[i]; + vq = &blk_vq->vq; + inflight_vq = &blk_vq->inflight_vq; + ret = rte_vhost_get_vring_base(ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); + assert(ret == 0); + ret = rte_vhost_get_vhost_vring(ctrlr->bdev->vid, i, vq); + assert(ret == 0); + ret = rte_vhost_get_vhost_ring_inflight_split(ctrlr->bdev->vid, i, + inflight_vq); + assert(ret == 0); + } + + /* start polling vring */ + g_should_stop = 0; + fprintf(stdout, "New Device %s, Device ID %d\n", path, vid); + if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) { + fprintf(stderr, "Worker Thread Started Failed\n"); + return -1; + } + + /* device has been started */ + ctrlr->started = 1; + pthread_detach(tid); + return 0; +} + +static void +destroy_device(int vid) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_blk_queue *blk_vq; + struct rte_vhost_vring *vq; + int i, ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + fprintf(stdout, "Destroy %s Device ID %d\n", path, vid); + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Destroy Ctrlr Failed\n"); + return; + } + + if (!ctrlr->started) + return; + + g_should_stop = 1; + + for (i = 0; i < NUM_OF_BLK_QUEUES; i++) { + blk_vq = &ctrlr->bdev->queues[i]; + rte_vhost_set_vring_base(ctrlr->bdev->vid, i, + blk_vq->last_avail_idx, blk_vq->last_used_idx); + } + + free(ctrlr->mem); + + ctrlr->started = 0; + sem_wait(&exit_sem); +} + +static int +new_connection(int vid) +{ + /* extend the proper features for block device */ + vhost_session_install_rte_compat_hooks(vid); +} + +struct vhost_device_ops vhost_blk_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, + .new_connection = new_connection, +}; + +static struct vhost_blk_ctrlr * +vhost_blk_ctrlr_construct(const char *ctrlr_name) +{ + int ret; + struct vhost_blk_ctrlr *ctrlr; + char *path; + char cwd[PATH_MAX]; + + /* always use current directory */ + path = getcwd(cwd, PATH_MAX); + if (!path) { + fprintf(stderr, "Cannot get current working directory\n"); + return NULL; + } + snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name); + + if (access(dev_pathname, F_OK) != -1) { + if (unlink(dev_pathname) != 0) + rte_exit(EXIT_FAILURE, "Cannot remove %s.\n", + dev_pathname); + } + + if (rte_vhost_driver_register(dev_pathname, 0) != 0) { + fprintf(stderr, "socket %s already exists\n", dev_pathname); + return NULL; + } + + ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES); + if (ret != 0) { + fprintf(stderr, "Set vhost driver features failed\n"); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* set proper features */ + vhost_dev_install_rte_compat_hooks(dev_pathname); + + ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE); + if (!ctrlr) { + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + /* hardcoded block device information with 128MiB */ + ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0", + 4096, 32768, 0); + if (!ctrlr->bdev) { + rte_free(ctrlr); + rte_vhost_driver_unregister(dev_pathname); + return NULL; + } + + rte_vhost_driver_callback_register(dev_pathname, + &vhost_blk_device_ops); + + return ctrlr; +} + +static void +signal_handler(__rte_unused int signum) +{ + struct vhost_blk_ctrlr *ctrlr; + + if (access(dev_pathname, F_OK) == 0) + unlink(dev_pathname); + + g_should_stop = 1; + + ctrlr = vhost_blk_ctrlr_find(NULL); + if (ctrlr != NULL) { + fprintf(stderr, "never come in\n"); + if (ctrlr->bdev != NULL) { + rte_free(ctrlr->bdev->data); + rte_free(ctrlr->bdev); + } + rte_free(ctrlr); + } + + rte_vhost_driver_unregister(dev_pathname); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int ret; + + signal(SIGINT, signal_handler); + + /* init EAL */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); + + g_vhost_ctrlr = vhost_blk_ctrlr_construct("vhost.socket"); + if (g_vhost_ctrlr == NULL) { + fprintf(stderr, "Construct vhost blk controller failed\n"); + return 0; + } + + if (sem_init(&exit_sem, 0, 0) < 0) { + fprintf(stderr, "Error init exit_sem\n"); + return -1; + } + + rte_vhost_driver_start(dev_pathname); + + /* loop for exit the application */ + while (1) + sleep(1); + + return 0; +} + diff --git a/examples/vhost_blk/vhost_blk.h b/examples/vhost_blk/vhost_blk.h new file mode 100644 index 000000000..9df5856c9 --- /dev/null +++ b/examples/vhost_blk/vhost_blk.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_H_ +#define _VHOST_BLK_H_ + +#include +#include +#include +#include +#include + +#include + +struct vhost_blk_queue { + struct rte_vhost_vring vq; + struct rte_vhost_ring_inflight_split inflight_vq; + uint16_t last_avail_idx; + uint16_t last_used_idx; +}; + +#define NUM_OF_BLK_QUEUES 1 + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +struct vhost_block_dev { + /** ID for vhost library. */ + int vid; + /** Queues for the block device */ + struct vhost_blk_queue queues[NUM_OF_BLK_QUEUES]; + /** Unique name for this block device. */ + char name[64]; + + /** Unique product name for this kind of block device. */ + char product_name[256]; + + /** Size in bytes of a logical block for the backend */ + uint32_t blocklen; + + /** Number of blocks */ + uint64_t blockcnt; + + /** write cache enabled, not used at the moment */ + int write_cache; + + /** use memory as disk storage space */ + uint8_t *data; +}; + +struct vhost_blk_ctrlr { + uint8_t started; + uint8_t need_restart; + /** Only support 1 LUN for the example */ + struct vhost_block_dev *bdev; + /** VM memory region */ + struct rte_vhost_memory *mem; +} __rte_cache_aligned; + +#define VHOST_BLK_MAX_IOVS 128 + +enum blk_data_dir { + BLK_DIR_NONE = 0, + BLK_DIR_TO_DEV = 1, + BLK_DIR_FROM_DEV = 2, +}; + +struct vhost_blk_task { + uint8_t readtype; + int req_idx; + uint32_t dxfer_dir; + uint32_t data_len; + struct virtio_blk_outhdr *req; + volatile uint8_t *status; + struct iovec iovs[VHOST_BLK_MAX_IOVS]; + uint32_t iovs_cnt; + struct vring_desc *desc; + struct rte_vhost_vring *vq; + struct vhost_block_dev *bdev; + struct vhost_blk_ctrlr *ctrlr; +}; + +struct vhost_blk_ctrlr *g_vhost_ctrlr; +struct vhost_device_ops vhost_blk_device_ops; +int g_should_stop; + +int vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task); + +void vhost_session_install_rte_compat_hooks(uint32_t vid); + +void vhost_dev_install_rte_compat_hooks(const char *path); + +struct vhost_blk_ctrlr* vhost_blk_ctrlr_find(const char *ctrlr_name); + +#endif /* _VHOST_blk_H_ */ diff --git a/examples/vhost_blk/vhost_blk_compat.c b/examples/vhost_blk/vhost_blk_compat.c new file mode 100644 index 000000000..405b091d7 --- /dev/null +++ b/examples/vhost_blk/vhost_blk_compat.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _VHOST_BLK_COMPAT_H_ +#define _VHOST_BLK_COMPAT_H_ + +#include +#include +#include +#include + +#include +#include "vhost_blk.h" +#include "blk_spec.h" + +#define VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define VHOST_USER_GET_CONFIG 24 +#define VHOST_USER_SET_CONFIG 25 + +static int +vhost_blk_get_config(struct vhost_block_dev *bdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config blkcfg; + uint32_t blk_size; + uint64_t blkcnt; + + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = bdev->blocklen; + blkcnt = bdev->blockcnt; + } + + memset(&blkcfg, 0, sizeof(blkcfg)); + blkcfg.blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg.min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg.capacity = (blkcnt * blk_size) / 512; + /* QEMU can overwrite this value when started */ + blkcfg.num_queues = VHOST_MAX_VQUEUES; + + fprintf(stdout, "block device:blk_size = %d, blkcnt = %d\n", blk_size, blkcnt); + + memcpy(config, &blkcfg, min(len, sizeof(blkcfg))); + + return 0; +} + +static enum rte_vhost_msg_result +extern_vhost_pre_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + switch (msg->request) { + case VHOST_USER_GET_VRING_BASE: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_KICK: + if (!g_should_stop && ctrlr->started) + vhost_blk_device_ops.destroy_device(vid); + break; + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_MEM_TABLE: + if (!g_should_stop && ctrlr->started) { + vhost_blk_device_ops.destroy_device(vid); + ctrlr->need_restart = 1; + } + break; + case VHOST_USER_GET_CONFIG: { + int rc = 0; + + rc = vhost_blk_get_config(ctrlr->bdev, + msg->payload.cfg.region, msg->payload.cfg.size); + if (rc != 0) + msg->size = 0; + + return RTE_VHOST_MSG_RESULT_REPLY; + } + case VHOST_USER_SET_CONFIG: + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +static enum rte_vhost_msg_result +extern_vhost_post_msg_handler(int vid, void *_msg) +{ + char path[PATH_MAX]; + struct vhost_blk_ctrlr *ctrlr; + struct vhost_user_msg *msg = _msg; + int ret; + + ret = rte_vhost_get_ifname(vid, path, PATH_MAX); + if (ret) { + fprintf(stderr, "Cannot get socket name\n"); + return -1; + } + + ctrlr = vhost_blk_ctrlr_find(path); + if (!ctrlr) { + fprintf(stderr, "Controller is not ready\n"); + return -1; + } + + if (ctrlr->need_restart) { + vhost_blk_device_ops.new_device(vid); + ctrlr->need_restart = 0; + } + + switch (msg->request) { + case VHOST_USER_SET_FEATURES: + break; + case VHOST_USER_SET_VRING_KICK: + /* vhost-user spec tells us to start polling a queue after receiving + * its SET_VRING_KICK message. Let's do it! + */ + if (g_should_stop && !ctrlr->started) + vhost_blk_device_ops.new_device(vid); + break; + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +struct rte_vhost_user_extern_ops g_extern_vhost_ops = { + .pre_msg_handle = extern_vhost_pre_msg_handler, + .post_msg_handle = extern_vhost_post_msg_handler, +}; + +void +vhost_session_install_rte_compat_hooks(uint32_t vid) +{ + int rc; + + rc = rte_vhost_extern_callback_register(vid, &g_extern_vhost_ops, NULL); + if (rc != 0) + fprintf(stderr, + "rte_vhost_extern_callback_register() failed for vid = %d\n", + vid); +} + +void +vhost_dev_install_rte_compat_hooks(const char *path) +{ + uint64_t protocol_features = 0; + + rte_vhost_driver_get_protocol_features(path, &protocol_features); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG); + protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); + rte_vhost_driver_set_protocol_features(path, protocol_features); +} + +#endif -- 2.17.2